def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r", encoding="utf8") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def main(): ap = AudioProcessor() # load model num_chars = len(phonemes) model = Tacotron(num_chars).to(device) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() print('Text: {}'.format(args.text)) wav = tts(model, args.text, ap) file_name = args.text.replace(' ', '_') + '.wav' out_path = os.path.join(args.out_path, file_name) ap.save_wav(wav, out_path)
def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
raise FileNotFoundError('{} not found'.format(metadata_file)) melspec_dir = os.path.join(args.data_root, 'melspec') if not os.path.exists(melspec_dir): os.makedirs(melspec_dir, exist_ok=True) spec_dir = os.path.join(args.data_root, 'spec') if not os.path.exists(spec_dir): os.makedirs(spec_dir, exist_ok=True) phoneme_dir = os.path.join(args.data_root, 'phoneme') if not os.path.exists(phoneme_dir): os.makedirs(phoneme_dir, exist_ok=True) items = load_metadata(metadata_file) ap = AudioProcessor() for text, wav_file in tqdm(items): prefix = wav_file.replace('.wav', '') # 音素系列を生成 generate_phoneme_sequence(text, os.path.join(phoneme_dir, prefix + '.npy')) wav = np.array(ap.load_wav(os.path.join(wav_dir, wav_file)), dtype=np.float32) # メルスペクトログラムを生成 melspec = ap.melspectrogram(wav).astype('float32') np.save(os.path.join(melspec_dir, prefix + '.npy'), melspec)
def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio)
if args.data_path != "": CONFIG.data_path = args.data_path DATA_PATH = CONFIG.data_path # DISTRUBUTED if num_gpus > 1: init_distributed( args.rank, num_gpus, args.group_id, CONFIG.distributed["backend"], CONFIG.distributed["url"], ) global ap ap = AudioProcessor(**CONFIG.audio) mode = CONFIG.mode # setup output paths and read configs _ = os.path.dirname(os.path.realpath(__file__)) if args.data_path != "": CONFIG.data_path = args.data_path if args.output_path == "": OUT_PATH = os.path.join(_, CONFIG.output_path) else: OUT_PATH = args.output_path if args.group_id == "": OUT_PATH = create_experiment_folder(OUT_PATH, CONFIG.model_name)
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1:
def main(args): dataset = importlib.import_module('datasets.' + c.dataset) Dataset = getattr(dataset, 'MyDataset') audio = importlib.import_module('utils.' + c.audio_processor) AudioProcessor = getattr(audio, 'AudioProcessor') ap = AudioProcessor(sample_rate=c.sample_rate, num_mels=c.num_mels, min_level_db=c.min_level_db, frame_shift_ms=c.frame_shift_ms, frame_length_ms=c.frame_length_ms, ref_level_db=c.ref_level_db, num_freq=c.num_freq, power=c.power, preemphasis=c.preemphasis) # Setup the dataset train_dataset = Dataset(c.data_path, c.meta_file_train, c.r, c.text_cleaner, ap=ap, min_seq_len=c.min_seq_len) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) if c.run_eval: val_dataset = Dataset(c.data_path, c.meta_file_val, c.r, c.text_cleaner, ap=ap) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers=4, pin_memory=True) else: val_loader = None model = Tacotron(c.embedding_size, ap.num_freq, c.num_mels, c.r) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() optimizer.load_state_dict(checkpoint['optimizer']) # optimizer_st.load_state_dict(checkpoint['optimizer_st']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() scheduler = AnnealLR(optimizer, warmup_steps=c.warmup_steps) num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params), flush=True) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, train_loader, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, val_loader, ap, current_step) print(" | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**c.audio)
type=str, help='Path to save final wav file.', ) args = parser.parse_args() try: path = os.path.realpath(os.path.dirname(__file__)) except NameError as e: path = './' C = load_config(os.path.join(path, 'pretrained_models/TTS/config.json')) C.forward_attn_mask = False C.windowing = True # load the audio processor ap = AudioProcessor(**C.audio) num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(os.path.join(path, 'pretrained_models/TTS/best_model.pth.tar'), map_location='cpu') model.load_state_dict(cp['model'], strict=False) model.r = cp['r'] model.decoder.r = cp['r'] model.eval() if use_cuda: model.cuda()
def main(): ap = AudioProcessor() train_dataset = TTSDataset('data/LJSpeech-1.1', 'train.list', outputs_per_step=r) valid_dataset = TTSDataset('data/LJSpeech-1.1', 'valid.list', outputs_per_step=r) print('train data:', len(train_dataset)) print('valid data:', len(valid_dataset)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=valid_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) # Create models num_chars = len(phonemes) model = Tacotron(num_chars, r=r).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0) # StopNetは二値分類タスクなので独自に訓練する optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=lr, weight_decay=0.0) criterion = L1LossMasked() criterion_st = nn.BCEWithLogitsLoss() num_params = count_parameters(model) print('Model has {} parameters'.format(num_params)) # Training best_loss = float('inf') global_step = 0 for epoch in range(0, epochs + 1): train_loss, global_step = train(train_loader, model, criterion, criterion_st, optimizer, optimizer_st, ap, global_step, epoch) valid_loss = evaluate(valid_loader, model, criterion, criterion_st, ap, global_step, epoch) print('Epoch [{}/{}] train_loss: {:.5f} valid_loss: {:.5f}'.format( epoch, epochs, train_loss, valid_loss)) if valid_loss < best_loss: print(' => valid_loss improved from {:.5f} to {:.5f}!'.format( best_loss, valid_loss)) new_state_dict = model.state_dict() state = { 'model': new_state_dict, 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'linear_loss': valid_loss } best_loss = valid_loss best_model_path = os.path.join(writer.logdir, 'best_model.pth') torch.save(state, best_model_path)
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) if c.use_speaker_embedding: speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) #optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer = Ranger(model.parameters(), lr=c.lr, weight_decay=c.wd) optimizer_gst = Ranger(model.textgst.parameters(), lr=c.lr, weight_decay=c.wd) if c.text_gst else None if c.stopnet and c.separate_stopnet: optimizer_st = Ranger(model.decoder.stopnet.parameters(), lr=c.lr) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None criterion_gst = nn.L1Loss() if c.text_gst else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print( " > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch, criterion_gst=criterion_gst, optimizer_gst=optimizer_gst) if epoch % 5 == 0: val_loss = evaluate(model, criterion, criterion_st, criterion_gst, ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, optimizer_st, optimizer_gst, target_loss, best_loss, OUT_PATH, global_step, epoch)
def __init__(self, *args, **kwargs): super(TestTTSDatasetCached, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.c = load_config(os.path.join(c.data_path_cache, 'config.json')) self.ap = AudioProcessor(**self.c.audio)
if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) if args.rank == 0: LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) # Conditional imports preprocessor = importlib.import_module('datasets.preprocess') preprocessor = getattr(preprocessor, c.dataset.lower()) # Audio processor ap = AudioProcessor(**c.audio) try: main(args) except KeyboardInterrupt: try: sys.exit(0) except SystemExit: os._exit(0) except Exception: remove_experiment_folder(OUT_PATH) traceback.print_exc() sys.exit(1)
# Set constants ROOT_PATH = '' MODEL_PATH = ROOT_PATH + 'best_model.pth.tar' CONFIG_PATH = ROOT_PATH + 'config.json' OUT_FOLDER = ROOT_PATH + '' CONFIG = load_config(CONFIG_PATH) use_cuda = False # load the model model = Tacotron(61, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r) # load the audio processor ap = AudioProcessor(CONFIG.audio['sample_rate'], CONFIG.audio['num_mels'], CONFIG.audio['min_level_db'], CONFIG.audio['frame_shift_ms'], CONFIG.audio['frame_length_ms'], CONFIG.audio['preemphasis'], CONFIG.audio['ref_level_db'], CONFIG.audio['num_freq'], CONFIG.audio['power'], griffin_lim_iters=30) # load model state if use_cuda: cp = torch.load(MODEL_PATH) else: cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) new=list(cp['model'].items()) my_model_kvpair=model.state_dict() count=0 for key,value in my_model_kvpair.items(): layer_name,weights=new[count]