def _create_random_model(self): config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r'])
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def setup(): use_cuda = True # model paths TTS_MODEL = "tts_model.pth.tar" TTS_CONFIG = "config.json" VOCODER_MODEL = "vocoder_model.pth.tar" VOCODER_CONFIG = "config_vocoder.json" # Load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG) ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) from TTS.vocoder.utils.generic_utils import setup_generator # LOAD VOCODER MODEL vocoder_model = setup_generator(VOCODER_CONFIG) vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) vocoder_model.remove_weight_norm() vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio']) if use_cuda: vocoder_model.cuda() vocoder_model.eval() return model, vocoder_model, speaker_id, TTS_CONFIG, use_cuda, ap
def __init__(self, use_cuda=False, verbose=False): self.use_cuda = use_cuda self.verbose = verbose # load configs self.TTS_CONFIG = load_config(TTS_CONFIG) self.VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor self.ap = AudioProcessor(**self.TTS_CONFIG.audio) # LOAD TTS MODEL self.speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if self.TTS_CONFIG.use_phonemes else len(symbols) self.model = setup_model(num_chars, len(speakers), self.TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model self.model.load_state_dict(cp['model']) if self.use_cuda: self.model.cuda() self.model.eval() # set model stepsize if 'r' in cp: self.model.decoder.set_r(cp['r']) # LOAD VOCODER MODEL self.vocoder_model = setup_generator(self.VOCODER_CONFIG) self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 ap_vocoder = AudioProcessor(**self.VOCODER_CONFIG['audio']) if self.use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval()
def __init__(self, TTS_MODEL, TTS_CONFIG, VOCODER_MODEL, VOCODER_CONFIG, use_cuda, use_gl): self.use_cuda = use_cuda self.use_gl = use_gl # model paths self.tts_config = load_config(TTS_CONFIG) vocoder_config = load_config(VOCODER_CONFIG) # load audio processor self.ap = AudioProcessor(**self.tts_config.audio) # LOAD TTS MODEL # multi speaker self.speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if self.tts_config.use_phonemes else len( symbols) self.model = setup_model(num_chars, len(speakers), self.tts_config) # load model state self.cp = torch.load(TTS_MODEL, map_location=torch.device('cpu')) # load the model self.model.load_state_dict(self.cp['model']) if self.use_cuda: self.model.cuda() self.model.train(False) self.model.eval() # set model stepsize if 'r' in self.cp: self.model.decoder.set_r(self.cp['r']) # LOAD VOCODER MODEL self.vocoder_model = setup_generator(vocoder_config) self.vocoder_model.load_state_dict( torch.load(VOCODER_MODEL, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 #ap_vocoder = AudioProcessor(**vocoder_config['audio']) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.train(False) self.vocoder_model.eval() #get sample rate self.sample_rate = self.ap.sample_rate gc.collect(2)
def _load_tts(self): # LOAD TTS MODEL from TTS.utils.text.symbols import symbols, phonemes from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import setup_model # load the model num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols) self.tts_model = setup_model(num_chars, self.tts_config) # load the audio processor self._ap = AudioProcessor(**self.tts_config.audio) # load model state cp = torch.load(self.tts_model_path, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) self.tts_model.to(self.device) self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 2000
def load_tts(self, tts_checkpoint, tts_config, use_cuda): global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) if 'text' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.text) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r'])
ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() if args.use_cuda: model.cuda() model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) ap_vocoder = AudioProcessor(**VC.audio) bits = 10 vocoder_model = VocoderModel(rnn_dims=512, fc_dims=512, mode=VC.mode,
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) if c.use_speaker_embedding: speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = RAdam(model.parameters(), lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" ] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" ] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, global_step, epoch)
) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, mel_postnet_spec, stop_tokens, waveform use_cuda = True batched_wavernn = True # initialize TTS CONFIG = load_config(tts_pretrained_model_config) print(CONFIG) # load the model num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, CONFIG) # load the audio processor ap = AudioProcessor(**CONFIG.audio) # load model state if use_cuda: cp = torch.load(tts_pretrained_model) else: cp = torch.load(tts_pretrained_model, map_location=lambda storage, loc: storage) # load the model model.load_state_dict(cp["model"]) if use_cuda: model.cuda() model.eval() print(cp["step"]) model.decoder.max_decoder_steps = 2000
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH)
def main(**kwargs): global symbols, phonemes # pylint: disable=global-statement current_date = date.today() current_date = current_date.strftime("%B %d %Y") start_time = time.time() # read passed variables from gui text = kwargs['text'] # text to generate speech from use_cuda = kwargs['use_cuda'] # if gpu exists default is true project = kwargs['project'] # path to project folder vocoder_type = kwargs['vocoder'] # vocoder type, default is GL use_gst = kwargs['use_gst'] # use style_wave for prosody style_dict = kwargs['style_input'] # use style_wave for prosody speaker_id = kwargs['speaker_id'] # name of the selected speaker sentence_file = kwargs['sentence_file'] # path to file if generate from file out_path = kwargs['out_path'] # path to save the output wav batched_vocoder = True # load speakers speakers_file_path = Path(project, "speakers.json") if speakers_file_path.is_file(): speaker_data = json.load(open(speakers_file_path, 'r')) num_speakers = len(speaker_data) #get the speaker id for selected speaker if speaker_id >= num_speakers: print('Speaker ID outside of number of speakers range. Using default 0.') speaker_id = 0 speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = 'Default' num_speakers = 0 speaker_id = None # load the config config_path = Path(project, "config.json") C = load_config(config_path) if use_gst: if style_dict is not None: style_input = style_dict else: style_input = None # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # find the tts model file in project folder try: tts_model_file = glob(str(Path(project, '*.pth.tar'))) if not tts_model_file: raise FileNotFoundError model_path = tts_model_file[0] except FileNotFoundError: print('[!] TTS Model not found in path: "{}"'.format(project)) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) # if gpu is not available use cpu model, state = load_checkpoint(model, model_path, use_cuda=use_cuda) model.decoder.max_decoder_steps = 2000 model.eval() print(' > Model step:', state['step']) print(' > Model r: ', state['r']) # load vocoder if vocoder_type is 'MelGAN': try: model_file = glob(str(Path(project, 'vocoder/*.pth.tar'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'vocoder/config.json')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) elif vocoder_type is 'WaveRNN': try: model_file = glob(str(Path(project, 'vocoder/*.pkl'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) else: vocoder, ap_vocoder = None, None print(" > Vocoder: {}".format(vocoder_type)) print(' > Using style input: {}\n'.format(style_input)) if sentence_file != '': with open(sentence_file, "r", encoding='utf8') as f: list_of_sentences = [s.strip() for s in f.readlines()] else: list_of_sentences = [text.strip()] # iterate over every passed sentence and synthesize for _, tts_sentence in enumerate(list_of_sentences): wav_list = [] # remove character which are not alphanumerical or contain ',. ' tts_sentence = clean_sentence(tts_sentence) print(" > Text: {}".format(tts_sentence)) # build filename current_time = datetime.now().strftime("%H%M%S") file_name = ' '.join(tts_sentence.split(" ")[:10]) # if multiple sentences in one line -> split them tts_sentence = split_into_sentences(tts_sentence) # if sentence was split in sub-sentences -> iterate over them for sentence in tts_sentence: # synthesize voice _, _, _, wav = tts(model, vocoder, C, None, sentence, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=speaker_id, style_input=style_input, figures=False) # join sub-sentences back together and add a filler between them wav_list += list(wav) wav_list += [0] * 10000 wav = np.array(wav_list) # finalize filename file_name = "_".join([str(current_time), file_name]) file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' if out_path == "": out_dir = str(Path(project, 'output', current_date, speaker_name)) out_path = os.path.join(out_dir, file_name) else: out_dir = os.path.dirname(out_path) # create output directory if it doesn't exist if not os.path.isdir(out_dir): os.makedirs(out_dir, exist_ok=True) # save generated wav to disk ap.save_wav(wav, out_path) end_time = time.time() print(" > Run-time: {}".format(end_time - start_time)) print(" > Saving output to {}\n".format(out_path))
# load configs TTS_CONFIG = load_config(TTS_CONFIG) VOCODER_CONFIG = load_config(VOCODER_CONFIG) # load the audio processor ap = AudioProcessor(**TTS_CONFIG.audio) # LOAD TTS MODEL # multi speaker speaker_id = None speakers = [] # load the model num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols) model = setup_model(num_chars, len(speakers), TTS_CONFIG) # load model state cp = torch.load(TTS_MODEL, map_location=torch.device("cpu")) # load the model model.load_state_dict(cp["model"]) if use_cuda: model.cuda() model.eval() # set model stepsize if "r" in cp: model.decoder.set_r(cp["r"])