def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None): # pylint: disable=global-statement global _phonemes_to_id if tp: _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} sequence = [] text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) to_phonemes = text2phone(clean_text, language) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format( clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. for phoneme in filter(None, to_phonemes.split('|')): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: sequence = pad_with_eos_bos(sequence, tp=tp) return sequence
def text_to_sequence(text, cleaner_names, tp=None): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." Args: text: string to convert to a sequence cleaner_names: names of the cleaner functions to run the text through Returns: List of integers corresponding to the symbols in the text ''' # pylint: disable=global-statement global _symbol_to_id if tp: _symbols, _ = make_symbols(**tp) _symbol_to_id = {s: i for i, s in enumerate(_symbols)} sequence = [] # Check for curly braces and treat their contents as ARPAbet: while text: m = _CURLY_RE.match(text) if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) return sequence
def pad_with_eos_bos(phoneme_sequence, tp=None): global _PHONEMES_TO_ID, _bos, _eos if tp: _bos = tp['bos'] _eos = tp['eos'] _, phonemes = make_symbols(**tp) _PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} return [_PHONEMES_TO_ID[_bos]] + list(phoneme_sequence) + [_PHONEMES_TO_ID[_eos]]
def _create_random_model(self): config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'text' in config.keys(): symbols, phonemes = make_symbols(**config.text) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def pad_with_eos_bos(phoneme_sequence, tp=None): # pylint: disable=global-statement global _phonemes_to_id, _bos, _eos if tp: _bos = tp['bos'] _eos = tp['eos'] _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} return [_phonemes_to_id[_bos] ] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def _create_random_model(self): # pylint: disable=global-statement global symbols, phonemes config = load_config( os.path.join(get_tests_output_path(), 'dummy_model_config.json')) if 'characters' in config.keys(): symbols, phonemes = make_symbols(**config.characters) num_chars = len(phonemes) if config.use_phonemes else len(symbols) model = setup_model(num_chars, 0, config) output_path = os.path.join(get_tests_output_path()) save_checkpoint(model, None, None, None, output_path, 10, 10)
def sequence_to_phoneme(sequence, tp=None): '''Converts a sequence of IDs back to a string''' global _ID_TO_PHONEMES if tp: _, phonemes = make_symbols(**tp) _ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} for symbol_id in sequence: if symbol_id in _ID_TO_PHONEMES: s = _ID_TO_PHONEMES[symbol_id] result += s return result.replace('}{', ' ')
def sequence_to_phoneme(sequence, tp=None): # pylint: disable=global-statement '''Converts a sequence of IDs back to a string''' global _id_to_phonemes result = '' if tp: _, _phonemes = make_symbols(**tp) _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)} for symbol_id in sequence: if symbol_id in _id_to_phonemes: s = _id_to_phonemes[symbol_id] result += s return result.replace('}{', ' ')
def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' global _ID_TO_SYMBOL if tp: symbols, _ = make_symbols(**tp) _ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} result = '' for symbol_id in sequence: if symbol_id in _ID_TO_SYMBOL: s = _ID_TO_SYMBOL[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] result += s return result.replace('}{', ' ')
def sequence_to_text(sequence, tp=None): '''Converts a sequence of IDs back to a string''' # pylint: disable=global-statement global _id_to_symbol if tp: _symbols, _ = make_symbols(**tp) _id_to_symbol = {i: s for i, s in enumerate(_symbols)} result = '' for symbol_id in sequence: if symbol_id in _id_to_symbol: s = _id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] result += s return result.replace('}{', ' ')
args = parser.parse_args() if args.vocoder_path != "": assert args.use_cuda, " [!] Enable cuda for vocoder." from WaveRNN.models.wavernn import Model as VocoderModel # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() if args.use_cuda:
def main(args): # pylint: disable=redefined-outer-name global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) print(" > TTS symbols {}".format(len(symbols))) print(symbols) print(" > TTS phonemes {}".format(len(phonemes))) print(phonemes) print('-' * 50) # if the vocabulary was passed, replace the default if 'text' in c.keys(): symbols, phonemes = make_symbols(**c.text) print(" > TTS symbols {}".format(len(symbols))) print(symbols) print(" > TTS phonemes {}".format(len(phonemes))) print(phonemes) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 print(" | > Num chars : {}".format(num_chars)) model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" ] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" ] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss( pos_weight=torch.tensor(10)) if c.stopnet else None if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch) val_loss = evaluate(model, criterion, criterion_st, ap, global_step, epoch) print(" | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, target_loss, best_loss, OUT_PATH, global_step, epoch)
def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train, meta_data_eval, symbols, phonemes # Audio processor ap = AudioProcessor(**c.audio) if 'characters' in c.keys(): symbols, phonemes = make_symbols(**c.characters) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) # load data instances meta_data_train, meta_data_eval = load_meta_data(c.datasets) # parse speakers if c.use_speaker_embedding: speakers = get_speakers(meta_data_train) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) params = set_weight_decay(model, c.wd) optimizer = RAdam(params, lr=c.lr, weight_decay=0) if c.stopnet and c.separate_stopnet: optimizer_st = RAdam(model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0) else: optimizer_st = None # setup criterion criterion = TacotronLoss(c, stopnet_pos_weight=10.0, ga_sigma=0.4) if args.restore_path: checkpoint = torch.load(args.restore_path, map_location='cpu') try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print(" > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model.cuda() criterion.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.noam_schedule: scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): c_logger.print_epoch_start(epoch, c.epochs) # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) if c.bidirectional_decoder: model.decoder_backward.set_r(r) print("\n > Number of output frames:", model.decoder.r) train_avg_loss_dict, global_step = train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch) eval_avg_loss_dict = evaluate(model, criterion, ap, global_step, epoch) c_logger.print_epoch_end(epoch, eval_avg_loss_dict) target_loss = train_avg_loss_dict['avg_postnet_loss'] if c.run_eval: target_loss = eval_avg_loss_dict['avg_postnet_loss'] best_loss = save_best_model(target_loss, best_loss, model, optimizer, global_step, epoch, c.r, OUT_PATH)
def main(**kwargs): global symbols, phonemes # pylint: disable=global-statement current_date = date.today() current_date = current_date.strftime("%B %d %Y") start_time = time.time() # read passed variables from gui text = kwargs['text'] # text to generate speech from use_cuda = kwargs['use_cuda'] # if gpu exists default is true project = kwargs['project'] # path to project folder vocoder_type = kwargs['vocoder'] # vocoder type, default is GL use_gst = kwargs['use_gst'] # use style_wave for prosody style_dict = kwargs['style_input'] # use style_wave for prosody speaker_id = kwargs['speaker_id'] # name of the selected speaker sentence_file = kwargs['sentence_file'] # path to file if generate from file out_path = kwargs['out_path'] # path to save the output wav batched_vocoder = True # load speakers speakers_file_path = Path(project, "speakers.json") if speakers_file_path.is_file(): speaker_data = json.load(open(speakers_file_path, 'r')) num_speakers = len(speaker_data) #get the speaker id for selected speaker if speaker_id >= num_speakers: print('Speaker ID outside of number of speakers range. Using default 0.') speaker_id = 0 speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = [speaker for speaker, id in speaker_data.items() if speaker_id == id][0] else: speaker_name = 'Default' num_speakers = 0 speaker_id = None # load the config config_path = Path(project, "config.json") C = load_config(config_path) if use_gst: if style_dict is not None: style_input = style_dict else: style_input = None # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'characters' in C.keys(): symbols, phonemes = make_symbols(**C.characters) # find the tts model file in project folder try: tts_model_file = glob(str(Path(project, '*.pth.tar'))) if not tts_model_file: raise FileNotFoundError model_path = tts_model_file[0] except FileNotFoundError: print('[!] TTS Model not found in path: "{}"'.format(project)) # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) # if gpu is not available use cpu model, state = load_checkpoint(model, model_path, use_cuda=use_cuda) model.decoder.max_decoder_steps = 2000 model.eval() print(' > Model step:', state['step']) print(' > Model r: ', state['r']) # load vocoder if vocoder_type is 'MelGAN': try: model_file = glob(str(Path(project, 'vocoder/*.pth.tar'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'vocoder/config.json')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) elif vocoder_type is 'WaveRNN': try: model_file = glob(str(Path(project, 'vocoder/*.pkl'))) vocoder, ap_vocoder = load_vocoder(str(Path('TTS')), str(model_file[0]), str(Path(project, 'config.yml')), use_cuda) except Exception: print('[!] Error loading vocoder: "{}"'.format(project)) sys.exit(0) else: vocoder, ap_vocoder = None, None print(" > Vocoder: {}".format(vocoder_type)) print(' > Using style input: {}\n'.format(style_input)) if sentence_file != '': with open(sentence_file, "r", encoding='utf8') as f: list_of_sentences = [s.strip() for s in f.readlines()] else: list_of_sentences = [text.strip()] # iterate over every passed sentence and synthesize for _, tts_sentence in enumerate(list_of_sentences): wav_list = [] # remove character which are not alphanumerical or contain ',. ' tts_sentence = clean_sentence(tts_sentence) print(" > Text: {}".format(tts_sentence)) # build filename current_time = datetime.now().strftime("%H%M%S") file_name = ' '.join(tts_sentence.split(" ")[:10]) # if multiple sentences in one line -> split them tts_sentence = split_into_sentences(tts_sentence) # if sentence was split in sub-sentences -> iterate over them for sentence in tts_sentence: # synthesize voice _, _, _, wav = tts(model, vocoder, C, None, sentence, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=speaker_id, style_input=style_input, figures=False) # join sub-sentences back together and add a filler between them wav_list += list(wav) wav_list += [0] * 10000 wav = np.array(wav_list) # finalize filename file_name = "_".join([str(current_time), file_name]) file_name = file_name.translate( str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' if out_path == "": out_dir = str(Path(project, 'output', current_date, speaker_name)) out_path = os.path.join(out_dir, file_name) else: out_dir = os.path.dirname(out_path) # create output directory if it doesn't exist if not os.path.isdir(out_dir): os.makedirs(out_dir, exist_ok=True) # save generated wav to disk ap.save_wav(wav, out_path) end_time = time.time() print(" > Run-time: {}".format(end_time - start_time)) print(" > Saving output to {}\n".format(out_path))
args = parser.parse_args() if args.vocoder_path != "": assert args.use_cuda, " [!] Enable cuda for vocoder." from WaveRNN.models.wavernn import Model as VocoderModel # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # if the vocabulary was passed, replace the default if 'text' in C.keys(): symbols, phonemes = make_symbols(**C.text) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() if args.use_cuda: