def synthesize(text): input = text + "|00-" + lang + "|" + lang # Change to Multi_TTS path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/Multilingual_Text_to_Speech")) if "utils" in sys.modules: del sys.modules["utils"] from synthesize import synthesize from utils import build_model # Load Mulilingual pretrained model model = build_model( os.path.abspath("./dependencies/checkpoints/generated_switching.pyt")) model.eval() # generate spectogram spectogram = synthesize(model, "|" + input) # Change to WaveRNN Path sys.path.append( os.path.join(os.path.dirname(__file__), "dependencies/WaveRNN")) if "utils" in sys.modules: del sys.modules["utils"] from models.fatchord_version import WaveRNN from utils import hparams as hp from gen_wavernn import generate import torch # Load WaveRNN pretrained model hp.configure("hparams.py") model = WaveRNN( rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to( torch.device('cuda' if torch.cuda.is_available() else 'cpu')) model.load( os.path.join(os.path.dirname(__file__), "dependencies/checkpoints/wavernn_weight.pyt")) waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) f = write("./temp/result.wav", "x") f.write(waveform) f.close()
def get_wavernn_model(model_path): device = torch.device('cuda') print() model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) model.load(model_path) return model
print("Cur Dir", os.getcwd()) if "utils" in sys.modules: del sys.modules["utils"] sys.path.append(WAVERNN_FOLDER) from gen_wavernn import generate from utils import hparams as hp from models.fatchord_version import WaveRNN hp.configure(WAVERNN_FOLDER+'/hparams.py') model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to('cpu') model.load(CHECKPOINTS_FOLDER + "/" + wavernn_chpt) y = [] ix=1 while os.path.exists(CHR_FOLDER+"/"+str(ix)+".npy"): print("Found", CHR_FOLDER+"/"+str(ix)+".npy") y.append(np.load(CHR_FOLDER+"/"+str(ix)+".npy")) ix+=1 idx=1 for s in y: waveform = generate(model, s, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap) sf.write("wg-"+str(idx)+".wav", waveform, hp.sample_rate) idx+=1
# Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.load('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways,
# Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron( embed_dims=hp.forward_embed_dims, num_chars=len(phonemes), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, durpred_dropout=hp.forward_durpred_dropout, pitch_rnn_dims=hp.forward_pitch_rnn_dims, pitch_conv_dims=hp.forward_pitch_conv_dims, pitch_dropout=hp.forward_pitch_dropout, pitch_emb_dims=hp.forward_pitch_emb_dims, pitch_proj_dropout=hp.forward_pitch_proj_dropout, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K,
bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) voc_weights = args.voc_weights if args.voc_weights else paths.voc_latest_weights model.load(voc_weights) simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) if gta: save_path = paths.voc_output / 'gta' else: save_path = paths.voc_output / 'natural' save_path.mkdir(parents=False, exist_ok=True) print(f'Saving to {save_path}') # import pdb; pdb.set_trace() if file: file = Path(file).expanduser()
def thak(): class Tshamsoo(): force_cpu = os.getenv('FORCE_CPU', False) hp_file = 'hparams.py' vocoder = os.getenv('VOCODER', 'wavernn') batched = os.getenv('BATCHED', True) target = os.getenv('TARGET', None) overlap = os.getenv('OVERLAP', None) tts_weights = None save_attn = os.getenv('SAVE_ATTN', False) voc_weights = None iters = os.getenv('GL_ITERS', 32) args = Tshamsoo() if args.vocoder in ['griffinlim', 'gl']: args.vocoder = 'griffinlim' elif args.vocoder in ['wavernn', 'wr']: args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(args.hp_file) # Load hparams from file tts_weights = args.tts_weights save_attn = args.save_attn paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) if args.vocoder == 'wavernn': # set defaults for any arguments that depend on hparams if args.target is None: args.target = hp.voc_target if args.overlap is None: args.overlap = hp.voc_overlap if args.batched is None: args.batched = hp.voc_gen_batched batched = args.batched target = int(args.target) overlap = int(args.overlap) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights voc_model.load(voc_load_path) else: voc_model = None batched = None target = None overlap = None print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights tts_model.load(tts_load_path) return args, voc_model, tts_model, batched, target, overlap, save_attn
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('--mel', type=str, help='[string/path] path to test mel file') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation') parser.add_argument( '--voc_weights', type=str, help='[string/path] Load in different FastSpeech weights', default="pretrained/wave_800K.pyt") args = parser.parse_args() if not os.path.exists('onnx'): os.mkdir('onnx') hp.configure(args.hp_file) device = torch.device('cpu') print('Using device:', device) ##### print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = args.voc_weights voc_model.load(voc_load_path) voc_upsampler = WaveRNNUpsamplerONNX(voc_model, args.batched, hp.voc_target, hp.voc_overlap) voc_infer = WaveRNNONNX(voc_model) voc_model.eval() voc_upsampler.eval() voc_infer.eval() opset_version = 11 with torch.no_grad(): mels = np.load(args.mel) mels = torch.from_numpy(mels) mels = mels.unsqueeze(0) mels = voc_upsampler.pad_tensor(mels) mels_onnx = mels.clone() torch.onnx.export(voc_upsampler, mels_onnx, "./onnx/wavernn_upsampler.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["mels"], output_names=["upsample_mels", "aux"]) mels, aux = voc_upsampler(mels) mels = mels[:, 550:-550, :] mels, aux = voc_upsampler.fold(mels, aux) h1, h2, x = voc_infer.get_initial_parameters(mels) aux_split = voc_infer.split_aux(aux) b_size, seq_len, _ = mels.size() if seq_len: m_t = mels[:, 0, :] a1_t, a2_t, a3_t, a4_t = \ (a[:, 0, :] for a in aux_split) rnn_input = (m_t, a1_t, a2_t, a3_t, a4_t, h1, h2, x) torch.onnx.export(voc_infer, rnn_input, "./onnx/wavernn_rnn.onnx", opset_version=opset_version, do_constant_folding=True, input_names=[ "m_t", "a1_t", "a2_t", "a3_t", "a4_t", "h1", "h2", "x" ], output_names=["logits", "h1", "h2"]) print('Done!')