def load_model(self): ####TODO#### 1.학습된 모델 불러오기 # 학습된 tacotron 모델 주소를 load하고 # 모델에 hparam과 statedict를 load한다 checkpoint_path = "/home/ubuntu/test/TTS/checkpoint_28000" self.model = train.load_model(self.hparams) self.model.load_state_dict( torch.load(checkpoint_path, map_location=torch.device("cpu"))['state_dict']) # pass ####TODO#### # _ = self.model.cpu().eval().half() _ = self.model.cpu().eval() #waveglow model load # waveglow_path = "/home/multicam/checkpoints/waveglow.pt" waveglow_path = "/home/ubuntu/test/TTS/waveglow.pt" self.waveglow = torch.load(waveglow_path, map_location=torch.device("cpu"))['model'] self.waveglow.cpu().eval() #self.waveglow.cpu().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers, ratios=hp.model.generator_ratio, mult = hp.model.mult, out_band = hp.model.out_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=True) with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.01) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length*10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace('.npy', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def main(waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(glob.glob('*.npy')): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.from_numpy(np.load(file_path)) mel = torch.unsqueeze(mel, 0).cuda() mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze().cpu().numpy() audio_path = os.path.join(output_dir, f'waveglow_{file_name}.wav') write(audio_path, sampling_rate, audio.astype('int16'))
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval() with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model(mel) # For multi-band inference print(audio.shape) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.1) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length * 10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace( '.npy', '_hifi_GAN_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def load_waveglow(chk_pt_path): waveglow = torch.load(chk_pt_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return waveglow, denoiser
def create_models(dataset_dir): """Initialize the app (available for localhost only) Parameters: dataset_dir (:func:`str`): Path to the training set """ logger.debug("Creating models...") if not local_exec: logger.error("Models can only be generated locally") exit(1) # Modify the configuration for local execution app_config['root'] = os.environ['ROOT'] # Generate inline models and train classifier denoiser = Denoiser(app_config) if not exists(dataset_dir) or not isdir(dataset_dir): logger.error(dataset_dir + " is not a valid directory") exit(2) dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)] denoiser.generate_models(dataset) logger.info("Inline models generated") denoiser.train(dataset) logger.info("Classifier trained")
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory): # Make synthesis paths if not os.path.exists(output_directory): os.makedirs(output_directory) print("Creating directory " + output_directory + "...") hparams = create_hparams() hparams.sampling_rate = 22050 print("Loading models...") model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) genlist = [] with open(text_file) as file: for line in file: genlist.append(line.strip()) for entry in genlist: wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav" epi = epitran.Epitran('eng-Latn', ligatures = True) if hparams.preprocessing == "ipa": entry = ipa.convert(english_cleaners(entry)) foreign_words = re.findall(r"[^ ]{0,}\*", entry) for word in foreign_words: entry = entry.replace(word, epi.transliterate(word[0:len(word)-1])) if hparams.preprocessing == "arpabet": entry = make_arpabet(entry) # Text sequencer if hparams.preprocessing is not None: sequence = np.array(text_to_sequence(entry, None))[None, :] else: sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # Synthesis mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_denoised = denoiser(audio, strength=0.01)[:, 0] # Save audio print ("Saving " + wav_name) write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def __init__(self, ds_name, ds_path, lr, iterations, batch_size, print_freq, k, eps, is_normalized, adv_momentum, store_adv=None, load_adv_dir=None, load_adv_name=None, load_dir=None, load_name=None, save_dir=None): self.data_processor = Preprocessor(ds_name, ds_path, is_normalized) # Load Data self.train_data, self.test_data, self.N_train, self.N_test = self.data_processor.datasets( ) self.train_loader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True) self.test_loader = DataLoader(self.test_data, batch_size=batch_size) # Other Variables self.save_dir = save_dir self.store_adv = store_adv # Set Model Hyperparameters self.learning_rate = lr self.iterations = iterations self.print_freq = print_freq self.cuda = torch.cuda.is_available() # Load Model to Conduct Adversarial Training adversarial_model = self.load_model(self.cuda, load_adv_dir, load_adv_name, TEST) self.adversarial_generator = Attacks(adversarial_model, eps, self.N_train, self.N_test, self.data_processor.get_const(), adv_momentum, is_normalized, store_adv) # Load Target Model self.target_model = self.load_model(self.cuda, load_dir, load_name, TEST) # Load Denoiser self.denoiser = Denoiser(x_h=32, x_w=32) self.denoiser = self.denoiser.cuda()
def __init__(self): hparams = create_hparams() hparams.sampling_rate = 22050 checkpoint_path = constants.TACOTRON_PT self.model = load_model(hparams) self.model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = self.model.cuda().eval().half() waveglow_path = constants.WAVEGLOW_PT self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def __init__(self, lang): tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run, forward_is_infer=True) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) self.language = lang
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] for m in waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] #print(file_name) mel = torch.load(file_path) # print("mel",mel) #print(mel.shape) mel = torch.autograd.Variable(mel.cuda()) # print("mel",mel) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel # print("mel",mel) print(torch.min(mel),torch.max(mel)) with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) k.append(abs(audio).max().item()) #print(min(k),max(k)) #audio = audio*18000*abs(audio).max()/0.99 #print("audio",audio) #print((audio).min().item(),(audio).max().item()) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis_sig0.7_d_0.1.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(text): hparams = create_hparams() hparams.sampling_rate = 22050 hparams.gate_threshold = 0.1 hparams.max_decoder_steps = 5000 # #### Load model from checkpoint checkpoint_path = "tacotron2_statedict.pt" model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() # #### Load WaveGlow for mel2audio synthesis and denoiser waveglow_path = 'waveglow_256channels.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for m in waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) # #### Prepare text input sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # #### Decode text input and plot results mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) # #### Synthesize audio from spectrogram using WaveGlow with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) # #### (Optional) Remove WaveGlow bias audio_denoised = denoiser(audio, strength=0.01)[:, 0] # save if (os.path.isfile("out.wav")): x, sr = librosa.load("out.wav") out = np.append(x, audio[0].data.cpu().numpy().astype(np.float32)) else: out = audio[0].data.cpu().numpy().astype(np.float32) librosa.output.write_wav('./out.wav', out, 22050)
def load_waveglow(self, vocoder_path, config_fpath): # Load config file with open(config_fpath) as f: data = f.read() config = json.loads(data) train_config = config["train_config"] data_config = config["data_config"] dist_config = config["dist_config"] vocoder_config = { **config["waveglow_config"], 'win_length': data_config['win_length'], 'hop_length': data_config['hop_length'] } print(vocoder_config) print(f"Config File from '{config_fpath}' successfully loaded.") # import the correct model core if self.is_ax(vocoder_config): from efficient_model_ax import WaveGlow else: if vocoder_config["yoyo"]: from efficient_model import WaveGlow else: from glow import WaveGlow # initialize model print(f"intializing WaveGlow model... ", end="") waveglow = WaveGlow(**vocoder_config).cuda() print(f"Done!") # load checkpoint from file print(f"loading WaveGlow checkpoint... ", end="") checkpoint = torch.load(vocoder_path) waveglow.load_state_dict( checkpoint['model'] ) # and overwrite initialized weights with checkpointed weights waveglow.cuda().eval().half( ) # move to GPU and convert to half precision print(f"Done!") print(f"initializing Denoiser... ", end="") denoiser = Denoiser(waveglow) print(f"Done!") vocoder_iters = checkpoint['iteration'] print(f"WaveGlow trained for {vocoder_iters} iterations") speaker_lookup = checkpoint['speaker_lookup'] # ids lookup training_sigma = train_config['sigma'] return waveglow, denoiser, training_sigma, speaker_lookup
def init_model(): print("init model!!!!") global tacotron2_model global waveglow_model global denoiser tacotron2_path = "outdir_finetune/checkpoint_62500" # tacotron2_path = "outdir_korean/checkpoint_8800" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000" # tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000" # tacotron2_path = "outdir_lj_korean/checkpoint_5000" # tacotron2_path = "outdir_longtrain/checkpoint_439500" waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000" # waveglow_path = "../waveglow/checkpoints/waveglow_335000" # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000" sampling_rate = 22050 denoiser_strength = 0.0 hparams = create_hparams() hparams.sampling_rate = sampling_rate hparams.training = False tacotron2_model = load_model(hparams) tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = tacotron2_model.cuda().eval().half() # with open("waveglow/config.json") as f: # data = f.read() # import json # config = json.loads(data) # waveglow_config = config["waveglow_config"] # # waveglow_model = glow.WaveGlow(**waveglow_config) # # checkpoint_dict = torch.load(waveglow_path, map_location='cpu') # model_for_loading = checkpoint_dict['model'] # waveglow_model.load_state_dict(model_for_loading.state_dict()) # # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict']) # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) # waveglow_model.cuda().eval().half() waveglow_model = torch.load(waveglow_path)['model'] waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) waveglow_model.cuda().eval().half() for k in waveglow_model.convinv: k.float() if denoiser_strength > 0: denoiser = Denoiser(waveglow_model)
def tacotron2_init(self): self.plot_wav_data = False # set parameters self.hparams = create_hparams() self.hparams.sampling_rate = 22050 # load tacotron2 self.model = load_model(self.hparams) self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict']) _ = self.model.cuda().eval().half() # load waveglow self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model'] self.waveglow.cuda().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def text2audio(waveglow_path, sigma, output_dir, sampling_rate, mel): waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() denoiser = Denoiser(waveglow).cuda() with torch.no_grad(): audio = waveglow.infer(mel.cuda(), sigma=sigma) # if denoiser_strength > 0: # audio = denoiser(audio, denoiser_strength) #audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() sf.write(os.path.join(output_dir, "pred2.wav"), audio, sampling_rate)
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): hparams = create_hparams() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) testset = TextMelLoader(text_files, hparams) collate_fn = TextMelCollate() test_loader = DataLoader(testset, num_workers=0, shuffle=False, sampler=None, batch_size=1, pin_memory=False, drop_last=True, collate_fn=collate_fn) waveglow = torch.load(waveglow_path)['model'] # waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, batch in enumerate(test_loader): text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) enc_outputs, _ = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel = torch.autograd.Variable(mel.cuda()) # mel = torch.unsqueeze(mel, 0) # mel = mel.half() if is_fp16 else mel with torch.no_grad(): mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma) '''if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE''' # audio = audio.squeeze() # mel = mel.cpu().numpy() # audio = audio.astype('int16') print(mel) mel = mel.squeeze() print(mel.size()) mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i)) torch.save(mel, mel_path) print(mel_path)
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): tic_prepare= time.time() mel_files = files_to_list(mel_files) squeezewave = torch.load(squeezewave_path)['model'] squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.cuda().eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave).cuda() toc_prepare = time.time() dur_prepare = toc_prepare - tic_prepare print("prepare model {:3.2}sec".format(dur_prepare) ) for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel tic=time.time() with torch.no_grad(): audio = squeezewave.infer(mel, sigma=sigma).float() if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE toc=time.time() dur = toc -tic audio = audio.squeeze() audio = audio.cpu().numpy() len_wav = len(audio) sec_wav = len_wav/sampling_rate samples_sec = len_wav / dur audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_s{}.wav".format(file_name,sigma)) write(audio_path, sampling_rate, audio) print("{} it took {:4.3f}sec for {:4.3f}sec {:4.2f}K sample 22Khz Audio files : RTF {:4.3f} {:4.3f}X {:4.2f}Ksamples/sec " .format(audio_path, dur, sec_wav, len_wav/1000, dur/sec_wav, sec_wav/dur , samples_sec/1000 ) )
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) #测试集mel谱list waveglow = torch.load(waveglow_path)['model'] #加载模型 waveglow = waveglow.remove_weightnorm(waveglow) #?移除权重归一化 waveglow.cuda().eval() #cuda()拷贝进gpu #?变成测试模式,dropout和BN在训练时和测不一样 #apex加速 if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") # denoiser_strength=0 if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): #file_name-对应的wav file_name = os.path.splitext(os.path.basename(file_path))[0] #加载MFCC特征,80个滤波器 mel = torch.load(file_path) #mel={key:mel[key].cuda() for key in mel} #封装数据 mel = torch.autograd.Variable(mel.cuda()) #80,375 -> 1*80*375 mel = torch.unsqueeze(mel, 0) #变成fp16数据以便apex加速 mel = mel.half() if is_fp16 else mel #反向传播不会自动求导 with torch.no_grad(): #生成1*96000Tensor数据,x为原始音频,z为mel谱 audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) #为了转成wav? audio = audio * MAX_WAV_VALUE #变成1维数据 audio = audio.squeeze() #在cpu中转成numpy audio = audio.cpu().numpy() #改变类型 audio = audio.astype('int16') #生成数据存储位置 audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) #写入音频 print(audio_path)
def __init__(self): for module_path in './waveglow/', './waveglow/tacotron2': if module_path not in sys.path: sys.path.insert(0, module_path) # Disable deprecation warnings import warnings warnings.simplefilter('ignore') self.waveglow = torch.load('waveglow_256channels_ljs_v2.pt')['model'] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self.waveglow.cuda().eval() from denoiser import Denoiser self.denoiser = Denoiser(self.waveglow).cuda() # Re-enable warnings warnings.resetwarnings()
def inference_plc(mel, waveglow, sigma, is_fp16, denoiser_strength): if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel = torch.autograd.Variable(mel.cuda()) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() return audio
def main(tacotron2_path, waveglow_path, sigma, output_dir, sampling_rate, denoiser_strength, text, file_idx, inference_name, zip_file, hparams): hparams.sampling_rate = sampling_rate torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) random.seed(hparams.seed) model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() if denoiser_strength > 0: denoiser = Denoiser(waveglow) sequence = np.array(text_to_sequence( text, ['transliteration_cleaners']))[None, :] print(sequence) # sequence2 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] # sequence3 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] # print(np.array_equal(sequence, sequence2)) # print(np.array_equal(sequence, sequence3)) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) mel_outputs, mel_outputs_postnet2, _, alignments = model.inference( sequence) MAX_WAV_VALUE = 32768.0 print(mel_outputs_postnet.cpu().data.numpy()[0][0][:30]) print(mel_outputs_postnet2.cpu().data.numpy()[0][0][:30]) if np.array_equal(mel_outputs_postnet.cpu().data.numpy(), mel_outputs_postnet2.cpu().data.numpy()): print("same!!") else: print("different!!")
def __init__(self, lang): self.language = lang self.hparams = create_hparams() self.hparams.sampling_rate = 22050 with open('config.json', 'r') as f: self.config = json.load(f) self.waveglow_path = self.config.get('model').get('waveglow') self.waveglow = torch.load(self.waveglow_path)['model'] self.waveglow.cuda().eval().half() for m in self.waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow) self.update_model(lang)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] print('Loading file: ', file_path) if file_path.find('.pt') != -1: print('load by torch') mel = torch.load(file_path) elif file_path.find('.npy') != -1: print('load by numpy') mel = np.load(file_path) mel = torch.from_numpy(mel) print(f"original mel shape: {mel.shape}") mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel print(f"mel shape right before using waveglow: {mel.shape}") with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') if not os.path.exists(output_dir): os.mkdir(output_dir) audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def __init__(self, model_path, device, sigma=0.666, strength=0.1): self.device = torch.device("cpu" if not torch.cuda.is_available() else device) self.dtype = torch.float if self.device.type == "cpu" else torch.half self.model = torch.load(model_path, map_location=self.device)["model"] self.model.device = self.device for m in self.model.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") self.model.eval().to(device=self.device, dtype=self.dtype) for k in self.model.convinv: k.float() self.denoiser = Denoiser(self.model, device=self.device) self.sigma = sigma self.strength = strength
def waveglow_gen(waveglow_path, mel, sigma=0.666, denoiser_strength=0.1, fp16=False): """Generate audio with waveglow from checkpoint""" torch.cuda.empty_cache() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval() if fp16: waveglow = waveglow.half() mel = mel.half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) with torch.no_grad(): audio = denoiser(waveglow.infer(mel, sigma), denoiser_strength) del waveglow, denoiser torch.cuda.empty_cache() return audio.cpu().view(1, -1)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] if True: # Processing for generic mel files shape = tuple(np.fromfile(file_path, count=2, dtype=np.int32)) mel = np.memmap(file_path, offset=8, dtype=np.float32, shape=shape) # mel = mel[1:1000,:] mel = mel.transpose() mel = torch.from_numpy(mel) else: mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def inference_plc(mel, waveglow_path, sigma, sampling_rate, is_fp16, denoiser_strength): waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel = torch.autograd.Variable(mel.cuda()) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() return audio
def predict(hp, model, mel, denoise=False, device="cuda"): with torch.no_grad(): if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.to(device) audio = model.inference(mel) # For multi-band inference if hp.model.out_channels > 1: pqmf = PQMF(device=device) audio = pqmf.synthesis(audio).squeeze(0) #.view(-1) # audio = audio.squeeze(0) # collapse all dimension except time axis if denoise: denoiser = Denoiser(model, device=device).to(device) audio = denoiser(audio, 0.1).mean(0) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length * 10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) audio = audio.short() audio = audio.cpu().detach().numpy() return audio