def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!') _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]') else: raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!") mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i))) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = str(save_path / '%sk_steps_%s_%s.wav' % (repr1(k), repr1(i), repr1(batch_str))) _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 os.makedirs(save_path/'test', exist_ok=True) for file_name in tqdm(os.listdir(load_path)): if file_name.endswith('.npy'): mel = np.load(os.path.join(load_path, file_name)) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav' _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def generate_samples(self, model: WaveRNN, session: VocSession) -> Tuple[float, list]: """ Generates audio samples to cherry-pick models. To evaluate audio quality we calculate the l1 distance between mels of predictions and targets. """ model.eval() mel_losses = [] gen_wavs = [] device = next(model.parameters()).device for i, sample in enumerate(session.val_set_samples, 1): m, x = sample['mel'], sample['x'] if i > self.train_cfg['num_gen_samples']: break x = x[0].numpy() bits = 16 if self.dsp.voc_mode == 'MOL' else self.dsp.bits if self.dsp.mu_law and self.dsp.voc_mode != 'MOL': x = DSP.decode_mu_law(x, 2**bits, from_labels=True) else: x = DSP.label_2_float(x, bits) gen_wav = model.generate(mels=m, batched=self.train_cfg['gen_batched'], target=self.train_cfg['target'], overlap=self.train_cfg['overlap'], mu_law=self.dsp.mu_law, silent=True) gen_wavs.append(gen_wav) y_mel = self.dsp.wav_to_mel(x.squeeze(), normalize=False) y_mel = torch.tensor(y_mel).to(device) y_hat_mel = self.dsp.wav_to_mel(gen_wav, normalize=False) y_hat_mel = torch.tensor(y_hat_mel).to(device) loss = F.l1_loss(y_hat_mel, y_mel) mel_losses.append(loss.item()) self.writer.add_audio(tag=f'Validation_Samples/target_{i}', snd_tensor=x, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag=f'Validation_Samples/generated_{i}', snd_tensor=gen_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) return sum(mel_losses) / len(mel_losses), gen_wavs[0]
def generate_samples(self, model: WaveRNN, session: VocSession) -> Tuple[float, list]: """ Generates audio samples to cherry-pick models. To evaluate audio quality we calculate the l1 distance between mels of predictions and targets. """ model.eval() mel_losses = [] gen_wavs = [] device = next(model.parameters()).device for i, (m, x) in enumerate(session.val_set_samples, 1): if i > hp.voc_gen_num_samples: break x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) gen_wav = model.generate(mels=m, save_path=None, batched=hp.voc_gen_batched, target=hp.voc_target, overlap=hp.voc_overlap, mu_law=hp.mu_law, silent=True) gen_wavs.append(gen_wav) y_mel = raw_melspec(x.squeeze()) y_mel = torch.tensor(y_mel).to(device) y_hat_mel = raw_melspec(gen_wav) y_hat_mel = torch.tensor(y_hat_mel).to(device) loss = F.l1_loss(y_hat_mel, y_mel) mel_losses.append(loss.item()) self.writer.add_audio(tag=f'Validation_Samples/target_{i}', snd_tensor=x, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag=f'Validation_Samples/generated_{i}', snd_tensor=gen_wav, global_step=model.step, sample_rate=hp.sample_rate) return sum(mel_losses) / len(mel_losses), gen_wavs[0]
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): k = model.get_step() // 1000 mypqmf = PQMF() for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) if hp.voc_multiband: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2 ** bits, from_labels=True) else: x = label_2_float(x, bits) source = mypqmf.synthesis( torch.tensor(x, dtype=torch.float).unsqueeze( 0)).numpy() # (1, sub_band, T//sub_band) -> (1, 1, T) source = source.squeeze() # (T,) save_wav(source,save_path/f'{k}k_steps_{i}_target.wav') # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False) else: x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path/f'{k}k_steps_{i}_target.wav') batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav') # 返回PQMF后 _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set_wav, samples, batched, target, overlap, save_path: Path): ''' :param model: :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件 :param samples: 要生成的样本量,也就是要生成的音频个数 :param batched: 在这个脚本中batched为True :param target: 11000 :param overlap: 550 :param save_path: model_outputs_* :return: 生成的音频文件 ''' c = 0 for i in os.listdir(test_set_wav): m = np.expand_dims(np.load(join(test_set_wav, i)).T, 0) filenname = basename(i)[:-4] wave_path = "/emotion_wav/" save_str = wave_path + str(filenname) + ".wav" _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, os.path.join(save_path, "target", os.path.basename(load_path))) print("Generating from {0}".format(load_path)) mel = melspectrogram(wav) print("Melspectrograms generated!") elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!' ) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]' ) else: raise ValueError( f"Expected an extension of .wav or .npy, but got {suffix}!") mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = os.path.join(save_path, os.path.basename(load_path)) beg = time.time() print("Start generating... [{0}]".format(beg)) output = model.generate(mel, save_str, batched, target, overlap, hp.mu_law) end = time.time() print("Done generating... [{0}] -> delta: [{1}]".format(end, end - beg)) save_wav(output, save_str)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav( wav, save_path / '__%s__%sk_steps_target.wav' % (repr1(file_name), repr1(k))) mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( 'Expected a numpy array shaped (n_mels, n_hops), but got %s!' % (repr1(wav.shape))) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( 'Expected spectrogram range in [0,1] but was instead [%s, %s]' % (repr1(_min), repr1(_max))) else: raise ValueError('Expected an extension of .wav or .npy, but got %s!' % (repr1(suffix))) mel = torch.tensor(mel).unsqueeze(0) batch_str = 'gen_batched_target%s_overlap%s' % ( repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED' save_str = save_path / '__%s__%sk_steps_%s.wav' % ( repr1(file_name), repr1(k), repr1(batch_str)) _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.stem suffix = load_path.suffix if suffix == ".wav": wav = load_wav(load_path) save_wav(wav, save_path / f'{prefix}{file_name}.target.wav') mel = melspectrogram(wav) elif suffix == ".npy": mel = np.load(load_path) if mel.ndim != 2 or mel.shape[0] != hp.num_mels: raise ValueError( f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!' ) _max = np.max(mel) _min = np.min(mel) if _max >= 1.01 or _min <= -0.01: raise ValueError( f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]' ) else: raise ValueError( f"Expected an extension of .wav or .npy, but got {suffix}!") m = torch.tensor(mel).unsqueeze(0) save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav' save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav' wav = reconstruct_waveform(mel, n_iter=32) save_wav(wav, save_str_griffinlim) _ = model.generate(m, save_str_wavernn, batched, target, overlap, hp.mu_law)
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path): ''' :param model: :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件 :param samples: 要生成的样本量,也就是要生成的音频个数 :param batched: 在这个脚本中batched为True :param target: 11000 :param overlap: 550 :param save_path: model_outputs_* :return: 生成的音频文件 ''' k = model.get_step() // 1000 for i, (m, x) in enumerate(test_set, 1): if i > samples: break print('\n| Generating: %i/%i' % (i, samples)) x = x[0].numpy() bits = 16 if hp.voc_mode == 'MOL' else hp.bits if hp.mu_law and hp.voc_mode != 'MOL': x = decode_mu_law(x, 2**bits, from_labels=True) else: x = label_2_float(x, bits) save_wav(x, save_path / f'{k}k_steps_{i}_target.wav') # 保存原音频文件 batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav') _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
_, m, dur, pitch = tts_model.generate(x, alpha=args.alpha, pitch_function=pitch_function) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if input_text: save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav' else: save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav' if args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if args.vocoder == 'melgan': m = torch.tensor(m).unsqueeze(0) torch.save( m, paths.forward_output / f'{i}_{tts_k}_alpha{args.alpha}_amp{args.amp}.mel') elif args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=args.iters) save_wav(wav, save_path) print('\n\nDone.\n')
res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, pad_val=hp.voc_pad_val, mode=hp.voc_mode).cuda() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) restore_path = args.weights if args.weights else paths.voc_latest_weights model.restore(restore_path) model.eval() if hp.amp: model, _ = amp.initialize(model, [], opt_level='O3') simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) k = model.get_step() // 1000 for file_name in os.listdir(args.dir): if file_name.endswith('.npy'): mel = np.load(os.path.join(args.dir, file_name)) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' save_str = f'{file_name}__{k}k_steps_{batch_str}.wav' model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
def TTS_Wave(self): os.makedirs('quick_start/tts_weights/', exist_ok=True) os.makedirs('quick_start/voc_weights/', exist_ok=True) zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r') zip_ref.extractall('quick_start/voc_weights/') zip_ref.close() zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r') zip_ref.extractall('quick_start/tts_weights/') zip_ref.close() # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('-name', metavar='name', type=str,help='name of pdf') parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)') parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)') parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index') parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples') parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.set_defaults(batched=hp.voc_gen_batched) parser.set_defaults(target=hp.voc_target) parser.set_defaults(overlap=hp.voc_overlap) parser.set_defaults(input_text=None) parser.set_defaults(weights_path=None) args = parser.parse_args() batched = args.batched target = args.target overlap = args.overlap input_text = args.input_text weights_path = args.weights_path if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_device(0) else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.restore('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).to(device) tts_model.restore('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('final.txt') as f: inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.get_r() simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print("f'\n| Generating {i}/{len(inputs)}'") _, m, attention = tts_model.generate(x) if input_text: save_path = './output_audio/'+str(i)+'.wav' else: save_path = './output_audio/'+str(i)+'.wav' # save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if i == 2: temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav") temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = temp1 + temp2 os.remove("./output_audio/"+str(i-1)+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") elif i > 2: preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav") newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = preTemp + newTemp os.remove("./output_audio/"+self.path[:-4]+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") print("Done")