def load_data(datapath, glob_file_str, scale=True, data_split=[0.8, 0.1]): data = defaultdict(list) stft = TacotronSTFT(filter_length=1024, hop_length=160, win_length=1024, sampling_rate=16000, n_mel_channels=64, mel_fmin=0, mel_fmax=None, representation='asrgen') for folderpath in sorted(glob.glob(os.path.join(datapath, '*/'))): label = os.path.basename(os.path.normpath(folderpath)) filepaths = glob.glob( os.path.join(os.path.join(datapath, label), glob_file_str)) for filepath in filepaths: audio = load_wav_to_torch(filepath, stft.sampling_rate) audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm / torch.max(audio_norm.abs()) audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) mel_spec = stft.mel_spectrogram(audio_norm)[0] mel_spec -= mel_spec.min() mel_spec = mel_spec / torch.max(mel_spec) mel_spec = (mel_spec * 2) - 1 train_end = int(mel_spec.size(1) * data_split[0]) val_end = int(mel_spec.size(1) * (data_split[0] + data_split[1])) data['train'].append([mel_spec[:, :train_end], label]) data['valid'].append([mel_spec[:, train_end:val_end], label]) data['test'].append([mel_spec[:, val_end:], label]) return data
def mel_spectrogram_and_waveform_generation(checkpoint_path, text, hparams): # Griffin Lim iterations n_iter = 60 # #### Load model from checkpoint model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.eval() # #### Prepare text input #text = "amor é fogo que arde sem se ver." sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # #### Decode text input mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, n_iter) return waveform
def __init__(self, training_files, val_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, val_flag=False): self.audio_files = files_to_list(training_files) if val_flag: self.audio_files = files_to_list(val_files) i = 0 for file in self.audio_files: audio_data, sample_r = load_wav_to_torch(file) if audio_data.size(0) < sampling_rate: i += 1 self.audio_files.remove(file) print("{} files shorter than segment_len".format(i)) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate
def test_MCD_and_f0(): hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio_path = 'kakao/1/1_0001.wav' mel_path = 'kakao/1/1_0001.mel.npy' srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0) srcMel = torch.clamp(srcMel, -4.0, 4.0) # print(srcMel.shape, srcMel.max(), srcMel.min()) audio, sr = load_wav_to_torch(audio_path) # print(audio.shape, audio.max(), audio.min()) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # print(audio_norm.shape, audio_norm.max(), audio_norm.min()) dstMel = stft.mel_spectrogram(audio_norm) # print(dstMel.shape, dstMel.max(), dstMel.min()) # mcc = stft.cepstrum_from_audio(audio_norm) # print('mcc', mcc.shape, mcc.max(), mcc.min()) log_MCD = MCD_from_mels(stft, srcMel, dstMel) print(log_MCD.data, 'log') sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel) print(sqrtDiffF0) meanSqrtDiffF0 = torch.mean(sqrtDiffF0) print(meanSqrtDiffF0.data, '100hz')
def test(hparams, mel, output_path="test.wav", ref_level_db=20, magnitude_power=1.5): taco_stft = TacotronSTFT(hparams) stime = time.time() mel_decompress = mel_denormalize(mel).unsqueeze(0) mel_decompress = taco_stft.spectral_de_normalize(mel_decompress + ref_level_db)**( 1 / magnitude_power) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]), taco_stft.stft_fn, 60) waveform = waveform[0].data.cpu().numpy() waveform = waveform / abs(waveform).max() * 0.99 * 2**15 waveform = waveform.astype(dtype=np.int16) dec_time = time.time() - stime len_audio = float(len(waveform)) / float(hparams.sampling_rate) str = "audio length: {:.2f} sec, mel_to_wave time: {:.2f}".format( len_audio, dec_time) print(str) write(os.path.join(output_path), hparams.sampling_rate, waveform)
def inference_texts(model, hp, target_texts, step, model_name, vocoder, waveglow, f_type='mel', _type='train', postnet=True): model.eval() for param in model.parameters(): param.requires_grad = False sample_rate = 22050 original_audio, texts = target_texts save_target = 'generate/{}-step-{}'.format(model_name, step) stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length, hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax) os.makedirs(save_target, exist_ok=True) for i, text in enumerate(texts): print(text) if original_audio: target_name = '{}-target-{}.wav'.format(_type, i) path = os.path.join(save_target, target_name) shutil.copy2( original_audio[i], path, ) inputs = prepare_inputs(hp, text) if torch.cuda.device_count() > 1: with torch.no_grad(): predict = model.module.inference(inputs, postnet=postnet) else: with torch.no_grad(): predict = model.inference(inputs, postnet=postnet) name = '{}-{}-{}-{}.wav'.format(_type, f_type, i, vocoder) path = os.path.join(save_target, name) if vocoder == 'griffin_lim': mel_decompress = stft.spectral_de_normalize(predict) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling print(spec_from_mel.size()) waveform = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), stft.stft_fn, 60) write(path, sample_rate, waveform[0].data.cpu().numpy()) elif vocoder == 'waveglow' and waveglow: with torch.no_grad(): audio = MAX_WAV_VALUE * waveglow.infer(predict, sigma=1.0)[0] audio = audio.cpu().numpy() audio = audio.astype('int16') write(path, sample_rate, audio)
def inference(args): hparams = create_hparams() sentences = get_sentences(args) # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] model = load_model(hparams) model.load_state_dict(torch.load(args.checkpoint_path)['state_dict']) model.cuda().eval() #.half() test_set = TextMelLoaderEval(sentences, hparams) test_collate_fn = TextMelCollateEval(hparams) test_sampler = DistributedSampler( valset) if hparams.distributed_run else None test_loader = DataLoader(test_set, num_workers=0, shuffle=shuffle, sampler=test_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=test_collate_fn) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) with torch.no_grad(): for i, batch in enumerate(test_loader): mel_outputs, mel_outputs_postnet, _, alignments = model.inference( batch) for j in range(mel_outputs.size(0)): mel_decompress = taco_stft.spectral_de_normalize( mel_outputs_postnet[j]) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, args.griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') # audio_path = os.path.join('samples', "{}_synthesis.wav".format(args.out_filename)) audio_path = os.path.join(args.out_filename, 'batch_{}_sentence_{}'.format(i, j)) write(audio_path, hparams.sampling_rate, audio) print(audio_path)
def __init__(self): super().__init__() self.hparams = create_hparams() self.hparams.sampling_rate = 16000 self.hparams.max_decoder_steps = 600 self.stft = TacotronSTFT( self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax)
def __init__(self, audio_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(audio_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate
def get_mel(filename, hparams): stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio = load_wav_to_torch(filename, hparams.sampling_rate) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def synthesis_griffin_lim(mel,hparams): taco_stft = TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60) return waveform
def __init__(self, ckpt, wglw, n_speakers=123): print("[Loading Model]") self.ckpt = ckpt self.hparams = create_hparams() self.hparams.n_speakers = n_speakers self.stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) self.mellotron = load_model(self.hparams).cuda().eval() self.waveglow = torch.load(wglw)['model'].cuda().eval() self.denoiser = Denoiser(self.waveglow).cuda().eval() self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') self.mellotron.load_state_dict(torch.load(ckpt)['state_dict']) print('[Loaded Model]')
def load_mel(path): hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = melspec.cpu() return melspec
def main(text, checkpoint_path, path, name): #### Setup hparams hparams = create_hparams("distributed_run=False,mask_padding=False") hparams.filter_length = 1024 hparams.hop_length = 256 hparams.win_length = 1024 #### Load model from checkpoint model = get_model(hparams, checkpoint_path) #### Prepare text input sequence = get_input(get_pinyin(text)) #### inference mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, drop_prob=0.25) #### tacotron result taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60) write( os.path.join(path, name) + '_tacotron.wav', 16000, waveform[0].data.cpu().numpy()) #### transform tacotron mel to wavenet mel wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T) #### save np.save( os.path.join(path, name) + '_mel.npy', mel_outputs_postnet.data.cpu().numpy()[0]) np.save( os.path.join(path, name) + '_alig.npy', alignments.data.cpu().numpy()[0]) np.save(os.path.join(path, name) + '.npy', wavenet_mel)
class TextMelLoader(torch.utils.data.Dataset): """ 1) loads audio,text pairs 2) normalizes text and converts them to sequences of one-hot vectors 3) computes mel-spectrograms from audio files. """ def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) def get_mel_text_pair(self, audiopath_and_text): # separate filename and text audiopath, text = audiopath_and_text[0], audiopath_and_text[1] text = self.get_text(text) mel = self.get_mel(audiopath) return (text, mel) def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename)) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec def get_text(self, text): text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners)) return text_norm def __getitem__(self, index): return self.get_mel_text_pair(self.audiopaths_and_text[index]) def __len__(self): return len(self.audiopaths_and_text)
def prepare_training_data(hparams, out_dir, for_wavenet, for_m2m, dataset): mel_dir = os.path.join(out_dir, 'mels') wav_dir = os.path.join(out_dir, 'audio') os.makedirs(out_dir, exist_ok=True) os.makedirs(mel_dir, exist_ok=True) os.makedirs(wav_dir, exist_ok=True) metadatas = open(os.path.join(dataset, 'metadata.csv'), 'r', encoding='utf-8').readlines() audio_paths = [] sentences = [] mels = [] mus = [] stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) for i, m in enumerate(metadatas): audio_path, sentence = m.strip().split('|') audio_path = os.path.join(dataset, 'wavs', audio_path) sentences.append(sentence) audio_paths.append(audio_path) audio = get_audio(audio_path) #print(audio.shape, audio.max(), audio.min()) mel = get_mel(stft, audio) mels.append(mel) #print(mel.shape, mel.max(), mel.min(), mel.size(0)) audio = audio.data.cpu().numpy() #print(len(audio), hparams.hop_length * mel.size(2)) diff = len(audio) - hparams.hop_length * mel.size(2) if (diff >= 0): audio = audio[:-diff] else: audio = np.append(audio, [0.] * -diff) #print(len(audio)%hparams.hop_length ==0, len(audio)//mel.size(2) == hparams.hop_length, len(audio), len(audio)//mel.size(2)) mu = mulaw_quantize(audio) mus.append(mu) # print(mu.shape, mu.max(), mu.min()) if (i % 100 == 0): print(i) if (for_wavenet): save_wavenet_map(out_dir, sentences, mels, mus) elif (for_m2m): save_m2m_metadata(out_dir, sentences, mels) pass
def generate(self, text=None): text = ch2p(text) sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) taco_stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, sampling_rate=self.hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60)
def run(hparams, output_dir, checkpoint_path, sentence_path, speaker_id, trans_con, condition, logvar, clenaer, removing_silence_mel_padding, adding_silence_mel_padding, is_GL, is_melout, is_metaout): f = open(sentence_path, mode='r', encoding='utf-8-sig') sentences = [x.strip() for x in f.readlines()] print('All sentences to infer:', sentences) f.close() os.makedirs(output_dir, exist_ok=True) stft = TacotronSTFT(hparams) mels = generate_mels( hparams, checkpoint_path, sentences, speaker_id, trans_con, condition, logvar, clenaer, removing_silence_mel_padding, adding_silence_mel_padding, is_GL, output_dir, ) if (is_GL): mels_to_wavs_GL(hparams, mels, stft, output_dir) mel_paths = [] if is_melout: mel_dir = os.path.join(output_dir, 'mels') os.makedirs(mel_dir, exist_ok=True) for i, mel in enumerate(mels): mel_path = os.path.join(output_dir, 'mels/', "mel-{}.npy".format(i)) mel_paths.append(mel_path) if (list(mel.shape)[1] >= hparams.max_decoder_steps - removing_silence_mel_padding): continue np.save(mel_path, mel) if is_metaout: with open(os.path.join(output_dir, 'metadata.csv'), 'w', encoding='utf-8') as file: lines = [] for i, s in enumerate(sentences): mel_path = mel_paths[i] if (list(mels[i].shape)[1] >= hparams.max_decoder_steps - removing_silence_mel_padding): continue lines.append('{}|{}\n'.format(mel_path, s)) file.writelines(lines)
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, training_files, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = files_to_list(training_files) random.seed(1234) random.shuffle(self.audio_files) self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_length = segment_length self.sampling_rate = sampling_rate def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): # Read audio filename = self.audio_files[index] audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.sampling_rate)) # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start + self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
def infer(checkpoint_path, griffin_iters, text, out_filename): hparams = create_hparams() hparams.sampling_rate = 22050 model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval() #.half() sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) write(audio_path, hparams.sampling_rate, audio) print(audio_path) plot_alignment_to_numpy( alignments.squeeze().cpu().detach().numpy().T, os.path.join('samples', "{}_attention.png".format(out_filename)))
class MelSpectrogramCreator(): tacotron_stft = TacotronSTFT(hparams.fft_size, hparams.hop_size, hparams.win_length, hparams.num_mels, hparams.sample_rate, hparams.fmin, hparams.fmax) @classmethod def mel_spectrogram(cls, wav, method): if method == 'original': mel = audio.logmelspectrogram(wav) elif method == 'tacotron': wav_tensor = torch.Tensor(wav).unsqueeze(0) mel_tensor = cls.tacotron_stft.mel_spectrogram(wav_tensor) mel = mel_tensor.squeeze().data.numpy() else: raise ValueError return mel.astype(np.float32).T
def generate_mels_by_ref_audio(model, waveglow, hparams, sequence, ref_wav, denoiser, denoiser_strength=0.01, device=torch.device('cpu'), *, outpath='output.wav'): # Prepare ref audio input ref_audio_mel = load_mel( ref_wav, TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax), hparams, device) # Decode text input and mel_outputs, mel_outputs_postnet, _, alignments = model.inference_by_ref_audio( sequence, ref_audio_mel) # Plot results # plot_data('mel.png', plot_spectrogram_to_numpy(mel_outputs.data.cpu().numpy()[0])) # Synthesize audio from spectrogram using WaveGlow with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) write(outpath, hparams.sampling_rate, audio[0].data.cpu().numpy()) # (Optional) Remove WaveGlow bias if denoiser_strength > 0: audio_denoised = denoiser(audio, strength=denoiser_strength)[:, 0] audio_denoised = audio_denoised * hparams.max_wav_value write("denoised_output.wav", hparams.sampling_rate, audio_denoised.squeeze().cpu().numpy().astype('int16'))
def prameter_experiment(): tmc = [2, 4, 6, 8, 10, 12, 14, 16] # time masking chunk fmc = [2, 4, 6, 8, 10, 12, 14, 16] # frequency maksing chunk # tmn = [(1,8), (2,4), (4,2), (8,1)] # time masking chunk number # fmn = [(1,6), (2,3), (3,2), (6,1)] # frequency masking chunk number twlr = [2, 4, 6, 8, 10, 12, 14, 16] # time warping length ratio fwl = [2, 4, 6, 8, 10, 12, 14, 16] # tlar = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] flar = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3] # time length adjust ratio var = [2, 4, 8, 16, 32, 64] lrtw = [0.4, 0.4, 0.4, 0.4, 0.4, 0.4] # [0.2, 0.4, 0.8, 1] try_nums = np.arange(1, 2) hparams = create_hparams() stft = TacotronSTFT(hparams) # taking filelist about validation data with open('./filelists/meta_val.txt', encoding='utf-8-sig') as f: files = [x.strip().split('|')[0] for x in f.readlines()] # file to mel mels = [] for x in files: mel = get_mel(stft, x, hparams, 0).squeeze(0) mels.append(mel) test(hparams, mel, "./test/test.wav") #plot_data(mel, 100) # average length of mel avg_len = np.average([mel.size(1) for mel in mels]) print(avg_len) # griffin lim # os.makedirs('gl', exist_ok=True) # for i, mel in enumerate(mels): # path = 'gl' + '/{}.wav'.format(i) # test(hparams, mel, path) for try_num in try_nums: output_dir = 'try{}'.format(try_num) os.makedirs(output_dir, exist_ok=True) # making a directory for time warping length rate flar_path = output_dir + '/FLAR' lrtw_path = output_dir + '/LRTW' ## warping part # time warping length rate # ex = 0 # for r in lrtw: # dir = lrtw_path + '/{}'.format(r) # os.makedirs(dir, exist_ok=True) # for i, mel in enumerate(mels): # path = dir + '/{}.wav'.format(i) # mel_ = local_random_time_warping(mel, 0.4) # plot_data(mel_, ex) # ex += 1 # test(hparams, mel_, path) print("--------------------------------------------") # for r in twlr: # dir = twlr_path + '/{}'.format(r) # os.makedirs(dir, exist_ok=True) # for i, mel in enumerate(mels): # path = dir + '/{}.wav'.format(i) # mel_ = time_warping(mel, r/100.0) # test(hparams, mel_, path) # # # # frequency warping length # for l in fwl: # dir = fwl_path + '/{}'.format(l) # os.makedirs(dir, exist_ok=True) # for i, mel in enumerate(mels): # path = dir + '/{}.wav'.format(i) # mel_ = freq_warping(mel,l) # test(hparams, mel_, path) # # # time length adjustment rate # for r in tlar: # dir = tlar_path + '/{}'.format(r) # os.makedirs(dir, exist_ok=True) # for i, mel in enumerate(mels): # path = dir + '/{}.wav'.format(i) # mel_ = local_random_time_warping(mel, warping_range(r/100.0)) # print(mel_.size()) # test(hparams, mel_, path) # ex = 0 for r in flar: dir = flar_path #+ '/{}'.format(r) os.makedirs(dir, exist_ok=True) for i, mel in enumerate(mels): path = dir + '/{}.wav'.format(ex) mel_ = local_random_freq_warping(mel, r) plot_data(mel_, ex) ex += 1 test(hparams, mel_, path)
import librosa import torch from torch.utils.data import DataLoader from model import parse_batch from configs.two_way_0730 import create_hparams from train import initiate_model from waveglow.denoiser import Denoiser from layers import TacotronSTFT from data_utils import TextMelLoader, TextMelCollate from text import cmudict, text_to_sequence from mellotron_utils import get_data_from_musicxml hparams = create_hparams() hparams.batch_size = 1 stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) speaker = "nes" checkpoint_path = '/mnt/sdd1/backup_149/checkpoints/supervised/checkpoint_180000' model = initiate_model(hparams).cuda().eval() model.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/home/admin/projects/mellotron_init_with_single/models/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') test_text_path = 'filelists/emotion/neutral2.txt' test_set = TextMelLoader(test_text_path, hparams) datacollate = TextMelCollate(1) dataloader = DataLoader(test_set, num_workers=1, shuffle=False,
id = os.path.splitext(id)[0] clean_char = custom_english_cleaners(text.rstrip()) clean_phone = [] for s in g2p(clean_char.lower()): if '@' + s in symbol_to_id: clean_phone.append('@' + s) else: clean_phone.append(s) metadata[id] = {'char': clean_char, 'phone': clean_phone} stft = TacotronSTFT(filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=16000, mel_fmin=55.0, mel_fmax=7600.0) def text2seq(text): sequence = [symbol_to_id['^']] sequence.extend([symbol_to_id[c] for c in text]) sequence.append(symbol_to_id['~']) return sequence def get_mel(filename): wav, sr = librosa.load(filename, sr=16000) wav, _ = librosa.effects.trim(wav,
def measure(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, hparams): """Handles all the validation scoring and printing""" stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) mellotron = load_model(hparams).cuda().eval() mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict']) waveglow_path = '/media/arsh/New Volume/Models/speech/waveglow_256channels_v4.pt' waveglow = torch.load(waveglow_path)['model'].cuda().eval() denoiser = Denoiser(waveglow).cuda().eval() arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') audio_paths = 'filelists/libritts_train_clean_100_audiopath_text_sid_atleast5min_val_filelist.txt' dataloader = TextMelLoader(audio_paths, hparams) datacollate = TextMelCollate(1) speaker_ids = TextMelLoader( "filelists/libritts_train_clean_100_audiopath_text_sid_shorterthan10s_atleast5min_train_filelist.txt", hparams).speaker_ids speakers = pd.read_csv('filelists/libritts_speakerinfo.txt', engine='python', header=None, comment=';', sep=' *\| *', names=['ID', 'SEX', 'SUBSET', 'MINUTES', 'NAME']) speakers['MELLOTRON_ID'] = speakers['ID'].apply( lambda x: speaker_ids[x] if x in speaker_ids else -1) female_speakers = cycle( speakers.query("SEX == 'F' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) male_speakers = cycle( speakers.query("SEX == 'M' and MINUTES > 20 and MELLOTRON_ID >= 0") ['MELLOTRON_ID'].sample(frac=1).tolist()) file_idx = 0 MEL_DTW = [] TPP_DTW = [] RAND_DTW = [] logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0) while file_idx < len(dataloader): audio_path, text, sid = dataloader.audiopaths_and_text[file_idx] # get audio path, encoded text, pitch contour and mel for gst text_encoded = torch.LongTensor( text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda() pitch_contour = dataloader[file_idx][3][None].cuda() mel = load_mel(audio_path, stft) fs, audio = read(audio_path) # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]])) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm, gst, tpse_gst = mellotron.forward( x) rhythm = rhythm.permute(1, 0, 2) speaker_id = next(female_speakers) if np.random.randint(2) else next( male_speakers) speaker_id = torch.LongTensor([speaker_id]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_mel = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, mel, speaker_id, pitch_contour, rhythm), with_tpse=True) with torch.no_grad(): audio_tpp = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = mellotron.inference_noattention( (text_encoded, np.random.randint( 0, 9), speaker_id, pitch_contour, rhythm), with_tpse=False) with torch.no_grad(): audio_rand = denoiser( waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0] audio = np.pad(audio, 128) MEL_DTW.append( logSpecDbConst * np.log(dtw(audio_mel.data.cpu().numpy(), audio, eucCepDist)[0])) TPP_DTW.append( logSpecDbConst * np.log(dtw(audio_tpp.data.cpu().numpy(), audio, eucCepDist)[0])) RAND_DTW.append( logSpecDbConst * np.log(dtw(audio_rand.data.cpu().numpy(), audio, eucCepDist)[0])) print(MEL_DTW[-1], TPP_DTW[-1], RAND_DTW[-1]) print("MEL DTW, Mean: ", np.mean(MEL_DTW), " SD: ", np.std(MEL_DTW)) print("TPP DTW, Mean: ", np.mean(TPP_DTW), " SD: ", np.std(TPP_DTW)) print("RAND DTW, Mean: ", np.mean(RAND_DTW), " SD: ", np.std(RAND_DTW)) file_idx += 1
def infer(output_directory, checkpoint_path, warm_start, hparams, debug=False): """Inference with teaching force Params ------ output_directory (string): directory to the spectrograms checkpoint_path(string): checkpoint path hparams (object): comma separated list of "name=value" pairs. """ os.makedirs(output_directory, exist_ok=True) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams) learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) return_file_name = True trainset = TextMelLoader(hparams.training_files, hparams, return_file_name=return_file_name) collate_fn = TextMelCollate(hparams.n_frames_per_step, return_file_name=return_file_name) train_sampler = None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, collate_fn=collate_fn) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, hparams.ignore_layers) else: model, optimizer, _learning_rate, iteration = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.eval() for i, batch in enumerate(train_loader): x, y = model.parse_batch(batch[:][:-1]) files_name = batch[:][-1] mel_outputs, mel_outputs_postnet, _, alignments = model(x) _, _, mel_expected_padded, _, mel_lengths = x for idx in range(mel_outputs_postnet.size(0)): name = os.path.basename(files_name[idx]).replace(".wav", '') mel_padded = mel_outputs_postnet[idx] mel_length = mel_lengths[idx] mel = mel_padded[:, :mel_length] np.save(os.path.join(output_directory, name + '.npy'), mel.detach().cpu().numpy()) if debug: print( "Debug Mode ON: Saving Wave files and Spectrograms Plot in:", output_directory) # plot audios librosa.output.write_wav( os.path.join(output_directory, name + '.wav'), spec_to_waveform(taco_stft, mel).detach().cpu().numpy(), sr=hparams.sampling_rate) librosa.output.write_wav( os.path.join(output_directory, name + '_padded.wav'), spec_to_waveform(taco_stft, mel_padded).detach().cpu().numpy(), sr=hparams.sampling_rate) librosa.output.write_wav( os.path.join(output_directory, name + '_expected_padded.wav'), spec_to_waveform( taco_stft, mel_expected_padded[idx]).detach().cpu().numpy(), sr=hparams.sampling_rate) # plot figures plot_spectrogram(mel.detach().cpu().numpy(), ) plot_spectrogram( mel_padded.detach().cpu().numpy(), os.path.join(output_directory, name + '_padded.png')) plot_spectrogram( mel_expected_padded[idx].detach().cpu().numpy(), os.path.join(output_directory, name + '_expect_padded.png'))
def taco_stft(): n_fft, hop_length, win_length = _stft_parameters() stft = TacotronSTFT(n_fft, hop_length, win_length) return stft
class Synthesizer(object): def __init__(self): super().__init__() self.hparams = create_hparams() self.hparams.sampling_rate = 16000 self.hparams.max_decoder_steps = 600 self.stft = TacotronSTFT( self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) def load_mel(self, path): audio, sampling_rate = load_wav_to_torch(path) if sampling_rate != self.hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = melspec.cuda() return melspec # def close(self): # tf.reset_default_graph() # self.sess.close() def load(self, checkpoint_path, waveglow_path): self.model = load_model(self.hparams) self.model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = self.model.eval() self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda() path = './web/static/uploads/koemo_spk_emo_all_test.txt' with open(path, encoding='utf-8') as f: filepaths_and_text = [line.strip().split("|") for line in f] base_path = os.path.dirname(checkpoint_path) data_path = os.path.basename(checkpoint_path) + '_' + path.rsplit('_', 1)[1].split('.')[0] + '.npz' npz_path = os.path.join(base_path, data_path) if os.path.exists(npz_path): d = np.load(npz_path) zs = d['zs'] emotions = d['emotions'] else: emotions = [] zs = [] for audio_path, _, _, emotion in tqdm(filepaths_and_text): melspec = self.load_mel(audio_path) _, _, _, z = self.model.vae_gst(melspec) zs.append(z.cpu().data) emotions.append(int(emotion)) emotions = np.array(emotions) # list이면 안됨 -> ndarray zs = torch.cat(zs, dim=0).data.numpy() d = {'zs':zs, 'emotions':emotions} np.savez(npz_path, **d) self.neu = np.mean(zs[emotions==0,:], axis=0) self.sad = np.mean(zs[emotions==1,:], axis=0) self.ang = np.mean(zs[emotions==2,:], axis=0) self.hap = np.mean(zs[emotions==3,:], axis=0) def synthesize(self, text, path, condition_on_ref, ref_audio, ratios): print(ratios) sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() inputs = self.model.parse_input(sequence) transcript_embedded_inputs = self.model.transcript_embedding(inputs).transpose(1,2) transcript_outputs = self.model.encoder.inference(transcript_embedded_inputs) print(condition_on_ref) if condition_on_ref: #ref_audio = '/data1/jinhan/KoreanEmotionSpeech/wav/hap/hap_00000001.wav' ref_audio_mel = self.load_mel(ref_audio) latent_vector, _, _, _ = self.model.vae_gst(ref_audio_mel) latent_vector = latent_vector.unsqueeze(1).expand_as(transcript_outputs) else: # condition on emotion ratio latent_vector = ratios[0] * self.neu + ratios[1] * self.sad + \ ratios[2] * self.hap + ratios[3] * self.ang latent_vector = torch.FloatTensor(latent_vector).cuda() latent_vector = self.model.vae_gst.fc3(latent_vector) encoder_outputs = transcript_outputs + latent_vector decoder_input = self.model.decoder.get_go_frame(encoder_outputs) self.model.decoder.initialize_decoder_states(encoder_outputs, mask=None) mel_outputs, gate_outputs, alignments = [], [], [] while True: decoder_input = self.model.decoder.prenet(decoder_input) mel_output, gate_output, alignment = self.model.decoder.decode(decoder_input) mel_outputs += [mel_output] gate_outputs += [gate_output] alignments += [alignment] if torch.sigmoid(gate_output.data) > self.hparams.gate_threshold: # print(torch.sigmoid(gate_output.data), gate_output.data) break if len(mel_outputs) == self.hparams.max_decoder_steps: print("Warning! Reached max decoder steps") break decoder_input = mel_output mel_outputs, gate_outputs, alignments = self.model.decoder.parse_decoder_outputs( mel_outputs, gate_outputs, alignments) mel_outputs_postnet = self.model.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet # print(mel_outputs_postnet.shape) with torch.no_grad(): synth = self.waveglow.infer(mel_outputs, sigma=0.666) # return synth[0].data.cpu().numpy() # path = add_postfix(path, idx) # print(path) librosa.output.write_wav(path, synth[0].data.cpu().numpy(), 16000)