def my_getitem(self, idx): wavpath = self.wav_list[idx] melpath = wavpath.replace('.wav', '.mel') sr, audio = read_wav_np(wavpath) if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short: audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \ mode='constant', constant_values=0.0) audio = torch.from_numpy(audio).unsqueeze(0) mel = torch.load(melpath).squeeze(0) # my_tacotron2_model # mel = np.load(melpath) # mel = torch.from_numpy(mel).squeeze(0) if self.train: max_mel_start = mel.size(1) - self.mel_segment_length mel_start = random.randint(0, max_mel_start) mel_end = mel_start + self.mel_segment_length mel = mel[:, mel_start:mel_end] audio_start = mel_start * self.hp.audio.hop_length audio = audio[:, audio_start:audio_start + self.hp.audio.segment_length] audio = audio + (1 / 32768) * torch.randn_like(audio) return mel, audio
def my_getitem(self, idx): wavpath = self.wav_list[idx] id = os.path.basename(wavpath).split(".")[0] mel_path = "{}/{}.npy".format(self.hp.data.mel_path, id) sr, audio = read_wav_np(wavpath) if len(audio) < self.hp.audio.segment_length + self.hp.audio.pad_short: audio = np.pad(audio, (0, self.hp.audio.segment_length + self.hp.audio.pad_short - len(audio)), \ mode='constant', constant_values=0.0) audio = torch.from_numpy(audio).unsqueeze(0) # mel = torch.load(melpath).squeeze(0) # # [num_mel, T] mel = torch.from_numpy(np.load(mel_path)) if self.train: max_mel_start = mel.size(1) - self.mel_segment_length mel_start = random.randint(0, max_mel_start) mel_end = mel_start + self.mel_segment_length mel = mel[:, mel_start:mel_end] audio_start = mel_start * self.hp.audio.hop_length audio = audio[:, audio_start:audio_start + self.hp.audio.segment_length] audio = audio + (1 / 32768) * torch.randn_like(audio) audio = audio.squeeze(0).squeeze(0) return mel, audio
def load_testset(): # args = parse_train_args(['-c', './config/blizzard_compressed_experiments.yaml', '-n', 'blizzard_compressed_validation', '-t', str(tier), '-b', '1', '-s', 'TTS']) hp = HParam('./config/blizzard_compressed_experiments.yaml') dataset = [] raw_data = None with open(os.path.join(hp.data.path, 'prompts.gui'), 'r') as f: lines = f.read().splitlines() filenames = lines[::3] sentences = lines[1::3] raw_data = list(zip(filenames, sentences)) random.seed(123) random.shuffle(raw_data) raw_data = raw_data[int(0.95 * len(raw_data)):] for filename, sentence in tqdm(raw_data, total=len(raw_data)): wav_path = os.path.join(hp.data.path, 'wavn', filename + '.wav') length = get_length(wav_path, hp.audio.sr) if length < hp.audio.duration: dataset.append((wav_path, sentence)) for i in range(len(dataset)): text = dataset[i][1] wav = read_wav_np(dataset[i][0], sample_rate=hp.audio.sr) filename = os.path.basename(dataset[i][0]) yield filename, text, wav
def main(hp, args): stft = TacotronSTFT(filter_length=hp.audio.filter_length, hop_length=hp.audio.hop_length, win_length=hp.audio.win_length, n_mel_channels=hp.audio.n_mel_channels, sampling_rate=hp.audio.sampling_rate, mel_fmin=hp.audio.mel_fmin, mel_fmax=hp.audio.mel_fmax) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) mel_path = hp.data.mel_path os.makedirs(mel_path, exist_ok=True) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): sr, wav = read_wav_np(wavpath) assert sr == hp.audio.sampling_rate, \ "sample rate mismatch. expected %d, got %d at %s" % \ (hp.audio.sampling_rate, sr, wavpath) if len(wav) < hp.audio.segment_length + hp.audio.pad_short: wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \ mode='constant', constant_values=0.0) wav = torch.from_numpy(wav).unsqueeze(0) mel = stft.mel_spectrogram(wav) # mel [1, num_mel, T] mel = mel.squeeze(0) # [num_mel, T] id = os.path.basename(wavpath).split(".")[0] np.save('{}/{}.npy'.format(mel_path, id), mel.numpy(), allow_pickle=False)
def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx]) wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return source, target
def __getitem__(self, idx): wav = read_wav_np(self.file_list[idx], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) # # Reconstruct audio for testing # filename = os.path.basename(self.file_list[idx]) # plt.imsave('./reconstructed_audio/original_'+filename+'.png', mel) # plt.imsave('./reconstructed_audio/source_'+filename+'.png', source) # plt.imsave('./reconstructed_audio/target_'+filename+'.png', target) # self.melgen.save_audio('source_'+filename, wav) # source_tensor = torch.unsqueeze(torch.from_numpy(source), 0) # target_tensor = torch.unsqueeze(torch.from_numpy(target), 0) # reconstructed_mel_tensor = self.tierutil.interleave(source_tensor, target_tensor, self.tier) # reconstructed_mel = reconstructed_mel_tensor.numpy() # print('Shapes: [mel, source, target, reconstruction], [%s, %s, %s, %s]' % ( # mel.shape, # source.shape, # target.shape, # reconstructed_mel.shape, # )) # reconstructed_audio = self.melgen.reconstruct_audio(reconstructed_mel) # self.melgen.save_audio('reconstructed_'+filename, reconstructed_audio) return source, target
def __getitem__(self, idx): sentence = self.dataset[idx][1] seq = seq_to_array(sentence) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
def main(hp, args): stft = TacotronSTFT(filter_length=hp.audio.filter_length, hop_length=hp.audio.hop_length, win_length=hp.audio.win_length, n_mel_channels=hp.audio.n_mel_channels, sampling_rate=hp.audio.sampling_rate, mel_fmin=hp.audio.mel_fmin, mel_fmax=hp.audio.mel_fmax) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): try: sr, wav = read_wav_np(wavpath) except: continue assert sr == hp.audio.sampling_rate, \ "sample rate mismatch. expected %d, got %d at %s" % \ (hp.audio.sampling_rate, sr, wavpath) if len(wav) < hp.audio.segment_length + hp.audio.pad_short: wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \ mode='constant', constant_values=0.0) wav = torch.from_numpy(wav).unsqueeze(0) mel = stft.mel_spectrogram(wav) melpath = wavpath.replace('.wav', '.mel') torch.save(mel, melpath)
def get_audio(): hp = HParam('./config/blizzard_compressed_experiments.yaml') file_list = glob.glob(os.path.join(hp.data.path, '**', hp.data.extension), recursive=True) random.seed(123) random.shuffle(file_list) file_list = file_list[int(0.95 * len(file_list)):] for idx in range(len(file_list)): filename = os.path.basename(file_list[idx]) wav = read_wav_np(file_list[idx], sample_rate=hp.audio.sr) yield filename, wav
def __getitem__(self, idx): text = self.dataset[idx][1] if self.hp.data.name == 'KSS': seq = text_to_sequence(text) elif self.hp.data.name == 'Blizzard': seq = process_blizzard(text) wav = read_wav_np(self.dataset[idx][0], sample_rate=self.hp.audio.sr) # wav = cut_wav(self.wavlen, wav) mel = self.melgen.get_normalized_mel(wav) source, target = self.tierutil.cut_divide_tiers(mel, self.tier) return seq, source, target
def main(hp, args): ap = AudioProcessor(sample_rate=22050, num_mels=80, min_level_db=-100, frame_shift_ms=None, frame_length_ms=None, hop_length=256, win_length=1024, ref_level_db=20, fft_size=1024, power=1.5, preemphasis=0.98, signal_norm=True, symmetric_norm=True, max_norm=4.0, mel_fmin=0.0, mel_fmax=8000.0, spec_gain=20.0, stft_pad_mode="reflect", clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, trim_db=60) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) mel_path = hp.data.mel_path # Create all folders os.makedirs(mel_path, exist_ok=True) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): sr, wav = read_wav_np(wavpath) assert sr == hp.audio.sampling_rate, \ "sample rate mismatch. expected %d, got %d at %s" % \ (hp.audio.sampling_rate, sr, wavpath) if len(wav) < hp.audio.segment_length + hp.audio.pad_short: wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \ mode='constant', constant_values=0.0) wav = torch.from_numpy(wav).unsqueeze(0) wav = wav.squeeze(0) mel = np.float32(ap.melspectrogram(wav.detach().cpu().numpy())) mel = torch.from_numpy(mel) mel = mel.unsqueeze(0) mel = mel.squeeze(0) # [num_mel, T] id = os.path.basename(wavpath).split(".")[0] np.save('{}/{}.npy'.format(mel_path, id), mel.numpy(), allow_pickle=False)
def main(hp, args): stft = TacotronSTFT(filter_length=hp.audio.filter_length, hop_length=hp.audio.hop_length, win_length=hp.audio.win_length, n_mel_channels=hp.audio.n_mel_channels, sampling_rate=hp.audio.sampling_rate, mel_fmin=hp.audio.mel_fmin, mel_fmax=hp.audio.mel_fmax) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) save_train_mel_path = 'melgan_train_mel_data' save_val_mel_path = 'melgan_val_mel_data' os.makedirs(save_train_mel_path, exist_ok=True) os.makedirs(save_val_mel_path, exist_ok=True) random.shuffle(wav_files) count = 0 #for wavpath in wav_files: for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): sr, wav = read_wav_np(wavpath) assert sr == hp.audio.sampling_rate, \ "sample rate mismatch. expected %d, got %d at %s" % \ (hp.audio.sampling_rate, sr, wavpath) if len(wav) < hp.audio.segment_length + hp.audio.pad_short: wav = np.pad(wav, (0, hp.audio.segment_length + hp.audio.pad_short - len(wav)), \ mode='constant', constant_values=0.0) wav = torch.from_numpy(wav).unsqueeze(0) mel = stft.mel_spectrogram(wav) wav_name = wavpath.split('/')[5] melpath = wavpath.replace('.wav', '.mel') mel_name = melpath.split('/')[5] if count < 300: final_mel_path = os.path.join(save_val_mel_path, mel_name) final_wav_path = os.path.join(save_val_mel_path, wav_name) else: final_mel_path = os.path.join(save_train_mel_path, mel_name) final_wav_path = os.path.join(save_train_mel_path, wav_name) torch.save(mel, final_mel_path) shutil.copy(wavpath, final_wav_path) count += 1
def my_getitem(self, idx): wavpath = self.wav_list[idx] melpath = wavpath.replace('.wav', '.mel') sr, audio = read_wav_np(wavpath) audio = torch.from_numpy(audio).unsqueeze(0) mel = torch.load(melpath).squeeze(0) if self.train: max_mel_start = mel.size(1) - self.mel_segment_length mel_start = random.randint(0, max_mel_start) mel_end = mel_start + self.mel_segment_length mel = mel[:, mel_start:mel_end] audio_start = mel_start * self.hp.audio.hop_length audio = audio[:, audio_start:audio_start + self.hp.audio.segment_length] audio = audio + (1 / 32768) * torch.randn_like(audio) return mel, audio
def main(hp, args): stft = TacotronSTFT(filter_length=hp.audio.filter_length, hop_length=hp.audio.hop_length, win_length=hp.audio.win_length, n_mel_channels=hp.audio.n_mel_channels, sampling_rate=hp.audio.sampling_rate, mel_fmin=hp.audio.mel_fmin, mel_fmax=hp.audio.mel_fmax) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): sr, wav = read_wav_np(wavpath) assert sr == hp.audio.sampling_rate, \ "sample rate mismatch. expected %d, got %d at %s" % \ (hp.audio.sampling_rate, sr, wavpath) wav = torch.from_numpy(wav).unsqueeze(0) mel = stft.mel_spectrogram(wav) melpath = wavpath.replace('.wav', '.mel') torch.save(mel, melpath)
def main(args, hp): stft = TacotronSTFT(filter_length=hp.filter_length, hop_length=hp.hop_length, win_length=hp.win_length, n_mel_channels=hp.n_mel_channels, sampling_rate=hp.sampling_rate, mel_fmin=hp.mel_fmin, mel_fmax=hp.mel_fmax) wav_files = glob.glob(os.path.join(args.data_path, '**', '*.wav'), recursive=True) mel_path = hp.data_path os.makedirs(mel_path, exist_ok=True) print("Sample Rate : ", hp.sampling_rate) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel'): sr, wav = read_wav_np(wavpath, hp.sampling_rate) wav = torch.from_numpy(wav).unsqueeze(0) mel, mag = stft.mel_spectrogram( wav) # mel [1, 80, T] mag [1, num_mag, T] mel = mel.squeeze(0) # [num_mel, T] id = os.path.basename(wavpath).split(".")[0] np.save('{}/{}.npy'.format(mel_path, id), mel.numpy(), allow_pickle=False)
def main(args): stft = TacotronSTFT(filter_length=hp.n_fft, hop_length=hp.hop_length, win_length=hp.win_length, n_mel_channels=hp.n_mels, sampling_rate=hp.sampling_rate, mel_fmin=hp.fmin, mel_fmax=hp.fmax) # wav_file loacation wav_files = glob.glob(os.path.join(args.wav_root_path, '**', '*.wav'), recursive=True) #Define all the paths correesponding to the feature text_path = os.path.join(hp.data_path, 'text') mel_path = os.path.join(hp.data_path, 'mels') duration_path = os.path.join(hp.data_path, 'alignment') energy_path = os.path.join(hp.data_path, 'energy') pitch_path = os.path.join(hp.data_path, 'pitch') symbol_path = os.path.join(hp.data_path, 'symbol') # create directory if doesnt exist os.makedirs(text_path, exist_ok=True) os.makedirs(duration_path, exist_ok=True) os.makedirs(mel_path, exist_ok=True) os.makedirs(energy_path, exist_ok=True) os.makedirs(pitch_path, exist_ok=True) os.makedirs(symbol_path, exist_ok=True) for wavpath in tqdm.tqdm(wav_files, desc='preprocess wav to mel, energy, and pitch'): sr, wav = read_wav_np(wavpath) p = pitch(wav) # [T, ] T = Number of frames wav = torch.from_numpy(wav).unsqueeze(0) mel, mag = stft.mel_spectrogram( wav) # mel [1, 80, T] mag [1, num_mag, T] mel = mel.squeeze(0) # [num_mel, T] mag = mag.squeeze(0) # [num_mag, T] e = torch.norm(mag, dim=0) # [T, ] p = p[:mel.shape[1]] p = np.array(p, dtype='float32') id = os.path.basename(wavpath).split(".")[0] # save the features np.save('{}/{}.npy'.format(mel_path, id), mel.numpy(), allow_pickle=False) np.save('{}/{}.npy'.format(energy_path, id), e.numpy(), allow_pickle=False) np.save('{}/{}.npy'.format(pitch_path, id), p, allow_pickle=False) with open(hp.filelist_alignment_dir + "alignment.txt", encoding='utf-8') as f: #add all 13100 examples to filelist.txt for lines in f: content = lines.split('|') id = content[4].split()[0].split('.')[0] if os.path.exists(os.path.join(args.wav_root_path, id + '.wav')): text = content[0] duration = content[2] duration = duration.split() dur = np.array(duration, dtype='float32') #done phoneme = content[3] symbol_sequence = phonemes_to_sequence(phoneme) np.save( '{}/{}.npy'.format(text_path, id), (text, phoneme), allow_pickle=False) #what is the input text or phonemen??? np.save('{}/{}.npy'.format(duration_path, id), dur, allow_pickle=False) np.save('{}/{}.npy'.format(symbol_path, id), symbol_sequence, allow_pickle=False)
spectrogram1 = plot_spectrogram_to_numpy(final_mel) plt.imsave(os.path.join('validation_tests', filename + '_generated.png'), spectrogram1.transpose((1, 2, 0))) spectrogram2 = plot_spectrogram_to_numpy(target_mel) plt.imsave(os.path.join('validation_tests', filename + '_target.png'), spectrogram2.transpose((1, 2, 0))) # breakdown = None # audio_files = get_audio() # for filename, wav in audio_files: # breakdown = deconstruct_audio(wav) # reconstruct_audio(filename, breakdown) # hp = HParam('./config/blizzard_alldata_v5.yaml') # melgen = MelGen(hp) # melgen.save_audio('original_'+filename, wav) # print('') # print('') # break # First deconstruct the wav file filename = os.path.basename(WAV_FILE) wav = read_wav_np(WAV_FILE, sample_rate=22050) tier_to_breakdown = deconstruct_audio(wav) # Now run inference final_mel = run_inference(SENTENCE, timestep, tier_to_breakdown) # target_mel = tier_to_breakdown[7][0] # save_images(final_mel, target_mel, filename) # save_audio('generated_'+filename, final_mel) # save_audio('target_'+filename, target_mel)