def __init__(self, args, files): self.audiopaths_and_text = load_filepaths_and_text( args.dataset_path, files) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate # self.load_mel_from_disk = args.load_mel_from_disk self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def load_mel(path): stft = TacotronSTFT() audio, sampling_rate = load_wav_to_torch(path) if sampling_rate != 16000: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / 32768.0 # hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) #melspec = melspec.cuda() melspec = torch.squeeze(melspec, 0) return melspec
def __init__(self, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, h5_melfile): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_framelength = math.ceil(segment_length / hop_length) self.segment_length = self.segment_framelength * hop_length self.hop_length = hop_length self.sampling_rate = sampling_rate self.h5_melfile = h5_melfile self.h5_mel = None
def get_mel_for_test(file_path): """ Library: from common.layers import TacotronSTFT """ audio, sr = load_wav_to_torch(file_path) audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) stft = TacotronSTFT(filter_length=1024, hop_length=160, win_length=1024, sampling_rate=sr, mel_fmin=0.0, mel_fmax=8000.0) melspec = stft.mel_spectrogram(audio_norm) return melspec
def __init__(self, args, anchor_dirs): self.speaker_num = len(anchor_dirs) self.meta_dirs = [ os.path.join(args.dataset_path, anchor_dirs[i]) for i in range(self.speaker_num) ] self.metadatas = [ load_metadata(meta_dir) for meta_dir in self.meta_dirs ] self.offsets = [0] * self.speaker_num self.text_cleaners = args.text_cleaners self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) for i in range(self.speaker_num): random.shuffle(self.metadatas[i])
class TextMelLoader(torch.utils.data.Dataset): """ 1) loads audio,text pairs 2) normalizes text and converts them to sequences of one-hot vectors 3) computes mel-spectrograms from audio files. """ def __init__(self, args, files): self.audiopaths_and_text = load_filepaths_and_text( args.dataset_path, files) self.text_cleaners = args.text_cleaners self.max_wav_value = args.max_wav_value self.sampling_rate = args.sampling_rate # self.load_mel_from_disk = args.load_mel_from_disk self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) def get_mel_text_pair(self, audiopath_and_text): # separate filename and text audiopath, text = audiopath_and_text[0], audiopath_and_text[1] text_len = len(text) text = self.get_text(text) mel = self.get_mel(audiopath) return (text, mel, text_len) def get_mel(self, filename): if False: #not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename).T) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec def get_text(self, text): return text_to_sequence(text, self.text_cleaners) def __getitem__(self, index): return self.get_mel_text_pair(self.audiopaths_and_text[index]) def __len__(self): return len(self.audiopaths_and_text)
def apply_griffin_lim(args, mel): taco_stft = TacotronSTFT( args.filter_length, args.hop_length, args.win_length, 80, args.sampling_rate, ) with torch.no_grad(): mel_decompress = taco_stft.spectral_de_normalize(mel) # mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audios = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, args.griffin_iters) audios = audios.squeeze() audios = [audios.cpu()] return audios
class Mel2Samp(torch.utils.data.Dataset): """ This is the main class that calculates the spectrogram and returns the spectrogram, audio pair. """ def __init__(self, segment_length, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax, h5_melfile): self.stft = TacotronSTFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment_framelength = math.ceil(segment_length / hop_length) self.segment_length = self.segment_framelength * hop_length self.hop_length = hop_length self.sampling_rate = sampling_rate self.h5_melfile = h5_melfile self.h5_mel = None def get_mel(self, audio): audio_norm = audio / MAX_WAV_VALUE audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec def __getitem__(self, index): if self.h5_mel is None: self.h5_mel = h5py.File(self.h5_melfile, "r") audio_gp = self.h5_mel[str(index)]["24k"] audio_start = random.randint(0, audio_gp.shape[0] - self.segment_length) audio = torch.FloatTensor(audio_gp[audio_start:audio_start + self.segment_length]) mel = self.get_mel(audio) audio = audio / MAX_WAV_VALUE return (mel, audio) def __len__(self): return len(self.audio_files)
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_training_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model, args = load_and_setup_model(parser, args) log_hardware() log_args(args) os.makedirs(args.output_dir, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [ os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs ] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for npy_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) wav = load_wav_to_torch(npy_path) mel = stft.mel_spectrogram(wav.unsqueeze(0)) mel = mel.squeeze() max_target_len = mel.size(1) - 1 max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val) target = padded_mel[:, ::args.n_frames_per_step] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) outputs = model.infer( to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int()) _, mel_out, _, _ = [ output.cpu() for output in outputs if output is not None ] mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1] assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length) fname = os.path.basename(npy_path) np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False) # GTA synthesis # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze()) # wav = griffin_lim(magnitudes, stft.stft_fn, 60) # save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_training_args(parser) parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model, args = load_and_setup_model(parser, args) log_hardware() log_args(args) try: f = open(args.input_file) sentences = list(map(lambda s: s.strip(), f.readlines())) except UnicodeDecodeError: f = open(args.input_file, encoding='gbk') sentences = list(map(lambda s: s.strip(), f.readlines())) os.makedirs(args.output_dir, exist_ok=True) LOGGER.iteration_start() measurements = {} sequences, text_lengths, ids_sorted_decreasing = prepare_input_sequence( sentences, args.speaker_id) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): outputs = model.infer(sequences, text_lengths) _, mels, _, _, mel_lengths = [output.cpu() for output in outputs] tacotron2_infer_perf = mels.size(0) * mels.size( 2) / measurements['tacotron2_time'] LOGGER.log(key="tacotron2_frames_per_sec", value=tacotron2_infer_perf) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish() # recover to the original order and concatenate stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) ids_sorted_decreasing = ids_sorted_decreasing.numpy().tolist() mels = [mel[:, :length] for mel, length in zip(mels, mel_lengths)] mels = [ mels[ids_sorted_decreasing.index(i)] for i in range(len(ids_sorted_decreasing)) ] magnitudes = stft.inv_mel_spectrogram(torch.cat(mels, axis=-1)) wav = griffin_lim(magnitudes, stft.stft_fn) save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) np.save(os.path.join(args.output_dir, 'eval.npy'), np.concatenate(mels, axis=-1), allow_pickle=False)
class TextMelDataset(torch.utils.data.Dataset): """ 1) loads audio,text pairs 2) normalizes text and converts them to sequences of one-hot vectors 3) computes mel-spectrograms from audio files. """ def __init__(self, args, anchor_dirs): self.speaker_num = len(anchor_dirs) self.meta_dirs = [ os.path.join(args.dataset_path, anchor_dirs[i]) for i in range(self.speaker_num) ] self.metadatas = [ load_metadata(meta_dir) for meta_dir in self.meta_dirs ] self.offsets = [0] * self.speaker_num self.text_cleaners = args.text_cleaners self.sampling_rate = args.sampling_rate self.load_mel_from_disk = args.load_mel_from_disk self.stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) random.seed(1234) for i in range(self.speaker_num): random.shuffle(self.metadatas[i]) def get_mel_text_pair(self, speaker_id, metadata): mel_path, text = metadata seq_len = len(text) seq = self.get_sequence(text, speaker_id) mel = self.get_mel(mel_path) return (seq, mel, seq_len) def get_mel(self, filename): if not self.load_mel_from_disk: audio = load_wav_to_torch(filename) melspec = self.stft.mel_spectrogram(audio.unsqueeze(0)) melspec = torch.squeeze(melspec, 0) else: melspec = torch.from_numpy(np.load(filename)) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec def get_sequence(self, text, speaker_id): return text_to_sequence(text, speaker_id, self.text_cleaners) def __getitem__(self, index): group = [ self.get_mel_text_pair(i, self.metadatas[i][self.offsets[i]]) for i in range(self.speaker_num) ] self.offsets = [(self.offsets[i] + 1) % len(self.metadatas[i]) for i in range(self.speaker_num)] return group def __len__(self): return sum([len(m) for m in self.metadatas]) // self.speaker_num
denoiser = Denoiser(waveglow_for_denoiser, mode=denoiser_mode) denoiser_strength = 0.005 # End of parameters logging.debug('Tacotron: %s', checkpoint_path) logging.debug('Waveglow: %s', waveglow_path) logging.debug('AM: SI model') logging.debug('is_clip: %d', is_clip) logging.debug('Fs: %d', fs) logging.debug('Sigma: %f', waveglow_sigma) logging.debug('Denoiser strength: %f', denoiser_strength) logging.debug('Denoiser mode: %s', denoiser_mode) hparams = create_hparams_stage() taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_acoustic_feat_dims, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) # Load models. tacotron_model = load_model(hparams) tacotron_model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = tacotron_model.eval() waveglow_model = load_waveglow_model(waveglow_path) deps = ppg.DependenciesPPG() if os.path.isfile(teacher_utt_path): logging.info('Perform AC on %s', teacher_utt_path) teacher_ppg = get_ppg(teacher_utt_path, deps) ac_mel = get_inference(teacher_ppg, tacotron_model, is_clip) ac_wav = waveglow_audio(ac_mel, waveglow_model, waveglow_sigma, True)