def get_mel(self, filename): if not self.load_mel_from_disk: audio, sampling_rate = load_wav_to_torch(filename) if sampling_rate != self.stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) else: #melspec = torch.from_numpy(np.load(filename)) melspec=torch.load(filename) assert melspec.size(0) == self.stft.n_mel_channels, ( 'Mel dimension mismatch: given {}, expected {}'.format( melspec.size(0), self.stft.n_mel_channels)) return melspec
def get_mel_audio_pair(self, filename): audio, sampling_rate = load_wav_to_torch(filename, sr = self.sampling_rate) if sampling_rate != self.stft.sampling_rate: raise ValueError(f"{sampling_rate} SR doesn't match target {self.stft.sampling_rate} SR") # Take segment if audio.size(0) >= self.segment_length: max_audio_start = audio.size(0) - self.segment_length audio_start = random.randint(0, max_audio_start) audio = audio[audio_start:audio_start+self.segment_length] else: audio = torch.nn.functional.pad( audio, (0, self.segment_length - audio.size(0)), 'constant').data audio = audio / self.max_wav_value audio_norm = audio.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = self.stft.mel_spectrogram(audio_norm) melspec = melspec.squeeze(0) return (melspec, audio, len(audio))
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_training_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.register_metric("tacotron2_frames_per_sec", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("tacotron2_latency", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE) model, args = load_and_setup_model(parser, args) log_hardware() log_args(args) os.makedirs(args.output_dir, exist_ok=True) LOGGER.iteration_start() measurements = {} anchor_dirs = [ os.path.join(args.dataset_path, anchor) for anchor in args.training_anchor_dirs ] metadatas = [load_metadata(anchor) for anchor in anchor_dirs] stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length, args.n_mel_channels, args.sampling_rate, args.mel_fmin, args.mel_fmax) with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"): for speaker_id in range(len(anchor_dirs)): metadata = metadatas[speaker_id] for npy_path, text in tqdm(metadata): seq = text_to_sequence(text, speaker_id, ['basic_cleaners']) seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0) seq_lens = torch.IntTensor([len(text)]) wav = load_wav_to_torch(npy_path) mel = stft.mel_spectrogram(wav.unsqueeze(0)) mel = mel.squeeze() max_target_len = mel.size(1) - 1 max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step padded_mel = np.pad(mel, [(0, 0), (0, max_target_len - mel.size(1))], mode='constant', constant_values=args.mel_pad_val) target = padded_mel[:, ::args.n_frames_per_step] targets = torch.from_numpy(np.stack(target)).unsqueeze(0) target_lengths = torch.IntTensor([target.shape[1]]) outputs = model.infer( to_gpu(seqs).long(), to_gpu(seq_lens).int(), to_gpu(targets).half(), to_gpu(target_lengths).int()) _, mel_out, _, _ = [ output.cpu() for output in outputs if output is not None ] mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1] assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length) fname = os.path.basename(npy_path) np.save(os.path.join(args.output_dir, fname), mel_out, allow_pickle=False) # GTA synthesis # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze()) # wav = griffin_lim(magnitudes, stft.stft_fn, 60) # save_wav(wav, os.path.join(args.output_dir, 'eval.wav')) LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time']) LOGGER.log(key="latency", value=(measurements['tacotron2_time'])) LOGGER.iteration_stop() LOGGER.finish()
def audio2mel2audio(dataset_path, audiopaths_and_text, melpaths_and_text, args, use_intermed=None): melpaths_and_text_list = \ load_filepaths_and_text(dataset_path, melpaths_and_text) audiopaths_and_text_list = \ load_filepaths_and_text(dataset_path, audiopaths_and_text) # n = 10 # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}") # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}") # torchaudio implementation spec = T.Spectrogram( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, power=1, normalized=True, ) # print(spec) griffin_lim = T.GriffinLim( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, n_iter=args.n_iters, power=1, normalized=True, ) # import pdb; pdb.set_trace() print(args) data_path = "/data/logotypografia_simple/cleaned_wavs/" # tacotron-based implementations # stft_fn = STFT(args.filter_length, args.hop_length, args.win_length) # data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args) for i in range(len(melpaths_and_text_list)): # specotrogram calculation based on internal components, buggy # spec = data_loader.get_spec(audiopaths_and_text_list[i][0]) # wave = griffin_lim(spec, stft_fn, n_iters=30) # wave = wave.detach().cpu().numpy() # spectrogram calculation based on torchaudio wav_name = data_path + audiopaths_and_text_list[i][0].split("/")[-1] audio, sampling_rate = load_wav_to_torch(wav_name) _spectrogram = spec(audio) inv_waveform = griffin_lim(_spectrogram) # torch.save(mel, f"grifin_lin/{}") inv_wav_name = "griffin_lim_inv_audio_custom7/" \ + audiopaths_and_text_list[i][0].split("/")[-1] print(f"Saving reconstructed wav with name {inv_wav_name}") write(inv_wav_name, 16000, inv_waveform.detach().cpu().numpy())