def __init__(self, n_fft=1024, n_mels=80): super().__init__() self.mel_to_lin = transforms.InverseMelScale(n_stft=n_fft // 2 + 1, n_mels=n_mels, sample_rate=sample_rate, max_iter=2048) self.griffin_lim = transforms.GriffinLim(n_fft=n_fft, hop_length=256)
def test_griffinlim(self, momentum, rand_init): n_fft = 400 power = 1 n_iter = 3 spec = get_spectrogram( get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2), n_fft=n_fft, power=power) transform = _DeterministicWrapper( T.GriffinLim(n_fft=n_fft, n_iter=n_iter, momentum=momentum, rand_init=rand_init, power=power)) self.assert_grad(transform, [spec])
def _spec_to_wav(spec, sr=sample_rate, engine='librosa'): ''' using Griffin-Lim algorithm ''' if engine == 'librosa': return librosa.griffinlim(spec, hop_length=stft_params['hop_length'], win_length=stft_params['win_length']) elif engine == 'torch': return tf.GriffinLim(**stft_params, power=power)(spec) raise ValueError(engine)
plot_waveform(waveform, sample_rate, title="Original") play_audio(waveform, sample_rate) n_fft = 1024 win_length = None hop_length = 512 spec = T.Spectrogram( n_fft=n_fft, win_length=win_length, hop_length=hop_length, )(waveform) griffin_lim = T.GriffinLim( n_fft=n_fft, win_length=win_length, hop_length=hop_length, ) waveform = griffin_lim(spec) plot_waveform(waveform, sample_rate, title="Reconstructed") play_audio(waveform, sample_rate) ###################################################################### # Mel Filter Bank # --------------- # # ``torchaudio.functional.create_fb_matrix`` generates the filter bank # for converting frequency bins to mel-scale bins. # # Since this function does not require input audio/features, there is no
def test_GriffinLim(self): tensor = torch.rand((1, 201, 6)) self._assert_consistency(T.GriffinLim(length=1000, rand_init=False), tensor)
def audio2mel2audio(dataset_path, audiopaths_and_text, melpaths_and_text, args, use_intermed=None): melpaths_and_text_list = \ load_filepaths_and_text(dataset_path, melpaths_and_text) audiopaths_and_text_list = \ load_filepaths_and_text(dataset_path, audiopaths_and_text) # n = 10 # print(f"The first {n} melpaths and text are {melpaths_and_text_list[:n]}") # print(f"The first {n} audiopaths and text are {audiopaths_and_text_list[:n]}") # torchaudio implementation spec = T.Spectrogram( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, power=1, normalized=True, ) # print(spec) griffin_lim = T.GriffinLim( n_fft=args.filter_length, win_length=args.win_length, hop_length=args.hop_length, n_iter=args.n_iters, power=1, normalized=True, ) # import pdb; pdb.set_trace() print(args) data_path = "/data/logotypografia_simple/cleaned_wavs/" # tacotron-based implementations # stft_fn = STFT(args.filter_length, args.hop_length, args.win_length) # data_loader = TextMelLoader(dataset_path, audiopaths_and_text, args) for i in range(len(melpaths_and_text_list)): # specotrogram calculation based on internal components, buggy # spec = data_loader.get_spec(audiopaths_and_text_list[i][0]) # wave = griffin_lim(spec, stft_fn, n_iters=30) # wave = wave.detach().cpu().numpy() # spectrogram calculation based on torchaudio wav_name = data_path + audiopaths_and_text_list[i][0].split("/")[-1] audio, sampling_rate = load_wav_to_torch(wav_name) _spectrogram = spec(audio) inv_waveform = griffin_lim(_spectrogram) # torch.save(mel, f"grifin_lin/{}") inv_wav_name = "griffin_lim_inv_audio_custom7/" \ + audiopaths_and_text_list[i][0].split("/")[-1] print(f"Saving reconstructed wav with name {inv_wav_name}") write(inv_wav_name, 16000, inv_waveform.detach().cpu().numpy())