def mels_to_wavs_GL(mels, taco_stft, ref_level_db=0, magnitude_power=1.5): #print('inside mels_wavs', len(mels)) for i, mel in enumerate(mels): stime = time.time() mel_decompress = mel_denormalize( torch.from_numpy(mel).cuda().unsqueeze(0), hp.max_abs_value) #print(mel_decompress.shape) mel_decompress = taco_stft.spectral_de_normalize(mel_decompress + ref_level_db)**( 1 / magnitude_power) #print(mel_decompress.shape) mel_decompress_ = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress_[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]), taco_stft.stft_fn, 60) waveform = waveform[0].data.cpu().numpy() dec_time = time.time() - stime len_audio = float(len(waveform)) / float(hp.sample_rate) str = "{}th sentence, audio length: {:.2f} sec, mel_to_wave time: {:.2f}".format( i, len_audio, dec_time) print(str) #write(os.path.join(output_dir,"sentence_{}.wav".format(i)), hp.sample_rate, waveform) return waveform
def mels_to_wavs_GL(hparams, mels, taco_stft, output_dir="", ref_level_db=0, magnitude_power=1.5): for i, mel in enumerate(mels): stime = time.time() mel_decompress = mel_denormalize( torch.from_numpy(mel).cuda().unsqueeze(0)) mel_decompress = taco_stft.spectral_de_normalize(mel_decompress + ref_level_db)**( 1 / magnitude_power) mel_decompress_ = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress_[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]), taco_stft.stft_fn, 60) # waveform = signal.lfilter([1], [1, -0.97], waveform) #waveform = torch.as_tensor(waveform[0]).data.cpu().numpy() waveform = waveform[0].data.cpu().numpy() dec_time = time.time() - stime len_audio = float(len(waveform)) / float(hparams.sampling_rate) str = "{}th sentence, audio length: {:.2f} sec, mel_to_wave time: {:.2f}".format( i, len_audio, dec_time) print(str) write(os.path.join(output_dir, "sentence_{}.wav".format(i)), hparams.sampling_rate, waveform)
def test(hparams, mel, output_path="test.wav", ref_level_db=20, magnitude_power=1.5): taco_stft = TacotronSTFT(hparams) stime = time.time() mel_decompress = mel_denormalize(mel).unsqueeze(0) mel_decompress = taco_stft.spectral_de_normalize(mel_decompress + ref_level_db)**( 1 / magnitude_power) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]), taco_stft.stft_fn, 60) waveform = waveform[0].data.cpu().numpy() waveform = waveform / abs(waveform).max() * 0.99 * 2**15 waveform = waveform.astype(dtype=np.int16) dec_time = time.time() - stime len_audio = float(len(waveform)) / float(hparams.sampling_rate) str = "audio length: {:.2f} sec, mel_to_wave time: {:.2f}".format( len_audio, dec_time) print(str) write(os.path.join(output_path), hparams.sampling_rate, waveform)
def cepstrum_from_mel(self, mel, ref_level_db=20, magnitude_power=1.5): assert (torch.min(mel.data) >= -self.max_abs_mel_value) assert (torch.max(mel.data) <= self.max_abs_mel_value) #print('mel: ', mel.max(), mel.min()) spec = mel_denormalize(mel, self.max_abs_mel_value) #print('spec: ', spec.max(), spec.min()) magnitudes = self.spectral_de_normalize(spec + ref_level_db)**( 1 / magnitude_power) #print('Magnitude: ', Magnitude.max(), Magnitude.min()) pow_spec = (magnitudes**2) / 1024 # if filter_length = 1024 #print('pow_spec: ', pow_spec.max(), pow_spec.min()) db_pow_spec = torch.log(torch.clamp(pow_spec, min=1e-5)) * 20 #db #print('db_pow_spec: ', db_pow_spec.max(), db_pow_spec.min()) mcc = dct(db_pow_spec, 'ortho') return mcc