Ejemplo n.º 1
0
def mels_to_wavs_GL(mels, taco_stft, ref_level_db=0, magnitude_power=1.5):
    #print('inside mels_wavs', len(mels))
    for i, mel in enumerate(mels):
        stime = time.time()
        mel_decompress = mel_denormalize(
            torch.from_numpy(mel).cuda().unsqueeze(0), hp.max_abs_value)
        #print(mel_decompress.shape)
        mel_decompress = taco_stft.spectral_de_normalize(mel_decompress +
                                                         ref_level_db)**(
                                                             1 /
                                                             magnitude_power)
        #print(mel_decompress.shape)
        mel_decompress_ = mel_decompress.transpose(1, 2).data.cpu()
        spec_from_mel_scaling = 1000
        spec_from_mel = torch.mm(mel_decompress_[0], taco_stft.mel_basis)
        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
        spec_from_mel = spec_from_mel * spec_from_mel_scaling
        waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]),
                               taco_stft.stft_fn, 60)
        waveform = waveform[0].data.cpu().numpy()
        dec_time = time.time() - stime
        len_audio = float(len(waveform)) / float(hp.sample_rate)
        str = "{}th sentence, audio length: {:.2f} sec,  mel_to_wave time: {:.2f}".format(
            i, len_audio, dec_time)
        print(str)
        #write(os.path.join(output_dir,"sentence_{}.wav".format(i)), hp.sample_rate, waveform)

    return waveform
Ejemplo n.º 2
0
def mels_to_wavs_GL(hparams,
                    mels,
                    taco_stft,
                    output_dir="",
                    ref_level_db=0,
                    magnitude_power=1.5):
    for i, mel in enumerate(mels):
        stime = time.time()
        mel_decompress = mel_denormalize(
            torch.from_numpy(mel).cuda().unsqueeze(0))
        mel_decompress = taco_stft.spectral_de_normalize(mel_decompress +
                                                         ref_level_db)**(
                                                             1 /
                                                             magnitude_power)
        mel_decompress_ = mel_decompress.transpose(1, 2).data.cpu()
        spec_from_mel_scaling = 1000
        spec_from_mel = torch.mm(mel_decompress_[0], taco_stft.mel_basis)
        spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
        spec_from_mel = spec_from_mel * spec_from_mel_scaling
        waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]),
                               taco_stft.stft_fn, 60)
        # waveform = signal.lfilter([1], [1, -0.97], waveform)
        #waveform = torch.as_tensor(waveform[0]).data.cpu().numpy()
        waveform = waveform[0].data.cpu().numpy()
        dec_time = time.time() - stime
        len_audio = float(len(waveform)) / float(hparams.sampling_rate)
        str = "{}th sentence, audio length: {:.2f} sec,  mel_to_wave time: {:.2f}".format(
            i, len_audio, dec_time)
        print(str)
        write(os.path.join(output_dir, "sentence_{}.wav".format(i)),
              hparams.sampling_rate, waveform)
Ejemplo n.º 3
0
def test(hparams,
         mel,
         output_path="test.wav",
         ref_level_db=20,
         magnitude_power=1.5):
    taco_stft = TacotronSTFT(hparams)
    stime = time.time()
    mel_decompress = mel_denormalize(mel).unsqueeze(0)
    mel_decompress = taco_stft.spectral_de_normalize(mel_decompress +
                                                     ref_level_db)**(
                                                         1 / magnitude_power)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]),
                           taco_stft.stft_fn, 60)
    waveform = waveform[0].data.cpu().numpy()
    waveform = waveform / abs(waveform).max() * 0.99 * 2**15
    waveform = waveform.astype(dtype=np.int16)
    dec_time = time.time() - stime
    len_audio = float(len(waveform)) / float(hparams.sampling_rate)
    str = "audio length: {:.2f} sec,  mel_to_wave time: {:.2f}".format(
        len_audio, dec_time)
    print(str)
    write(os.path.join(output_path), hparams.sampling_rate, waveform)
Ejemplo n.º 4
0
 def cepstrum_from_mel(self, mel, ref_level_db=20, magnitude_power=1.5):
     assert (torch.min(mel.data) >= -self.max_abs_mel_value)
     assert (torch.max(mel.data) <= self.max_abs_mel_value)
     #print('mel: ', mel.max(), mel.min())
     spec = mel_denormalize(mel, self.max_abs_mel_value)
     #print('spec: ', spec.max(), spec.min())
     magnitudes = self.spectral_de_normalize(spec + ref_level_db)**(
         1 / magnitude_power)
     #print('Magnitude: ', Magnitude.max(), Magnitude.min())
     pow_spec = (magnitudes**2) / 1024  # if filter_length = 1024
     #print('pow_spec: ', pow_spec.max(), pow_spec.min())
     db_pow_spec = torch.log(torch.clamp(pow_spec, min=1e-5)) * 20  #db
     #print('db_pow_spec: ', db_pow_spec.max(), db_pow_spec.min())
     mcc = dct(db_pow_spec, 'ortho')
     return mcc