Example #1
0
def convert(predictor, df):
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db)
    y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length,
                               hp.Default.n_iter) for spec in pred_spec])
    y_audio = np.array([spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length, hp.Default.hop_length,
                                 hp.Default.n_iter) for spec in y_spec])

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis)

    # if hp.Convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Example #2
0
def convert(predictor, df):
    a, b, c = next(df().get_data())
    pred_spec, r_spec = predictor(a, b, c)

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    r_spec = denormalize_db(r_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    r_spec = db2amp(r_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    r_spec = np.power(r_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
                                       hp.default.n_iter), pred_spec)))
    y_audio = np.array(
        list(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
                                       hp.default.n_iter), r_spec)))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio
Example #3
0
def do_convert(predictor, input_name, logdir2):
    convert_s = datetime.datetime.now()

    # Load input audio
    input_audio, _ = librosa.load(input_name, sr=hp.default.sr, dtype=np.float64)

    # Extract F0 from input audio first
    input_f0, t_table = pw.dio(input_audio, hp.default.sr)
    input_f0 = pw.stonemask(input_audio, input_f0, t_table, hp.default.sr)

    # Get MFCC, Spectral Envelope, and Aperiodicity
    mfcc = _get_mfcc(input_audio, hp.default.n_fft, hp.default.win_length, hp.default.hop_length)
    mfcc = np.expand_dims(mfcc, axis=0)

    input_ap = pw.d4c(input_audio, input_f0, t_table, hp.default.sr, fft_size=hp.default.n_fft)

    input_sp_en = _get_spectral_envelope(preemphasis(input_audio, coeff=hp.default.preemphasis), hp.default.n_fft)
    plt.imsave('./converted/debug/input_sp_en_original.png', input_sp_en, cmap='binary')
    input_sp_en = np.expand_dims(input_sp_en, axis=0)

    # Convert Spectral Envelope
    output_sp_en, ppgs = convert_spectral_envelope(predictor, mfcc, input_sp_en)
    output_sp_en = np.squeeze(output_sp_en.astype(np.float64), axis=0)

    preproc_s = datetime.datetime.now()
    # Denormalization
    output_sp_en = denormalize_db(output_sp_en, hp.default.max_db, hp.default.min_db)

    # Db to amp
    output_sp_en = librosa.db_to_amplitude(output_sp_en)

    # Emphasize the magnitude
    output_sp_en = np.power(output_sp_en, hp.convert.emphasis_magnitude)

    preproc_e = datetime.datetime.now()
    preproc_t = preproc_e - preproc_s
    print("Pre-Processing time:{}s".format(preproc_t.seconds))

    # F0 transformation with WORLD Vocoder
    output_f0 = f0_adapt(input_f0, logdir2)

    # Synthesize audio and de-emphasize
    output_audio = pw.synthesize(output_f0, output_sp_en, input_ap, hp.default.sr)
    output_audio = inv_preemphasis(output_audio, coeff=hp.default.preemphasis)

    # Saving output_audio to 32-bit Float wav file
    output_audio = output_audio.astype(np.float32)
    librosa.output.write_wav(path="./converted/"+input_name,y=output_audio,sr=hp.default.sr)

    # Saving PPGS data to Grayscale Image and raw binary file
    ppgs = np.squeeze(ppgs, axis=0)
    plt.imsave('./converted/debug/'+input_name+'.png', ppgs, cmap='binary')
    np.save('./converted/debug/'+input_name+'.npy', ppgs)

    convert_e = datetime.datetime.now()
    convert_time = convert_e - convert_s
    print("Total Converting Time:{}s".format(convert_time.seconds))
Example #4
0
File: convert.py Project: v-ilin/vc
def convert(predictor, df):

    t = next(df().get_data())
    print(t[0].shape)
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)
    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    librosa.output.write_wav(
        '/home/user/vilin/deep-voice-conversion/output/file_trim_8.wav',
        audio[0], hp.default.sr)

    y_audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Example #5
0
def convert(predictor, data):
    x_mfccs, y_spec, y_mel = data
    x_mfccs, y_spec, y_mel = data
    x_mfccs = np.array(x_mfccs).reshape((-1, ) + x_mfccs.shape)
    y_spec = np.array(y_spec).reshape((-1, ) + y_spec.shape)
    y_mel = np.array(y_mel).reshape((-1, ) + y_mel.shape)
    pred_spec, y_spec, ppgs = predictor(x_mfccs, y_spec, y_mel)

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.Default.max_db, hp.Default.min_db)
    y_spec = denormalize_db(y_spec, hp.Default.max_db, hp.Default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.Convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.Convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array([
        spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length,
                 hp.Default.hop_length, hp.Default.n_iter)
        for spec in pred_spec
    ])
    y_audio = np.array([
        spec2wav(spec.T, hp.Default.n_fft, hp.Default.win_length,
                 hp.Default.hop_length, hp.Default.n_iter) for spec in y_spec
    ])

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.Default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.Default.preemphasis)

    if hp.Convert.one_full_wav:
        # Concatenate to a wav
        y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
        audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Example #6
0
 def predict(self, path_to_wav):
     x_mfcc, y_spec, mel = get_mfccs_and_spectrogram(path_to_wav)
     pred_spec, y_spec, ppgs = self.predictor(x_mfcc, y_spec, mel)
     pred_spec = denormalize_db(pred_spec, hp.default.max_db,
                                hp.default.min_db)
     y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)
     pred_spec = db2amp(pred_spec)
     y_spec = db2amp(y_spec)
     pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
     y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)
     # Spectrogram to waveform
     audio = np.array(
         map(
             lambda spec:
             spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                      default.hop_length, hp.default.n_iter), pred_spec))
     y_audio = np.array(
         map(
             lambda spec:
             spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                      default.hop_length, hp.default.n_iter), y_spec))
Example #7
0
def convert(predictor, df):
    # TODO need to fix reading in with duration
    pred_spec, y_spec, ppgs = predictor(next(df().get_data()))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    y_audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)

    if hp.convert.one_full_wav:
        # Concatenate to a wav
        y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
        audio = np.reshape(audio, (1, audio.size), order='C')

    return audio, y_audio, ppgs
Example #8
0
def sumspecimage(spec, spec_name):
    spec = denormalize_db(spec, hp.max_db, hp.min_db)
    spec = db2amp(spec)

    spec_image = spec.transpose(0, 2, 1)
    heatmap = np.expand_dims(spec_image, 3)
    tf.summary.image(spec_name, heatmap, max_outputs=spec_image.shape[0])

    out_spec = np.power(np.maximum(spec, 0), 1)  #hp.emphasis_magnitude)
    out_audio = np.array(
        list(
            map(
                lambda spec: spec2wav(spec.T, hp.n_fft, hp.win_length, hp.
                                      hop_length, hp.n_iter), out_spec)))

    out_audio = inv_preemphasis(out_audio, coeff=hp.preemphasis)
    tf.summary.audio(spec_name, out_audio, hp.sr, max_outputs=hp.batch_size)
def convert(predictor, mfcc, spec, mel_spec):
    print("convert")
    pred_s = datetime.datetime.now()
    pred_spec, _, ppgs = predictor(mfcc, spec, mel_spec)
    pred_e = datetime.datetime.now()
    pred_t = pred_e - pred_s
    print("Predicting time:{}s".format(pred_t.seconds))

    preproc_s = datetime.datetime.now()
    # Denormalizatoin
    print("denormalize_db")
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    print("db2amp")
    pred_spec = db2amp(pred_spec)

    # Emphasize the magnitude
    print("emphasize")
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)

    preproc_e = datetime.datetime.now()
    preproc_t = preproc_e - preproc_s
    print("Pre-Processing time:{}s".format(preproc_t.seconds))

    audio = []
    # Spectrogram to waveform
    recon_s = datetime.datetime.now()

    print("spec2wav")
    audio.append(
        spec2wav_lws(pred_spec[0], hp.default.n_fft, hp.default.win_length,
                     hp.default.hop_length, hp.default.lws_mode))
    recon_e = datetime.datetime.now()
    recon_t = recon_e - recon_s
    print("Converting Spectrogram-to-Wave time:{}s".format(recon_t.seconds))

    audio = np.array(audio)
    # print('audio.shape : ', audio.shape)

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    return audio[0], ppgs
Example #10
0
def convert(predictor, tensor):
    # tensor = next(df().get_data())
    # print(tensor.shape)
    pred_spec, y_spec, ppgs = predictor(tensor)
    # pred_spec, y_spec, ppgs = predictor(tf.expand_dims(df, 0))

    # Denormalizatoin
    pred_spec = denormalize_db(pred_spec, hp.default.max_db, hp.default.min_db)
    # y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)

    # Db to amp
    pred_spec = db2amp(pred_spec)
    # y_spec = db2amp(y_spec)

    # Emphasize the magnitude
    pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
    # y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)

    # Spectrogram to waveform
    audio = np.array(
        map(
            lambda spec:
            spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                     default.hop_length, hp.default.n_iter), pred_spec))
    # y_audio = np.array(map(lambda spec: spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.default.hop_length,
    #                                              hp.default.n_iter), y_spec))

    # Apply inverse pre-emphasis
    audio = inv_preemphasis(audio, coeff=hp.default.preemphasis)
    # y_audio = inv_preemphasis(y_audio, coeff=hp.default.preemphasis)
    # pickle.dump( y_audio, open( "y-audio.p", "wb" ) )
    # pickle.dump( audio, open( "o-audio.p", "wb" ) )

    # if hp.convert.one_full_wav:
    #     # Concatenate to a wav
    #     y_audio = np.reshape(y_audio, (1, y_audio.size), order='C')
    #     audio = np.reshape(audio, (1, audio.size), order='C')

    # return audio, y_audio, ppgs
    return audio, ppgs