Esempio n. 1
0
def do_convert(args, logdir1, logdir2):
    # Load graph
    model = Net2()

    data = get_mfccs_and_spectrogram(args.file)

    ckpt1 = '{}/{}'.format(
        logdir1,
        args.net1) if args.net1 else tf.train.latest_checkpoint(logdir1)
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.net2) if args.net2 else tf.train.latest_checkpoint(logdir2)
    session_inits = []
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    pred_conf = PredictConfig(model=model,
                              input_names=get_eval_input_names(),
                              output_names=get_eval_output_names(),
                              session_init=ChainInit(session_inits))
    predictor = OfflinePredictor(pred_conf)

    audio, y_audio, ppgs = convert(predictor, data)

    target_file = args.file.split('/')[-1]
    portion = os.path.splitext(target_file)
    # converted_file = target_file.split('.')[0] + '_converted.wav'
    converted_file = portion[0] + '.wav'
    write_wav(audio[0], hp.Default.sr, args.savepath + converted_file)

    # Write the result
    tf.summary.audio('A',
                     y_audio,
                     hp.Default.sr,
                     max_outputs=hp.Convert.batch_size)
    tf.summary.audio('B',
                     audio,
                     hp.Default.sr,
                     max_outputs=hp.Convert.batch_size)

    # Visualize PPGs
    heatmap = np.expand_dims(ppgs, 3)  # channel=1
    tf.summary.image('PPG', heatmap, max_outputs=ppgs.shape[0])

    writer = tf.summary.FileWriter(args.savepath)
    with tf.Session() as sess:
        summ = sess.run(tf.summary.merge_all())
    writer.add_summary(summ)
    writer.close()
Esempio n. 2
0
 def predict(self, path_to_wav):
     x_mfcc, y_spec, mel = get_mfccs_and_spectrogram(path_to_wav)
     pred_spec, y_spec, ppgs = self.predictor(x_mfcc, y_spec, mel)
     pred_spec = denormalize_db(pred_spec, hp.default.max_db,
                                hp.default.min_db)
     y_spec = denormalize_db(y_spec, hp.default.max_db, hp.default.min_db)
     pred_spec = db2amp(pred_spec)
     y_spec = db2amp(y_spec)
     pred_spec = np.power(pred_spec, hp.convert.emphasis_magnitude)
     y_spec = np.power(y_spec, hp.convert.emphasis_magnitude)
     # Spectrogram to waveform
     audio = np.array(
         map(
             lambda spec:
             spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                      default.hop_length, hp.default.n_iter), pred_spec))
     y_audio = np.array(
         map(
             lambda spec:
             spec2wav(spec.T, hp.default.n_fft, hp.default.win_length, hp.
                      default.hop_length, hp.default.n_iter), y_spec))
Esempio n. 3
0
def generate_npz(wav_files):
    for i in range(len(wav_files)):
        f_name = wav_files[i].replace('wav', 'npz')
        mfccs, mag_db, mel_db = get_mfccs_and_spectrogram(wav_files[i])
        savez(f_name, mfccs=mfccs, mag_db=mag_db, mel_db=mel_db)
Esempio n. 4
0
def get_network_output(
    wav,
    ckpt_dir=CKPT_DIR,
    out_path_fmt="extern/deep_voice_conversion/outputs/test2_{:04d}_{:04d}.png"
):
    """ Computes PPGs for a given loaded wav audio.
    This splits the input wav file into two-second batches and runs each through the Phoneme classifier.
    For each batch, this outputs each ppg to the given output_path.

    :param wav: Loaded Wav audio.
    :param ckpt_dir: Pretrained 'Net1' weights.
    :param out_path_fmt: Output path format template to write each of the network outputs.
    """
    assert os.path.isdir(ckpt_dir)

    # Make output directory.
    # out_dir = os.path.dirname(out_path_fmt)
    # if not os.path.isdir(out_dir):
    # os.mkdir(out_dir)

    # Initialize Offline Predictor.
    predictor = init_predictor(ckpt_dir)

    # Split wav into 2-second clips.
    length = hp.default.sr * hp.default.duration
    splits = list(range(length, wav.shape[0], length))
    wavs = np.array_split(wav, splits, axis=0)

    print("Original wav length is", len(wav), "with sample rate",
          hp.default.sr)
    print("Length of wavs is ", [len(x) for x in wavs])

    # mfcc_batch: [b=num_splits, time/lenghth, feats]
    mfcc_batch = np.array(
        [data_load.get_mfccs_and_spectrogram(wav=wav_)[0] for wav_ in wavs])
    print("Length of mfccs is ", [len(x) for x in mfcc_batch])

    # inp.shape: [b=num_splits, time/length, feats]
    # ppgs: (N, T, V);
    preds = predictor(mfcc_batch)
    assert len(preds) == 1
    ppgs = preds[0]
    print("Length of ppgs is ", [len(x) for x in ppgs])

    # Output each ppg.
    heatmaps = []
    for i, heatmap in enumerate(ppgs):
        assert 0 <= np.min(heatmap) <= np.max(heatmap) <= 1.0
        out_path = out_path_fmt.format(i, len(splits) - 1)
        print("Writing heatmap '{}' of shape '{}' to '{}'".format(
            i, heatmap.shape, out_path))

        # REVIEW josephz: This really should be changed to the PIL Image.fromarray(...) -> convert -> save.
        #   as scipy.misc.toimage is deprecated. But how the f**k are they doing the conversion?
        #   I guess asserting the range and then manually converting will just have to do.
        #   I think the scipy.misc.toimage method was deprecated as they realized supporting generalized
        #   image conversion is actually really hard and the user should know how to do it themselves.
        #   In fact we have the most information available to best approach convert it exactly as we need.
        # import scipy.misc
        # scipy.misc.toimage(heatmap).save(out_path)

        # Convert [0, 1] heatmap to [0, 255].
        heatmap = 255 * heatmap
        # im = Image.fromarray(heatmap.astype(np.uint8))

        # Save the image to disk.
        # im.save(out_path)

        # Accumulate heatmap.
        heatmaps.append(heatmap)
    return heatmaps
def do_convert(args, logdir1, logdir2):
    print("do_convert")
    # Load graph

    ckpt1 = tf.train.latest_checkpoint(logdir1)
    ckpt2 = '{}/{}'.format(
        logdir2,
        args.ckpt) if args.ckpt else tf.train.latest_checkpoint(logdir2)
    model = Net2ForConvert()

    session_inits = []
    if ckpt2:
        session_inits.append(SaverRestore(ckpt2))
    if ckpt1:
        session_inits.append(SaverRestore(ckpt1, ignore=['global_step']))
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    else:
        os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    print("PredictConfig")
    pred_conf = PredictConfig(model=model,
                              input_names=get_eval_input_names(),
                              output_names=get_eval_output_names(),
                              session_init=ChainInit(session_inits))

    print("OfflinePredictor")
    set_env_s = datetime.datetime.now()
    predictor = OfflinePredictor(pred_conf)
    set_env_e = datetime.datetime.now()
    set_env_t = set_env_e - set_env_s
    print("Setting Environment time:{}s".format(set_env_t.seconds))

    input_name = ''
    while True:
        input_name = input("Write your audio file\'s path for converting : ")
        if input_name == 'quit':
            break
        elif len(glob.glob(input_name)) == 0:
            print("That audio file doesn't exist! Try something else.")
            continue

        convert_s = datetime.datetime.now()
        mfcc, spec, mel_spec = get_mfccs_and_spectrogram(input_name,
                                                         trim=False,
                                                         isConverting=True)
        mfcc = np.expand_dims(mfcc, axis=0)
        spec = np.expand_dims(spec, axis=0)
        mel_spec = np.expand_dims(mel_spec, axis=0)
        output_audio, ppgs = convert(predictor, mfcc, spec, mel_spec)

        input_audio, samplerate = load(input_name,
                                       sr=hp.default.sr,
                                       dtype=np.float64)
        """
        # F0 adaptation with WORLD Vocoder
        f0_conv_s = datetime.datetime.now()
        output_audio = f0_adapt(input_audio, output_audio, logdir2, samplerate)
        f0_conv_e = datetime.datetime.now()
        f0_conv_time = f0_conv_e - f0_conv_s
        print("F0 Adapting Time:{}s".format(f0_conv_time.seconds))
        """

        # Saving voice-converted audio to 32-bit float wav file
        # print(audio.dtype)
        output_audio = output_audio.astype(np.float32)
        write_wav(path="./converted/" + input_name,
                  y=output_audio,
                  sr=hp.default.sr)

        # Saving PPGS data to Grayscale Image and raw binary file
        ppgs = np.squeeze(ppgs, axis=0)
        plt.imsave('./converted/debug/' + input_name + '.png',
                   ppgs,
                   cmap='binary')
        np.save('./converted/debug/' + input_name + '.npy', ppgs)

        convert_e = datetime.datetime.now()
        convert_time = convert_e - convert_s
        print("Total Converting Time:{}s".format(convert_time.seconds))