Exemple #1
0
def main(_):
    tf.gfile.MkDir(args.output_dir)

    data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list),
                              filenames=tf.gfile.Glob(args.file_pattern),
                              num_epoch=1)

    XNOM = data.f[0]
    XWAV = tf.expand_dims(mu_law_decode(data.x[0, :]), -1)
    XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', 16000)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=sess_config) as sess:
        sess.run(tf.tables_initializer())
        sess.run(data.iterator.initializer)
        csv = open('vctk.csv', 'w')
        counter = 1
        while True:
            try:
                fetch = {'xbin': XBIN, 'xwav': XWAV, 'wav_name': XNOM}
                result = sess.run(fetch)
                wav_name = result['wav_name'].decode('utf8')
                print('\rFile {:05d}: Processing {}'.format(counter, wav_name),
                      end='')
                csv.write('{}, {:d}\n'.format(wav_name, len(result['xwav'])))
                filename = os.path.join(args.output_dir, wav_name) + '.wav'
                with open(filename, 'wb') as fp:
                    fp.write(result['xbin'])
                counter += 1
            except tf.errors.OutOfRangeError:
                print('\nEpoch complete')
                break
        print()
        csv.close()
Exemple #2
0
def main(unused_args):
    '''
    NOTE: the directory structure must be [args.dir_to_wav]/[Set]/[speakers]
    '''
    if not args.output_dir:
        raise ValueError('`output_dir` (output dir) should be specified')

    print('[WARNING] Protobuf is super slow (~7 examples per sec). \n'
          'This could take 2 hours or more.')

    reader = tf.WholeFileReader()
    files = tf.gfile.Glob(args.file_pattern)
    filename_queue = tf.train.string_input_producer(files,
                                                    num_epochs=1,
                                                    shuffle=False)

    key, val = reader.read(filename_queue)
    '''
    wav = tf.contrib.ffmpeg.decode_audio(val, args.ext, args.fs, 1)
    wav = tf.reshape(wav, [-1, ])
    mulaw = mu_law_encode(wav)
    '''
    for s in txt2list(args.speaker_list):
        tf.gfile.MakeDirs(join(args.output_dir, s))

    counter = 1
    N = len(files)
    with tf.train.MonitoredSession() as sess:
        while not sess.should_stop():
            filename = sess.run(key).decode('utf8')
            binary, _ = librosa.load(filename)
            x_int = mu_law_encode(binary)

            # TODO: remove this
            #decoded = mu_law_decode(x_int)
            #librosa.output.write_wav('testwav-{}.wav', decoded, _)

            text = read_text(filename)

            b, _ = splitext(filename)
            _, b = split(b)

            s = b.split('_')[0]

            ex = make_mu_law_speaker_length(x_int, s, text, b)

            fp = tf.python_io.TFRecordWriter(
                join(args.output_dir, s, '{}.tfr'.format(b)))
            fp.write(ex.SerializeToString())
            fp.close()

            print('\rFile {:5d}/{:5d}: {}'.format(counter, N, b), end='')
            counter += 1

    print()
def main(unused_args):
    if args.logdir is None:
        raise ValueError('Please specify the dir to the checkpoint')

    arch = tf.gfile.Glob(join(args.logdir, 'arch*.json'))[0]
    arch = json2dict(arch)

    net = VQVAE(arch)

    data = ByteWavWholeReader(speaker_list=txt2list(args.speaker_list),
                              filenames=tf.gfile.Glob(args.file_pattern))

    ZH = net.encode(data.x, args.mode)

    ema = tf.train.ExponentialMovingAverage(decay=0.995)
    trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()}
    saver = tf.train.Saver(trg_vars)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=sess_config) as sess:
        sess.run(tf.tables_initializer())
        sess.run(data.iterator.initializer)
        sess.run(tf.global_variables_initializer())
        load(saver, sess, args.logdir, ckpt=args.ckpt)

        hist = np.zeros([
            arch['num_exemplar'],
        ], dtype=np.int64)
        counter = 1
        while True:
            try:
                z_ids = sess.run(ZH)
                print('\rNum of processed files: {:d}'.format(counter), end='')
                counter += 1
                for i in z_ids[0]:  # bz = 1
                    hist[i] += 1
            except tf.errors.OutOfRangeError:
                print()
                break

        with open('histogram.npf', 'wb') as fp:
            hist.tofile(fp)

        plt.figure(figsize=[10, 2])
        plt.plot(np.log10(hist + 1), '.')
        plt.xlim([0, arch['num_exemplar'] - 1])
        plt.ylabel('log-frequency')
        plt.xlabel('exemplar index')
        plt.savefig('histogram.png')
        plt.close()
Exemple #4
0
def main(_):
  speaker_list = txt2list(args.speaker_list)
  dirs = validate_log_dirs(args)
  arch = json2dict(args.arch)
  arch.update(dirs)
  arch.update({'ckpt': args.ckpt})
  copy_arch_file(args.arch, arch['logdir'])
  net = VQVAE(arch)
  P = net.n_padding()
  print('Receptive field: {} samples ({:.2f} sec)\n'.format(P, P / arch['fs']))
  data = ByteWavReader(
    speaker_list,
    args.file_pattern,
    T=arch['T'],
    batch_size=arch['training']['batch_size'],
    buffer_size=5000
  )
  net.train(data)
Exemple #5
0
def main(_):
    """Train the model based on the command-line arguments."""
    # Parse command-line arguments
    speaker_list = txt2list(args.speaker_list)
    dirs = validate_log_dirs(args)
    arch = json2dict(args.arch)
    arch.update(dirs)
    arch.update({'ckpt': args.ckpt})
    copy_arch_file(args.arch, arch['logdir'])

    # Initialize the model
    net = VQVAE(arch)
    P = net.n_padding()
    print('Receptive field: {} samples ({:.2f} sec)'.format(P, P / arch['fs']))

    # Read the input data as specified by the command line arguments
    data = ByteWavReader(speaker_list,
                         args.file_pattern,
                         T=arch['T'],
                         batch_size=arch['training']['batch_size'],
                         buffer_size=5000)

    # Train the model on the input data
    net.train(data)
Exemple #6
0
def main(unused_args):
    if args.logdir is None:
        raise ValueError('Please specify the dir to the checkpoint')

    speaker_list = txt2list(args.speaker_list)

    arch = tf.gfile.Glob(os.path.join(args.logdir, 'arch*.json'))[0]
    arch = json2dict(arch)

    net = VQVAE(arch)

    # they start roughly at the same position but end very differently (3 is longest)
    filenames = [
        'dataset/VCTK/tfr/p227/p227_363.tfr',
        # 'dataset/VCTK/tfr/p240/p240_341.tfr',
        # 'dataset/VCTK/tfr/p243/p243_359.tfr',
        'dataset/VCTK/tfr/p225/p225_001.tfr'
    ]
    data = ByteWavWholeReader(speaker_list, filenames)

    X = tf.placeholder(dtype=tf.int64, shape=[None, None])
    Y = tf.placeholder(dtype=tf.int64, shape=[
        None,
    ])
    ZH = net.encode(X, args.mode)
    XH = net.generate(X, ZH, Y)
    # XWAV = mu_law_decode(X)
    # XBIN = tf.contrib.ffmpeg.encode_audio(XWAV, 'wav', arch['fs'])

    ema = tf.train.ExponentialMovingAverage(decay=0.995)
    trg_vars = {ema.average_name(v): v for v in tf.trainable_variables()}
    saver = tf.train.Saver(trg_vars)

    logdir = get_default_logdir(args.logdir)
    tf.gfile.MkDir(logdir)

    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 gpu_options=tf.GPUOptions(allow_growth=True))
    with tf.Session(config=sess_config) as sess:
        sess.run(tf.tables_initializer())
        sess.run(data.iterator.initializer)

        results = []
        for _ in filenames:
            result = sess.run({'x': data.x, 'y': data.y})
            results.append(result)
        # results1 = sess.run({'x': data.x, 'y': data.y})
        # results2 = sess.run({'x': data.x, 'y': data.y})

        length_input = net.n_padding() + 1  # same as padding + 1

        ini = 15149 - length_input
        end = 42285
        # x_source1 = results1['x'][:, ini: end]
        # x_source2 = results2['x'][:, ini: end]
        for i in range(len(results)):
            x = results[i]['x']
            if x.shape[-1] < end:
                x = np.concatenate(
                    [x, x[0, 0] + np.zeros([1, end - x.shape[-1]])], -1)
            results[i]['x'] = x[:, ini:end]

        # from pdb import set_trace
        # set_trace()
        x_source = np.concatenate([
            results[0]['x'], results[0]['x'], results[1]['x'], results[1]['x']
        ], 0)

        B = x_source.shape[0]

        y_input = np.concatenate([
            results[0]['y'], results[1]['y'], results[1]['y'], results[0]['y']
        ], 0)

        length_target = x_source.shape[1] - length_input

        while True:
            sess.run(tf.global_variables_initializer())
            load(saver, sess, args.logdir, ckpt=args.ckpt)

            z_blend = sess.run(ZH, feed_dict={X: x_source})
            x_input = x_source[:, :length_input]

            z_input = z_blend[:, :length_input, :]

            # Generate
            try:
                x_gen = np.zeros([B, length_target],
                                 dtype=np.int64)  # + results['x'][0, 0]
                for i in range(length_target):
                    xh = sess.run(XH,
                                  feed_dict={
                                      X: x_input,
                                      ZH: z_input,
                                      Y: y_input
                                  })
                    z_input = z_blend[:, i + 1:i + 1 + length_input, :]
                    x_input[:, :-1] = x_input[:, 1:]
                    x_input[:, -1] = xh[:, -1]
                    x_gen[:, i] = xh[:, -1]
                    print('\rGenerating {:5d}/{:5d}... x={:3d}'.format(
                        i + 1, length_target, xh[0, -1]),
                          end='',
                          flush=True)
            except KeyboardInterrupt:
                print("Interrupted by the user.")
            finally:
                print()
                x_wav = mu_law_decode(x_gen)
                for i in range(x_wav.shape[0]):
                    x_1ch = np.expand_dims(x_wav[i], -1)
                    # x_bin = sess.run(XBIN, feed_dict={X: x_1ch})

                    librosa.output.write_wav('testwav-{}.wav'.format(i), x_1ch,
                                             arch['fs'])
                    # with open(os.path.join(logdir, 'testwav-{}.wav'.format(i)), 'wb') as fp:
                    #  fp.write(x_bin)

            # For periodic gen.
            if args.period > 0:
                try:
                    print('Sleep for a while')
                    sleep(args.period * 60)
                    logdir = get_default_logdir(args.logdir)
                    tf.gfile.MkDir(logdir)
                except KeyboardInterrupt:
                    print('Stop periodic gen.')
                    break
                finally:
                    print('all finished')
            else:
                break