def test_librispeech(): """Summary """ batch_size = 24 filter_length = 2 n_stages = 7 n_layers_per_stage = 9 n_hidden = 48 n_skip = 384 total_length = 16000 sequence_length = get_sequence_length(n_stages, n_layers_per_stage) prime_length = sequence_length ckpt_path = 'wavenet/wavenet_filterlen{}_batchsize{}_sequencelen{}_stages{}_layers{}_hidden{}_skips{}/'.format( filter_length, batch_size, sequence_length, n_stages, n_layers_per_stage, n_hidden, n_skip) dataset = librispeech.get_dataset() batch = next(librispeech.batch_generator(dataset, batch_size, prime_length))[0] sess = tf.Session() net = create_wavenet(batch_size=batch_size, filter_length=filter_length, n_hidden=n_hidden, n_skip=n_skip, n_layers_per_stage=n_layers_per_stage, n_stages=n_stages, shift=False) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) saver = tf.train.Saver() if tf.train.latest_checkpoint(ckpt_path) is not None: saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) else: print('Could not find checkpoint') synth = np.zeros([batch_size, total_length], dtype=np.float32) synth[:, :prime_length] = batch print('Synthesize...') for sample_i in range(0, total_length - prime_length): print('{}/{}/{}'.format(sample_i, prime_length, total_length), end='\r') probs = sess.run(net["probs"], feed_dict={ net["X"]: synth[:, sample_i:sample_i + sequence_length] }) idxs = sample_categorical(probs) idxs = idxs.reshape((batch_size, sequence_length)) if sample_i == 0: audio = wnu.inv_mu_law_numpy(idxs - 128) synth[:, :prime_length] = audio else: audio = wnu.inv_mu_law_numpy(idxs[:, -1] - 128) synth[:, prime_length + sample_i] = audio for i in range(batch_size): wavfile.write('synthesis-{}.wav'.format(i), 16000, synth[i])
def test_librispeech(): """Summary """ batch_size = 24 filter_length = 2 n_stages = 7 n_layers_per_stage = 9 n_hidden = 48 n_skip = 384 total_length = 16000 sequence_length = get_sequence_length(n_stages, n_layers_per_stage) prime_length = sequence_length ckpt_path = 'wavenet/wavenet_filterlen{}_batchsize{}_sequencelen{}_stages{}_layers{}_hidden{}_skips{}/'.format( filter_length, batch_size, sequence_length, n_stages, n_layers_per_stage, n_hidden, n_skip) dataset = librispeech.get_dataset() batch = next( librispeech.batch_generator(dataset, batch_size, prime_length))[0] sess = tf.Session() net = create_wavenet( batch_size=batch_size, filter_length=filter_length, n_hidden=n_hidden, n_skip=n_skip, n_layers_per_stage=n_layers_per_stage, n_stages=n_stages, shift=False) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) saver = tf.train.Saver() if tf.train.latest_checkpoint(ckpt_path) is not None: saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) else: print('Could not find checkpoint') synth = np.zeros([batch_size, total_length], dtype=np.float32) synth[:, :prime_length] = batch print('Synthesize...') for sample_i in range(0, total_length - prime_length): print('{}/{}/{}'.format(sample_i, prime_length, total_length), end='\r') probs = sess.run( net["probs"], feed_dict={net["X"]: synth[:, sample_i:sample_i + sequence_length]}) idxs = sample_categorical(probs) idxs = idxs.reshape((batch_size, sequence_length)) if sample_i == 0: audio = wnu.inv_mu_law_numpy(idxs - 128) synth[:, :prime_length] = audio else: audio = wnu.inv_mu_law_numpy(idxs[:, -1] - 128) synth[:, prime_length + sample_i] = audio for i in range(batch_size): wavfile.write('synthesis-{}.wav'.format(i), 16000, synth[i])
def test_librispeech(): """Summary """ prime_length = 6144 total_length = 16000 * 3 batch_size = 32 n_stages = 6 n_layers_per_stage = 9 n_hidden = 32 filter_length = 2 n_skip = 256 onehot = False sequence_length = get_sequence_length(n_stages, n_layers_per_stage) ckpt_path = 'vctk-wavenet/wavenet_filterlen{}_batchsize{}_sequencelen{}_stages{}_layers{}_hidden{}_skips{}/'.format( filter_length, batch_size, sequence_length, n_stages, n_layers_per_stage, n_hidden, n_skip) dataset = librispeech.get_dataset() batch = next(librispeech.batch_generator(dataset, batch_size, prime_length))[0] with tf.Graph().as_default(), tf.Session() as sess: net = create_generation_model(batch_size=batch_size, filter_length=filter_length, n_hidden=n_hidden, n_skip=n_skip, n_layers_per_stage=n_layers_per_stage, n_stages=n_stages, onehot=onehot) saver = tf.train.Saver() if tf.train.latest_checkpoint(ckpt_path) is not None: saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) else: print('Could not find checkpoint') sess.run(net['init_ops']) synth = np.zeros([batch_size, total_length], dtype=np.float32) synth[:, :prime_length] = batch print('Synthesize...') for sample_i in range(total_length - 1): print('{}/{}/{}'.format(sample_i, prime_length, total_length), end='\r') probs = sess.run( [net["probs"], net["push_ops"]], feed_dict={net["X"]: synth[:, [sample_i]]})[0] idxs = sample_categorical(probs) audio = wnu.inv_mu_law_numpy(idxs - 128) if sample_i >= prime_length: synth[:, sample_i + 1] = audio for i in range(batch_size): wavfile.write('synthesis-{}.wav'.format(i), 16000, synth[i])
def train_librispeech(): dataset = librispeech.get_dataset(convert_to_wav=False) it_i = 0 n_epochs = 10000 batch_size = 32 n_stages = 6 n_layers_per_stage = 9 n_hidden = 32 filter_length = 2 n_skip = 256 sequence_length = get_sequence_length(n_stages, n_layers_per_stage) ckpt_path = 'vctk-wavenet/wavenet_filterlen{}_batchsize{}_sequencelen{}_stages{}_layers{}_hidden{}_skips{}/'.format( filter_length, batch_size, sequence_length, n_stages, n_layers_per_stage, n_hidden, n_skip) with tf.Graph().as_default() as g, tf.Session(graph=g) as sess: net = create_generation_model(n_stages=n_stages, n_layers_per_stage=n_layers_per_stage, n_hidden=n_hidden, batch_size=batch_size, n_skip=n_skip, filter_length=filter_length) batch = librispeech.batch_generator opt = tf.train.AdamOptimizer(learning_rate=0.00001).minimize( net['loss']) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() writer = tf.summary.FileWriter(ckpt_path) if tf.train.latest_checkpoint(ckpt_path) is not None: saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) for epoch_i in range(n_epochs): for batch_xs, batch_hs in batch(dataset, batch_size, sequence_length): loss, _ = sess.run([net['loss'], opt], feed_dict={ net['X']: batch_xs}) print(loss) if it_i % 100 == 0: summary = sess.run(net['summaries'], feed_dict={ net['X']: batch_xs}) writer.add_summary(summary, it_i) saver.save(sess, os.path.join(ckpt_path, 'model.ckpt'), global_step=it_i) it_i += 1
def test_librispeech_batch_generator(): ds = librispeech.get_dataset() batch = next(librispeech.batch_generator(ds, batch_size=32)) assert (batch[0].shape == (32, 6144))
def test_librispeech_dataset(): ds = librispeech.get_dataset(convert_to_wav=True) assert (len(ds) == 106717)
def test_librispeech_batch(): ds = librispeech.get_dataset() batch = next(librispeech.batch_generator(ds, batch_size=32)) assert(batch[0].shape == (32, 6144)) assert(batch[1].shape == (32,))
def test_librispeech_dataset(): ds = librispeech.get_dataset() assert(len(ds) == 106717)