def nsynth_to_melspec(dataset, hparams, stats=None): if 'instrument' in hparams and hparams['instrument'] is not None: instrument = hparams['instrument'] if 'family' in instrument and instrument['family'] is not None: dataset = pro.filter( instrument_families_filter(instrument['family']))(dataset) if 'source' in hparams and hparams['source'] is not None: dataset = pro.filter( instrument_sources_filter(instrument['source']))(dataset) dataset = pro.index_map( 'pitch', pro.pipeline([ pro.map_transform(lambda x: x - 24), pro.one_hot(hparams['cond_vector_size']), pro.map_transform(lambda x: tf.cast(x, tf.float32)), ]))(dataset) dataset = pro.index_map( 'audio', pro.pipeline([ pro.melspec(sr=hparams['sample_rate']), pro.pad([[0, 0], [0, 2]], 'CONSTANT', constant_values=hparams['log_amin']), ]))(dataset) if stats is not None: dataset = pro.index_map( 'audio', pro.normalize(normalization='specgan', stats=stats))(dataset) # Create preprocessing pipeline for the melspectograms return dataset
def generate(hparams): input_vocab_size = 128 + 128 + 128 + 128 target_vocab_size = 128 + 128 + 128 + 128 dataset = tf.data.Dataset.list_files( '/home/big/datasets/maestro-v2.0.0/**/*.midi') dataset_single = pro.pipeline([ pro.midi(), pro.frame(hparams['frame_size'], hparams['frame_size'], True), pro.unbatch(), ])(dataset).skip(16000).as_numpy_iterator() transformer = Transformer(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, pe_input=input_vocab_size, pe_target=target_vocab_size, hparams=hparams) trainer = Trainer(dataset, hparams) ckpt = tf.train.Checkpoint(step=trainer.step, transformer=transformer, optimizer=transformer.optimizer) trainer.init_checkpoint(ckpt) return generate_from_model(hparams, transformer, dataset_single)
def generate(hparams, seed, pitches): gan_stats = np.load('gan_stats.npz') gan = GAN((256, 128), hparams) trainer = Trainer(None, hparams) ckpt = tf.train.Checkpoint( step=trainer.step, generator=gan.generator, discriminator=gan.discriminator, gen_optimizer=gan.generator_optimizer, disc_optimizer=gan.discriminator_optimizer, ) trainer.init_checkpoint(ckpt) #seed = tf.repeat(seed, count, axis=0) pitches = tf.one_hot(pitches, hparams['cond_vector_size'], axis=1) samples = tf.reshape(gan.generator([seed, pitches], training=False), [-1, 256, 128]) audio = pro.pipeline([ pro.denormalize(normalization='specgan', stats=gan_stats), pro.invert_log_melspec(hparams['sample_rate']), list, # Stupid workaround becuase invert_log_melspec only does np.array, # one spectrogram at a time ])(samples) return samples, audio
def maestro_from_files(root_path, frame_size): dataset = tf.data.Dataset.list_files(os.path.join(root_path, '**/*.wav')) dataset = pro.pipeline([ pro.read_file(), pro.decode_wav(desired_channels=1), pro.map_transform(lambda x: x[0]), pro.reshape([-1]), pro.frame(frame_size, frame_size), pro.unbatch() ])(dataset) return dataset
def generate_tones(pitches): seed = tf.random.normal((len(pitches), gan_hparams['latent_size'])) pitches = tf.one_hot(pitches, gan_hparams['cond_vector_size'], axis=1) samples = gan.generator([seed, pitches], training=False) samples = tf.reshape(samples, [-1, 256, 128]) audio = pro.pipeline([ pro.denormalize(normalization='specgan', stats=gan_stats), pro.invert_log_melspec(gan_hparams['sample_rate']), list, # Stupid workaround becuase invert_log_melspec only does np.array, # one spectrogram at a time ])(samples) return audio
def start(hparams): dataset = tf.data.Dataset.list_files('dataset/*.midi') dataset = pro.pipeline([ pro.midi(), pro.prefetch(), pro.frame(hparams['frame_size'], hparams['frame_size'], True), pro.unbatch(), # pro.map_transform(tf_serialize_example) ])(dataset) def generator(): for features in dataset: yield serialize_example(features) serialized_features_dataset = tf.data.Dataset.from_generator( generator, output_types=tf.string, output_shapes=()) filename = 'midi_dataset.tfrecord' writer = tf.data.experimental.TFRecordWriter(filename) writer.write(serialized_features_dataset)
def nsynth_from_tfrecord(nsynth_tfrecord_path): dataset = tf.data.TFRecordDataset([nsynth_tfrecord_path]) return pro.pipeline([ pro.parse_tfrecord({ "note_str": tf.io.FixedLenFeature([], dtype=tf.string), "pitch": tf.io.FixedLenFeature([1], dtype=tf.int64), "velocity": tf.io.FixedLenFeature([1], dtype=tf.int64), "audio": tf.io.FixedLenFeature([64000], dtype=tf.float32), "qualities": tf.io.FixedLenFeature([10], dtype=tf.int64), "instrument_source": tf.io.FixedLenFeature([1], dtype=tf.int64), "instrument_family": tf.io.FixedLenFeature([1], dtype=tf.int64), }), ])(dataset)
def start(hparams): # Load nsynth dataset # dataset = tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True) dataset = tf.data.Dataset.list_files( '/home/big/datasets/maestro-v2.0.0/**/*.wav') dataset = pro.pipeline([ pro.wav(), # pro.resample(16000, hparams['sample_rate'], tf.float32), pro.normalize(), pro.frame(hparams['window_samples'], hparams['window_samples']), pro.unbatch(), pro.set_channels(1), pro.dupe(), pro.shuffle(hparams['buffer_size']), pro.batch(hparams['batch_size']), pro.prefetch() ])(dataset) vae = VAE(hparams) vae.vae.summary() trainer = Trainer(dataset, hparams) ckpt = tf.train.Checkpoint( step=trainer.step, encoder=vae.encoder, decoder=vae.decoder, vae=vae.vae, ) trainer.on_epoch_start = on_epoch_start trainer.on_step = on_step trainer.init_tensorboard() trainer.init_checkpoint(ckpt) trainer.set_train_step(vae.train_step) trainer.run()
import matplotlib.pyplot as plt import librosa import numpy as np dataset = tf.data.Dataset.list_files('src/audio/*.wav') stats = np.load('gan_stats') hparams = { 'sample_rate': 16000, 'log_amin': 1e-6, } dataset = pro.pipeline([ pro.wav(), pro.melspec(hparams['sample_rate']), pro.pad([[0, 0], [0, 2]], 'CONSTANT', constant_values=hparams['log_amin']), pro.normalize(normalization='specgan', stats=stats), pro.numpy(), ])(dataset) x = next(dataset) plt.imshow(x) plt.savefig('result.png') dataset = pro.pipeline([ pro.denormalize(normalization='specgan', stats=stats), pro.invert_log_melspec(hparams['sample_rate']) ])(dataset) x = next(dataset) librosa.output.write_wav('result.wav', x, hparams['sample_rate'])
def start(hparams): gc.collect() dataset = tf.data.Dataset.list_files( '/home/big/datasets/maestro-v2.0.0/**/*.midi') dataset_single = pro.pipeline([ pro.midi(), pro.frame(hparams['frame_size'] * 2, hparams['frame_hop_len'], True), pro.unbatch(), ])(dataset) def _reshape(inp, tar): inp = tf.reshape(inp, [hparams['frame_size']]) tar = tf.reshape(tar, [hparams['frame_size']]) return inp, tar dataset = pro.pipeline([ pro.split(2), #pro.batch(2, True), # pro.dupe(), pro.map_transform(_reshape), pro.cache(), pro.shuffle(hparams['buffer_size']), pro.batch(hparams['batch_size'], True), pro.prefetch(), ])(dataset_single) dataset_single = pro.shuffle(hparams['buffer_size'] // 4)(dataset_single) dataset_single = dataset_single.as_numpy_iterator() transformer = Transformer(input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, pe_input=input_vocab_size, pe_target=target_vocab_size, hparams=hparams) # pop_size = 10 # generations = 100 # mutation_rate = 0.2 # pool.populate(pop_size, 1) # for generation in range(generations): # pool.evaluate(evaluate(dataset, hparams)) # print(f"--- GENERATION: {generation} ---") # print("BEST:", pool.best, pool.fitness) # pool.select(pop_size) # pool.populate(pop_size, mutation_rate) # image_save_step = hparams[ 'image_save_step'] if 'image_save_step' in hparams else 2000 def generate_image(step, tsw): print("Generating sample...") encoded, seed = generate_from_model(hparams, transformer, dataset_single) print("Generating sample done.") print("Decoding midi...") decoded_seed = pro.decode_midi()(seed) decoded = pro.decode_midi()(encoded) print("Decoding midi done.") print("Saving midi...") with open(f'gen_transformer_{step}.midi', 'wb') as f: M.write_midi(f, decoded) with open(f'prior_transformer_{step}.midi', 'wb') as f: M.write_midi(f, decoded_seed) print("Saving midi done.") print("Plotting midi...") plt.title('Prior') M.display_midi(decoded_seed) image_seed = util.get_plot_image() plt.clf() plt.title('Generated') M.display_midi(decoded) image = util.get_plot_image() plt.clf() image_conc = tf.concat([image_seed, image], axis=1) print("Plotting done.") with tsw.as_default(): tf.summary.image(f'image', image_conc, step=step) print("Complete.") # This runs at every step in the training (for each batch in dataset) def on_step(epoch, step, stats, tsw): loss, tar_real, predictions = stats train_loss(loss) train_accuracy(tar_real, predictions) if step % 100 == 0: print( f"Epoch: {epoch}, Step: {step}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result()}" ) if step % image_save_step == 0: generate_image(step, tsw) with tsw.as_default(): tf.summary.scalar('loss', train_loss.result(), step=step) trainer = Trainer(dataset, hparams) ckpt = tf.train.Checkpoint(step=trainer.step, transformer=transformer, optimizer=transformer.optimizer) trainer.init_checkpoint(ckpt) trainer.init_tensorboard() trainer.set_train_step(transformer.train_step) trainer.on_epoch_start = on_epoch_start trainer.on_step = on_step trainer.on_epoch_complete = on_epoch_complete #generate_image(trainer.step.numpy(), trainer.train_summary_writer) trainer.run()
def start(hparams): # Load nsynth dataset from tfds dataset = tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True) gan_stats = calculate_dataset_stats(hparams, dataset) #gan_stats = np.load('gan_stats.npz') dataset = nsynth_to_melspec(dataset, hparams, gan_stats) # Determine shape of the spectograms in the dataset spec_shape = None for x in dataset.take(1): e = x['audio'] cond = x['pitch'] spec_shape = e.shape print(cond) print(f'Spectogram shape: {spec_shape}') # Make sure we got a shape before continuing assert spec_shape is not None, "Could not get spectogram shape" # Make sure the dimensions of spectogram is divisible by 4. # This is because the generator is going to upscale it's state twice with a factor of 2. assert spec_shape[0] % 4 == 0 and spec_shape[ 1] % 4 == 0, "Spectogram dimensions is not divisible by 4" dataset = pro.index_map('audio', pro.reshape([*spec_shape, 1]))(dataset) # Create preprocessing pipeline for shuffling and batching dataset = pro.pipeline([ pro.cache(), pro.shuffle(hparams['buffer_size']), pro.batch(hparams['batch_size']), pro.prefetch() ])(dataset) gan = GAN(spec_shape, hparams) gan.discriminator.summary() gan.generator.summary() # This runs at the end of every epoch and is used to display metrics def on_epoch_complete(epoch, step, duration, tsw): #display.clear_output(wait=True) count = 6 seed = tf.random.normal((count, gan.hparams['latent_size'])) mid = gan.hparams['cond_vector_size'] // 2 pitches = tf.one_hot(range(mid - count // 2, mid + count // 2), gan.hparams['cond_vector_size'], axis=1) samples = tf.reshape(gan.generator([seed, pitches], training=False), [-1, 128, 128]) img = tf.unstack(samples) img = tf.reverse(tf.concat(img, axis=1), axis=[0]) plt.axis('off') plt.imshow(img) buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) # Convert PNG buffer to TF image image = tf.image.decode_png(buf.getvalue(), channels=4) # Add the batch dimension image = tf.expand_dims(image, 0) with tsw.as_default(): tf.summary.image(f'Spectrogram', image, step=step) print( f"Epoch: {epoch}, Step: {step}, Gen Loss: {gen_loss_avg.result()}, Disc Loss: {disc_loss_avg.result()}, Duration: {duration} s" ) trainer = Trainer(dataset, hparams) ckpt = tf.train.Checkpoint( step=trainer.step, generator=gan.generator, discriminator=gan.discriminator, gen_optimizer=gan.generator_optimizer, disc_optimizer=gan.discriminator_optimizer, ) trainer.init_checkpoint(ckpt) trainer.init_tensorboard() trainer.set_train_step(gan.train_step) trainer.on_epoch_start = on_epoch_start trainer.on_step = on_step trainer.on_epoch_complete = on_epoch_complete trainer.run()