Esempio n. 1
0
def generate(hparams):
    input_vocab_size = 128 + 128 + 128 + 128
    target_vocab_size = 128 + 128 + 128 + 128

    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.midi')

    dataset_single = pro.pipeline([
        pro.midi(),
        pro.frame(hparams['frame_size'], hparams['frame_size'], True),
        pro.unbatch(),
    ])(dataset).skip(16000).as_numpy_iterator()

    transformer = Transformer(input_vocab_size=input_vocab_size,
                              target_vocab_size=target_vocab_size,
                              pe_input=input_vocab_size,
                              pe_target=target_vocab_size,
                              hparams=hparams)

    trainer = Trainer(dataset, hparams)
    ckpt = tf.train.Checkpoint(step=trainer.step,
                               transformer=transformer,
                               optimizer=transformer.optimizer)

    trainer.init_checkpoint(ckpt)

    return generate_from_model(hparams, transformer, dataset_single)
Esempio n. 2
0
def generate(hparams, seed, pitches):
    gan_stats = np.load('gan_stats.npz')

    gan = GAN((256, 128), hparams)

    trainer = Trainer(None, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        generator=gan.generator,
        discriminator=gan.discriminator,
        gen_optimizer=gan.generator_optimizer,
        disc_optimizer=gan.discriminator_optimizer,
    )

    trainer.init_checkpoint(ckpt)

    #seed = tf.repeat(seed, count, axis=0)
    pitches = tf.one_hot(pitches, hparams['cond_vector_size'], axis=1)

    samples = tf.reshape(gan.generator([seed, pitches], training=False),
                         [-1, 256, 128])
    audio = pro.pipeline([
        pro.denormalize(normalization='specgan', stats=gan_stats),
        pro.invert_log_melspec(hparams['sample_rate']),
        list,  # Stupid workaround becuase invert_log_melspec only does
        np.array,  # one spectrogram at a time
    ])(samples)
    return samples, audio
Esempio n. 3
0
def start(hparams):
    vae = VAE(hparams)

    trainer = Trainer(None, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        encoder=vae.encoder,
        decoder=vae.decoder,
        vae=vae.vae,
    )

    trainer.init_checkpoint(ckpt)

    samples = tf.reshape(vae.sample(100), [-1]).numpy()

    librosa.output.write_wav('vae_sample2.wav',
                             samples,
                             hparams['sample_rate'],
                             norm=False)
Esempio n. 4
0
    def _eval(model):
        trainer = Trainer(dataset, hparams)

        trainer.set_train_step(model.train_step)
        trainer.on_epoch_start = on_epoch_start
        trainer.on_step = on_step
        trainer.on_epoch_complete = on_epoch_complete

        stats = trainer.run()
        if stats is not None:
            loss, _, _ = stats
            return 1 / tf.reduce_mean(loss).numpy()
        else:
            return 0
Esempio n. 5
0
print(midi)

pitches = tf.cast(
    [a.pitch - 24 for a in midi if isinstance(a, M.Midi.NoteEvent)], tf.int32)
amp = tf.cast(
    [a.velocity / 127 for a in midi if isinstance(a, M.Midi.NoteEvent)],
    tf.float32)
vel = tf.ones_like(pitches) * 2

sr = 16000
samples_per_note = 8000
tone_length = sr

gan_stats = np.load('gan_stats.npz')
gan = GAN((256, 128), gan_hparams)
gan_trainer = Trainer(None, gan_hparams)
gan_ckpt = tf.train.Checkpoint(
    step=gan_trainer.step,
    generator=gan.generator,
    discriminator=gan.discriminator,
    gen_optimizer=gan.generator_optimizer,
    disc_optimizer=gan.discriminator_optimizer,
)
gan_trainer.init_checkpoint(gan_ckpt)


def generate_tones(pitches):
    seed = tf.random.normal((len(pitches), gan_hparams['latent_size']))
    pitches = tf.one_hot(pitches, gan_hparams['cond_vector_size'], axis=1)

    samples = gan.generator([seed, pitches], training=False)
Esempio n. 6
0
def start(hparams):
    # Load nsynth dataset
    # dataset = tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True)
    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.wav')
    dataset = pro.pipeline([
        pro.wav(),
        # pro.resample(16000, hparams['sample_rate'], tf.float32),
        pro.normalize(),
        pro.frame(hparams['window_samples'], hparams['window_samples']),
        pro.unbatch(),
        pro.set_channels(1),
        pro.dupe(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size']),
        pro.prefetch()
    ])(dataset)

    vae = VAE(hparams)

    vae.vae.summary()

    trainer = Trainer(dataset, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        encoder=vae.encoder,
        decoder=vae.decoder,
        vae=vae.vae,
    )

    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step

    trainer.init_tensorboard()
    trainer.init_checkpoint(ckpt)
    trainer.set_train_step(vae.train_step)
    trainer.run()
Esempio n. 7
0
def start(hparams):
    gc.collect()

    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.midi')

    dataset_single = pro.pipeline([
        pro.midi(),
        pro.frame(hparams['frame_size'] * 2, hparams['frame_hop_len'], True),
        pro.unbatch(),
    ])(dataset)

    def _reshape(inp, tar):
        inp = tf.reshape(inp, [hparams['frame_size']])
        tar = tf.reshape(tar, [hparams['frame_size']])
        return inp, tar

    dataset = pro.pipeline([
        pro.split(2),
        #pro.batch(2, True),
        # pro.dupe(),
        pro.map_transform(_reshape),
        pro.cache(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size'], True),
        pro.prefetch(),
    ])(dataset_single)

    dataset_single = pro.shuffle(hparams['buffer_size'] // 4)(dataset_single)
    dataset_single = dataset_single.as_numpy_iterator()

    transformer = Transformer(input_vocab_size=input_vocab_size,
                              target_vocab_size=target_vocab_size,
                              pe_input=input_vocab_size,
                              pe_target=target_vocab_size,
                              hparams=hparams)

    # pop_size = 10
    # generations = 100
    # mutation_rate = 0.2
    # pool.populate(pop_size, 1)

    # for generation in range(generations):
    #     pool.evaluate(evaluate(dataset, hparams))
    #     print(f"--- GENERATION: {generation} ---")
    #     print("BEST:", pool.best, pool.fitness)
    #     pool.select(pop_size)
    #     pool.populate(pop_size, mutation_rate)
    #

    image_save_step = hparams[
        'image_save_step'] if 'image_save_step' in hparams else 2000

    def generate_image(step, tsw):
        print("Generating sample...")
        encoded, seed = generate_from_model(hparams, transformer,
                                            dataset_single)
        print("Generating sample done.")

        print("Decoding midi...")
        decoded_seed = pro.decode_midi()(seed)
        decoded = pro.decode_midi()(encoded)
        print("Decoding midi done.")

        print("Saving midi...")
        with open(f'gen_transformer_{step}.midi', 'wb') as f:
            M.write_midi(f, decoded)
        with open(f'prior_transformer_{step}.midi', 'wb') as f:
            M.write_midi(f, decoded_seed)
        print("Saving midi done.")

        print("Plotting midi...")
        plt.title('Prior')
        M.display_midi(decoded_seed)
        image_seed = util.get_plot_image()
        plt.clf()

        plt.title('Generated')
        M.display_midi(decoded)
        image = util.get_plot_image()
        plt.clf()

        image_conc = tf.concat([image_seed, image], axis=1)
        print("Plotting done.")

        with tsw.as_default():
            tf.summary.image(f'image', image_conc, step=step)
        print("Complete.")

    # This runs at every step in the training (for each batch in dataset)
    def on_step(epoch, step, stats, tsw):
        loss, tar_real, predictions = stats
        train_loss(loss)
        train_accuracy(tar_real, predictions)
        if step % 100 == 0:
            print(
                f"Epoch: {epoch}, Step: {step}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result()}"
            )
        if step % image_save_step == 0:
            generate_image(step, tsw)

        with tsw.as_default():
            tf.summary.scalar('loss', train_loss.result(), step=step)

    trainer = Trainer(dataset, hparams)
    ckpt = tf.train.Checkpoint(step=trainer.step,
                               transformer=transformer,
                               optimizer=transformer.optimizer)

    trainer.init_checkpoint(ckpt)
    trainer.init_tensorboard()
    trainer.set_train_step(transformer.train_step)
    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step
    trainer.on_epoch_complete = on_epoch_complete

    #generate_image(trainer.step.numpy(), trainer.train_summary_writer)

    trainer.run()
Esempio n. 8
0
def start(hparams):

    # Load nsynth dataset from tfds
    dataset = tfds.load('nsynth/gansynth_subset',
                        split='train',
                        shuffle_files=True)

    gan_stats = calculate_dataset_stats(hparams, dataset)
    #gan_stats = np.load('gan_stats.npz')

    dataset = nsynth_to_melspec(dataset, hparams, gan_stats)

    # Determine shape of the spectograms in the dataset
    spec_shape = None
    for x in dataset.take(1):
        e = x['audio']
        cond = x['pitch']
        spec_shape = e.shape
        print(cond)
        print(f'Spectogram shape: {spec_shape}')

    # Make sure we got a shape before continuing
    assert spec_shape is not None, "Could not get spectogram shape"

    # Make sure the dimensions of spectogram is divisible by 4.
    # This is because the generator is going to upscale it's state twice with a factor of 2.
    assert spec_shape[0] % 4 == 0 and spec_shape[
        1] % 4 == 0, "Spectogram dimensions is not divisible by 4"

    dataset = pro.index_map('audio', pro.reshape([*spec_shape, 1]))(dataset)

    # Create preprocessing pipeline for shuffling and batching
    dataset = pro.pipeline([
        pro.cache(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size']),
        pro.prefetch()
    ])(dataset)

    gan = GAN(spec_shape, hparams)
    gan.discriminator.summary()
    gan.generator.summary()

    # This runs at the end of every epoch and is used to display metrics
    def on_epoch_complete(epoch, step, duration, tsw):
        #display.clear_output(wait=True)
        count = 6
        seed = tf.random.normal((count, gan.hparams['latent_size']))
        mid = gan.hparams['cond_vector_size'] // 2
        pitches = tf.one_hot(range(mid - count // 2, mid + count // 2),
                             gan.hparams['cond_vector_size'],
                             axis=1)

        samples = tf.reshape(gan.generator([seed, pitches], training=False),
                             [-1, 128, 128])
        img = tf.unstack(samples)
        img = tf.reverse(tf.concat(img, axis=1), axis=[0])
        plt.axis('off')
        plt.imshow(img)

        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)

        # Convert PNG buffer to TF image
        image = tf.image.decode_png(buf.getvalue(), channels=4)

        # Add the batch dimension
        image = tf.expand_dims(image, 0)

        with tsw.as_default():
            tf.summary.image(f'Spectrogram', image, step=step)
        print(
            f"Epoch: {epoch}, Step: {step}, Gen Loss: {gen_loss_avg.result()}, Disc Loss: {disc_loss_avg.result()}, Duration: {duration} s"
        )

    trainer = Trainer(dataset, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        generator=gan.generator,
        discriminator=gan.discriminator,
        gen_optimizer=gan.generator_optimizer,
        disc_optimizer=gan.discriminator_optimizer,
    )

    trainer.init_checkpoint(ckpt)
    trainer.init_tensorboard()
    trainer.set_train_step(gan.train_step)
    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step
    trainer.on_epoch_complete = on_epoch_complete

    trainer.run()