Exemple #1
0
def nsynth_to_melspec(dataset, hparams, stats=None):
    if 'instrument' in hparams and hparams['instrument'] is not None:
        instrument = hparams['instrument']
        if 'family' in instrument and instrument['family'] is not None:
            dataset = pro.filter(
                instrument_families_filter(instrument['family']))(dataset)
        if 'source' in hparams and hparams['source'] is not None:
            dataset = pro.filter(
                instrument_sources_filter(instrument['source']))(dataset)

    dataset = pro.index_map(
        'pitch',
        pro.pipeline([
            pro.map_transform(lambda x: x - 24),
            pro.one_hot(hparams['cond_vector_size']),
            pro.map_transform(lambda x: tf.cast(x, tf.float32)),
        ]))(dataset)

    dataset = pro.index_map(
        'audio',
        pro.pipeline([
            pro.melspec(sr=hparams['sample_rate']),
            pro.pad([[0, 0], [0, 2]],
                    'CONSTANT',
                    constant_values=hparams['log_amin']),
        ]))(dataset)

    if stats is not None:
        dataset = pro.index_map(
            'audio', pro.normalize(normalization='specgan',
                                   stats=stats))(dataset)

    # Create preprocessing pipeline for the melspectograms
    return dataset
Exemple #2
0
def generate(hparams):
    input_vocab_size = 128 + 128 + 128 + 128
    target_vocab_size = 128 + 128 + 128 + 128

    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.midi')

    dataset_single = pro.pipeline([
        pro.midi(),
        pro.frame(hparams['frame_size'], hparams['frame_size'], True),
        pro.unbatch(),
    ])(dataset).skip(16000).as_numpy_iterator()

    transformer = Transformer(input_vocab_size=input_vocab_size,
                              target_vocab_size=target_vocab_size,
                              pe_input=input_vocab_size,
                              pe_target=target_vocab_size,
                              hparams=hparams)

    trainer = Trainer(dataset, hparams)
    ckpt = tf.train.Checkpoint(step=trainer.step,
                               transformer=transformer,
                               optimizer=transformer.optimizer)

    trainer.init_checkpoint(ckpt)

    return generate_from_model(hparams, transformer, dataset_single)
Exemple #3
0
def generate(hparams, seed, pitches):
    gan_stats = np.load('gan_stats.npz')

    gan = GAN((256, 128), hparams)

    trainer = Trainer(None, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        generator=gan.generator,
        discriminator=gan.discriminator,
        gen_optimizer=gan.generator_optimizer,
        disc_optimizer=gan.discriminator_optimizer,
    )

    trainer.init_checkpoint(ckpt)

    #seed = tf.repeat(seed, count, axis=0)
    pitches = tf.one_hot(pitches, hparams['cond_vector_size'], axis=1)

    samples = tf.reshape(gan.generator([seed, pitches], training=False),
                         [-1, 256, 128])
    audio = pro.pipeline([
        pro.denormalize(normalization='specgan', stats=gan_stats),
        pro.invert_log_melspec(hparams['sample_rate']),
        list,  # Stupid workaround becuase invert_log_melspec only does
        np.array,  # one spectrogram at a time
    ])(samples)
    return samples, audio
Exemple #4
0
def maestro_from_files(root_path, frame_size):
    dataset = tf.data.Dataset.list_files(os.path.join(root_path, '**/*.wav'))
    dataset = pro.pipeline([
        pro.read_file(),
        pro.decode_wav(desired_channels=1),
        pro.map_transform(lambda x: x[0]),
        pro.reshape([-1]),
        pro.frame(frame_size, frame_size),
        pro.unbatch()
    ])(dataset)

    return dataset
Exemple #5
0
def generate_tones(pitches):
    seed = tf.random.normal((len(pitches), gan_hparams['latent_size']))
    pitches = tf.one_hot(pitches, gan_hparams['cond_vector_size'], axis=1)

    samples = gan.generator([seed, pitches], training=False)
    samples = tf.reshape(samples, [-1, 256, 128])
    audio = pro.pipeline([
        pro.denormalize(normalization='specgan', stats=gan_stats),
        pro.invert_log_melspec(gan_hparams['sample_rate']),
        list,  # Stupid workaround becuase invert_log_melspec only does
        np.array,  # one spectrogram at a time
    ])(samples)
    return audio
Exemple #6
0
def start(hparams):
    dataset = tf.data.Dataset.list_files('dataset/*.midi')

    dataset = pro.pipeline([
        pro.midi(),
        pro.prefetch(),
        pro.frame(hparams['frame_size'], hparams['frame_size'], True),
        pro.unbatch(),
        # pro.map_transform(tf_serialize_example)
    ])(dataset)

    def generator():
        for features in dataset:
            yield serialize_example(features)

    serialized_features_dataset = tf.data.Dataset.from_generator(
        generator, output_types=tf.string, output_shapes=())
    filename = 'midi_dataset.tfrecord'
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(serialized_features_dataset)
Exemple #7
0
def nsynth_from_tfrecord(nsynth_tfrecord_path):
    dataset = tf.data.TFRecordDataset([nsynth_tfrecord_path])
    return pro.pipeline([
        pro.parse_tfrecord({
            "note_str":
            tf.io.FixedLenFeature([], dtype=tf.string),
            "pitch":
            tf.io.FixedLenFeature([1], dtype=tf.int64),
            "velocity":
            tf.io.FixedLenFeature([1], dtype=tf.int64),
            "audio":
            tf.io.FixedLenFeature([64000], dtype=tf.float32),
            "qualities":
            tf.io.FixedLenFeature([10], dtype=tf.int64),
            "instrument_source":
            tf.io.FixedLenFeature([1], dtype=tf.int64),
            "instrument_family":
            tf.io.FixedLenFeature([1], dtype=tf.int64),
        }),
    ])(dataset)
Exemple #8
0
def start(hparams):
    # Load nsynth dataset
    # dataset = tfds.load('nsynth/gansynth_subset', split='train', shuffle_files=True)
    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.wav')
    dataset = pro.pipeline([
        pro.wav(),
        # pro.resample(16000, hparams['sample_rate'], tf.float32),
        pro.normalize(),
        pro.frame(hparams['window_samples'], hparams['window_samples']),
        pro.unbatch(),
        pro.set_channels(1),
        pro.dupe(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size']),
        pro.prefetch()
    ])(dataset)

    vae = VAE(hparams)

    vae.vae.summary()

    trainer = Trainer(dataset, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        encoder=vae.encoder,
        decoder=vae.decoder,
        vae=vae.vae,
    )

    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step

    trainer.init_tensorboard()
    trainer.init_checkpoint(ckpt)
    trainer.set_train_step(vae.train_step)
    trainer.run()
Exemple #9
0
import matplotlib.pyplot as plt
import librosa
import numpy as np

dataset = tf.data.Dataset.list_files('src/audio/*.wav')
stats = np.load('gan_stats')

hparams = {
    'sample_rate': 16000,
    'log_amin': 1e-6,
}

dataset = pro.pipeline([
    pro.wav(),
    pro.melspec(hparams['sample_rate']),
    pro.pad([[0, 0], [0, 2]], 'CONSTANT', constant_values=hparams['log_amin']),
    pro.normalize(normalization='specgan', stats=stats),
    pro.numpy(),
])(dataset)
x = next(dataset)
plt.imshow(x)
plt.savefig('result.png')

dataset = pro.pipeline([
    pro.denormalize(normalization='specgan', stats=stats),
    pro.invert_log_melspec(hparams['sample_rate'])
])(dataset)

x = next(dataset)
librosa.output.write_wav('result.wav', x, hparams['sample_rate'])
Exemple #10
0
def start(hparams):
    gc.collect()

    dataset = tf.data.Dataset.list_files(
        '/home/big/datasets/maestro-v2.0.0/**/*.midi')

    dataset_single = pro.pipeline([
        pro.midi(),
        pro.frame(hparams['frame_size'] * 2, hparams['frame_hop_len'], True),
        pro.unbatch(),
    ])(dataset)

    def _reshape(inp, tar):
        inp = tf.reshape(inp, [hparams['frame_size']])
        tar = tf.reshape(tar, [hparams['frame_size']])
        return inp, tar

    dataset = pro.pipeline([
        pro.split(2),
        #pro.batch(2, True),
        # pro.dupe(),
        pro.map_transform(_reshape),
        pro.cache(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size'], True),
        pro.prefetch(),
    ])(dataset_single)

    dataset_single = pro.shuffle(hparams['buffer_size'] // 4)(dataset_single)
    dataset_single = dataset_single.as_numpy_iterator()

    transformer = Transformer(input_vocab_size=input_vocab_size,
                              target_vocab_size=target_vocab_size,
                              pe_input=input_vocab_size,
                              pe_target=target_vocab_size,
                              hparams=hparams)

    # pop_size = 10
    # generations = 100
    # mutation_rate = 0.2
    # pool.populate(pop_size, 1)

    # for generation in range(generations):
    #     pool.evaluate(evaluate(dataset, hparams))
    #     print(f"--- GENERATION: {generation} ---")
    #     print("BEST:", pool.best, pool.fitness)
    #     pool.select(pop_size)
    #     pool.populate(pop_size, mutation_rate)
    #

    image_save_step = hparams[
        'image_save_step'] if 'image_save_step' in hparams else 2000

    def generate_image(step, tsw):
        print("Generating sample...")
        encoded, seed = generate_from_model(hparams, transformer,
                                            dataset_single)
        print("Generating sample done.")

        print("Decoding midi...")
        decoded_seed = pro.decode_midi()(seed)
        decoded = pro.decode_midi()(encoded)
        print("Decoding midi done.")

        print("Saving midi...")
        with open(f'gen_transformer_{step}.midi', 'wb') as f:
            M.write_midi(f, decoded)
        with open(f'prior_transformer_{step}.midi', 'wb') as f:
            M.write_midi(f, decoded_seed)
        print("Saving midi done.")

        print("Plotting midi...")
        plt.title('Prior')
        M.display_midi(decoded_seed)
        image_seed = util.get_plot_image()
        plt.clf()

        plt.title('Generated')
        M.display_midi(decoded)
        image = util.get_plot_image()
        plt.clf()

        image_conc = tf.concat([image_seed, image], axis=1)
        print("Plotting done.")

        with tsw.as_default():
            tf.summary.image(f'image', image_conc, step=step)
        print("Complete.")

    # This runs at every step in the training (for each batch in dataset)
    def on_step(epoch, step, stats, tsw):
        loss, tar_real, predictions = stats
        train_loss(loss)
        train_accuracy(tar_real, predictions)
        if step % 100 == 0:
            print(
                f"Epoch: {epoch}, Step: {step}, Loss: {train_loss.result()}, Accuracy: {train_accuracy.result()}"
            )
        if step % image_save_step == 0:
            generate_image(step, tsw)

        with tsw.as_default():
            tf.summary.scalar('loss', train_loss.result(), step=step)

    trainer = Trainer(dataset, hparams)
    ckpt = tf.train.Checkpoint(step=trainer.step,
                               transformer=transformer,
                               optimizer=transformer.optimizer)

    trainer.init_checkpoint(ckpt)
    trainer.init_tensorboard()
    trainer.set_train_step(transformer.train_step)
    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step
    trainer.on_epoch_complete = on_epoch_complete

    #generate_image(trainer.step.numpy(), trainer.train_summary_writer)

    trainer.run()
Exemple #11
0
def start(hparams):

    # Load nsynth dataset from tfds
    dataset = tfds.load('nsynth/gansynth_subset',
                        split='train',
                        shuffle_files=True)

    gan_stats = calculate_dataset_stats(hparams, dataset)
    #gan_stats = np.load('gan_stats.npz')

    dataset = nsynth_to_melspec(dataset, hparams, gan_stats)

    # Determine shape of the spectograms in the dataset
    spec_shape = None
    for x in dataset.take(1):
        e = x['audio']
        cond = x['pitch']
        spec_shape = e.shape
        print(cond)
        print(f'Spectogram shape: {spec_shape}')

    # Make sure we got a shape before continuing
    assert spec_shape is not None, "Could not get spectogram shape"

    # Make sure the dimensions of spectogram is divisible by 4.
    # This is because the generator is going to upscale it's state twice with a factor of 2.
    assert spec_shape[0] % 4 == 0 and spec_shape[
        1] % 4 == 0, "Spectogram dimensions is not divisible by 4"

    dataset = pro.index_map('audio', pro.reshape([*spec_shape, 1]))(dataset)

    # Create preprocessing pipeline for shuffling and batching
    dataset = pro.pipeline([
        pro.cache(),
        pro.shuffle(hparams['buffer_size']),
        pro.batch(hparams['batch_size']),
        pro.prefetch()
    ])(dataset)

    gan = GAN(spec_shape, hparams)
    gan.discriminator.summary()
    gan.generator.summary()

    # This runs at the end of every epoch and is used to display metrics
    def on_epoch_complete(epoch, step, duration, tsw):
        #display.clear_output(wait=True)
        count = 6
        seed = tf.random.normal((count, gan.hparams['latent_size']))
        mid = gan.hparams['cond_vector_size'] // 2
        pitches = tf.one_hot(range(mid - count // 2, mid + count // 2),
                             gan.hparams['cond_vector_size'],
                             axis=1)

        samples = tf.reshape(gan.generator([seed, pitches], training=False),
                             [-1, 128, 128])
        img = tf.unstack(samples)
        img = tf.reverse(tf.concat(img, axis=1), axis=[0])
        plt.axis('off')
        plt.imshow(img)

        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)

        # Convert PNG buffer to TF image
        image = tf.image.decode_png(buf.getvalue(), channels=4)

        # Add the batch dimension
        image = tf.expand_dims(image, 0)

        with tsw.as_default():
            tf.summary.image(f'Spectrogram', image, step=step)
        print(
            f"Epoch: {epoch}, Step: {step}, Gen Loss: {gen_loss_avg.result()}, Disc Loss: {disc_loss_avg.result()}, Duration: {duration} s"
        )

    trainer = Trainer(dataset, hparams)

    ckpt = tf.train.Checkpoint(
        step=trainer.step,
        generator=gan.generator,
        discriminator=gan.discriminator,
        gen_optimizer=gan.generator_optimizer,
        disc_optimizer=gan.discriminator_optimizer,
    )

    trainer.init_checkpoint(ckpt)
    trainer.init_tensorboard()
    trainer.set_train_step(gan.train_step)
    trainer.on_epoch_start = on_epoch_start
    trainer.on_step = on_step
    trainer.on_epoch_complete = on_epoch_complete

    trainer.run()