Ejemplo n.º 1
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():
        spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(
            args)

        def preload_fn(aa):
            aa.annotation = datasets.Annotation.from_time_series(
                *aa.annotation, args.frame_width * args.samplerate / 44100)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(
                spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example,
                                  num_parallel_calls=args.threads).batch(
                                      args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(
                10**5).map(dataset.prepare_example,
                           num_parallel_calls=args.threads).batch(
                               args.batch_size).prefetch(10)

        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets, args, preload_fn, dataset_transform,
            dataset_transform_train)

        network.construct(args,
                          create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes,
                          spectrogram_info=spectrogram_info)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 2
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():
        spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(args)
        # save spectrogram_thumb to hyperparams
        args.spectrogram_thumb = spectrogram_thumb

        hop_samples = args.frame_width*args.samplerate/44100
        print("hop_samples", hop_samples)
        def preload_fn(aa):
            aa.annotation = datasets.Annotation.from_time_series(*aa.annotation, hop_samples=hop_samples)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(10**5).map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size).prefetch(10)

        valid_hooks = [MetricsHook(), VisualOutputHook(False, False, True, True), SaveBestModelHook(args.logdir), CSVOutputWriterHook(), AdjustVoicingHook()]
        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(args.datasets, args, preload_fn, dataset_transform, dataset_transform_train, valid_hooks=valid_hooks)

        network.construct(args, create_model, train_dataset.dataset.output_types, train_dataset.dataset.output_shapes, spectrogram_info=spectrogram_info)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 3
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():
        def spec_function(audio, samplerate):

            cqt_list = []
            shapes = []
            for h in HARMONICS:
                cqt = librosa.cqt(
                    audio, sr=samplerate, hop_length=HOP_LENGTH, fmin=FMIN*float(h),
                    n_bins=N_BINS,
                    bins_per_octave=BINS_PER_OCTAVE
                )
                cqt_list.append(cqt)
                shapes.append(cqt.shape)

            shapes_equal = [s == shapes[0] for s in shapes]
            if not all(shapes_equal):
                print("NOT ALL", shapes_equal)
                min_time = np.min([s[1] for s in shapes])
                new_cqt_list = []
                for i in range(len(cqt_list)):
                    new_cqt_list.append(cqt_list[i][:, :min_time])
                cqt_list = new_cqt_list

            log_hcqt = ((1.0/80.0) * librosa.core.amplitude_to_db(
                np.abs(np.array(cqt_list)), ref=np.max)) + 1.0

            return (log_hcqt*65535).astype(np.uint16)

            # cqt = librosa.core.cqt(audio, samplerate, hop_length=HOP_LENGTH, fmin=FMIN, n_bins=N_BINS, bins_per_octave=BINS_PER_OCTAVE)
            # # log scaling
            # cqt = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)
            # # uint8 compression
            # cqt = ((cqt/80+1)*255).astype(np.uint8)
            # return cqt

        spectrogram_thumb = "hcqt-fmin{}-oct{}-octbins{}-hop{}-db-uint16".format(FMIN, N_BINS/BINS_PER_OCTAVE, BINS_PER_OCTAVE, HOP_LENGTH)

        def preload_fn(aa):
            aa.annotation = datasets.Annotation.from_time_series(*aa.annotation, 512)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(spec_function, spectrogram_thumb, HOP_LENGTH)

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(10**5).map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size).prefetch(10)

        valid_hooks = [MetricsHook(write_estimations=True), VisualOutputHook(False, False, True, True), SaveBestModelHook(args.logdir), AdjustVoicingHook()]
        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(args.datasets, args, preload_fn, dataset_transform, dataset_transform_train, valid_hooks=valid_hooks)

        network.construct(args, create_model, train_dataset.dataset.output_types, train_dataset.dataset.output_shapes)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 4
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():
        spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(
            args)
        # save spectrogram_thumb to hyperparams
        args.spectrogram_thumb = spectrogram_thumb

        def preload_fn(aa):
            annot_path, uid = aa.annotation
            if uid.startswith("mdb_"):
                uid = uid + "_mel4"
            aa.annotation = datasets.Annotation.from_time_series(
                annot_path,
                uid,
                hop_samples=args.frame_width * args.samplerate / 44100,
                unique_mf0=True)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(
                spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example,
                                  num_parallel_calls=args.threads).batch(
                                      args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(
                10**5).map(dataset.prepare_example,
                           num_parallel_calls=args.threads).batch(
                               args.batch_size).prefetch(10)

        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets, args, preload_fn, dataset_transform,
            dataset_transform_train)

        if not args.voicing:
            for vd in validation_datasets:
                if not vd.name.startswith("small_"):
                    vd.hooks.append(AdjustVoicingHook())

        network.construct(args,
                          create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes,
                          spectrogram_info=spectrogram_info)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 5
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():
        def preload_fn(aa):
            aa.annotation = datasets.Annotation.from_time_series(*aa.annotation)
            aa.audio.load_resampled_audio(args.samplerate)

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example, num_parallel_calls=args.threads).batch(args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(10**5).map(dataset.prepare_example, num_parallel_calls=args.threads).filter(dataset.is_example_voiced).batch(args.batch_size).prefetch(10)

        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(args.datasets, args, preload_fn, dataset_transform, dataset_transform_train)

        network.construct(args, create_model, train_dataset.dataset.output_types, train_dataset.dataset.output_shapes)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 6
0
def construct(args):
    network = NetworkMelody(args)

    with network.session.graph.as_default():

        def preload_fn(aa):
            aa.annotation = datasets.Annotation.from_time_series(
                *aa.annotation)
            aa.audio.load_resampled_audio(args.samplerate)

        # augment_audio_basa = datasets.Audio("/mnt/tera/jirka/V1/MatthewEntwistle_FairerHopes/MatthewEntwistle_FairerHopes_STEMS/MatthewEntwistle_FairerHopes_STEM_07.wav",
        #                                     "augment_low").load_resampled_audio(args.samplerate).slice(20, 30)
        # augment_audio_perkuse = datasets.Audio("/mnt/tera/jirka/V1/MatthewEntwistle_FairerHopes/MatthewEntwistle_FairerHopes_STEMS/MatthewEntwistle_FairerHopes_STEM_08.wav",
        #                                        "augment_low").load_resampled_audio(args.samplerate).slice(20, 30)

        # augment_audio = augment_audio_basa.samples*10 + augment_audio_perkuse.samples*10

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example).batch(
                args.batch_size_evaluation).prefetch(10)
            # return tf_dataset.map(dataset.prepare_example).map(dataset.mix_example_with(augment_audio)).batch(args.batch_size_evaluation).prefetch(1)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(10**5).map(
                dataset.prepare_example,
                num_parallel_calls=4).filter(dataset.is_example_voiced).batch(
                    args.batch_size).prefetch(10)

        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets, args, preload_fn, dataset_transform,
            dataset_transform_train)

        # Add voicing hook to the validation dataset
        for vd in validation_datasets:
            if not vd.name.startswith("small_"):
                vd.hooks.append(AdjustVoicingHook())

        network.construct(args, create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes)

    return network, train_dataset, validation_datasets, test_datasets
def construct(args):
    network = NetworkMultiInstrumentRecognition(args)

    with network.session.graph.as_default():
        if args.spectrogram == "YunNingHung_cqt":
            HOP_LENGTH = args.frame_width
            FMIN = 27.5
            BINS_PER_OCTAVE = 12
            N_BINS = 88
            top_db = args.spectrogram_top_db
            filter_scale = args.spectrogram_filter_scale

            def spectrogram_function(audio, samplerate):
                print(np.min(audio), np.max(audio))
                cqt = librosa.cqt(audio,
                                  sr=samplerate,
                                  hop_length=HOP_LENGTH,
                                  fmin=FMIN,
                                  n_bins=N_BINS,
                                  bins_per_octave=BINS_PER_OCTAVE,
                                  filter_scale=filter_scale)
                print(np.min(cqt), np.max(cqt))

                log_cqt = (librosa.core.amplitude_to_db(
                    np.abs(cqt), ref=np.max, top_db=top_db) / top_db) + 1.0
                log_cqt = np.expand_dims(log_cqt, 0)
                return (log_cqt * 65535).astype(np.uint16)

            spectrogram_thumb = "YunNingHung_cqt-fmin{}-oct{}-octbins{}-hop{}-db{}-fs{}-uint16".format(
                FMIN, N_BINS / BINS_PER_OCTAVE, BINS_PER_OCTAVE, HOP_LENGTH,
                top_db, filter_scale)
            spectrogram_info = (1, N_BINS, HOP_LENGTH, FMIN)
        else:
            spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(
                args)
            # save spectrogram_thumb to hyperparams

        args.spectrogram_thumb = spectrogram_thumb

        # all instruments in MusicNet
        # mapping between MIDI instrument and position in output probability tensor
        instrument_mappings = {
            1: {
                "id": 0,
                "instrument": "piano"
            },
            7: {
                "id": 1,
                "instrument": "harpsichord"
            },
            41: {
                "id": 2,
                "instrument": "violin"
            },
            42: {
                "id": 3,
                "instrument": "viola"
            },
            43: {
                "id": 4,
                "instrument": "cello"
            },
            44: {
                "id": 5,
                "instrument": "contrabass"
            },
            61: {
                "id": 6,
                "instrument": "french horn"
            },
            69: {
                "id": 7,
                "instrument": "oboe"
            },
            71: {
                "id": 8,
                "instrument": "bassoon"
            },
            72: {
                "id": 9,
                "instrument": "clarinet"
            },
            74: {
                "id": 10,
                "instrument": "flute"
            },
        }

        def preload_fn(aa):
            annot_path, uid = aa.annotation
            if uid.startswith("musicnet_mir"):
                aa.annotation = datasets.Annotation.from_musicnet_csv(
                    annot_path,
                    uid,
                    hop_samples=args.frame_width * args.samplerate / 44100,
                    unique_mf0=True,
                    instrument_mappings=instrument_mappings)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(
                spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example,
                                  num_parallel_calls=args.threads).batch(
                                      args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(
                10**5).map(dataset.prepare_example,
                           num_parallel_calls=args.threads).batch(
                               args.batch_size).prefetch(10)

        small_hooks = [
            MetricsHook_mir(instrument_mappings),
            VisualOutputHook_mir()
        ]
        valid_hooks = [
            AdjustVoicingHook_mir(),
            MetricsHook_mir(instrument_mappings),
            SaveBestModelHook(args.logdir, "micro f1"),
            BatchOutputWriterHook_mir(split="valid", output_reference=True)
        ]
        test_hooks = [
            MetricsHook_mir(instrument_mappings,
                            write_summaries=True,
                            print_detailed=True,
                            split="test"),
            BatchOutputWriterHook_mir(output_reference=True)
        ]
        if args.save_salience:
            test_hooks.append(SaveSaliencesHook())
        print("preparing datasets...")
        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets,
            args,
            preload_fn,
            dataset_transform,
            dataset_transform_train,
            small_hooks_mf0=small_hooks,
            valid_hooks=valid_hooks,
            test_hooks=test_hooks)
        print("done preparing datasets")

        all_notes = train_dataset.all_notes()
        notes_count = np.zeros((args.note_range, ))
        for note_frame in all_notes:
            for note in note_frame:
                notes_count[int(note)] += 1

        class_priors = notes_count / np.sum(notes_count)
        mean_prior = 1 / args.note_range
        class_weights = mean_prior / class_priors * (1 - class_priors) / (
            1 - mean_prior)
        class_weights = class_weights**0.3
        print("weights", class_weights)

        # if not args.voicing:
        #     for vd in validation_datasets:
        #         if not vd.name.startswith("small_"):
        #             vd.hooks.append(AdjustVoicingHook())

        network.construct(args,
                          create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes,
                          spectrogram_info=spectrogram_info,
                          class_weights=class_weights)

    return network, train_dataset, validation_datasets, test_datasets
def construct(args):
    network = NetworkMultif0(args)

    with network.session.graph.as_default():
        spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(
            args)
        # save spectrogram_thumb to hyperparams
        args.spectrogram_thumb = spectrogram_thumb

        def preload_fn(aa):
            annot_path, uid = aa.annotation
            if uid.startswith("mdb_"):
                uid = uid + "_mel4"
            if uid.startswith("maps_"):
                aa.annotation = datasets.Annotation.from_midi(
                    annot_path,
                    uid,
                    hop_samples=args.frame_width * args.samplerate / 44100,
                    unique_mf0=True)
            else:
                aa.annotation = datasets.Annotation.from_time_series(
                    annot_path,
                    uid,
                    hop_samples=args.frame_width * args.samplerate / 44100,
                    unique_mf0=True)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(
                spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example,
                                  num_parallel_calls=args.threads).batch(
                                      args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(
                10**5).map(dataset.prepare_example,
                           num_parallel_calls=args.threads).batch(
                               args.batch_size).prefetch(10)

        small_hooks = [MetricsHook_mf0(), VisualOutputHook_mf0()]
        valid_hooks = [
            MetricsHook_mf0(),
            SaveBestModelHook(args.logdir, "Accuracy")
        ]
        test_hooks = [
            MetricsHook_mf0(write_summaries=True,
                            print_detailed=False,
                            split="test"),
            CSVBatchOutputWriterHook_mf0(output_reference=True)
        ]

        print("preparing datasets...")
        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets,
            args,
            preload_fn,
            dataset_transform,
            dataset_transform_train,
            small_hooks_mf0=small_hooks,
            valid_hooks=valid_hooks,
            test_hooks=test_hooks)
        print("done preparing datasets")

        # if not args.voicing:
        #     for vd in validation_datasets:
        #         if not vd.name.startswith("small_"):
        #             vd.hooks.append(AdjustVoicingHook())

        network.construct(args,
                          create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes,
                          spectrogram_info=spectrogram_info)

    return network, train_dataset, validation_datasets, test_datasets
Ejemplo n.º 9
0
def construct(args):
    network = NetworkMultif0(args)

    HOP_LENGTH = args.frame_width
    # https://github.com/rainerkelz/framewise_2016/blob/master/datasets.py
    if args.spectrogram == "kelz":
        FMIN = 30
        FMAX = 8000
        NUMBANDS = 48

        def spectrogram_function(audio, samplerate):
            audio_options = dict(num_channels=1,
                                 sample_rate=samplerate,
                                 filterbank=LogarithmicFilterbank,
                                 frame_size=4096,
                                 fft_size=4096,
                                 hop_size=HOP_LENGTH,
                                 num_bands=NUMBANDS,
                                 fmin=FMIN,
                                 fmax=FMAX,
                                 fref=440.0,
                                 norm_filters=True,
                                 unique_filters=True,
                                 circular_shift=False,
                                 norm=True)
            x = LogarithmicFilteredSpectrogram(audio, **audio_options)
            x = x.T
            x = x / np.max(x)
            x = np.expand_dims(x, 0)
            return (np.array(x) * 65535).astype(np.uint16)

        N_BINS = 229

        spectrogram_thumb = "kelz-fmin{}-fmax{}-bands{}-hop{}-uint16".format(
            FMIN, FMAX, NUMBANDS, HOP_LENGTH)
        spectrogram_info = (1, N_BINS, HOP_LENGTH, FMIN)

    elif args.spectrogram == "cqt_kelz":
        FMIN = 32.7
        BINS_PER_OCTAVE = 48
        N_BINS = BINS_PER_OCTAVE * 5
        top_db = 110
        filter_scale = 1.0

        def spectrogram_function(audio, samplerate):
            cqt = librosa.cqt(audio,
                              sr=samplerate,
                              hop_length=HOP_LENGTH,
                              fmin=FMIN,
                              n_bins=N_BINS,
                              bins_per_octave=BINS_PER_OCTAVE,
                              filter_scale=filter_scale)

            log_cqt = (librosa.core.amplitude_to_db(
                np.abs(cqt), ref=np.max, top_db=top_db) / top_db) + 1.0
            log_cqt = np.expand_dims(log_cqt, 0)
            return (log_cqt * 65535).astype(np.uint16)

        spectrogram_thumb = "cqt-fmin{}-oct{}-octbins{}-hop{}-db{}-fs{}-uint16".format(
            FMIN, N_BINS / BINS_PER_OCTAVE, BINS_PER_OCTAVE, HOP_LENGTH,
            top_db, filter_scale)
        spectrogram_info = (1, N_BINS, HOP_LENGTH, FMIN)

    else:
        spectrogram_function, spectrogram_thumb, spectrogram_info = common.spectrograms(
            args)

    # save spectrogram_thumb to hyperparams
    args.spectrogram_thumb = spectrogram_thumb

    with network.session.graph.as_default():

        def preload_fn(aa):
            annot_path, uid = aa.annotation
            if uid.startswith("mdb_"):
                uid = uid + "_mel4"
            if uid.startswith("maps_"):
                aa.annotation = datasets.Annotation.from_midi(
                    annot_path,
                    uid,
                    hop_samples=args.frame_width * args.samplerate / 44100,
                    unique_mf0=True)
            else:
                aa.annotation = datasets.Annotation.from_time_series(
                    annot_path,
                    uid,
                    hop_samples=args.frame_width * args.samplerate / 44100,
                    unique_mf0=True)
            aa.audio.load_resampled_audio(args.samplerate).load_spectrogram(
                spectrogram_function, spectrogram_thumb, spectrogram_info[2])

        def dataset_transform(tf_dataset, dataset):
            return tf_dataset.map(dataset.prepare_example,
                                  num_parallel_calls=args.threads).batch(
                                      args.batch_size_evaluation).prefetch(10)

        def dataset_transform_train(tf_dataset, dataset):
            return tf_dataset.shuffle(
                10**5).map(dataset.prepare_example,
                           num_parallel_calls=args.threads).batch(
                               args.batch_size).prefetch(10)

        small_hooks = [MetricsHook_mf0(), VisualOutputHook_mf0()]
        valid_hooks = [
            MetricsHook_mf0(),
            SaveBestModelHook(args.logdir, "Accuracy")
        ]
        test_hooks = [
            MetricsHook_mf0(write_summaries=True,
                            print_detailed=False,
                            split="test"),
            CSVBatchOutputWriterHook_mf0(output_reference=True)
        ]
        train_dataset, test_datasets, validation_datasets = common.prepare_datasets(
            args.datasets,
            args,
            preload_fn,
            dataset_transform,
            dataset_transform_train,
            small_hooks_mf0=small_hooks,
            valid_hooks=valid_hooks,
            test_hooks=test_hooks)

        network.construct(args,
                          create_model,
                          train_dataset.dataset.output_types,
                          train_dataset.dataset.output_shapes,
                          spectrogram_info=spectrogram_info)

    return network, train_dataset, validation_datasets, test_datasets