Exemple #1
0
def test_mux_of_muxes_single():
    # Check on Issue #79
    abc = pescador.Streamer('abc')
    xyz = pescador.Streamer('xyz')
    mux1 = pescador.Mux([abc, xyz],
                        k=2,
                        rate=None,
                        revive=True,
                        with_replacement=False,
                        prune_empty_streams=False)

    n123 = pescador.Streamer('123')
    n456 = pescador.Streamer('456')
    mux2 = pescador.Mux([n123, n456],
                        k=2,
                        rate=None,
                        revive=True,
                        with_replacement=False,
                        prune_empty_streams=False)

    mux3 = pescador.Mux([mux1, mux2],
                        k=2,
                        rate=None,
                        with_replacement=False,
                        revive=True,
                        prune_empty_streams=False)
    samples3 = list(mux3.iterate(max_iter=10000))
    count3 = collections.Counter(samples3)
    print(samples3[:10], count3)
    assert set('abcxyz123456') == set(count3.keys())
Exemple #2
0
def test_sampled_mux_of_muxes():
    def _cycle(values):
        while True:
            for v in values:
                yield v

    # Build some sample streams
    ab = pescador.Streamer(_cycle, 'ab')
    cd = pescador.Streamer(_cycle, 'cd')
    ef = pescador.Streamer(_cycle, 'ef')
    mux1 = pescador.Mux([ab, cd, ef],
                        k=3,
                        rate=None,
                        with_replacement=False,
                        revive=False)

    # And inspect the first mux
    samples1 = list(mux1(max_iter=6 * 10))
    count1 = collections.Counter(samples1)
    print(count1)
    assert set(count1.keys()) == set('abcdef')

    # Build another set of streams
    gh = pescador.Streamer(_cycle, 'gh')
    ij = pescador.Streamer(_cycle, 'ij')
    kl = pescador.Streamer(_cycle, 'kl')
    mux2 = pescador.Mux([gh, ij, kl],
                        k=3,
                        rate=None,
                        with_replacement=False,
                        revive=False)

    # And inspect the second mux
    samples2 = list(mux2(max_iter=6 * 10))
    count2 = collections.Counter(samples2)
    print(count2)
    assert set(count2.keys()) == set('ghijkl')

    # Merge the muxes together.
    mux3 = pescador.Mux([mux1, mux2],
                        k=2,
                        rate=None,
                        with_replacement=False,
                        revive=False)
    samples3 = list(mux3.iterate(max_iter=10000))
    count3 = collections.Counter(samples3)
    print(count3)
    assert set('abcdefghijkl') == set(count3.keys())
    max_count, min_count = max(count3.values()), min(count3.values())
    assert (max_count - min_count) / max_count < 0.2
Exemple #3
0
def data_generator(working, tracks, sampler, k, batch_size=32,
                   augmentation=False, weights=None, **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []
    pool_weights = []

    for track in tracks:
        fname = os.path.join(working, 'pump',
                             os.path.extsep.join([track, 'npz']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler))

        if weights is not None:
            pool_weights.append(weights.loc[track])

        if augmentation:
            for fname in sorted(glob(os.path.join(working, 'pump',
                                                  '{}.*.npz'.format(track)))):
                seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                if weights is not None:
                    pool_weights.append(weights.loc[track])

    # Send it all to a mux
    if not pool_weights:
        pool_weights = None

    mux = pescador.Mux(seeds, k, pool_weights=pool_weights, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemple #4
0
def data_generator(working,
                   tracks,
                   sampler,
                   k,
                   augment=True,
                   augment_drc=True,
                   batch_size=32,
                   **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tqdm(tracks):
        fname = os.path.join(working, os.path.extsep.join([str(track), 'h5']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler))

        if augment:
            for aug in range(4):
                augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                # seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                seeds.append(pescador.Streamer(data_sampler, augname, sampler))

    # Send it all to a mux
    mux = pescador.Mux(seeds, k, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemple #5
0
def data_generator(working,
                   tracks,
                   sampler,
                   k,
                   threshold,
                   augment=True,
                   batch_size=32,
                   **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tqdm(tracks):
        fname = os.path.join(working, os.path.extsep.join([str(track), 'h5']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler,
                                       threshold))

        if augment:
            for augname in sorted(
                    glob(os.path.join(working, '{}.*.h5'.format(track)))):
                seeds.append(
                    pescador.Streamer(data_sampler, augname, sampler,
                                      threshold))

    # Send it all to a mux
    mux = pescador.Mux(seeds, k, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.BufferedStreamer(mux, batch_size)
Exemple #6
0
def test_mux_bad_weights():
    with pytest.raises(pescador.PescadorError):
        streamers = [
            pescador.Streamer(T.finite_generator, 10) for _ in range(5)
        ]

        # 5 streamers, all-zeros weight vector should trigger an error
        pescador.Mux(streamers, None, weights=np.zeros(5))
Exemple #7
0
def test_mux_bad_streamers():
    with pytest.raises(pescador.PescadorError):
        steamers = [
            pescador.Streamer(T.finite_generator, 10) for _ in range(5)
        ]

        # 5 steamers, 10 weights, should trigger an error
        pescador.Mux(steamers, None, weights=np.random.randn(10))
Exemple #8
0
def test_restart_mux():
    s1 = pescador.Streamer('abc')
    s2 = pescador.Streamer('def')
    mux = pescador.Mux([s1, s2],
                       k=2,
                       rate=None,
                       revive=True,
                       with_replacement=False,
                       random_state=1234)
    assert len(list(mux(max_iter=100))) == len(list(mux(max_iter=100)))
Exemple #9
0
def data_generator(subset_path, k=32, batch_size=64, random_state=20171021,
                   precompute=False, num_distractors=1, augment=False, rate=32,
                   max_videos=None, include_metadata=False, cycle=True):
    """Sample video and audio from data_dir, returns a streamer that yield samples infinitely.

    Args:
        subset_path: path to subset file
        k: number of concurrent open streamer
        batch_size: batch size
        random_state: Value used to initialize state of RNG
        num_distractors: Number of pairs to generate a stream for each video

    Returns:
        A generator that yield infinite video and audio samples from data_dir

    """

    random.seed(random_state)
    np.random.seed(random_state)


    LOGGER.info("Loading subset list")
    file_list = read_csv_as_dicts(subset_path)

    LOGGER.info("Creating streamers...")
    if max_videos is not None and max_videos < len(file_list):
        LOGGER.info("Using a subset of {} videos".format(max_videos))
        random.shuffle(file_list)
        file_list = file_list[:max_videos]

    seeds = []
    for video_1 in tqdm(file_list):
        for _ in range(num_distractors):
            video_2 = video_1
            # Make sure we sample a different file
            while video_2 == video_1:
                video_2 = random.choice(file_list)

            streamer = pescador.Streamer(sampler, video_1, video_2,
                                         rate=rate, augment=augment,
                                         precompute=precompute,
                                         include_metadata=include_metadata)
            seeds.append(streamer)

    # Randomly shuffle the seeds
    random.shuffle(seeds)

    mux = pescador.Mux(seeds, k, rate=rate, random_state=random_state)
    if cycle:
        mux = mux.cycle()

    if batch_size == 1:
        return mux
    else:
        return pescador.maps.buffer_stream(mux, batch_size)
Exemple #10
0
def test_mux_of_muxes_itered():
    # Check on Issue #79
    abc = pescador.Streamer('abc')
    xyz = pescador.Streamer('xyz')
    mux1 = pescador.Mux([abc, xyz],
                        k=10,
                        rate=None,
                        prune_empty_streams=False,
                        revive=True,
                        random_state=135)
    samples1 = mux1.iterate(max_iter=1000)
    count1 = collections.Counter(samples1)
    print(count1)
    assert set('abcxyz') == set(count1.keys())

    n123 = pescador.Streamer('123')
    n456 = pescador.Streamer('456')
    mux2 = pescador.Mux([n123, n456],
                        k=10,
                        rate=None,
                        prune_empty_streams=False,
                        revive=True,
                        random_state=246)
    samples2 = mux2.iterate(max_iter=1000)
    count2 = collections.Counter(samples2)
    print(count2)
    assert set('123456') == set(count2.keys())

    # Note that (random_state=987, k=2) fails.
    mux3 = pescador.Mux([mux1, mux2],
                        k=10,
                        rate=None,
                        prune_empty_streams=False,
                        revive=True,
                        random_state=987)
    samples3 = mux3.iterate(max_iter=1000)
    count3 = collections.Counter(samples3)
    print(count3)
    assert set('abcxyz123456') == set(count3.keys())
Exemple #11
0
def test_critical_mux():
    # Check on Issue #80
    chars = 'abcde'
    streamers = [pescador.Streamer(x * 5) for x in chars]
    mux = pescador.Mux(streamers,
                       k=len(chars),
                       rate=None,
                       with_replacement=False,
                       revive=True,
                       prune_empty_streams=False,
                       random_state=135)
    samples = mux.iterate(max_iter=1000)
    print(collections.Counter(samples))
Exemple #12
0
def test_critical_mux_of_rate_limited_muxes():
    # Check on Issue #79
    def _choice(vals):
        while True:
            yield random.choice(vals)

    ab = pescador.Streamer(_choice, 'ab')
    cd = pescador.Streamer(_choice, 'cd')
    ef = pescador.Streamer(_choice, 'ef')
    mux1 = pescador.Mux([ab, cd, ef],
                        k=2,
                        rate=2,
                        with_replacement=False,
                        revive=True)

    gh = pescador.Streamer(_choice, 'gh')
    ij = pescador.Streamer(_choice, 'ij')
    kl = pescador.Streamer(_choice, 'kl')

    mux2 = pescador.Mux([gh, ij, kl],
                        k=2,
                        rate=2,
                        with_replacement=False,
                        revive=True)

    mux3 = pescador.Mux([mux1, mux2],
                        k=2,
                        rate=None,
                        with_replacement=False,
                        revive=True)
    samples = list(mux3.iterate(max_iter=10000))
    count = collections.Counter(samples)
    max_count, min_count = max(count.values()), min(count.values())
    assert (max_count - min_count) / max_count < 0.2
    print(count)
    assert set('abcdefghijkl') == set(count.keys())
Exemple #13
0
def keras_generator(data_list, input_patch_size):
    """Generator to be passed to a keras model
    """
    streams = []
    for fpath_in, fpath_out in data_list:
        streams.append(
            pescador.Streamer(patch_generator,
                              fpath_in,
                              fpath_out,
                              input_patch_size=input_patch_size))

    stream_mux = pescador.Mux(streams,
                              10,
                              with_replacement=True,
                              lam=500,
                              random_state=RANDOM_STATE)

    batch_generator = pescador.BufferedStreamer(stream_mux, 16)

    for batch in batch_generator.tuples('X', 'Y'):
        yield batch
Exemple #14
0
def multiplex_tfr(aug_kind_str,
                  fold_units,
                  n_hops,
                  batch_size,
                  tfr_str="logmelspec"):
    # Parse augmentation kind string (aug_kind_str).
    if aug_kind_str == "none":
        augs = ["original"]
    elif aug_kind_str == "pitch":
        augs = ["original", "pitch"]
    elif aug_kind_str == "stretch":
        augs = ["original", "stretch"]
    elif aug_kind_str == "all-but-noise":
        augs = ["original", "pitch", "stretch"]
    else:
        noise_augs = ["noise-" + unit_str for unit_str in fold_units]
        if aug_kind_str == "all":
            augs = noise_augs + ["original", "pitch", "stretch"]
        elif aug_kind_str == "noise":
            augs = noise_augs + ["original"]

    # Generate a Pescador streamer for every HDF5 container, that is,
    # every unit-augmentation-instance triplet.
    aug_dict = get_augmentations()
    data_dir = get_data_dir()
    dataset_name = get_dataset_name()
    tfr_name = "_".join([dataset_name, "clip-" + tfr_str])
    tfr_dir = os.path.join(data_dir, tfr_name)
    streams = []
    for aug_str in augs:
        aug_dir = os.path.join(tfr_dir, aug_str)
        if aug_str == "original":
            instances = [aug_str]
        else:
            n_instances = aug_dict[aug_str]
            instances = [
                "-".join([aug_str, str(instance_id)])
                for instance_id in range(n_instances)
            ]
        if aug_str[:5] == "noise" and tfr_str == "logmelspec":
            bias = np.float32(-17.0)
        else:
            bias = np.float32(0.0)
        for instanced_aug_str in instances:
            for unit_str in fold_units:
                lms_name = "_".join(
                    [dataset_name, instanced_aug_str, unit_str])
                lms_path = os.path.join(aug_dir, lms_name + ".hdf5")
                stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias,
                                           tfr_str)
                streams.append(stream)

    # Multiplex streamers together.
    mux = pescador.Mux(streams,
                       k=len(streams),
                       lam=None,
                       with_replacement=True,
                       revive=True)

    # Create buffered streamer with specified batch size.
    buffered_streamer = pescador.BufferedStreamer(mux, batch_size)

    return buffered_streamer.tuples("X", "y", cycle=True)
Exemple #15
0
def data_generator_balanced(working,
                            tracks,
                            sampler,
                            k,
                            augment=True,
                            augment_drc=True,
                            batch_size=32,
                            **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    labelfile = (
        '/beegfs/js7561/datasets/dcase2017/task4_official/combined/metadata/'
        'labels/groundtruth_weak_label_training_set.csv')
    labels = pd.read_csv(labelfile, header=None, sep='\t')
    labels.columns = ['filename', 'start_time', 'end_time', 'label']

    muxes = []

    for l in DCASE_CLASSES:
        lclass = labels[labels.label == l]
        filenames = lclass.filename.values
        filenames = [('Y{}'.format(fn)).replace('.wav', '')
                     for fn in filenames]
        tracks_str = [str(t) for t in tracks]
        filenames = np.intersect1d(np.array(tracks_str), np.array(filenames))
        print(l, len(filenames))

        seeds = []

        for track in tqdm(filenames):
            fname = os.path.join(working,
                                 os.path.extsep.join([str(track), 'h5']))
            seeds.append(pescador.Streamer(data_sampler, fname, sampler))

            if augment:
                # for fname in sorted(glob(os.path.join(working,
                #                                       '{}.*.h5'.format(track)))):
                for aug in range(10):
                    augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                    # seeds.append(pescador.Streamer(data_sampler, fname, sampler))
                    seeds.append(
                        pescador.Streamer(data_sampler, augname, sampler))

            if augment_drc:
                for aug in range(10, 14):
                    augname = fname.replace('.h5', '.{:d}.h5'.format(aug))
                    seeds.append(
                        pescador.Streamer(data_sampler, augname, sampler))

        # Send it all to a mux
        n_active = k // len(DCASE_CLASSES)
        mux = pescador.Mux(seeds, n_active, **kwargs)
        # Add mux to list
        muxes.append(mux)

    # Create mux from muxes
    supermux = pescador.Mux(muxes,
                            len(muxes),
                            lam=None,
                            pool_weights=None,
                            with_replacement=True)

    if batch_size == 1:
        return supermux
    else:
        return pescador.BufferedStreamer(supermux, batch_size)
Exemple #16
0
def multiplex_lms_with_background(aug_kind_str, fold_units, n_input_hops,
                                  batch_size):

    # Define constants.
    aug_dict = localmodule.get_augmentations()
    data_dir = localmodule.get_data_dir()
    dataset_name = localmodule.get_dataset_name()
    tfr_name = "_".join([dataset_name, "clip-logmelspec"])
    tfr_dir = os.path.join(data_dir, tfr_name)
    bg_name = "_".join([dataset_name, "clip-logmelspec-backgrounds"])
    bg_dir = os.path.join(data_dir, bg_name)
    T_str = "T-" + str(bg_duration).zfill(4)
    T_dir = os.path.join(bg_dir, T_str)

    # Parse augmentation kind string (aug_kind_str).
    if aug_kind_str == "none":
        augs = ["original"]
    elif aug_kind_str == "pitch":
        augs = ["original", "pitch"]
    elif aug_kind_str == "stretch":
        augs = ["original", "stretch"]
    elif aug_kind_str == "all-but-noise":
        augs = ["original", "pitch", "stretch"]
    else:
        noise_augs = ["noise-" + unit_str for unit_str in fold_units]
        if aug_kind_str == "all":
            augs = noise_augs + ["original", "pitch", "stretch"]
        elif aug_kind_str == "noise":
            augs = noise_augs + ["original"]

    # Loop over augmentations.
    streams = []
    for aug_str in augs:

        # Define instances.
        aug_dir = os.path.join(tfr_dir, aug_str)
        if aug_str == "original":
            instances = [aug_str]
        else:
            n_instances = aug_dict[aug_str]
            instances = [
                "-".join([aug_str, str(instance_id)])
                for instance_id in range(n_instances)
            ]

        # Define bias.
        if aug_str[:5] == "noise":
            bias = np.float32(-17.0)
        else:
            bias = np.float32(0.0)

        # Loop over instances.
        for instanced_aug_str in instances:

            # Loop over units.
            for unit_str in fold_units:

                # Define path to time-frequency representation.
                lms_name = "_".join(
                    [dataset_name, instanced_aug_str, unit_str])
                lms_path = os.path.join(aug_dir, lms_name + ".hdf5")

                # Define path to background.
                bg_name = "_".join([
                    dataset_name, "background_summaries", unit_str,
                    T_str + ".hdf5"
                ])
                bg_path = os.path.join(T_dir, bg_name)

                # Define pescador streamer.
                stream = pescador.Streamer(yield_lms_and_background, lms_path,
                                           n_input_hops, bias, bg_path)
                streams.append(stream)

    # Multiplex streamers together.
    mux = pescador.Mux(streams,
                       k=len(streams),
                       lam=None,
                       with_replacement=True,
                       revive=True)

    # Create buffered streamer with specified batch size.
    buffered_streamer = pescador.BufferedStreamer(mux, batch_size)

    return pescador.maps.keras_tuples(buffered_streamer,
                                      inputs=["X_spec", "X_bg"],
                                      outputs=["y"])
Exemple #17
0
##############################################
# Put it all together
##############################################
input_shape, (X_train, Y_train), (X_test, Y_test) = setup_data()
steps_per_epoch = len(X_train) // batch_size

# Create two streams from the same data, where one of the streams
# adds a small amount of Gaussian noise. You could easily perform
# other data augmentations using the same 'map' strategy.
stream = pescador.Streamer(sampler, X_train, Y_train)
noisy_stream = pescador.Streamer(additive_noise, stream, 'X')

# Multiplex the two streamers together.
mux = pescador.Mux(
    [stream, noisy_stream],
    # Two streams, always active.
    k=2,
    # We want to sample from each stream infinitely.
    rate=None)

# Buffer the stream into minibatches.
batches = pescador.buffer_stream(mux, batch_size)

model = build_model(input_shape)
try:
    print("Start time: {}".format(datetime.datetime.now()))
    model.fit_generator(pescador.tuples(batches, 'X', 'y'),
                        steps_per_epoch=steps_per_epoch,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(X_test, Y_test))
except KeyboardInterrupt:
        i = np.random.randint(0, n)
        yield {'X': X[i], 'Y': y[i]}


streams = [pescador.Streamer(npz_generator, x) for x in datasets]

##############################################
# Option 1: Stream equally from each dataset
##############################################
# If you can easily fit all the datasets in memory and you want to
# sample from then equally, you would set up your Mux as follows:

mux = pescador.Mux(
    streams,
    # Three streams, always active.
    k=len(streams),
    # We want to sample from each stream infinitely,
    # so we turn off the rate parameter, which
    # controls how long to sample from each stream.
    rate=None)

##############################################
# Option 2: Sample from one at a time.
##############################################
# Another approach might be to restrict sampling to one stream at a time.
# Now, the rate parameter controls (statistically) how long to sample
# from a stream before activating a new stream.

mux = pescador.Mux(
    streams,
    # Only allow one active stream
    k=1,
Exemple #19
0
def multitask_generator(mtrack_list,
                        json_path=JSON_PATH,
                        data_types=DATA_TYPES,
                        tasks=TASKS,
                        mux_weights=None,
                        add_frequency=False,
                        augment=True,
                        n_harms=5):

    typed_data = get_grouped_data(json_path, mtrack_list)
    task_pairs = get_all_task_pairs(typed_data)

    # make a streamer for each data type and task
    data_streamers = {}
    for data_type in data_types:
        if data_type == 'XA':
            data_streamers[data_type] = {'melody': [], 'bass': [], 'vocal': []}
        elif data_type == 'XB':
            data_streamers[data_type] = {
                'multif0': [],
                'melody': [],
                'bass': [],
                'vocal': [],
                'piano': [],
                'guitar': []
            }
        elif data_type == 'XC':
            data_streamers[data_type] = {
                'multif0': [],
                'melody': [],
                'bass': [],
                'vocal': [],
                'piano': []
            }
        else:
            data_streamers[data_type] = {
                'multif0': [],
                'melody': [],
                'bass': [],
                'vocal': []
            }

        for task in data_streamers[data_type].keys():
            for pair in task_pairs[data_type][task]:
                data_streamers[data_type][task].append(
                    pescador.Streamer(multitask_patch_generator, pair[0],
                                      pair[1], tasks, 20, (360, 50),
                                      add_frequency, augment, n_harms))

    # for each data type make a mux
    n_active = 10
    data_muxes = {}
    for data_type in data_types:
        data_muxes[data_type] = {}
        for task in data_streamers[data_type].keys():
            data_muxes[data_type][task] = pescador.Mux(
                data_streamers[data_type][task],
                n_active,
                with_replacement=True,
                lam=250,
                random_state=42)

    # for each task make a mux that samples from the data muxes
    task_streams = {}
    for task in tasks:
        task_streams[task] = []
        for data_type in data_types:
            if task in data_muxes[data_type].keys():
                task_streams[task].append(data_muxes[data_type][task])

    if mux_weights is None:
        mux_weights = {}
        for task in tasks:
            n_data_types = len(task_streams[task])
            mux_weights[task] = np.ones((n_data_types, )) / float(n_data_types)

    task_muxes = {}
    for task in tasks:
        task_muxes[task] = pescador.Mux(task_streams[task],
                                        1,
                                        pool_weights=mux_weights[task])

    batch_gen = multitask_batch_generator(task_muxes, tasks)

    for batch in batch_gen:
        yield batch