def create_zmq_mux(streamers, num_cpus, active_streamers, streamer_rate, weights=None):
    num_streamers = len(streamers)
    if weights is None:
        weights = np.ones((num_streamers,))

    weights = np.array(weights)

    # Normalize to sum to 1
    weights = weights / weights.sum()

    partition_size = max(int(num_streamers / float(num_cpus)), 1)

    zmq_streamers = []
    zmq_weights = []

    actual_num_cpus = 0

    for idx in range(num_cpus):
        start = partition_size * idx
        stop = min(len(streamers), start + partition_size)
        if start >= stop:
            break

        actual_num_cpus += 1

        weight = sum(weights[start:stop])
        zmq_weights.append(weight)

        sub_weights = np.array(weights[start:stop]) / weight
        zmq_streamers.append(pescador.ZMQStreamer(pescador.StochasticMux(streamers[start:stop],
                                                               n_active=int(active_streamers * weight),
                                                               rate=streamer_rate,
                                                               weights=sub_weights)))

    return pescador.StochasticMux(zmq_streamers, n_active=actual_num_cpus, rate=None, weights=zmq_weights)
Beispiel #2
0
    def _build_streamer(self, start_index: int,
                        end_index: int) -> pescador.Streamer:
        """Create a pescador streamer for the provided indecies into the dataset."""
        if (self.streamer_settings["n_frames"] is None
                or self.streamer_settings["n_target_frames"] is None):
            raise ValueError(
                "n_famres and n_target frames are currently required in the config "
                "for an Iterable dataset.")

        audiofile_streamers = [
            _gen_frames(
                self.audioset_dataset,
                index,
                self.streamer_settings["n_frames"],
                self.streamer_settings["n_target_frames"],
            ) for index in range(start_index, end_index)
        ]

        if self.evaluate:
            audiofile_mux = pescador.RoundRobinMux(audiofile_streamers)
        else:
            audiofile_mux = pescador.StochasticMux(
                audiofile_streamers,
                # todo: eventually, this should probably be a function of
                #   <batch size> & <# workers>
                # should probably be (batch_size / num_workers)
                n_active=self.streamer_settings["n_active"],
                # on average how many samples are generated from a stream before it dies
                rate=self.streamer_settings["rate"],
            )

        return audiofile_mux
def lstm_data_generator(speech_list, noise_dir, srir_dir, sc_to_pos_dict,
                        num_frames, num_frames_hop, fft_size,
                        hop_size, sr, batch_size,
                        active_streamers, rate, random_state=12345678):

    sc_list = get_sc_list(sc_to_pos_dict)
    azi_list, elv_list = zip(*sc_list)
    azi_list = np.array(list(azi_list))
    elv_list = np.array(list(elv_list))
    steer_mat = steer_vector(azi_list, elv_list)

    seeds = []
    for speech_path in speech_list:
        if not speech_path.endswith('.wav'):
            continue

        streamer = pescador.Streamer(lstm_speech_mask_sampler,
                                     speech_path, noise_dir, srir_dir, sc_to_pos_dict,
                                     azi_list, elv_list, steer_mat,
                                     num_frames, num_frames_hop,
                                     fft_size, hop_size, sr)
        seeds.append(streamer)

    # Randomly shuffle the seeds
    random.shuffle(seeds)

    mux = pescador.StochasticMux(seeds, active_streamers, rate=rate, random_state=random_state)

    if batch_size == 1:
        return mux
    else:
        return pescador.maps.buffer_stream(mux, batch_size)
    def __init__(self,
                 source_filepath,
                 seq_len=512,
                 hop=None,
                 normalize=True,
                 transform=None,
                 restart_streams=False):
        super(MusicDataset).__init__()
        source_folder = Path(source_filepath)
        self.seq_len = seq_len

        if hop == None:
            hop = seq_len

        self.hop = hop

        self.normalize = normalize
        self.transform = transform

        # get songs' path
        songs = []
        for root, dirs, files in os.walk(source_folder):
            for name in files:
                songs.append(os.path.join(root, name))

        # let's restrict to wav files (damn .DS_Store)
        songs = [song for song in songs if song.endswith('.wav')]

        # get songs length
        data = []
        for song in songs:
            # get audio info
            song_info = torchaudio.info(song)
            data.append({
                "path": song,
                "len": int(song_info[0].length / song_info[0].channels)
            })

        self.data = data

        # muxing different streams
        if restart_streams:
            streams = [
                pescador.Streamer(generate_rnd_chunk, track['path'],
                                  track['len'], seq_len, normalize, transform)
                for track in data
            ]
            self.mux = pescador.ShuffledMux(streams)
        else:
            streams = [
                pescador.Streamer(generate_chunk, track['path'], track['len'],
                                  seq_len, hop, normalize, transform)
                for track in data
            ]
            self.mux = pescador.StochasticMux(streams,
                                              len(streams),
                                              rate=None,
                                              mode='exhaustive')
def data_generator(directories, sampler, k, rate, batch_size=16, slices=None, **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []
    for working in directories:
        for track in tqdm(find_files(working,ext='h5')):
            fname = os.path.join(working,track)
            seeds.append(data_sampler(fname, sampler, slices))

    # Send it all to a mux
    mux = pescador.StochasticMux(seeds, k, rate, mode='with_replacement', **kwargs)

    return pescador.buffer_stream(mux, batch_size, axis=0)
Beispiel #6
0
def data_generator(working, tracks, sampler, k, augment=True, rate=8, **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tracks:
        fname = os.path.join(working,
                             os.path.extsep.join([track, 'h5']))
        seeds.append(pescador.Streamer(data_sampler, fname, sampler))

        if augment:
            for fname in sorted(glob(os.path.join(working,
                                                  '{}.*.h5'.format(track)))):
                seeds.append(pescador.Streamer(data_sampler, fname, sampler))

    # Send it all to a mux
    return pescador.StochasticMux(seeds, k, rate, **kwargs)
def keras_generator(data_list, input_patch_size, batch_size=16, active_str=200, muxrate=20):
    """Generator to be passed to a keras model
    """
    streams = []
    for fpath_in, fpath_out in data_list:

        print("Data list shape is {}".format(len(data_list)))

        streams.append(
            pescador.Streamer(
                patch_generator, fpath_in, fpath_out,
                input_patch_size=input_patch_size
            )
        )

    stream_mux = pescador.StochasticMux(streams, active_str, rate=muxrate, mode='with_replacement', random_state=RANDOM_STATE)

    batch_generator = pescador.buffer_stream(stream_mux, batch_size)

    for batch in batch_generator:
        print("\n Batch length: ".format(len(batch['X1'])))
        yield [batch['X1'], batch['X2']], batch['Y']
Beispiel #8
0
def data_generator(working,
                   tracks,
                   sampler,
                   k,
                   batch_size=32,
                   slices=None,
                   **kwargs):
    '''Generate a data stream from a collection of tracks and a sampler'''

    seeds = []

    for track in tqdm(tracks):
        fname = working + os.path.extsep.join([str(track), 'h5'])
        seeds.append(
            pescador.Streamer(data_sampler, fname, sampler, slices=slices))

    # Send it all to a mux
    '''updated!'''
    mux = pescador.StochasticMux(seeds, k, **kwargs)

    if batch_size == 1:
        return mux
    else:
        return pescador.buffer_stream(mux, batch_size, axis=0)
Beispiel #9
0
##############################################
# Put it all together
##############################################
input_shape, (X_train, Y_train), (X_test, Y_test) = setup_data()
steps_per_epoch = len(X_train) // batch_size

# Create two streams from the same data, where one of the streams
# adds a small amount of Gaussian noise. You could easily perform
# other data augmentations using the same 'map' strategy.
stream = sampler(X_train, Y_train)
noisy_stream = additive_noise(stream, 'X')

# Multiplex the two streamers together.
mux = pescador.StochasticMux(
    [stream, noisy_stream],
    # Two streams, always active.
    n_active=2,
    # We want to sample from each stream infinitely.
    rate=None)

# Buffer the stream into minibatches.
batches = pescador.buffer_stream(mux, batch_size)

model = build_model(input_shape)
try:
    print("Start time: {}".format(datetime.datetime.now()))
    model.fit_generator(pescador.tuples(batches, 'X', 'y'),
                        steps_per_epoch=steps_per_epoch,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(X_test, Y_test))
except KeyboardInterrupt:
Beispiel #10
0
# First, let's make a simple generator that makes an infinite
# sequence of a given letter.
def letter(c):
    while True:
        yield c


# Let's make the two populations of streamers
pop1 = [pescador.Streamer(letter, c) for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ']
pop2 = [pescador.Streamer(letter, c) for c in 'abcdefghijklmnopqrstuvwxyz']

# We'll sample population 1 with 3 streamers active at any time.
# Each streamer will generate, on average, 5 samples before being
# replaced.
mux1 = pescador.StochasticMux(pop1, 3, 5)

# Let's have 5 active streamers for population 2, and replace
# them after 2 examples on average.
mux2 = pescador.StochasticMux(pop2, 5, 2)

####################
# Mux composition
####################
# We multiplex the two populations using a ShuffledMux.
# The ShuffledMux keeps all of its input streamers active,
# and draws samples independently at random from each one.

# This should generate an approximately equal number of upper- and
# lower-case letters, with more diversity among the lower-case letters.
hier_mux = pescador.ShuffledMux([mux1, mux2])
                                                    config['classes_vector'],
                                                    label2ids_train,
                                                    label2ids_val, config)
        [ids_train, ids_val, _] = tmp_data

        # pescador train: define streamer
        train_pack = [
            config, config['train_sampling'], config['param_train_sampling']
        ]
        train_streams = [
            pescador.Streamer(data_gen, id, id2audio_repr_path[id],
                              id2gt_train[id], train_pack) for id in ids_train
        ]
        train_mux_stream = pescador.StochasticMux(
            train_streams,
            n_active=config['batch_size'] * 2,
            rate=None,
            mode='exhaustive')
        train_batch_streamer = pescador.Streamer(
            pescador.buffer_stream,
            train_mux_stream,
            buffer_size=config['batch_size'],
            partial=True)

        # pescador val: define streamer
        val_batch_size = np.min([len(ids_val), config['val_batch_size']])
        val_pack = [config, 'overlap_sampling', 1]
        val_streams = [
            pescador.Streamer(data_gen, id, id2audio_repr_path[id],
                              id2gt_val[id], val_pack) for id in ids_val
        ]
Beispiel #12
0
#    previously used streamers to be re-activated.
#
# For epoch-based sampling, we will use `exhaustive` mode to ensure
# that streamers are not reactivated within the epoch.
#
# Since each data stream produces exactly `M` examples, this would lead
# to a finite sample stream (i.e., only one epoch).
# To prevent the mux from exiting after the first epoch, we'll use `cycle` mode.
#

k = 100  # or however many streamers you want simultaneously active

# We'll use `rate=None` here so that the number of samples per stream is
# determined by the streamer (`M`) and not the mux.

mux = pescador.StochasticMux(streams, k, rate=None, mode='exhaustive')

epoch_stream = mux(cycle=True)

####################
# The `epoch_stream` will produce an infinite sequence of iterates.
# The same samples are presented (in random order) in the
# first `N*M`, second `N*M`, etc. disjoint sub-sequences, each
# of which may be considered as an *epoch*.
#
# *NOTE*: for this approach to work with something like `keras`'s
# `fit_generator` method, you need to be able to explicitly calculate
# the duration of an epoch, which  means that the number of samples
# per streamer (`M` here) must be known in advance.
#
Beispiel #13
0
def multiplex_tfr(data_dir, n_hops, batch_size, mode="inference", aug_kind_str="none", tfr_str="logmelspec",
                  label_inputs=False, partial_labels=True, structured=True, active_streamers=32, streamer_rate=1024,
                  num_cpus=1, multi_label=False, align_perturb=False, single_output="fine"):
    tfr_dir = os.path.join(data_dir, tfr_str)
    streams = []

    # Parse augmentation kind string (aug_kind_str).
    if mode == "train":
        if aug_kind_str == "none":
            augs = ["original"]
        elif aug_kind_str == "pitch":
            augs = ["original", "pitch"]
        elif aug_kind_str == "stretch":
            augs = ["original", "stretch"]
        elif aug_kind_str == "all-but-noise":
            augs = ["original", "pitch", "stretch"]
        else:
            if aug_kind_str == "all":
                augs = ["original", "pitch", "stretch", "noise"]
            elif aug_kind_str == "noise":
                augs = ["original", "noise"]
            else:
                raise ValueError('Invalid augmentation kind: {}'.format(aug_kind_str))

        # Generate a Pescador streamer for every HDF5 container, that is,
        # every unit-augmentation-instance triplet.
        aug_dict = get_augmentations()
        aug_list = []
        class_list = []
        class_count = Counter()

        for aug_str in augs:
            if aug_str == "original":
                instances = [aug_str]
            else:
                n_instances = aug_dict[aug_str]
                instances = ["-".join([aug_str, str(instance_id+1)])
                    for instance_id in range(n_instances)]
            if aug_str == "noise" and tfr_str == "logmelspec":
                bias = np.float32(-17.0)
            else:
                bias = np.float32(0.0)
            for instanced_aug_str in instances:
                aug_dir = os.path.join(tfr_dir, instanced_aug_str)
                lms_name = "_".join(["*", instanced_aug_str])

                lms_pattern = os.path.join(aug_dir, lms_name + ".h5*")
                for lms_path in glob.glob(lms_pattern):
                    if not is_valid_data_hdf5(lms_path, partial_labels):
                        continue

                    taxonomy_code = os.path.splitext(os.path.basename(lms_path))[0].split('_')[1].replace('-', '.')

                    triplet = annotations.get_taxonomy_code_idx_triplet(taxonomy_code)
                    coarse_idx, medium_idx, fine_idx = triplet

                    if structured or single_output == "fine":
                        bal_idx = fine_idx
                    elif single_output == "medium":
                        bal_idx = medium_idx
                    elif single_output == "coarse":
                        bal_idx = coarse_idx
                    else:
                        raise ValueError("Invalid single output mode:{}".format(single_output))

                    class_list.append(bal_idx)
                    class_count[bal_idx] += 1

                    aug_list.append(aug_str)

                    stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb)
                    streams.append(stream)

        num_streamers = len(streams)
        num_fine_classes = len(class_count)
        num_aug = len([k for k in aug_dict.keys() if k != "original"])
        class_weights = {cls: (num_streamers / float(num_fine_classes * count)) for cls, count in class_count.items()}
        aug_weights = {aug: 1.0 if aug == "original" else 1.0 / num_aug for aug, n_inst in aug_dict.items()}

        # Weight examples to balance for class, such that each file is sampled from evenly per class. Additionally,
        # Balance so sampling any augmentation type (or original) is equally likely, despite the number of instances
        # per augmentation. Within augmentation types, instances are equally likely.
        weights = [class_weights[cls] * aug_weights[aug] for cls, aug in zip(class_list, aug_list)]

        # Multiplex streamers together.
        if num_cpus > 1:
            mux = create_zmq_mux(streams, num_cpus, active_streamers, streamer_rate, weights=weights)
        else:
            mux = pescador.StochasticMux(streams, n_active=active_streamers, rate=streamer_rate, weights=weights)

        # Create buffered streamer with specified batch size.
        buffered_streamer = pescador.maps.buffer_stream(mux, batch_size)
    else:
        # If not dealing with augmentations, just go through all HDF5 files
        weights = None
        bias = np.float32(0.0)
        for fname in os.listdir(data_dir):
            lms_path = os.path.join(data_dir, fname)

            if not is_valid_data_hdf5(lms_path, partial_labels):
                continue

            stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb)
            streams.append(stream)

        # Multiplex streamers together, but iterate exhaustively.
        mux = pescador.ChainMux(streams, mode='exhaustive')

        # Create buffered streamer with specified batch size.
        buffered_streamer = cycle_partial_buffer_stream(mux, batch_size)

    inputs = ["tfr_input"]
    if mode in ('train', 'valid') and structured and label_inputs:
        inputs += ["coarse_label_input", "medium_label_input"]

    if structured:
        outputs = ["y_coarse", "y_medium", "y_fine"]
    else:
        outputs = ["y_" + single_output]

    return pescador.maps.keras_tuples(buffered_streamer,
                                      inputs=inputs,
                                      outputs=outputs)
Beispiel #14
0
        i = np.random.randint(0, n)
        yield {'X': X[i], 'Y': y[i]}


streams = [npz_generator(x) for x in datasets]

##############################################
# Option 1: Stream equally from each dataset
##############################################
# If you can easily fit all the datasets in memory and you want to
# sample from then equally, you would set up your Mux as follows:

mux = pescador.StochasticMux(
    streams,
    # Three streams, always active.
    n_active=len(streams),
    # We want to sample from each stream infinitely,
    # so we turn off the rate parameter, which
    # controls how long to sample from each stream.
    rate=None)

##############################################
# Option 2: Sample from one at a time.
##############################################
# Another approach might be to restrict sampling to one stream at a time.
# Now, the rate parameter controls (statistically) how long to sample
# from a stream before activating a new stream.

mux = pescador.StochasticMux(
    streams,
    # Only allow one active stream
    n_active=1,
    # load ground truth
    FILE_GROUND_TRUTH_TEST = config_file.DATA_FOLDER + 'index/' + DATASET + '/gt_' + DATASET + '_fold' + str(
        config['fold']) + '_test.tsv'
    [ids, id2gt] = shared.load_id2gt(FILE_GROUND_TRUTH_TEST)
    [_, id2label] = shared.load_id2label(FILE_GROUND_TRUTH_TEST)
    print(FILE_GROUND_TRUTH_TEST)

    # pescador: define (finite, batched & parallel) streamer
    pack = [config, 'overlap_sampling', 1]
    streams = [
        pescador.Streamer(sl_train.data_gen, id, id2audio_repr_path[id],
                          id2gt[id], pack) for id in ids
    ]
    mux_stream = pescador.StochasticMux(streams,
                                        n_active=TEST_BATCH_SIZE * 2,
                                        rate=None,
                                        mode='exhaustive')
    batch_streamer = pescador.Streamer(pescador.buffer_stream,
                                       mux_stream,
                                       buffer_size=TEST_BATCH_SIZE)
    #batch_streamer = pescador.ZMQStreamer(batch_streamer)

    # tensorflow: define model and cost
    [x, y_, is_train, y, normalized_y,
     cost] = sl_train.tf_define_model_and_cost(config)

    # tensorflow: compute the accuracy of each model
    accuracies = []
    fgt = open(experiment_folder + 'models.list')
    for model_name in fgt.readlines():
        print(model_name)
Beispiel #16
0
def generate_urls(
    queries: Dict,
    label: Optional[str] = None,
    split_streams_by: Optional[Union[str, List]] = None,
    subset_streams: Optional[Union[str, Dict]] = None,
    nb_samples_per_stream: Optional[int] = None,
    nb_samples: Optional[int] = None,
    weighted_streams: bool = False,
    cache_requests: bool = False,
    mediatype: str = "StillImage",
    license_info: bool = True,
    one_media_per_occurrence: bool = True,
    verbose: bool = False,
):
    """Provides url generator from given query

    Args:
        queries (Dict): dictionary of queries supported by the GBIF api
        label (str, optional): Output label name.
            Defaults to `None` which yields all metadata.
        nb_samples (int): Limit the total number of samples retrieved from the API.
            When set to -1 and `split_streams_by` is not `None`,
            a minimum number of samples will be calculated
            from using the number of available samples per stream.
            Defaults to `None` which retrieves all samples from all streams until
            all streams are exchausted.
        nb_samples_per_stream (int): Limit the maximum number of items to be retrieved per stream.
            Defaults to `None` which retrieves all samples from stream until
            stream generator is exhausted.
        split_streams_by (Optional[Union[str, List]], optional): Stream identifiers to be balanced.
            Defaults to None.
        subset_streams (Optional[Union[str, Dict]], optional): Map certain streams into
            separate subsets, by setting the `subset` metadata. Supports a remainder
            value of `"*"` which acts as a wildcard. Defaults to None.
            E.g. `subset_streams={"train": { "speciesKey": [5352251, 3190653]},
            "test": { "speciesKey": "*" }}` will move species of 5352251 and 3190653
            into `train` whereas all other species will go into test.
        weighted_streams (int): Calculates sampling weights for all streams and applies them during
            sampling. To be combined with nb_samples not `None`.
            Defaults to `False`.
        cache_requests (bool, optional): Enable GBIF API cache.
            Can significantly improve API requests. Defaults to False.
        mediatype (str): supported GBIF media type. Can be `StillImage`, `MovingImage`, `Sound`.
            Defaults to `StillImage`.
        license_info (bool): retrieve images license information. Default to True.
        one_media_per_occurrence (bool): only retrieve one media in multiple media occurrences. Default to True,


    Returns:
        Iterable: generate-like object, that yields dictionaries
    """
    streams = []
    # set pygbif api caching
    pygbif.caching(cache_requests)

    # copy queries since we delete keys from the dict
    q = queries.copy()

    # if weighted_streams and nb_samples_per_stream is not None:
    #     raise RuntimeError("weights can only be applied when the number of samples are limited.")

    # Split queries into product of streamers
    if split_streams_by is not None:
        balance_queries = {}
        # if single string is provided, covert into list
        if isinstance(split_streams_by, str):
            split_streams_by = [split_streams_by]

        # remove balance_by from query and move to balance_queries
        for key in split_streams_by:
            balance_queries[key] = q.pop(key)

        # for each b in balance_queries, create a separate stream
        # later we control the sampling processs of these streams to balance them
        for b in _dproduct(balance_queries):
            subset = None
            # for each stream we wrap into pescador Streamers for additional features
            for key, value in b.items():
                if subset_streams is not None:
                    for x, y in subset_streams.items():
                        result = y.get(key)
                        if result is not None:
                            if isinstance(result, list):
                                for item in result:
                                    if value == item:
                                        subset = x
                            else:
                                if value == result:
                                    subset = x

                            # assign remainder class
                            if result == "*" and subset is None:
                                subset = x

            streams.append(
                pescador.Streamer(
                    pescador.Streamer(
                        gbif_query_generator,
                        label=label,
                        mediatype=mediatype,
                        subset=subset,
                        license_info=license_info,
                        one_media_per_occurrence=one_media_per_occurrence,
                        **q,
                        **b,
                    ),
                    # this makes sure that we only obtain a maximum number
                    # of samples per stream
                    max_iter=nb_samples_per_stream,
                ))

        if verbose:
            nb_queries = [
                gbif_count(mediatype=mediatype, **q, **b)
                for b in _dproduct(balance_queries)
            ]
            print(sum(nb_queries))

        # count the available occurances for each stream and select the min.
        # We only yield the minimum of streams to balance
        if nb_samples == -1:
            # calculate the miniumum number of samples available per stream
            nb_samples = min([
                gbif_count(mediatype=mediatype, **q, **b)
                for b in _dproduct(balance_queries)
            ]) * len(streams)

        if weighted_streams:
            weights = np.array([
                float(gbif_count(mediatype=mediatype, **q, **b))
                for b in _dproduct(balance_queries)
            ])
            weights /= np.max(weights)
        else:
            weights = None

        mux = pescador.StochasticMux(
            streams,
            n_active=len(streams),  # all streams are always active.
            rate=None,  # all streams are balanced
            weights=weights,  # weight streams
            mode="exhaustive",  # if one stream fails it is not revived
        )
        return mux(max_iter=nb_samples)

    # else there will be only one stream, hence no balancing or sampling
    else:
        if nb_samples and nb_samples_per_stream:
            nb_samples = min(nb_samples, nb_samples_per_stream)

        if verbose:
            print(nb_samples)
        return pescador.Streamer(
            gbif_query_generator,
            label=label,
            mediatype=mediatype,
            license_info=license_info,
            one_media_per_occurrence=one_media_per_occurrence,
            **q,
        ).iterate(max_iter=nb_samples)