def test_mux_of_muxes_single(): # Check on Issue #79 abc = pescador.Streamer('abc') xyz = pescador.Streamer('xyz') mux1 = pescador.Mux([abc, xyz], k=2, rate=None, revive=True, with_replacement=False, prune_empty_streams=False) n123 = pescador.Streamer('123') n456 = pescador.Streamer('456') mux2 = pescador.Mux([n123, n456], k=2, rate=None, revive=True, with_replacement=False, prune_empty_streams=False) mux3 = pescador.Mux([mux1, mux2], k=2, rate=None, with_replacement=False, revive=True, prune_empty_streams=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) print(samples3[:10], count3) assert set('abcxyz123456') == set(count3.keys())
def test_sampled_mux_of_muxes(): def _cycle(values): while True: for v in values: yield v # Build some sample streams ab = pescador.Streamer(_cycle, 'ab') cd = pescador.Streamer(_cycle, 'cd') ef = pescador.Streamer(_cycle, 'ef') mux1 = pescador.Mux([ab, cd, ef], k=3, rate=None, with_replacement=False, revive=False) # And inspect the first mux samples1 = list(mux1(max_iter=6 * 10)) count1 = collections.Counter(samples1) print(count1) assert set(count1.keys()) == set('abcdef') # Build another set of streams gh = pescador.Streamer(_cycle, 'gh') ij = pescador.Streamer(_cycle, 'ij') kl = pescador.Streamer(_cycle, 'kl') mux2 = pescador.Mux([gh, ij, kl], k=3, rate=None, with_replacement=False, revive=False) # And inspect the second mux samples2 = list(mux2(max_iter=6 * 10)) count2 = collections.Counter(samples2) print(count2) assert set(count2.keys()) == set('ghijkl') # Merge the muxes together. mux3 = pescador.Mux([mux1, mux2], k=2, rate=None, with_replacement=False, revive=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) print(count3) assert set('abcdefghijkl') == set(count3.keys()) max_count, min_count = max(count3.values()), min(count3.values()) assert (max_count - min_count) / max_count < 0.2
def data_generator(working, tracks, sampler, k, batch_size=32, augmentation=False, weights=None, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] pool_weights = [] for track in tracks: fname = os.path.join(working, 'pump', os.path.extsep.join([track, 'npz'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if weights is not None: pool_weights.append(weights.loc[track]) if augmentation: for fname in sorted(glob(os.path.join(working, 'pump', '{}.*.npz'.format(track)))): seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if weights is not None: pool_weights.append(weights.loc[track]) # Send it all to a mux if not pool_weights: pool_weights = None mux = pescador.Mux(seeds, k, pool_weights=pool_weights, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def data_generator(working, tracks, sampler, k, augment=True, augment_drc=True, batch_size=32, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tqdm(tracks): fname = os.path.join(working, os.path.extsep.join([str(track), 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if augment: for aug in range(4): augname = fname.replace('.h5', '.{:d}.h5'.format(aug)) # seeds.append(pescador.Streamer(data_sampler, fname, sampler)) seeds.append(pescador.Streamer(data_sampler, augname, sampler)) # Send it all to a mux mux = pescador.Mux(seeds, k, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def data_generator(working, tracks, sampler, k, threshold, augment=True, batch_size=32, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tqdm(tracks): fname = os.path.join(working, os.path.extsep.join([str(track), 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler, threshold)) if augment: for augname in sorted( glob(os.path.join(working, '{}.*.h5'.format(track)))): seeds.append( pescador.Streamer(data_sampler, augname, sampler, threshold)) # Send it all to a mux mux = pescador.Mux(seeds, k, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def test_mux_bad_weights(): with pytest.raises(pescador.PescadorError): streamers = [ pescador.Streamer(T.finite_generator, 10) for _ in range(5) ] # 5 streamers, all-zeros weight vector should trigger an error pescador.Mux(streamers, None, weights=np.zeros(5))
def test_mux_bad_streamers(): with pytest.raises(pescador.PescadorError): steamers = [ pescador.Streamer(T.finite_generator, 10) for _ in range(5) ] # 5 steamers, 10 weights, should trigger an error pescador.Mux(steamers, None, weights=np.random.randn(10))
def test_restart_mux(): s1 = pescador.Streamer('abc') s2 = pescador.Streamer('def') mux = pescador.Mux([s1, s2], k=2, rate=None, revive=True, with_replacement=False, random_state=1234) assert len(list(mux(max_iter=100))) == len(list(mux(max_iter=100)))
def data_generator(subset_path, k=32, batch_size=64, random_state=20171021, precompute=False, num_distractors=1, augment=False, rate=32, max_videos=None, include_metadata=False, cycle=True): """Sample video and audio from data_dir, returns a streamer that yield samples infinitely. Args: subset_path: path to subset file k: number of concurrent open streamer batch_size: batch size random_state: Value used to initialize state of RNG num_distractors: Number of pairs to generate a stream for each video Returns: A generator that yield infinite video and audio samples from data_dir """ random.seed(random_state) np.random.seed(random_state) LOGGER.info("Loading subset list") file_list = read_csv_as_dicts(subset_path) LOGGER.info("Creating streamers...") if max_videos is not None and max_videos < len(file_list): LOGGER.info("Using a subset of {} videos".format(max_videos)) random.shuffle(file_list) file_list = file_list[:max_videos] seeds = [] for video_1 in tqdm(file_list): for _ in range(num_distractors): video_2 = video_1 # Make sure we sample a different file while video_2 == video_1: video_2 = random.choice(file_list) streamer = pescador.Streamer(sampler, video_1, video_2, rate=rate, augment=augment, precompute=precompute, include_metadata=include_metadata) seeds.append(streamer) # Randomly shuffle the seeds random.shuffle(seeds) mux = pescador.Mux(seeds, k, rate=rate, random_state=random_state) if cycle: mux = mux.cycle() if batch_size == 1: return mux else: return pescador.maps.buffer_stream(mux, batch_size)
def test_mux_of_muxes_itered(): # Check on Issue #79 abc = pescador.Streamer('abc') xyz = pescador.Streamer('xyz') mux1 = pescador.Mux([abc, xyz], k=10, rate=None, prune_empty_streams=False, revive=True, random_state=135) samples1 = mux1.iterate(max_iter=1000) count1 = collections.Counter(samples1) print(count1) assert set('abcxyz') == set(count1.keys()) n123 = pescador.Streamer('123') n456 = pescador.Streamer('456') mux2 = pescador.Mux([n123, n456], k=10, rate=None, prune_empty_streams=False, revive=True, random_state=246) samples2 = mux2.iterate(max_iter=1000) count2 = collections.Counter(samples2) print(count2) assert set('123456') == set(count2.keys()) # Note that (random_state=987, k=2) fails. mux3 = pescador.Mux([mux1, mux2], k=10, rate=None, prune_empty_streams=False, revive=True, random_state=987) samples3 = mux3.iterate(max_iter=1000) count3 = collections.Counter(samples3) print(count3) assert set('abcxyz123456') == set(count3.keys())
def test_critical_mux(): # Check on Issue #80 chars = 'abcde' streamers = [pescador.Streamer(x * 5) for x in chars] mux = pescador.Mux(streamers, k=len(chars), rate=None, with_replacement=False, revive=True, prune_empty_streams=False, random_state=135) samples = mux.iterate(max_iter=1000) print(collections.Counter(samples))
def test_critical_mux_of_rate_limited_muxes(): # Check on Issue #79 def _choice(vals): while True: yield random.choice(vals) ab = pescador.Streamer(_choice, 'ab') cd = pescador.Streamer(_choice, 'cd') ef = pescador.Streamer(_choice, 'ef') mux1 = pescador.Mux([ab, cd, ef], k=2, rate=2, with_replacement=False, revive=True) gh = pescador.Streamer(_choice, 'gh') ij = pescador.Streamer(_choice, 'ij') kl = pescador.Streamer(_choice, 'kl') mux2 = pescador.Mux([gh, ij, kl], k=2, rate=2, with_replacement=False, revive=True) mux3 = pescador.Mux([mux1, mux2], k=2, rate=None, with_replacement=False, revive=True) samples = list(mux3.iterate(max_iter=10000)) count = collections.Counter(samples) max_count, min_count = max(count.values()), min(count.values()) assert (max_count - min_count) / max_count < 0.2 print(count) assert set('abcdefghijkl') == set(count.keys())
def keras_generator(data_list, input_patch_size): """Generator to be passed to a keras model """ streams = [] for fpath_in, fpath_out in data_list: streams.append( pescador.Streamer(patch_generator, fpath_in, fpath_out, input_patch_size=input_patch_size)) stream_mux = pescador.Mux(streams, 10, with_replacement=True, lam=500, random_state=RANDOM_STATE) batch_generator = pescador.BufferedStreamer(stream_mux, 16) for batch in batch_generator.tuples('X', 'Y'): yield batch
def multiplex_tfr(aug_kind_str, fold_units, n_hops, batch_size, tfr_str="logmelspec"): # Parse augmentation kind string (aug_kind_str). if aug_kind_str == "none": augs = ["original"] elif aug_kind_str == "pitch": augs = ["original", "pitch"] elif aug_kind_str == "stretch": augs = ["original", "stretch"] elif aug_kind_str == "all-but-noise": augs = ["original", "pitch", "stretch"] else: noise_augs = ["noise-" + unit_str for unit_str in fold_units] if aug_kind_str == "all": augs = noise_augs + ["original", "pitch", "stretch"] elif aug_kind_str == "noise": augs = noise_augs + ["original"] # Generate a Pescador streamer for every HDF5 container, that is, # every unit-augmentation-instance triplet. aug_dict = get_augmentations() data_dir = get_data_dir() dataset_name = get_dataset_name() tfr_name = "_".join([dataset_name, "clip-" + tfr_str]) tfr_dir = os.path.join(data_dir, tfr_name) streams = [] for aug_str in augs: aug_dir = os.path.join(tfr_dir, aug_str) if aug_str == "original": instances = [aug_str] else: n_instances = aug_dict[aug_str] instances = [ "-".join([aug_str, str(instance_id)]) for instance_id in range(n_instances) ] if aug_str[:5] == "noise" and tfr_str == "logmelspec": bias = np.float32(-17.0) else: bias = np.float32(0.0) for instanced_aug_str in instances: for unit_str in fold_units: lms_name = "_".join( [dataset_name, instanced_aug_str, unit_str]) lms_path = os.path.join(aug_dir, lms_name + ".hdf5") stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str) streams.append(stream) # Multiplex streamers together. mux = pescador.Mux(streams, k=len(streams), lam=None, with_replacement=True, revive=True) # Create buffered streamer with specified batch size. buffered_streamer = pescador.BufferedStreamer(mux, batch_size) return buffered_streamer.tuples("X", "y", cycle=True)
def data_generator_balanced(working, tracks, sampler, k, augment=True, augment_drc=True, batch_size=32, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' labelfile = ( '/beegfs/js7561/datasets/dcase2017/task4_official/combined/metadata/' 'labels/groundtruth_weak_label_training_set.csv') labels = pd.read_csv(labelfile, header=None, sep='\t') labels.columns = ['filename', 'start_time', 'end_time', 'label'] muxes = [] for l in DCASE_CLASSES: lclass = labels[labels.label == l] filenames = lclass.filename.values filenames = [('Y{}'.format(fn)).replace('.wav', '') for fn in filenames] tracks_str = [str(t) for t in tracks] filenames = np.intersect1d(np.array(tracks_str), np.array(filenames)) print(l, len(filenames)) seeds = [] for track in tqdm(filenames): fname = os.path.join(working, os.path.extsep.join([str(track), 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if augment: # for fname in sorted(glob(os.path.join(working, # '{}.*.h5'.format(track)))): for aug in range(10): augname = fname.replace('.h5', '.{:d}.h5'.format(aug)) # seeds.append(pescador.Streamer(data_sampler, fname, sampler)) seeds.append( pescador.Streamer(data_sampler, augname, sampler)) if augment_drc: for aug in range(10, 14): augname = fname.replace('.h5', '.{:d}.h5'.format(aug)) seeds.append( pescador.Streamer(data_sampler, augname, sampler)) # Send it all to a mux n_active = k // len(DCASE_CLASSES) mux = pescador.Mux(seeds, n_active, **kwargs) # Add mux to list muxes.append(mux) # Create mux from muxes supermux = pescador.Mux(muxes, len(muxes), lam=None, pool_weights=None, with_replacement=True) if batch_size == 1: return supermux else: return pescador.BufferedStreamer(supermux, batch_size)
def multiplex_lms_with_background(aug_kind_str, fold_units, n_input_hops, batch_size): # Define constants. aug_dict = localmodule.get_augmentations() data_dir = localmodule.get_data_dir() dataset_name = localmodule.get_dataset_name() tfr_name = "_".join([dataset_name, "clip-logmelspec"]) tfr_dir = os.path.join(data_dir, tfr_name) bg_name = "_".join([dataset_name, "clip-logmelspec-backgrounds"]) bg_dir = os.path.join(data_dir, bg_name) T_str = "T-" + str(bg_duration).zfill(4) T_dir = os.path.join(bg_dir, T_str) # Parse augmentation kind string (aug_kind_str). if aug_kind_str == "none": augs = ["original"] elif aug_kind_str == "pitch": augs = ["original", "pitch"] elif aug_kind_str == "stretch": augs = ["original", "stretch"] elif aug_kind_str == "all-but-noise": augs = ["original", "pitch", "stretch"] else: noise_augs = ["noise-" + unit_str for unit_str in fold_units] if aug_kind_str == "all": augs = noise_augs + ["original", "pitch", "stretch"] elif aug_kind_str == "noise": augs = noise_augs + ["original"] # Loop over augmentations. streams = [] for aug_str in augs: # Define instances. aug_dir = os.path.join(tfr_dir, aug_str) if aug_str == "original": instances = [aug_str] else: n_instances = aug_dict[aug_str] instances = [ "-".join([aug_str, str(instance_id)]) for instance_id in range(n_instances) ] # Define bias. if aug_str[:5] == "noise": bias = np.float32(-17.0) else: bias = np.float32(0.0) # Loop over instances. for instanced_aug_str in instances: # Loop over units. for unit_str in fold_units: # Define path to time-frequency representation. lms_name = "_".join( [dataset_name, instanced_aug_str, unit_str]) lms_path = os.path.join(aug_dir, lms_name + ".hdf5") # Define path to background. bg_name = "_".join([ dataset_name, "background_summaries", unit_str, T_str + ".hdf5" ]) bg_path = os.path.join(T_dir, bg_name) # Define pescador streamer. stream = pescador.Streamer(yield_lms_and_background, lms_path, n_input_hops, bias, bg_path) streams.append(stream) # Multiplex streamers together. mux = pescador.Mux(streams, k=len(streams), lam=None, with_replacement=True, revive=True) # Create buffered streamer with specified batch size. buffered_streamer = pescador.BufferedStreamer(mux, batch_size) return pescador.maps.keras_tuples(buffered_streamer, inputs=["X_spec", "X_bg"], outputs=["y"])
############################################## # Put it all together ############################################## input_shape, (X_train, Y_train), (X_test, Y_test) = setup_data() steps_per_epoch = len(X_train) // batch_size # Create two streams from the same data, where one of the streams # adds a small amount of Gaussian noise. You could easily perform # other data augmentations using the same 'map' strategy. stream = pescador.Streamer(sampler, X_train, Y_train) noisy_stream = pescador.Streamer(additive_noise, stream, 'X') # Multiplex the two streamers together. mux = pescador.Mux( [stream, noisy_stream], # Two streams, always active. k=2, # We want to sample from each stream infinitely. rate=None) # Buffer the stream into minibatches. batches = pescador.buffer_stream(mux, batch_size) model = build_model(input_shape) try: print("Start time: {}".format(datetime.datetime.now())) model.fit_generator(pescador.tuples(batches, 'X', 'y'), steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=1, validation_data=(X_test, Y_test)) except KeyboardInterrupt:
i = np.random.randint(0, n) yield {'X': X[i], 'Y': y[i]} streams = [pescador.Streamer(npz_generator, x) for x in datasets] ############################################## # Option 1: Stream equally from each dataset ############################################## # If you can easily fit all the datasets in memory and you want to # sample from then equally, you would set up your Mux as follows: mux = pescador.Mux( streams, # Three streams, always active. k=len(streams), # We want to sample from each stream infinitely, # so we turn off the rate parameter, which # controls how long to sample from each stream. rate=None) ############################################## # Option 2: Sample from one at a time. ############################################## # Another approach might be to restrict sampling to one stream at a time. # Now, the rate parameter controls (statistically) how long to sample # from a stream before activating a new stream. mux = pescador.Mux( streams, # Only allow one active stream k=1,
def multitask_generator(mtrack_list, json_path=JSON_PATH, data_types=DATA_TYPES, tasks=TASKS, mux_weights=None, add_frequency=False, augment=True, n_harms=5): typed_data = get_grouped_data(json_path, mtrack_list) task_pairs = get_all_task_pairs(typed_data) # make a streamer for each data type and task data_streamers = {} for data_type in data_types: if data_type == 'XA': data_streamers[data_type] = {'melody': [], 'bass': [], 'vocal': []} elif data_type == 'XB': data_streamers[data_type] = { 'multif0': [], 'melody': [], 'bass': [], 'vocal': [], 'piano': [], 'guitar': [] } elif data_type == 'XC': data_streamers[data_type] = { 'multif0': [], 'melody': [], 'bass': [], 'vocal': [], 'piano': [] } else: data_streamers[data_type] = { 'multif0': [], 'melody': [], 'bass': [], 'vocal': [] } for task in data_streamers[data_type].keys(): for pair in task_pairs[data_type][task]: data_streamers[data_type][task].append( pescador.Streamer(multitask_patch_generator, pair[0], pair[1], tasks, 20, (360, 50), add_frequency, augment, n_harms)) # for each data type make a mux n_active = 10 data_muxes = {} for data_type in data_types: data_muxes[data_type] = {} for task in data_streamers[data_type].keys(): data_muxes[data_type][task] = pescador.Mux( data_streamers[data_type][task], n_active, with_replacement=True, lam=250, random_state=42) # for each task make a mux that samples from the data muxes task_streams = {} for task in tasks: task_streams[task] = [] for data_type in data_types: if task in data_muxes[data_type].keys(): task_streams[task].append(data_muxes[data_type][task]) if mux_weights is None: mux_weights = {} for task in tasks: n_data_types = len(task_streams[task]) mux_weights[task] = np.ones((n_data_types, )) / float(n_data_types) task_muxes = {} for task in tasks: task_muxes[task] = pescador.Mux(task_streams[task], 1, pool_weights=mux_weights[task]) batch_gen = multitask_batch_generator(task_muxes, tasks) for batch in batch_gen: yield batch