def data_generator(working, tracks, sampler, k, threshold, augment=True, batch_size=32, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tqdm(tracks): fname = os.path.join(working, os.path.extsep.join([str(track), 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler, threshold)) if augment: for augname in sorted( glob(os.path.join(working, '{}.*.h5'.format(track)))): seeds.append( pescador.Streamer(data_sampler, augname, sampler, threshold)) # Send it all to a mux mux = pescador.Mux(seeds, k, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def data_generator(working, tracks, sampler, k, augment=True, augment_drc=True, batch_size=32, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for track in tqdm(tracks): fname = os.path.join(working, os.path.extsep.join([str(track), 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if augment: for aug in range(4): augname = fname.replace('.h5', '.{:d}.h5'.format(aug)) # seeds.append(pescador.Streamer(data_sampler, fname, sampler)) seeds.append(pescador.Streamer(data_sampler, augname, sampler)) # Send it all to a mux mux = pescador.Mux(seeds, k, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def data_generator(working, tracks, sampler, k, batch_size=32, augmentation=False, weights=None, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] pool_weights = [] for track in tracks: fname = os.path.join(working, 'pump', os.path.extsep.join([track, 'npz'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if weights is not None: pool_weights.append(weights.loc[track]) if augmentation: for fname in sorted(glob(os.path.join(working, 'pump', '{}.*.npz'.format(track)))): seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if weights is not None: pool_weights.append(weights.loc[track]) # Send it all to a mux if not pool_weights: pool_weights = None mux = pescador.Mux(seeds, k, pool_weights=pool_weights, **kwargs) if batch_size == 1: return mux else: return pescador.BufferedStreamer(mux, batch_size)
def test_multiple_copies(self): """Check that the Mux class can be activated multiple times successfully. """ ab = pescador.Streamer('ab') cde = pescador.Streamer('cde') fghi = pescador.Streamer('fghi') mux = pescador.mux.StochasticMux([ab, cde, fghi], n_active=5, rate=2) gen1 = mux.iterate(6) gen2 = mux.iterate(8) # No streamers should be active until we actually start the generators assert mux.active == 0 # grab one sample each to make sure we've actually started the # generator _ = next(gen1) _ = next(gen2) assert mux.active == 2 # the first one should die after 5 more samples result1 = list(gen1) assert len(result1) == 5 assert mux.active == 1 # The second should die after 7 result2 = list(gen2) assert len(result2) == 7 assert mux.active == 0
def mux_streams(streams, n_samples, n_batch=64): '''Multiplex data source streams Parameters ---------- streams: list of pescador.Streamer The streams to merge n_samples : int >0 or None The total number of samples to draw n_batch : int > 0 The size of each batch Returns ------- mux : pescador.Streamer A multiplexing stream object that generates batches of size n_batch from the merged input streams ''' # Mux all incoming streams stream_mux = pescador.Streamer(pescador.mux, streams, n_samples, len(streams), lam=None, with_replacement=False) return pescador.Streamer(pescador.buffer_streamer, stream_mux, n_batch)
def setup(self): """Perform the setup to prepare for streaming.""" # Instrument names instruments = list(self.features_df["instrument"].unique()) # Get Muxes for each instrument. inst_muxes = [self._instrument_mux(i) for i in instruments] # Construct the streams for each mux. mux_streams = [ pescador.Streamer(x) for x in inst_muxes if x is not None ] # Construct the master mux master_mux = pescador.mux(mux_streams, **self.master_mux_params) # We have to wrap the mux in a stream so that the buffer # knows what to do with it. self.master_stream = pescador.Streamer(master_mux) # Now construct the final streamer if self.use_zmq: self.buffered_streamer = zmq_buffered_stream( self.master_stream, self.batch_size) else: self.buffered_streamer = buffer_stream(self.master_stream, self.batch_size)
def sample_chord_qualities(stash, output_dir, win_length=20, num_obs=10000): quality_partition = util.partition(stash, quality_map) qual_indexes = [ util.index_partition_arrays(quality_partition, [q]) for q in range(13) ] qual_pools = [[ pescador.Streamer(chord_sampler, key, stash, 20, q_idx) for key in q_idx ] for q_idx in qual_indexes] futil.create_directory(output_dir) print "[%s] Starting loop" % time.asctime() for qual, pool in enumerate(qual_pools): base_stream = pescador.mux(pool, n_samples=None, k=50, lam=5) for root in range(12): stream = FX.rotate_chord_to_root(base_stream, root) x_obs = np.array([stream.next().cqt for _ in range(num_obs)]) chord_idx = qual * 12 + root np.save(os.path.join(output_dir, "%03d.npy" % chord_idx), x_obs) print "[%s] %3d" % (time.asctime(), chord_idx) null_index = util.index_partition_arrays(quality_partition, [13]) null_pool = [ pescador.Streamer(chord_sampler, key, stash, 20, null_index) for key in null_index ] stream = pescador.mux(null_pool, n_samples=None, k=50, lam=5) x_obs = np.array([stream.next().cqt for _ in range(num_obs)]) np.save(os.path.join(output_dir, "156.npy"), x_obs)
def test_mux_of_muxes_single(): # Check on Issue #79 abc = pescador.Streamer('abc') xyz = pescador.Streamer('xyz') mux1 = pescador.Mux([abc, xyz], k=2, rate=None, revive=True, with_replacement=False, prune_empty_streams=False) n123 = pescador.Streamer('123') n456 = pescador.Streamer('456') mux2 = pescador.Mux([n123, n456], k=2, rate=None, revive=True, with_replacement=False, prune_empty_streams=False) mux3 = pescador.Mux([mux1, mux2], k=2, rate=None, with_replacement=False, revive=True, prune_empty_streams=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) print(samples3[:10], count3) assert set('abcxyz123456') == set(count3.keys())
def create_uniform_factored_stream(stash, win_length, partition_labels=None, working_size=50, vocab_dim=157, pitch_shift=True): """Return a stream of chord samples, with uniform quality presentation.""" if partition_labels is None: partition_labels = util.partition(stash, quality_map) quality_pool = [] for qual_idx in range(13): quality_subindex = util.index_partition_arrays(partition_labels, [qual_idx]) entity_pool = [ pescador.Streamer(chord_sampler, key, stash, win_length, quality_subindex) for key in quality_subindex.keys() ] stream = pescador.mux(entity_pool, n_samples=None, k=25, lam=20) quality_pool.append(pescador.Streamer(stream)) stream = pescador.mux(quality_pool, n_samples=None, k=working_size, lam=None, with_replacement=False) if pitch_shift: stream = FX.pitch_shift_cqt(stream) return FX.map_to_joint_index(stream, vocab_dim)
def test_mux_of_muxes_itered(self, mux_class): # Check on Issue #79 abc = pescador.Streamer('abc') xyz = pescador.Streamer('xyz') mux1 = mux_class([abc, xyz], 10, rate=None, prune_empty_streams=False, random_state=135) samples1 = mux1.iterate(max_iter=1000) count1 = collections.Counter(samples1) assert set('abcxyz') == set(count1.keys()) n123 = pescador.Streamer('123') n456 = pescador.Streamer('456') mux2 = mux_class([n123, n456], 10, rate=None, prune_empty_streams=False, random_state=246) samples2 = mux2.iterate(max_iter=1000) count2 = collections.Counter(samples2) assert set('123456') == set(count2.keys()) # Note that (random_state=987, n_active=2) fails. mux3 = mux_class([mux1, mux2], 10, rate=None, prune_empty_streams=False, random_state=987) samples3 = mux3.iterate(max_iter=1000) count3 = collections.Counter(samples3) assert set('abcxyz123456') == set(count3.keys())
def test_rr_multiple_copies(self): ab = pescador.Streamer('ab') cde = pescador.Streamer('cde') fghi = pescador.Streamer('fghi') mux = pescador.mux.RoundRobinMux([ab, cde, fghi], 'exhaustive') gen1 = mux.iterate(3) gen2 = mux.iterate() # n == 9 # No streamers should be active until we actually start the generators assert mux.active == 0 # grab one sample each to make sure we've actually started the # generator _ = next(gen1) _ = next(gen2) assert mux.active == 2 # the first one should die after two more samples result1 = list(gen1) assert "".join(result1) == "cf" assert len(result1) == 2 assert mux.active == 1 # The second should die after 6 result2 = list(gen2) assert "".join(result2) == "cfbdgehi" assert len(result2) == 8 assert mux.active == 0
def test_modes(self, mode): a = pescador.Streamer("abc") b = pescador.Streamer("def") mux = pescador.mux.ChainMux([a, b], mode="exhaustive") result = list(mux.iterate()) assert len(result) > 0
def compute_chord_averages(stash, win_length=20, num_obs=5000): quality_partition = util.partition(stash, quality_map) qual_indexes = [ util.index_partition_arrays(quality_partition, [q]) for q in range(13) ] qual_pools = [[ pescador.Streamer(chord_sampler, key, stash, 20, q_idx) for key in q_idx ] for q_idx in qual_indexes] obs_aves = [] for pool in qual_pools: base_stream = pescador.mux(pool, n_samples=None, k=50, lam=5) for root in range(12): stream = FX.rotate_chord_to_root(base_stream, root) x_obs = np.array([stream.next().cqt for _ in range(num_obs)]) obs_aves.append(x_obs.mean(axis=0).squeeze()) print len(obs_aves) null_index = util.index_partition_arrays(quality_partition, [13]) null_pool = [ pescador.Streamer(chord_sampler, key, stash, 20, null_index) for key in null_index ] stream = pescador.mux(null_pool, n_samples=None, k=50, lam=5) x_obs = np.array([stream.next().cqt for _ in range(num_obs)]) obs_aves.append(x_obs.mean(axis=0).squeeze()) return np.array(obs_aves)
def test_chain_mux_exhaustive(self): a = pescador.Streamer("abc") b = pescador.Streamer("def") mux = pescador.mux.ChainMux([a, b], mode="exhaustive") assert "".join(list(mux.iterate())) == "abcdef" # Make sure it's the same as itertools.chain assert list(mux.iterate()) == list( itertools.chain(a.iterate(), b.iterate()))
def __init__(self, source_filepath, seq_len=512, hop=None, normalize=True, transform=None, restart_streams=False): super(MusicDataset).__init__() source_folder = Path(source_filepath) self.seq_len = seq_len if hop == None: hop = seq_len self.hop = hop self.normalize = normalize self.transform = transform # get songs' path songs = [] for root, dirs, files in os.walk(source_folder): for name in files: songs.append(os.path.join(root, name)) # let's restrict to wav files (damn .DS_Store) songs = [song for song in songs if song.endswith('.wav')] # get songs length data = [] for song in songs: # get audio info song_info = torchaudio.info(song) data.append({ "path": song, "len": int(song_info[0].length / song_info[0].channels) }) self.data = data # muxing different streams if restart_streams: streams = [ pescador.Streamer(generate_rnd_chunk, track['path'], track['len'], seq_len, normalize, transform) for track in data ] self.mux = pescador.ShuffledMux(streams) else: streams = [ pescador.Streamer(generate_chunk, track['path'], track['len'], seq_len, hop, normalize, transform) for track in data ] self.mux = pescador.StochasticMux(streams, len(streams), rate=None, mode='exhaustive')
def test_mux_rare(self, weight, mux_class): "This should give us all the reference before all the noise" reference = list(T.finite_generator(50)) noise = list(T.finite_generator(50, size=1)) stream = pescador.Streamer(reference) stream2 = pescador.Streamer(noise) mux = mux_class([stream, stream2], 2, rate=256, weights=weight) estimate = list(mux) assert T._eq_list_of_dicts(reference + noise, estimate)
def create_stream(sources, tasks, crema_input, n_per_track=128, n_duration=16, n_alive=32, cache=None, thread=False, keys=None): '''Create a crema data stream Parameters ---------- sources : pd.DataFrame Must contain columns `audio` and `jams` task_map : iterable of crema.task.BaseTaskTransformers Objects to transform jams annotations into crema targets crema_input : crema.pre.CremaInput The feature extraction object n_per_track : int > 0 The number of example patches to generate from each source file n_duration : int > 0 The duration (in frames) of each generated patch n_alive : int > 0 The number of sources to keep active cache : Shove or None feature cache object thread: bool if true, launch this stream in a parallel thread keys : iterable or None If given, only elements of `sources` belonging to `keys` will be processed Returns ------- mux : pescador.Streamer A multiplexing stream object over the sources ''' # If we have keys, filter down to th if keys: sources = sources[sources['key'].isin(keys)] # Create the seed bank seeds = [pescador.Streamer(sampler, audf, jamf, tasks, crema_input, n_per_track, n_duration, cache=cache) for audf, jamf in zip(sources.audio, sources.jams)] # Multiplex these seeds together streamer = pescador.Streamer(pescador.mux, seeds, None, n_alive) if thread: return pescador.Streamer(pescador.zmq_stream, streamer) else: return streamer
def test_restart_mux(): s1 = pescador.Streamer('abc') s2 = pescador.Streamer('def') mux = pescador.Mux([s1, s2], k=2, rate=None, revive=True, with_replacement=False, random_state=1234) assert len(list(mux(max_iter=100))) == len(list(mux(max_iter=100)))
def test_mux_weighted(self, weight, mux_class): reference = list(T.finite_generator(50)) noise = list(T.finite_generator(50, size=1)) stream = pescador.Streamer(reference) stream2 = pescador.Streamer(noise) mux = mux_class([stream, stream2], 2, rate=256, weights=[1.0, weight]) estimate = list(mux) if weight == 0.0: assert T._eq_list_of_dicts(reference, estimate) else: assert not T._eq_list_of_dicts(reference, estimate)
def test_mux_rare(weight): reference = list(T.finite_generator(50)) noise = list(T.finite_generator(50, size=1)) stream = pescador.Streamer(reference) stream2 = pescador.Streamer(noise) mux = pescador.mux.Mux([stream, stream2], 2, weights=weight, with_replacement=False) estimate = list(mux) assert (reference + noise) == estimate
def test_zmq_buffer(): n_samples = 50 stream = pescador.Streamer(T.md_generator, dimension=2, n=n_samples, size=64, items=['X', 'Y']) buff_size = 10 buff_stream = pescador.Streamer(pescador.buffer_stream, stream, buff_size) zmq_stream = pescador.ZMQStreamer(buff_stream) outputs = [x for x in zmq_stream] assert len(outputs) == int(n_samples) / buff_size
def test_rr_permuted_cycle(self): a = pescador.Streamer('a') b = pescador.Streamer('bb') empty = pescador.Streamer([]) c = pescador.Streamer('c') mux = pescador.mux.RoundRobinMux([a, b, empty, c], 'permuted_cycle') result = list(mux.iterate(12)) counts = collections.Counter(result) assert len(counts) == 3 assert counts['a'] == 3 assert counts['b'] == 6 assert counts['c'] == 3
def test_mux_of_mux(): """Make sure that mux activate still works correctly when a mux is passed a mux. """ a = pescador.Streamer('aaaaaaaaaa') b = pescador.Streamer('bbbbbbbb') c = pescador.Streamer('cccccc') d = pescador.Streamer('dddd') e = pescador.Streamer('ee') f = pescador.Streamer('fff') g = pescador.Streamer('gggg') h = pescador.Streamer('hhhhh') base1 = pescador.mux.ShuffledMux([a, b], random_state=1) base2 = pescador.mux.ShuffledMux([c, d, e], random_state=10) base3 = pescador.mux.ShuffledMux([f, g, h], random_state=100) train_mux = pescador.mux.StochasticMux([base1, base2, base3], n_active=2, rate=3, mode="with_replacement", random_state=123) train_result = list(train_mux.iterate(100)) sample_counts = collections.Counter(train_result) assert set(sample_counts.keys()) == set( ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
def test_mux_of_muxes_single(self, mux_class): # Check on Issue #79 abc = pescador.Streamer('abc') xyz = pescador.Streamer('xyz') mux1 = mux_class([abc, xyz], 2, rate=None, prune_empty_streams=False) n123 = pescador.Streamer('123') n456 = pescador.Streamer('456') mux2 = mux_class([n123, n456], 2, rate=None, prune_empty_streams=False) mux3 = mux_class([mux1, mux2], 2, rate=None, prune_empty_streams=False) samples3 = list(mux3.iterate(max_iter=10000)) count3 = collections.Counter(samples3) assert set('abcxyz123456') == set(count3.keys())
def test_mux_weighted(weight): reference = list(T.finite_generator(50)) noise = list(T.finite_generator(50, size=1)) stream = pescador.Streamer(reference) stream2 = pescador.Streamer(noise) mux = pescador.mux.Mux([stream, stream2], 2, weights=[1.0, weight], with_replacement=False) estimate = list(mux) if weight == 0.0: assert reference == estimate else: assert reference != estimate
def val_generator(working, tracks, sampler, augment=True): '''validation generator, deterministic roundrobin''' seeds = [] for track in tracks: fname = os.path.join(working, os.path.extsep.join([track, 'h5'])) seeds.append(pescador.Streamer(data_sampler, fname, sampler)) if augment: for fname in sorted( glob(os.path.join(working, '{}.*.h5'.format(track)))): seeds.append(pescador.Streamer(data_sampler, fname, sampler)) # Send it all to a mux return pescador.RoundRobinMux(seeds)
def test_weighted_empty_streams(self, mux_class): def __empty(): if False: yield 1 reference = pescador.Streamer(T.finite_generator, 10) empty = pescador.Streamer(__empty) mux = mux_class([reference, empty], weights=[1e-10, 1e10]) estimate = list(mux.iterate(10)) ref = list(reference) assert len(ref) == len(estimate) for b1, b2 in zip(ref, estimate): T._eq_batch(b1, b2)
def test_mux_k_greater_n(self, mux_class, n_samples, rate, random_state): """Test that replacement works correctly. See #112: https://github.com/pescadores/pescador/issues/112 When streamers are activated, they should make copies of their underlying streamers, and this should work. Before the bug was fixed, this would fail. Note; this doesn't test underlying state at all, however. """ a = pescador.Streamer('a') b = pescador.Streamer('b') mux = mux_class([a, b], 6, rate, random_state=random_state) result = list(mux.iterate(n_samples)) assert len(result) == n_samples
def test_mux_bad_weights(mux_class): with pytest.raises(pescador.PescadorError): streamers = [pescador.Streamer(T.finite_generator, 10) for _ in range(5)] # 5 streamers, all-zeros weight vector should trigger an error mux_class(streamers, weights=np.zeros(5))
def test_deepcopy__randomseed(self, mux_class, random_state): n_streams = 10 # We use an offset to make sure each stream produces unique values. # That way, we can tell when the mux copies have returned # the same streamer or not. streamers = [pescador.Streamer(T.infinite_generator, offset=i * 10) for i in range(n_streams)] mux = mux_class(streamers, random_state=random_state) copy_mux = copy.deepcopy(mux) assert mux.streamers is not copy_mux.streamers assert len(mux.streamers) == len(copy_mux.streamers) if random_state is None: assert mux.rng == np.random assert copy_mux.rng == np.random else: assert mux.rng is not copy_mux.rng s1 = mux.rng.get_state() s2 = copy_mux.rng.get_state() # Only the second parameter in the state tuple is useful to # compare. assert np.allclose(s1[1], s2[1]) # Using global state (random_state=None), we can't necessarily # guarantee that these will be the same withour resetting the seed, # but here with the local random state, we can. sample1 = list(mux.iterate(30)) sample2 = list(copy_mux.iterate(30)) assert T._eq_list_of_dicts(sample1, sample2)