def create_stream_reader(single_signal_file_list): data_streams = [] for audio_path in single_signal_file_list: stream = pescador.Streamer(wav_generator, audio_path) data_streams.append(stream) mux = pescador.ShuffledMux(data_streams) batch_gen = pescador.buffer_stream(mux, batch_size) return batch_gen
def test_shuffled_mux_integer_weights(self): "Tests that integer-valued weights are supported (issue #143)." a = pescador.Streamer(_cycle, 'a') b = pescador.Streamer(_cycle, 'b') c = pescador.Streamer(_cycle, 'c') int_weights = [6, 3, 1] int_mux = pescador.ShuffledMux( [a, b, c], weights=int_weights, random_state=10) int_seq = "".join(list(int_mux.iterate(max_iter=20))) float_weights = [6.0, 3.0, 1.0] float_mux = pescador.ShuffledMux( [a, b, c], weights=float_weights, random_state=10) float_seq = "".join(list(float_mux.iterate(max_iter=20))) assert int_seq == float_seq
def __init__(self, source_filepath, seq_len=512, hop=None, normalize=True, transform=None, restart_streams=False): super(MusicDataset).__init__() source_folder = Path(source_filepath) self.seq_len = seq_len if hop == None: hop = seq_len self.hop = hop self.normalize = normalize self.transform = transform # get songs' path songs = [] for root, dirs, files in os.walk(source_folder): for name in files: songs.append(os.path.join(root, name)) # let's restrict to wav files (damn .DS_Store) songs = [song for song in songs if song.endswith('.wav')] # get songs length data = [] for song in songs: # get audio info song_info = torchaudio.info(song) data.append({ "path": song, "len": int(song_info[0].length / song_info[0].channels) }) self.data = data # muxing different streams if restart_streams: streams = [ pescador.Streamer(generate_rnd_chunk, track['path'], track['len'], seq_len, normalize, transform) for track in data ] self.mux = pescador.ShuffledMux(streams) else: streams = [ pescador.Streamer(generate_chunk, track['path'], track['len'], seq_len, hop, normalize, transform) for track in data ] self.mux = pescador.StochasticMux(streams, len(streams), rate=None, mode='exhaustive')
def create_batch_generator(audio_filepath_list, batch_size): streamers = [] for audio_filepath in audio_filepath_list: s = pescador.Streamer(audio_sample_generator, audio_filepath) streamers.append(s) mux = pescador.ShuffledMux(streamers) batch_gen = pescador.buffer_stream(mux, batch_size) return batch_gen
def create_batch_generator(audio_filepath_list, batch_size): streamers = [] for audio_filepath in audio_filepath_list: class_name = os.path.basename(audio_filepath).split('-')[0] if class_name in class_labels: label = class_labels[class_name] else: label = None s = pescador.Streamer(file_sample_generator, [audio_filepath, label]) streamers.append(s) mux = pescador.ShuffledMux(streamers) batch_gen = pescador.buffer_stream(mux, batch_size) return batch_gen
def test_shuffled_mux_simple(self): "Test that `ShuffledMux` samples from all provided streams" to_generate = ['a', 'b', 'c', 'd', 'e'] streams = [pescador.Streamer(_cycle, x) for x in to_generate] mux = pescador.ShuffledMux(streams, random_state=10) samples = list(mux.iterate(max_iter=1000)) counter = collections.Counter(samples) # Test that there is [a, b, c] in the set assert set(counter.keys()) == set(to_generate) # Test that the statistics line up with expected. for i, key in enumerate(to_generate): np.testing.assert_approx_equal(counter[key] / len(samples), mux.weights[i], significant=1)
def batch_generator(self, audio_path_list, audio_label_list, batch_size): """ Generates batches to input algorithm(NN) batch <-> bunch of samples inputs algorithm(NN) one at a time :param audio_path_list: list of all paths of audio dataset :param batch_size: size(lenght) of batch :return: generated batch """ # TODO: Make it closure form to get some speed streamers = [] for (audio_path, audio_label) in zip(audio_path_list, audio_label_list): s = pescador.Streamer(self.sample_generator, audio_path, audio_label) streamers.append(s) mux = pescador.ShuffledMux(streamers) return pescador.buffer_stream(mux, batch_size)
def test_shuffled_mux_weights(self): "When sampling with weights, do the statistics line up?" a = pescador.Streamer(_cycle, 'a') b = pescador.Streamer(_cycle, 'b') c = pescador.Streamer(_cycle, 'c') weights = [.6, .3, .1] mux = pescador.ShuffledMux([a, b, c], weights=weights, random_state=10) samples = list(mux.iterate(max_iter=1000)) counter = collections.Counter(samples) # Test that there is [a, b, c] in the set assert set(counter.keys()) == {'a', 'b', 'c'} # Test the statistics on the counts. # Does the sampling approximately match the weights? for i, key in enumerate(['a', 'b', 'c']): np.testing.assert_approx_equal(counter[key] / len(samples), weights[i], significant=1)
# Each streamer will generate, on average, 5 samples before being # replaced. mux1 = pescador.StochasticMux(pop1, 3, 5) # Let's have 5 active streamers for population 2, and replace # them after 2 examples on average. mux2 = pescador.StochasticMux(pop2, 5, 2) #################### # Mux composition #################### # We multiplex the two populations using a ShuffledMux. # The ShuffledMux keeps all of its input streamers active, # and draws samples independently at random from each one. # This should generate an approximately equal number of upper- and # lower-case letters, with more diversity among the lower-case letters. hier_mux = pescador.ShuffledMux([mux1, mux2]) print(''.join(hier_mux(max_iter=80))) ##################### # Weighted sampling ##################### # If you want to specify the sampling probability of mux1 and mux2, # you can supply weights to the ShuffledMux. # By default, each input is equally likely. # This should generate three times as many upper-case as lower-case letters. weight_mux = pescador.ShuffledMux([mux1, mux2], weights=[0.75, 0.25]) print(''.join(weight_mux(max_iter=80)))
def train(self, train_dir, kk=0, folds=10000, grid=False, grid_file=None): p = self.p net = self.build() net.to(device) net.train() # Loss and Optimizer criterion = nn.NLLLoss() optimizer = torch.optim.Adam(net.parameters(), lr=p["learning_rate"], weight_decay=p["reg_lambda"]) # Pescador streams train_files = [ os.path.join(train_dir, f) for f in os.listdir(train_dir) ] streams_train = [ pescador.Streamer(indexes_gen, ff, 1, p["utter_len"], p["char_len"], k=kk, mode="train", folds=folds) for ff in train_files ] mux_stream_train = pescador.ShuffledMux(streams_train, random_state=33) word_idxs = np.empty(shape=(p["batch_size"], p["utter_len"] * p["char_len"]), dtype=int) labels = np.empty(shape=(p["batch_size"]), dtype=int) # Train the Model for epoch in range(p["num_epochs"]): print("Epoch " + str(epoch)) for i, (word_idx, label, _, _) in enumerate(mux_stream_train): np.copyto(word_idxs[i % p["batch_size"]], word_idx) labels[i % p["batch_size"]] = label if i % p["batch_size"] == 0 and i != 0: answers = autograd.Variable(torch.LongTensor(labels)) samples = torch.LongTensor(word_idxs) answers = answers.to(device) samples = samples.to(device) optimizer.zero_grad() outputs = net(samples) loss = criterion(outputs, answers) loss.backward() optimizer.step() if (i + 1) % 20 == 0: print("Epoch [%d/%d], Batch [%d], Loss: %.4f" % (epoch + 1, p["num_epochs"], i + 1, loss.item())) if i // p["batch_size"] > p["max_batch_epoch"]: break # Estimate intermediate if grid and (epoch + 1) % 10 == 0: net.eval() results_file = tempfile.NamedTemporaryFile(mode="w", delete=False) streams_test = [ pescador.Streamer(indexes_gen, ff, 1, p["utter_len"], p["char_len"], k=kk, mode="test", folds=folds) for ff in train_files ] mux_stream_test = pescador.ChainMux(streams_test) for i, (word_idx, label, character, _) in enumerate(mux_stream_test): samples = torch.LongTensor( word_idx.reshape((1, p["utter_len"] * p["char_len"]))) samples = samples.to(device) output = net(samples) entry = output.cpu().data.numpy()[0] results_file.write( str(character[0]) + "\t" + str(label[0]) + "\t" + "\t".join([ str(y) for y in sorted(enumerate(np.exp(entry)), key=lambda x: x[1], reverse=True) ]) + "\n") results_file.close() mrr_character = compute_MRR_per_character(results_file.name) macro_mrr = compute_MRR_per_prof(results_file.name, 1) auroc = compute_auroc(results_file.name, 1) grid_file.write( str(epoch + 1) + "\t" + str(mrr_character) + "\t" + str(macro_mrr) + "\t" + str(auroc[0]) + "\n") grid_file.flush() os.remove(results_file.name) net.train() # Save the Model if grid == False: self.save(p["model_path"])