def data_generator_val(directories, sampler, batch_size=16, slices=None, **kwargs): '''Generate a data stream from a collection of tracks and a sampler''' seeds = [] for working in directories: for track in tqdm(find_files(working,ext='h5')): fname = os.path.join(working,track) seeds.append(data_sampler(fname, sampler, slices)) # Send it all to a mux mux = pescador.ChainMux(seeds, mode='cycle', **kwargs) return pescador.buffer_stream(mux, batch_size, axis=0)
for model in models: experiment_folder = config_file.DATA_FOLDER + 'experiments/' + str( model) + '/' config = json.load(open(experiment_folder + 'config.json')) print('Experiment: ' + str(model)) print('\n' + str(config)) # pescador: define (finite, batched & parallel) streamer pack = [config, 'overlap_sampling', config['n_frames'], False] streams = [ pescador.Streamer(train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in ids ] mux_stream = pescador.ChainMux(streams, mode='exhaustive') batch_streamer = pescador.Streamer(pescador.buffer_stream, mux_stream, buffer_size=TEST_BATCH_SIZE, partial=True) batch_streamer = pescador.ZMQStreamer(batch_streamer) # tensorflow: define model and cost fuckin_graph = tf.Graph() with fuckin_graph.as_default(): sess = tf.Session() [x, y_, is_train, y, normalized_y, cost] = train.tf_define_model_and_cost(config) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() results_folder = experiment_folder
def eval(config, ids, id2audio_repr_path, support_set, id2gt, id2label, tf_vars, vis_vars): [ id_string, save_latents, track_accuracies, printing, transfer_learning, model_folder ] = vis_vars if transfer_learning: [sess, x, q, log_p_y, emb_q, emb_prototypes] = tf_vars pack = [config, 'overlap_sampling', 1] eval_streams = [ pescador.Streamer(transfer_train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in ids ] else: [sess, x, q, is_train, log_p_y, emb_q, emb_prototypes] = tf_vars pack = [config, 'overlap_sampling', 42] # 42 being a second of audio eval_streams = [ pescador.Streamer(sl_train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in ids ] eval_mux_stream = pescador.ChainMux(eval_streams, mode='exhaustive') eval_batch_streamer = pescador.Streamer( pescador.buffer_stream, eval_mux_stream, buffer_size=config['test_batch_size'], partial=True) first_eval = True count = 0 for eval_batch in eval_batch_streamer: if transfer_learning: [probabilities, embeddings, prototypes] = sess.run([log_p_y, emb_q, emb_prototypes], feed_dict={ x: support_set, q: np.expand_dims(eval_batch['X'], axis=-1) }) else: [probabilities, embeddings, prototypes] = sess.run( [log_p_y, emb_q, emb_prototypes], feed_dict={ x: support_set, q: np.expand_dims(eval_batch['X'], axis=-1), is_train: False }) if first_eval: first_eval = False pred_array = probabilities id_array = eval_batch['ID'] if save_latents: embed_array = embeddings gt_array = eval_batch['Y'] else: count = count + 1 pred_array = np.concatenate((pred_array, probabilities), axis=0) id_array = np.append(id_array, eval_batch['ID']) if save_latents: embed_array = np.concatenate((embed_array, embeddings), axis=0) gt_array = np.concatenate((gt_array, eval_batch['Y']), axis=0) epoch_acc = shared.accuracy_with_aggergated_predictions( pred_array, id_array, ids, id2label) if printing: print(id_string + ' Number of audios: ' + str(len(ids))) print(id_string + ' Accuracy: ' + str(epoch_acc)) print(id_string + ' Prototypes: ' + str(prototypes.shape)) if track_accuracies: fac = open(model_folder + 'epoch_accuracies.tsv', 'a') fac.write(str(epoch_acc) + '\n') fac.close() if save_latents: print(id_string + ' Embed_array: ' + str(embed_array.shape)) print(id_string + ' GT: ' + str(gt_array.shape)) np.savez(model_folder + 'embeddings_' + id_string + '.npz', embed_array) np.savez(model_folder + 'prototypes.npz', prototypes) np.savez(model_folder + 'gt_' + id_string + '.npz', gt_array) print('Storing latents for visualization..') print('\nPrototypes: ') print(prototypes) return epoch_acc
def fetch_data(classes_vector, label2selectedIDs, id2audio_repr_path, id2gt, config, transfer_learning=False): set_dic = {} gt_dic = {} id_dic = {} minimum_number_of_patches = np.inf total_number_of_patches = 0 for c in classes_vector: # pescador: to batch the computations preprocess_batch_size = np.min( [len(label2selectedIDs[c]), config['preprocess_batch_size']]) print('Batch size: ' + str(preprocess_batch_size)) print('IDs used for computing the category ' + str(c) + ' prototype: ' + str(label2selectedIDs[c])) pack = [ config, config['train_sampling'], config['param_train_sampling'] ] if transfer_learning: streams = [ pescador.Streamer(transfer_train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in label2selectedIDs[c] ] else: streams = [ pescador.Streamer(sl_train.data_gen, id, id2audio_repr_path[id], id2gt[id], pack) for id in label2selectedIDs[c] ] mux_stream = pescador.ChainMux(streams, mode='exhaustive') batch_streamer = pescador.Streamer(pescador.buffer_stream, mux_stream, buffer_size=preprocess_batch_size, partial=True) # construct data vectors first = True gt = [] for batch in batch_streamer: if first: class_set = batch['X'] class_gt = batch['Y'] class_id = batch['ID'] first = False else: class_set = np.concatenate((class_set, batch['X']), axis=0) class_gt = np.concatenate((class_gt, batch['Y']), axis=0) class_id = np.concatenate((class_id, batch['ID']), axis=0) print(class_set.shape) print(class_gt.shape) print(class_id.shape) set_dic[c] = class_set gt_dic[c] = class_gt id_dic[c] = class_id minimum_number_of_patches = min(minimum_number_of_patches, class_set.shape[0]) total_number_of_patches += class_set.shape[0] return [ set_dic, gt_dic, id_dic, minimum_number_of_patches, total_number_of_patches ]
data = np.load(filename) X = data['X'] Y = data['Y'] for i in range(m): yield dict(X=X[i], y=Y[i]) ############################ # Constructing the streamers ############################ # First, we'll make a streamer for each validation example. # val_streams = [data_gen(fn, M) for fn in val_files] ############################ # Building the mux ############################ # The `ChainMux` can be used to combine data from all val_streams # in order. # We'll use `cycle` mode here, so that the chain restarts after # all of the streamers have been exhausted. # This produces an infinite stream of data from a finite sequence # that repeats every `N*M` steps. # This can be used in `keras`'s `fit_generator` function # with `validation_steps=N*M` to ensure that the validation set is # constant at each epoch. val_stream = pescador.ChainMux(val_streams, mode='cycle')
def train(self, train_dir, kk=0, folds=10000, grid=False, grid_file=None): p = self.p net = self.build() net.to(device) net.train() # Loss and Optimizer criterion = nn.NLLLoss() optimizer = torch.optim.Adam(net.parameters(), lr=p["learning_rate"], weight_decay=p["reg_lambda"]) # Pescador streams train_files = [ os.path.join(train_dir, f) for f in os.listdir(train_dir) ] streams_train = [ pescador.Streamer(indexes_gen, ff, 1, p["utter_len"], p["char_len"], k=kk, mode="train", folds=folds) for ff in train_files ] mux_stream_train = pescador.ShuffledMux(streams_train, random_state=33) word_idxs = np.empty(shape=(p["batch_size"], p["utter_len"] * p["char_len"]), dtype=int) labels = np.empty(shape=(p["batch_size"]), dtype=int) # Train the Model for epoch in range(p["num_epochs"]): print("Epoch " + str(epoch)) for i, (word_idx, label, _, _) in enumerate(mux_stream_train): np.copyto(word_idxs[i % p["batch_size"]], word_idx) labels[i % p["batch_size"]] = label if i % p["batch_size"] == 0 and i != 0: answers = autograd.Variable(torch.LongTensor(labels)) samples = torch.LongTensor(word_idxs) answers = answers.to(device) samples = samples.to(device) optimizer.zero_grad() outputs = net(samples) loss = criterion(outputs, answers) loss.backward() optimizer.step() if (i + 1) % 20 == 0: print("Epoch [%d/%d], Batch [%d], Loss: %.4f" % (epoch + 1, p["num_epochs"], i + 1, loss.item())) if i // p["batch_size"] > p["max_batch_epoch"]: break # Estimate intermediate if grid and (epoch + 1) % 10 == 0: net.eval() results_file = tempfile.NamedTemporaryFile(mode="w", delete=False) streams_test = [ pescador.Streamer(indexes_gen, ff, 1, p["utter_len"], p["char_len"], k=kk, mode="test", folds=folds) for ff in train_files ] mux_stream_test = pescador.ChainMux(streams_test) for i, (word_idx, label, character, _) in enumerate(mux_stream_test): samples = torch.LongTensor( word_idx.reshape((1, p["utter_len"] * p["char_len"]))) samples = samples.to(device) output = net(samples) entry = output.cpu().data.numpy()[0] results_file.write( str(character[0]) + "\t" + str(label[0]) + "\t" + "\t".join([ str(y) for y in sorted(enumerate(np.exp(entry)), key=lambda x: x[1], reverse=True) ]) + "\n") results_file.close() mrr_character = compute_MRR_per_character(results_file.name) macro_mrr = compute_MRR_per_prof(results_file.name, 1) auroc = compute_auroc(results_file.name, 1) grid_file.write( str(epoch + 1) + "\t" + str(mrr_character) + "\t" + str(macro_mrr) + "\t" + str(auroc[0]) + "\n") grid_file.flush() os.remove(results_file.name) net.train() # Save the Model if grid == False: self.save(p["model_path"])
def multiplex_tfr(data_dir, n_hops, batch_size, mode="inference", aug_kind_str="none", tfr_str="logmelspec", label_inputs=False, partial_labels=True, structured=True, active_streamers=32, streamer_rate=1024, num_cpus=1, multi_label=False, align_perturb=False, single_output="fine"): tfr_dir = os.path.join(data_dir, tfr_str) streams = [] # Parse augmentation kind string (aug_kind_str). if mode == "train": if aug_kind_str == "none": augs = ["original"] elif aug_kind_str == "pitch": augs = ["original", "pitch"] elif aug_kind_str == "stretch": augs = ["original", "stretch"] elif aug_kind_str == "all-but-noise": augs = ["original", "pitch", "stretch"] else: if aug_kind_str == "all": augs = ["original", "pitch", "stretch", "noise"] elif aug_kind_str == "noise": augs = ["original", "noise"] else: raise ValueError('Invalid augmentation kind: {}'.format(aug_kind_str)) # Generate a Pescador streamer for every HDF5 container, that is, # every unit-augmentation-instance triplet. aug_dict = get_augmentations() aug_list = [] class_list = [] class_count = Counter() for aug_str in augs: if aug_str == "original": instances = [aug_str] else: n_instances = aug_dict[aug_str] instances = ["-".join([aug_str, str(instance_id+1)]) for instance_id in range(n_instances)] if aug_str == "noise" and tfr_str == "logmelspec": bias = np.float32(-17.0) else: bias = np.float32(0.0) for instanced_aug_str in instances: aug_dir = os.path.join(tfr_dir, instanced_aug_str) lms_name = "_".join(["*", instanced_aug_str]) lms_pattern = os.path.join(aug_dir, lms_name + ".h5*") for lms_path in glob.glob(lms_pattern): if not is_valid_data_hdf5(lms_path, partial_labels): continue taxonomy_code = os.path.splitext(os.path.basename(lms_path))[0].split('_')[1].replace('-', '.') triplet = annotations.get_taxonomy_code_idx_triplet(taxonomy_code) coarse_idx, medium_idx, fine_idx = triplet if structured or single_output == "fine": bal_idx = fine_idx elif single_output == "medium": bal_idx = medium_idx elif single_output == "coarse": bal_idx = coarse_idx else: raise ValueError("Invalid single output mode:{}".format(single_output)) class_list.append(bal_idx) class_count[bal_idx] += 1 aug_list.append(aug_str) stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb) streams.append(stream) num_streamers = len(streams) num_fine_classes = len(class_count) num_aug = len([k for k in aug_dict.keys() if k != "original"]) class_weights = {cls: (num_streamers / float(num_fine_classes * count)) for cls, count in class_count.items()} aug_weights = {aug: 1.0 if aug == "original" else 1.0 / num_aug for aug, n_inst in aug_dict.items()} # Weight examples to balance for class, such that each file is sampled from evenly per class. Additionally, # Balance so sampling any augmentation type (or original) is equally likely, despite the number of instances # per augmentation. Within augmentation types, instances are equally likely. weights = [class_weights[cls] * aug_weights[aug] for cls, aug in zip(class_list, aug_list)] # Multiplex streamers together. if num_cpus > 1: mux = create_zmq_mux(streams, num_cpus, active_streamers, streamer_rate, weights=weights) else: mux = pescador.StochasticMux(streams, n_active=active_streamers, rate=streamer_rate, weights=weights) # Create buffered streamer with specified batch size. buffered_streamer = pescador.maps.buffer_stream(mux, batch_size) else: # If not dealing with augmentations, just go through all HDF5 files weights = None bias = np.float32(0.0) for fname in os.listdir(data_dir): lms_path = os.path.join(data_dir, fname) if not is_valid_data_hdf5(lms_path, partial_labels): continue stream = pescador.Streamer(yield_tfr, lms_path, n_hops, bias, tfr_str, mode, label_inputs, multi_label, align_perturb) streams.append(stream) # Multiplex streamers together, but iterate exhaustively. mux = pescador.ChainMux(streams, mode='exhaustive') # Create buffered streamer with specified batch size. buffered_streamer = cycle_partial_buffer_stream(mux, batch_size) inputs = ["tfr_input"] if mode in ('train', 'valid') and structured and label_inputs: inputs += ["coarse_label_input", "medium_label_input"] if structured: outputs = ["y_coarse", "y_medium", "y_fine"] else: outputs = ["y_" + single_output] return pescador.maps.keras_tuples(buffered_streamer, inputs=inputs, outputs=outputs)