def test_load_dataset(self): """ Tests the load_data function """ # if the dataset doesn't exist with self.assertRaises(IOError): ds.load_dataset("this one definitely doesn't exist either", "")
def augment(dataset, format, shift_size): """ Parameters ---------- dataset : format : shift_size : Returns ------- """ shift_size *= (math.pi/180.0) num_shifts = int(2*math.pi / shift_size) x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format) augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1])) augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1])) for ix, line in enumerate(x_train): if (ix+1)%1000 == 0: print ix+1 for s in xrange(num_shifts): shift = s * shift_size augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)] augmented_y[ix*num_shifts+s] = y_train[ix] tr.shuffle_in_unison(augmented_x, augmented_y) output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format)) np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
def _save_by_jet_num(dataset, num_jets): data, format = dataset.split('/') x_train, y_train, x_test, y_test = ds.load_dataset(data, format) if num_jets.endswith("+"): val = lambda x: x >= int(num_jets[:-1]) elif num_jets.endswith("-"): val = lambda x: x <= int(num_jets[:-1]) else: val = lambda x: x == int(num_jets) all_x = np.concatenate((x_train, x_test), axis=0) all_y = np.concatenate((y_train, y_test), axis=0) nulls = np.zeros((all_x.shape[0], all_x.shape[1]/4), dtype=np.bool) for y in xrange(all_x.shape[1]/4): for ix, row in enumerate((x_train > 0)[:, y*4:(y+1)*4]): nulls[ix, y] = all(row == 0) events_with_x_jets = val(np.array([row[:-2].sum() for row in ~nulls])) all_x, all_y = all_x[events_with_x_jets], all_y[events_with_x_jets] tr.shuffle_in_unison(all_x, all_y) cutoff = int(all_x.shape[0] * 0.8) # 80% training 20% testing train_x = all_x[:cutoff] train_y = all_y[:cutoff] test_x = all_x[cutoff:] test_y = all_y[cutoff:] output_path = os.path.join(ds.get_path_to_dataset(data), "{}jets_{}.npz".format(num_jets, format)) np.savez(output_path, x_train=train_x, x_test=test_x, y_train=train_y, y_test=test_y)
def augment(dataset, format, shift_size): shift_size *= (math.pi/180.0) num_shifts = int(2*math.pi / shift_size) x_train, y_train, x_test, y_test = ds.load_dataset(dataset, format) augmented_x = np.zeros((x_train.shape[0]*num_shifts, x_train.shape[1])) augmented_y = np.zeros((y_train.shape[0]*num_shifts, y_train.shape[1])) for ix, line in enumerate(x_train): if (ix+1)%1000 == 0: print ix+1 for s in xrange(num_shifts): shift = s * shift_size augmented_x[ix*num_shifts+s] = [verify_angle(val+shift) if index%4==2 else val for index,val in enumerate(line)] augmented_y[ix*num_shifts+s] = y_train[ix] tr.shuffle_in_unison(augmented_x, augmented_y) output_path = os.path.join(ds.get_path_to_dataset(dataset), "augmented_{}.npz".format(format)) np.savez(output_path, x_train=augmented_x, x_test=x_test, y_train=augmented_y, y_test=y_test)
def permutate_individual_sorted(dataset): """ Only use this for sorted data! Also, this takes up a significant amount of RAM """ data, format = dataset.split('/') x_train, y_train, x_test, y_test = ds.load_dataset(data, format) # Generate permutations, transforms, and alter the dataset perms = list(gen_permutations(2, 7, 2)) num_perms = len(perms) aperms = np.array(perms) labels = np.zeros(aperms.shape) r = np.arange(11) for i,p in enumerate(aperms): labels[i] = (p == r).astype('int32') transforms = np.zeros((44, 44 * num_perms)) for i, p in enumerate(perms): transforms[:, i * 44:(i + 1) * 44] = E(p) # For the training data sorted_train_x = np.zeros((x_train.shape[0] * num_perms, x_train.shape[1])) sorted_train_y = np.zeros((sorted_train_x.shape[0], 2)) for i, batch in enumerate(x_train): event = np.dot(batch, transforms).reshape((num_perms, x_train.shape[1])) arange = np.arange(num_perms) np.random.shuffle(arange) sorted_train_x[i * num_perms:(i + 1) * num_perms] = event[arange] sorted_train_y[i * num_perms:(i + 1) * num_perms] = labels[arange] # For the testing data sorted_test_x = np.zeros((x_test.shape[0] * num_perms, x_test.shape[1])) sorted_test_y = np.zeros((sorted_test_x.shape[0], 2)) for i, batch in enumerate(x_test): event = np.dot(batch, transforms).reshape((num_perms, x_test.shape[1])) arange = np.arange(num_perms) np.random.shuffle(arange) sorted_test_x[i * num_perms:(i + 1) * num_perms] = event[arange] sorted_test_y[i * num_perms:(i + 1) * num_perms] = labels[arange] output_path = os.path.join(ds.get_path_to_dataset(data), "{}_{}.npz".format(format, "Permuted")) np.savez(output_path, x_train=sorted_train_x, x_test=sorted_test_x, y_train=sorted_train_y, y_test=sorted_test_y)
cutoff = 1-i*(1/datapoints) e_b[i], e_s[i] = efficiencies(model, data, cutoff)[:,1] if experiment_epoch: point = experiment_epoch.curve.add() point.signal = e_s[i] point.background = e_b[i] point.cutoff = cutoff if save: plt.plot(e_b, e_s) plt.title("Efficiency Curve") plt.ylabel("Signal Efficiency") plt.xlabel("Background Inefficiency") plt.savefig(save, format="png") return trapz(e_s,e_b) # "" def confusion_matrix(model, data, offset='', **kwargs): eff = efficiencies(model, data, **kwargs) return MATRIX.format(offset, *(eff*100).flatten()) if __name__ == "__main__": from deep_learning.trainNN import load_model model = load_model("ttHLep/U_Optimal") x_train, y_train, x_test, y_test = ds.load_dataset("ttHLep", "Unsorted") x_train, x_test = tr.transform(x_train, x_test) data = (x_train, y_train, x_test, y_test) print significance(model, data) print AUC(model, data) print confusion_matrix(model, data) print confusion_matrix(model, data, over_rows=False)
def save_ratios(dataset, ratios, buffer=1000): """ Divides a certain dataset into subsets of data with certain ratios of backgrond to signal. For a ratio list of length n, the counting index, i, for the background starts from index 0, and the counting index, j, for the signal starts at n-1. The ratio for each iteration is then i to j (i/j). Generates a temporary file to accomplish this. Parameters ---------- dataset <string> : the name of the dataset (/-separated) ratios <list> : a list of integers that define ratios of background to signal. buffer <int> : an integer defining the number of data points to load into memory at a time. """ ratios = [ratios] if type(ratios) is str else ratios ratios = map(lambda x: map(float, x.split(':')), ratios) data = dataset.split('/')[0] format = '/'.join(dataset.split('/')[1:]) main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a') bkg_test, sig_test = sum_cols(y_test) bkg_train, sig_train = sum_cols(y_train) TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test) TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train) temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.h5", "Temp", [(bkg_train, x_train.shape[1]), (bkg_train, y_train.shape[1]), (sig_train, x_train.shape[1]), (sig_train, y_train.shape[1]), (bkg_test, x_test.shape[1]), (bkg_test, y_test.shape[1]), (sig_test, x_test.shape[1]), (sig_test, y_test.shape[1])], names=["train_bkg_x", "train_bkg_y", "train_sig_x", "train_sig_y", "test_bkg_x", "test_bkg_y", "test_sig_x", "test_sig_y"]) print "Generating temporary files..." for i in xrange(int(math.ceil(x_train.shape[0] / buffer))): # index should be same shape and need to reshape the result :/ train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) for j in xrange(x_train.shape[1]): train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1 train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1 for j in xrange(x_test.shape[1]): test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1 test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1 selection = x_train[train_bkg_index] temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_bkg_index[:, :y_train.shape[1]]] temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_train[train_sig_index] temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_sig_index[:, :y_train.shape[1]]] temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_test[test_bkg_index] temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_bkg_index[:, :y_test.shape[1]]] temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) selection = x_test[test_sig_index] temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_sig_index[:, :y_test.shape[1]]] temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) # Perform all of this in archive so that you write to file every iteration buffer_reset = buffer for rat in ratios: print "Creating ratio {:d}/{:d} ...".format(*map(int, rat)) h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".h5", "{}to{}".format(*map(int, rat)), [(TRAIN_UPPER_LIMIT, x_train.shape[1]), (TRAIN_UPPER_LIMIT, y_train.shape[1]), (TEST_UPPER_LIMIT, x_test.shape[1]), (TEST_UPPER_LIMIT, y_test.shape[1])], where='/{}'.format(format)) test_bkg_indices = np.arange(bkg_test) test_sig_indices = np.arange(sig_test) train_bkg_indices = np.arange(bkg_train) train_sig_indices = np.arange(sig_train) train_count = 0 buffer = buffer_reset while train_count < TRAIN_UPPER_LIMIT: if TRAIN_UPPER_LIMIT - train_count < buffer: buffer = TRAIN_UPPER_LIMIT - train_count # Indices to NOT include train_bkg_ix = np.random.choice(train_bkg_indices, train_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) train_sig_ix = np.random.choice(train_sig_indices, train_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix) k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix) train_small_x_sig = temp_h_data[2][k_train_sig] train_small_y_sig = temp_h_data[3][k_train_sig] train_small_x_bkg = temp_h_data[0][k_train_bkg] train_small_y_bkg = temp_h_data[1][k_train_bkg] train_x = np.concatenate((train_small_x_bkg, train_small_x_sig)) train_y = np.concatenate((train_small_y_bkg, train_small_y_sig)) tr.shuffle_in_unison(train_x, train_y) h_data[0].append(train_x) h_data[1].append(train_y) train_count += k_train_bkg.size + k_train_sig.size train_bkg_indices = train_bkg_ix train_sig_indices = train_sig_ix test_count = 0 buffer = buffer_reset while test_count < TEST_UPPER_LIMIT: if TEST_UPPER_LIMIT - test_count < buffer: buffer = TEST_UPPER_LIMIT - test_count # Indices to NOT include test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix) k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix) test_small_x_sig = temp_h_data[6][k_test_sig] test_small_y_sig = temp_h_data[7][k_test_sig] test_small_x_bkg = temp_h_data[4][k_test_bkg] test_small_y_bkg = temp_h_data[5][k_test_bkg] test_x = np.concatenate((test_small_x_bkg, test_small_x_sig)) test_y = np.concatenate((test_small_y_bkg, test_small_y_sig)) tr.shuffle_in_unison(test_x, test_y) h_data[2].append(test_x) h_data[3].append(test_y) test_count += k_test_bkg.size + k_test_sig.size test_bkg_indices = test_bkg_ix test_sig_indices = test_sig_ix print "Created Group: {}/{}to{}".format(format, *map(int, rat)) h_file.flush() h_file.close() main_file.close() temp_h_file.close() os.remove(".deep_learning.temp.h5")
def run(model, exp, terms, save_freq=5, data=None): exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset)) save_dir = os.path.join(exp_dir, exp.description) ## # Load data from .npz archive created by invoking # deep_learning/utils/archive.py ## if data: x_train, y_train, x_test, y_test = data x_train, x_test = tr.transform(x_train, x_test) else: h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset( pb.Experiment.Dataset.Name(exp.dataset), exp.coordinates + '/transformed') data = x_train, y_train, x_test, y_test exp_file_name = exp.description + '.exp' # Start training train_length = x_train.shape[0] num_batches = int(ceil(train_length / exp.batch_size)) valid = Validator(exp, terms) eTimes = np.array([]) valid._clock = clock() model.summary() while valid.check(): t = clock() if valid._num_epochs: print("Epoch {}/{}".format(valid.epochs + 1, valid._num_epochs)) else: print("Epoch {}".format(valid.epochs + 1)) bETA = 0 bTimes = np.array([]) #print("\t Training: ") for b in xrange(num_batches): bt = clock() # Update progress bar progress(b, num_batches, exp.batch_size, bETA) # Train on a batch x_batch = x_train[b * exp.batch_size:b * exp.batch_size + exp.batch_size, :] y_batch = y_train[b * exp.batch_size:b * exp.batch_size + exp.batch_size, :] model.train_on_batch(x_batch, y_batch) bTimes = np.append(bTimes, clock() - bt) bETA = np.median(bTimes) * (num_batches - b - 1) # Finish progress bar progress(num_batches, num_batches, exp.batch_size, 0, end='\n', time=clock() - t) # Calculate stats and add the epoch results to the experiment object epoch = exp.results.add() timer = clock() print("Evaluating Train") epoch.train_loss, epoch.train_accuracy = model.evaluate_generator( ((x_train[i * exp.batch_size:(i + 1) * exp.batch_size], y_train[i * exp.batch_size:(i + 1) * exp.batch_size]) for i in xrange(num_batches)), num_batches, max_q_size=min((num_batches // 2, 10))) #print("Finished {:.2f}s".format(clock()-timer)) timer = clock() print("Evaluating Test") epoch.test_loss, epoch.test_accuracy = model.evaluate_generator( ((x_test[i * exp.batch_size:(i + 1) * exp.batch_size], y_test[i * exp.batch_size:(i + 1) * exp.batch_size]) for i in xrange(int(ceil(x_test.shape[0] / exp.batch_size)))), int(ceil(x_test.shape[0] / exp.batch_size)), max_q_size=min( (int(ceil(x_test.shape[0] / exp.batch_size)) // 2, 10))) #print("Finished {:.2f}s".format(clock() - timer)) timer = clock() print("Calculating Sig") epoch.s_b = st.significance(model, data) #print("Finished {:.2f}".format(clock() - timer)) #timer = clock() #print("Calculating AUC {:.2f}".format(clock())) #epoch.auc = st.AUC(model, data, experiment_epoch=epoch) #print("Finished {:.2f}".format(clock() - timer)) timer = clock() for r in st.num_of_each_cell(model, data): epoch.matrix.add().columns.extend(r) print("Making CFM") matrix = st.confusion_matrix(model, data, offset='\t ') #print("Finished {:.2f}".format(clock() - timer)) epoch.num_seconds = clock() - t timer = clock() print("Getting output") output = st.get_output_distro(model, data) epoch.output.background.extend(output["background"]) epoch.output.signal.extend(output["signal"]) #print("Finished {:.2f}".format(clock() - timer)) # Print statistics print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format( epoch.train_accuracy, epoch.test_accuracy)) if valid.update_w(): print("\t Slope: {:.5f} (test_accuracy / second)".format( valid.slope)) print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='') if valid._num_epochs: eTimes = np.append(eTimes, epoch.num_seconds) print("\tFinal ETA: {}".format( convert_seconds( np.median(eTimes) * (valid._num_epochs - valid.epochs)))) else: print() print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b)) print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc)) print(matrix) # Saves the model if (len(exp.results) % save_freq) == 0: save(model, exp, save_dir, exp_file_name) print("\t ", end='') sys.stdout.flush() exp.end_date_time = str(datetime.datetime.now()) exp.total_time = valid.time print("\n" + valid.failed) print("Total Time: {}".format(convert_seconds(valid.time))) save(model, exp, save_dir, exp_file_name) print("\t ", end='') h_file.close()
def run(model, exp, terms, save_freq=5, data=None): exp_dir = ds.get_path_to_dataset(pb.Experiment.Dataset.Name(exp.dataset)) save_dir = os.path.join(exp_dir, exp.description) ## # Load data from .npz archive created by invoking # deep_learning/utils/archive.py ## if data: x_train, y_train, x_test, y_test = data x_train, x_test = tr.transform(x_train, x_test) else: h_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(pb.Experiment.Dataset.Name(exp.dataset), exp.coordinates) x_train, x_test = tr.transform(x_train, x_test) data = x_train, y_train, x_test, y_test exp_file_name = exp.description + '.exp' train_length = x_train.shape[0] num_batches = int(ceil(train_length / exp.batch_size)) valid = Validator(exp, terms) eTimes = np.array([]) valid._clock = clock() model.summary() while valid.check(): t = clock() if valid._num_epochs: print("Epoch {}/{}".format(valid.epochs+1, valid._num_epochs)) else: print("Epoch {}".format(valid.epochs+1)) bETA = 0 bTimes = np.array([]) for b in xrange(num_batches): bt = clock() # Update progress bar progress(b, num_batches, exp.batch_size, bETA) # Train on a batch model.train_on_batch(x_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :], y_train[b*exp.batch_size:b*exp.batch_size+exp.batch_size, :]) bTimes = np.append(bTimes, clock()-bt) bETA = np.median(bTimes)*(num_batches-b-1) # Finish progress bar progress(num_batches, num_batches, exp.batch_size, 0, end='\n') # Calculate stats and add the epoch results to the experiment object epoch = exp.results.add() epoch.train_loss, epoch.train_accuracy = model.evaluate_generator(((x_train[i*exp.batch_size:(i+1)*exp.batch_size], y_train[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(int(ceil(x_test.shape[0]/exp.batch_size)))), int(ceil(x_test.shape[0]/exp.batch_size))) epoch.test_loss, epoch.test_accuracy = model.evaluate_generator(((x_test[i*exp.batch_size:(i+1)*exp.batch_size], y_test[i*exp.batch_size:(i+1)*exp.batch_size]) for i in xrange(num_batches)), num_batches) epoch.s_b = st.significance(model, data) epoch.auc = st.AUC(model, data, experiment_epoch=epoch) for r in st.num_of_each_cell(model, data): epoch.matrix.add().columns.extend(r) matrix = st.confusion_matrix(model, data, offset='\t ') epoch.num_seconds = clock() - t # Print statistics print("\t Train Accuracy: {:.3f}\tTest Accuracy: {:.3f}".format(epoch.train_accuracy, epoch.test_accuracy)) if valid.update_w(): print("\t Slope: {:.5f} (test_accuracy / second)".format(valid.slope)) print("\t Time this epoch: {:.2f}s".format(epoch.num_seconds), end='') if valid._num_epochs: eTimes = np.append(eTimes, epoch.num_seconds) print("\tFinal ETA: {}".format(convert_seconds(np.median(eTimes) * (valid._num_epochs - valid.epochs)))) else: print() print("\t Significance (S/sqrt(B)): {:.2f}".format(epoch.s_b)) print("\t Area Under the Curve (efficiency): {:.3f}".format(epoch.auc)) print(matrix) if (len(exp.results) % save_freq) == 0: save(model, exp, save_dir, exp_file_name) print("\t Saved the model\n") sys.stdout.flush() exp.end_date_time = str(datetime.datetime.now()) exp.total_time = valid.time print("\n"+valid.failed) print("Total Time: {}".format(convert_seconds(valid.time))) save(model, exp, save_dir, exp_file_name, graph=True) h_file.close()
def save_ratios(dataset, ratios, buffer=1000): ratios = [ratios] if type(ratios) is str else ratios ratios = map(lambda x: map(float, x.split(':')), ratios) data, format = dataset.split('/') main_file, (x_train, y_train, x_test, y_test) = ds.load_dataset(data, format, mode='a') bkg_test, sig_test = sum_cols(y_test) bkg_train, sig_train = sum_cols(y_train) TEST_UPPER_LIMIT = int(1.5 * bkg_test) if bkg_test < sig_test else int(1.5 * sig_test) TRAIN_UPPER_LIMIT = int(1.5 * bkg_train) if bkg_train < sig_train else int(1.5 * sig_train) temp_h_file, temp_h_data = add_group_hdf5(".deep_learning.temp.hdf5", "Temp", [(bkg_train, x_train.shape[1]), (bkg_train, y_train.shape[1]), (sig_train, x_train.shape[1]), (sig_train, y_train.shape[1]), (bkg_test, x_test.shape[1]), (bkg_test, y_test.shape[1]), (sig_test, x_test.shape[1]), (sig_test, y_test.shape[1])], names=["train_bkg_x", "train_bkg_y", "train_sig_x", "train_sig_y", "test_bkg_x", "test_bkg_y", "test_sig_x", "test_sig_y"]) print "Generating temporary files..." for i in xrange(int(math.ceil(x_train.shape[0] / buffer))): # index should be same shape and need to reshape the result :/ train_bkg_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) train_sig_index = np.array([[False]*x_train.shape[1]]*x_train.shape[0]) test_bkg_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) test_sig_index = np.array([[False]*x_test.shape[1]]*x_test.shape[0]) for j in xrange(x_train.shape[1]): train_bkg_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 0] == 1 train_sig_index[i * buffer:(i + 1) * buffer, j] = y_train[i * buffer:(i + 1) * buffer, 1] == 1 for j in xrange(x_test.shape[1]): test_bkg_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 0] == 1 test_sig_index[i * buffer:(i + 1) * buffer, j] = y_test[i * buffer:(i + 1) * buffer, 1] == 1 selection = x_train[train_bkg_index] temp_h_data[0].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_bkg_index[:, :y_train.shape[1]]] temp_h_data[1].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_train[train_sig_index] temp_h_data[2].append(selection.reshape((selection.size/x_train.shape[1], x_train.shape[1]))) selection = y_train[train_sig_index[:, :y_train.shape[1]]] temp_h_data[3].append(selection.reshape((selection.size/y_train.shape[1], y_train.shape[1]))) selection = x_test[test_bkg_index] temp_h_data[4].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_bkg_index[:, :y_test.shape[1]]] temp_h_data[5].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) selection = x_test[test_sig_index] temp_h_data[6].append(selection.reshape((selection.size/x_test.shape[1], x_test.shape[1]))) selection = y_test[test_sig_index[:, :y_test.shape[1]]] temp_h_data[7].append(selection.reshape((selection.size/y_test.shape[1], y_test.shape[1]))) # Perform all of this in archive so that you write to file every iteration buffer_reset = buffer for rat in ratios: print "Creating ratio {:d}/{:d} ...".format(*map(int, rat)) h_file, h_data = add_group_hdf5(ds.get_path_to_dataset(data)+os.sep+data+".hdf5", "{}to{}".format(*map(int, rat)), [(TRAIN_UPPER_LIMIT, x_train.shape[1]), (TRAIN_UPPER_LIMIT, y_train.shape[1]), (TEST_UPPER_LIMIT, x_test.shape[1]), (TEST_UPPER_LIMIT, y_test.shape[1])], where='/{}'.format(format)) test_bkg_indices = np.arange(bkg_test) test_sig_indices = np.arange(sig_test) train_bkg_indices = np.arange(bkg_train) train_sig_indices = np.arange(sig_train) train_count = 0 buffer = buffer_reset while train_count < TRAIN_UPPER_LIMIT: if TRAIN_UPPER_LIMIT - train_count < buffer: buffer = TRAIN_UPPER_LIMIT - train_count # Indices to NOT include train_bkg_ix = np.random.choice(train_bkg_indices, train_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) train_sig_ix = np.random.choice(train_sig_indices, train_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_train_bkg = np.setdiff1d(train_bkg_indices, train_bkg_ix) k_train_sig = np.setdiff1d(train_sig_indices, train_sig_ix) train_small_x_sig = temp_h_data[2][k_train_sig] train_small_y_sig = temp_h_data[3][k_train_sig] train_small_x_bkg = temp_h_data[0][k_train_bkg] train_small_y_bkg = temp_h_data[1][k_train_bkg] train_x = np.concatenate((train_small_x_bkg, train_small_x_sig)) train_y = np.concatenate((train_small_y_bkg, train_small_y_sig)) tr.shuffle_in_unison(train_x, train_y) h_data[0].append(train_x) h_data[1].append(train_y) train_count += k_train_bkg.size + k_train_sig.size train_bkg_indices = train_bkg_ix train_sig_indices = train_sig_ix test_count = 0 buffer = buffer_reset while test_count < TEST_UPPER_LIMIT: if TEST_UPPER_LIMIT - test_count < buffer: buffer = TEST_UPPER_LIMIT - test_count # Indices to NOT include test_bkg_ix = np.random.choice(test_bkg_indices, test_bkg_indices.size - (rat[0] * buffer / sum(rat)), replace=False) test_sig_ix = np.random.choice(test_sig_indices, test_sig_indices.size - (rat[1] * buffer / sum(rat)), replace=False) # Indices to keep k_test_bkg = np.setdiff1d(test_bkg_indices, test_bkg_ix) k_test_sig = np.setdiff1d(test_sig_indices, test_sig_ix) test_small_x_sig = temp_h_data[6][k_test_sig] test_small_y_sig = temp_h_data[7][k_test_sig] test_small_x_bkg = temp_h_data[4][k_test_bkg] test_small_y_bkg = temp_h_data[5][k_test_bkg] test_x = np.concatenate((test_small_x_bkg, test_small_x_sig)) test_y = np.concatenate((test_small_y_bkg, test_small_y_sig)) tr.shuffle_in_unison(test_x, test_y) h_data[2].append(test_x) h_data[3].append(test_y) test_count += k_test_bkg.size + k_test_sig.size test_bkg_indices = test_bkg_ix test_sig_indices = test_sig_ix print "Created Group: {}/{}to{}".format(format, *map(int, rat)) h_file.flush() h_file.close() main_file.close() temp_h_file.close() os.remove(".deep_learning.temp.hdf5")