def main(): parser = argparse.ArgumentParser( description="Train a Keras model over you pre-processed files") parser.add_argument( "-i", "--input", help= "Provide input, pre-processed HDF5 file with training, validation, and scaling data", required=True) parser.add_argument( "-m", "--model-name", help="Provide the name of the model to build and train", required=True) parser.add_argument( "-o", "--outdir", help="Provide an output directory do dump files [default: ./]", default="./") parser.add_argument("-n", "--name", help="Provide output filename descriptor", default="test") parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Be loud about it") parser.add_argument("--regress", help="Provide a variable to regress on", default="") args = parser.parse_args() training_samples, data_scaler = load_input_file(args) if len(training_samples) < 2: print( "ERROR there are not enough training samples loaded to perform a training" ) sys.exit() print("Pre-processed file contained {} samples: {}, {}".format( len(training_samples), [s.name() for s in training_samples], [s.class_label() for s in training_samples])) input_features, targets, regression_targets = build_combined_input( training_samples, data_scaler=data_scaler, scale=True, regress_var=args.regress) if len(regression_targets) == 0: regression_targets = [] #model = None #if args.regress == "" : # model = build_keras_model( len(data_scaler.feature_list()), len(training_samples) ) #else : # model = build_keras_model_regression( len(data_scaler.feature_list()), len(training_samples) ) ## TODO : save the fit_history object for later use #model, fit_history = train(len(training_samples), input_features, targets, model, regression_targets = regression_targets) n_inputs = len(data_scaler.feature_list()) n_outputs = len(training_samples) my_model = wwbb_models.get_model(args.model_name) model, fit_history = build_and_train(my_model, n_inputs, n_outputs, input_features, targets) # dump the fit history to file for later use #with open("./ml_training_apr3_split_4/fit_history_{}.pkl".format(args.name), 'wb') as pickle_history : with open("fit_history_{}.pkl".format(args.name), 'wb') as pickle_history: pickle.dump(fit_history.history, pickle_history) # save job_suff = "_{}".format(args.name) if args.name else "" arch_name = "architecture{}.json".format(job_suff) weights_name = "weights{}.h5".format(job_suff) if args.outdir != "": mkdir_p(args.outdir) arch_name = "{}/{}".format(args.outdir, arch_name) weights_name = "{}/{}".format(args.outdir, weights_name) print("Saving architecture to: {}".format(os.path.abspath(arch_name))) print("Saving weights to : {}".format(os.path.abspath(weights_name))) with open(arch_name, 'w') as arch_file: arch_file.write(model.to_json()) model.save_weights(weights_name)
def extract_scale_dataset(args, ignore_features=['eventweight', 'eventNumber']): """ From the user-provided path to the input HDF5 file produced by 'preprocess.py', extract the scaling dataset and store in a new output file. Having the scaling data contained in the file made by 'preprocess.py' is useful as it keeps it near the data that produced it (makes it easier for training and validation) but when expanding out and using the network on general datasets, it should be nicer to have this separate. Currently this does not worry about handling "features to ignore", like event weights. The DataScaler which loads this should handle this. Args: args : command-line user-input """ scaling_group_name = "scaling" scaling_dataset_name = "scaling_data" scaling_dataset = None with h5py.File(args.input, 'r', libver='latest') as input_file: if scaling_group_name in input_file: scaling_group = input_file[scaling_group_name] scaling_dataset = scaling_group[scaling_dataset_name] else: raise Exception("Input file (={}) does not contained the expected \ scaling dataset".format(args.input)) output_name = args.input.split("/")[-1].replace(".h5", "").replace( ".hdf5", "") output_name += "_scaling_data" if args.outdir != "": mkdir_p(args.outdir) output_name = "{}/{}".format(os.path.abspath(args.outdir), output_name) if args.to_json: output_json_name = output_name + ".json" # the fields are named in a specific way, if they are not there # as expected then exit if 'name' not in scaling_dataset.dtype.fields.keys(): print("ERROR 'name' field not in scaling dataset") sys.exit() if 'mean' not in scaling_dataset.dtype.fields.keys(): print("ERROR 'mean' field not in scaling dataset") sys.exit() if 'scale' not in scaling_dataset.dtype.fields.keys(): print("ERROR 'scale' field not in scaling dataset") sys.exit() import json variables = scaling_dataset['name'] scales = scaling_dataset['scale'] offsets = scaling_dataset['mean'] jdata = {} jdata["variables"] = [] for ivar, varname in enumerate(variables): jdata["variables"].append({ "name": varname, "offset": -1.0 * offsets[ivar], "scale": 1.0 / scales[ivar] }) with open(output_json_name, 'w') as jsonfile: json.dump(jdata, jsonfile) output_name += ".h5" with h5py.File( output_name, 'w') as output_file: #, libver = 'latest') as output_file : scaling_group = output_file.create_group(scaling_group_name) # store the name of the original input file so that we can (loosely) # correlated it with this output file if this output file's # name drastically differs scaling_group.attrs['original_input_file'] = args.input out_ds = scaling_group.create_dataset(scaling_dataset_name, shape = scaling_dataset.shape, \ dtype = scaling_dataset.dtype, data = scaling_dataset, maxshape = (None,))
def dump_scores(input_file, model, data_scaler, args): """ From the input HDF5 file, go through it and get the NN output for the features, storing them to a single output file whose name is based on the input filename. Args : input_file : input filename for HDF5 file to be opened and processed model : loaded Keras model data_scaler : loaded DataScaler object used to scale the input features prior to network evaluation args : command line inputs """ outname = input_file.split("/")[-1].replace(".h5", "").replace(".hdf5", "") outname += "_scores.h5" if args.outdir != "": mkdir_p(args.outdir) outname = "{}/{}".format(args.outdir, outname) #out_ds_created = False #out_ds = None #gen = chunk_generator(input_file, dataset_name = args.dataset) #chunk = next(gen) #chunk = chunk [ (chunk['nBJets']==2) ] #row_count = chunk.shape[0] #weights = chunk['eventweight'] #input_features = chunk[data_scaler.feature_list()] #input_features = floatify(chunk[data_scaler.feature_list()], data_scaler.feature_list()) #input_features = (input_features - data_scaler.mean()) / data_scaler.scale() #scores = model.predict(input_features) #n_outputs = scores.shape[1] #ds = np.array( list(weights), dtype = [('eventweight', float)]) #for io in range(n_outputs) : # ds = recfunctions.append_fields( ds , names = 'nn_score_{}'.format(io), data = scores[:,io], dtypes = float ) #dtype = ds.dtype #row_count = ds.shape[0] dataset_id = 0 with h5py.File(outname, 'w', libver='latest') as outfile: for chunk in chunk_generator(input_file, dataset_name=args.dataset): # apply the selection here chunk = chunk[(chunk['nBJets'] >= 1) & ( chunk['HT2Ratio'] > 0.5 )] # & (chunk['mt2_bb'] > 65) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ] #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 55) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ] #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 65) & (chunk['HT2Ratio']>0.5) ] if chunk.size == 0: continue weights = chunk['eventweight'] input_features = chunk[data_scaler.feature_list()] input_features = floatify(input_features, data_scaler.feature_list()) input_features = (input_features - data_scaler.mean()) / data_scaler.scale() scores = model.predict(input_features) n_outputs = scores.shape[1] discriminants = build_discriminant_array(scores, n_outputs) ds = np.array(list(weights), dtype=[('eventweight', float)]) for io in range(n_outputs): ds = recfunctions.append_fields(ds, names='nn_score_{}'.format(io), data=scores[:, io], dtypes=np.float64) for io in range(n_outputs): ds = recfunctions.append_fields(ds, names='nn_disc_{}'.format(io), data=discriminants[io], dtypes=np.float64) maxshape = (None, ) + ds.shape[1:] dsname = "nn_scores_{}".format(dataset_id) out_ds = outfile.create_dataset(dsname, shape=ds.shape, maxshape=maxshape, chunks=ds.shape, dtype=ds.dtype) out_ds[:] = ds dataset_id += 1 print(" > output saved : {}".format(os.path.abspath(outname)))
def make_nn_roc_curve(output_scores=None, samples=[], inputs=None, targets=None, signal_class=0, args=None): class_labels = set(targets) targets_list = list(targets) nn_scores_dict = {} names = {} for sample in samples: names[sample.class_label()] = sample.name() for ilabel, label in enumerate(class_labels): left = targets_list.index(label) right = len(targets_list) - 1 - targets_list[::-1].index(label) nn_scores_dict[label] = output_scores[left:right + 1] lowbin = 0 highbin = 1 edges = np.concatenate([[-np.inf], np.linspace(lowbin, highbin, 500), [np.inf]]) # we want the sample efficiency to pass the signal eff sample_eff = {} h_total = [] w_total = [] for label in nn_scores_dict: # select out the scores for class 'label' for NN output 'signal_class' scores = nn_scores_dict[label][:, signal_class] weights = sample_with_label(label, samples).eventweights h_nn, _ = np.histogram(scores, bins=edges, weights=weights.reshape((scores.shape[0], ))) if label != signal_class: h_total.append(h_nn) w_total.append(weights) # We want to integrate from the high end and then flip # to give the yield "to the right" of the value at # which the integration starts, since "to the right" is # signal like. We also normalize to give the value as # a relative fraction, or efficiency, of selecting that sample at the # given value where we integrate from. eff = np.cumsum(h_nn[::-1])[::-1] / h_nn.sum() sample_eff[label] = eff summed_bkg = h_total[0] for h in h_total[1:]: summed_bkg += h summed_weights = w_total[0] for h in w_total[1:]: summed_weights += h eff_total_bkg = np.cumsum(summed_bkg[::-1])[::-1] / summed_bkg.sum() signal_eff = None bkg_eff = {} for e in sample_eff: if e == signal_class: signal_eff = sample_eff[e] else: bkg_eff[e] = sample_eff[e] fig, ax = plt.subplots(1, 1) for bkg_label in bkg_eff: bkg = bkg_eff[bkg_label] valid_rej = bkg > 0 sig = np.array(signal_eff[:]) valid_sig = (sig != 1.0) valid = valid_rej & valid_sig bkg = bkg[valid] sig = sig[valid] bkg_rej = 1 / bkg ax.plot(sig, bkg_rej, label=names[bkg_label]) valid_rej_total = eff_total_bkg > 0 sig = np.array(signal_eff[:]) valid_sig_total = sig != 1.0 valid_total = valid_rej_total & valid_sig_total bkg_total = eff_total_bkg[valid_total] sig_total = sig[valid_total] bkg_rej_total = 1 / bkg_total ax.plot(sig_total, bkg_rej_total, label="Total Bkg") ax.set_yscale('log') ax.set_xlabel('$hh$ efficiency', horizontalalignment='right', x=1) ax.set_ylabel('Background rejection, $1/\\epsilon_{bkg}$', horizontalalignment='right', y=1) ax.legend(loc='best', frameon=False) # save savename = "nn_output_ROC_{}.pdf".format(args.name) if args.outdir != "": mkdir_p(args.outdir) savename = "{}/{}".format(args.outdir, savename) fig.savefig(savename, bbox_inches='tight', dpi=200)
def make_discriminant_plots(model=None, inputs=None, samples=None, targets=None, args=None): nn_scores = model.predict(inputs) class_labels = set(targets) targets_list = list(targets) # index the sample names by their class label names = {} for sample in samples: names[sample.class_label()] = sample.name() discriminants = build_discriminants(scores=nn_scores, labels=class_labels, targets_list=targets_list) idx_map = {} for ilabel, label in enumerate(class_labels): # left-most left = targets_list.index(label) # right-most right = len(targets_list) - 1 - targets_list[::-1].index(label) idx_map[label] = [left, right + 1] for label in class_labels: fig, ax = plt.subplots(1, 1) # ax.set_xlim([-40,15]) # ax.set_ylim([1e-2,2]) binning = np.arange(-40, 20, 1) centers = (binning[1:-2] + binning[2:-1]) / 2 ax.set_xlim((centers[0] - 0.1, centers[-1] + 0.1)) ax.set_yscale('log') for sample_label in discriminants: left, right = idx_map[sample_label][0], idx_map[sample_label][1] disc_scores_for_sample = discriminants[label][left:right] # since we took the log_ratio, lets clear out any invalid numbers ok_idx = valid_idx(disc_scores_for_sample) disc_scores_for_sample = disc_scores_for_sample[ok_idx] sample_weights = sample_with_label(sample_label, samples).eventweights sample_weights = sample_weights[ok_idx] yields, _ = np.histogram(disc_scores_for_sample, bins=binning, weights=sample_weights.reshape( (disc_scores_for_sample.shape[0], ))) yields = yields / yields.sum() ax.step(centers, yields[1:-1], label=names[sample_label], where='mid') ax.legend(loc='best', frameon=False) savename = "nn_discriminant_{}_class_{}.pdf".format( args.name, names[label]) if args.outdir != "": mkdir_p(args.outdir) savename = "{}/{}".format(args.outdir, savename) fig.savefig(savename, bbox_inches='tight', dpi=200)
def make_nn_output_plots(model=None, inputs=None, samples=None, targets=None, args=None): # set of scores for each label: shape = (n_samples, n_outputs) nn_scores = model.predict(inputs) class_labels = set(targets) targets_list = list(targets) nn_scores_dict = {} # index the sample names by their class label names = {} for sample in samples: names[sample.class_label()] = sample.name() # break up the predicted scores by the class label for ilabel, label in enumerate(class_labels): # left-most appearance of the label left = targets_list.index(label) # right-most appearance of the label right = len(targets_list) - 1 - targets_list[::-1].index(label) nn_scores_dict[label] = nn_scores[left:right + 1] # start plotting for label in class_labels: fig, ax = plt.subplots(1, 1) ax.grid(color='k', which='both', linestyle='--', lw=0.5, alpha=0.1, zorder=0) ax.set_xlabel("NN output for label {}".format(names[label]), horizontalalignment='right', x=1) #ax.set_xlim([1e-2,1.0]) ax.set_xlim([-0.01, 1.01]) ax.set_yscale('log') binning = np.arange(0, 1, 0.02) centers = (binning[1:-2] + binning[2:-1]) / 2 ax.set_xlim((centers[0] - 0.1, centers[-1] + 0.1)) for sample_label in nn_scores_dict: sample_scores_for_label = nn_scores_dict[sample_label][:, label] sample_weights = sample_with_label(sample_label, samples).eventweights yields, _ = np.histogram(sample_scores_for_label, bins=binning, weights=sample_weights.reshape( (sample_scores_for_label.shape[0], ))) yields = yields / yields.sum() ax.step(centers, yields[1:-1], label=names[sample_label], where='mid') #ax.hist(sample_scores_for_label, bins = binning, alpha = 0.3, label = names[sample_label], density = True) ax.legend(loc='best', frameon=False) savename = "nn_outputs_{}_class_{}.pdf".format(args.name, names[label]) if args.outdir != "": mkdir_p(args.outdir) savename = "{}/{}".format(args.outdir, savename) fig.savefig(savename, bbox_inches='tight', dpi=200) return nn_scores