def NN_validate(filename, class_number=1, cut=0., original_n_events=20000): X = rootnp.root2array(args.ValidationDir + "/" + filename[0], "tree") X = rootnp.rec2array(X) for i in range(len(filename)): if i == 0: continue X_ = rootnp.root2array(args.ValidationDir + "/" + filename[i], "tree") X_ = rootnp.rec2array(X_) X = np.concatenate((X, X_)) model = load_model(args.TrainingFile) scaler = pickle.load(open(args.ScalerFile, 'r')) X = scaler.transform(X) if class_number == -1: coupling_name = filename[0].split("_")[0] #print coupling_name coupling_class = classes_dict[coupling_name] discr_dict = {} for class_n in set(i for j, i in classes_dict.iteritems()): discr_dict[class_n] = model.predict(X)[:, class_n] #discr = np.asarray([j for jdx,j in enumerate(discr_dict[coupling_class])]) discr = np.asarray([ j / (discr_dict[0][jdx] + discr_dict[coupling_class][jdx]) for jdx, j in enumerate(discr_dict[coupling_class]) ]) #discr = np.asarray([(discr_dict[1][jdx]+discr_dict[2][jdx]) for jdx,j in enumerate(discr_dict[1])]) else: discr = model.predict(X)[:, class_number] nEvents = len(discr) print float(len(discr)), sum_original_n_events, 100 * float( len(discr)) / float(original_n_events), "%" discr = discr[discr >= cut] print "selection efficiency NN cut: ", 100 * float( len(discr)) / float(nEvents) #print "" #print filename, float(len(discr)),"/",float(len(filename)*original_n_events),"%" return float(len(discr)) / float(original_n_events)
def make_dataset(signals, backgrounds, category, region, fields, cuts=None): signal_arrs = [] signal_weight_arrs = [] background_arrs = [] background_weight_arrs = [] for signal in signals: rec = signal.merged_records( category=category, region=region, fields=fields, cuts=cuts) signal_weight_arrs.append(rec['weight']) signal_arrs.append(rec2array(rec, fields)) for background in backgrounds: rec = background.merged_records( category=category, region=region, fields=fields, cuts=cuts) background_weight_arrs.append(rec['weight']) background_arrs.append(rec2array(rec, fields)) signal_array = np.concatenate(signal_arrs) signal_weight_array = np.concatenate(signal_weight_arrs) background_array = np.concatenate(background_arrs) background_weight_array = np.concatenate(background_weight_arrs) return (signal_array, signal_weight_array, background_array, background_weight_array)
def AddToROC(self,filename): """ Info of the root file, the name of the probability branches and the target (0 or 1 or ...) """ valid_file = False for key,value in self.selector.items(): if key in os.path.basename(filename): target = value valid_file = True if not valid_file: return False# If file not to be taken into account # Get the output prob # if self.weight_name and self.weight_name!='': probs = rec2array(root2array(filename,self.tree,branches=self.prob_branches+[self.weight_name],selection=self.cut)) self.prob_per_class = np.concatenate((self.prob_per_class,probs[:,:-1]),axis=0) self.weight = np.concatenate((self.weight,probs[:,-1].reshape(-1,1)),axis=0) else: probs = rec2array(root2array(filename,self.tree,branches=self.prob_branches,selection=self.cut)) self.prob_per_class = np.concatenate((self.prob_per_class,probs),axis=0) self.weight = None # Make the targets labelized # target_arr = self.lb.transform([target]*probs.shape[0]) # eg target = 1 and classes = [0,1,2] => scores = [0,1,0],... self.scores = np.concatenate((self.scores,target_arr),axis=0) return True
def test_rec2array(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False), ], dtype=[('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) arr = rnp.rec2array(a) assert_array_equal(arr, np.array([[12345, 2, 2.1, 1], [3, 4, 4.2, 0]])) arr = rnp.rec2array(a, fields=['x', 'y']) assert_array_equal(arr, np.array([[12345, 2], [3, 4]])) # single field arr = rnp.rec2array(a, fields=['x']) assert_equal(arr.ndim, 1) assert_equal(arr.shape, (a.shape[0], )) # array fields a = np.array([ ( [1, 2, 3], [4.5, 6, 9.5], ), ( [4, 5, 6], [3.3, 7.5, 8.4], ), ], dtype=[('x', np.int32, (3, )), ('y', np.float32, (3, ))]) arr = rnp.rec2array(a) assert_array_almost_equal( arr, np.array([[[1, 4.5], [ 2, 6, ], [3, 9.5]], [[4, 3.3], [5, 7.5], [6, 8.4]]]))
def import_data(): signal = root2array(BASE_PATH + "0nubb/sensitivity_0nubb_1E7_Pre_Cut.root", "Sensitivity", BRANCH_NAMES_TRAIN) signal = rec2array(signal) bkg2nu = root2array(BASE_PATH + "2nubb/sensitivity_2nubb_2E8_Pre_Cut.root", "Sensitivity", BRANCH_NAMES_TRAIN) bkg2nu = rec2array(bkg2nu) bkg214Bi = root2array( BASE_PATH + "Bi214/sensitivity_Bi214_Foils_2E8_Pre_Cut.root", "Sensitivity", BRANCH_NAMES_TRAIN) bkg214Bi = rec2array(bkg214Bi) bkg208Tl = root2array( BASE_PATH + "Tl208/sensitivity_Tl208_Foils_2E8_Pre_Cut.root", "Sensitivity", BRANCH_NAMES_TRAIN) bkg208Tl = rec2array(bkg208Tl) bkgRn = root2array( BASE_PATH + "Radon/sensitivity_Bi214_Wires_2E8_Pre_Cut.root", "Sensitivity", BRANCH_NAMES_TRAIN) bkgRn = rec2array(bkgRn) return signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn
def tvars(rootfile, first, last): stringa=["seed_pt","seed_eta","seed_phi","seed_mass","seed_dz","seed_dxy", "seed_3D_ip","seed_3D_sip","seed_2D_ip","seed_2D_sip","seed_3D_signedIp","seed_3D_signedSip","seed_2D_signedIp","seed_2D_signedSip", "seed_chi2reduced","seed_nPixelHits","seed_nHits","seed_jetAxisDistance","seed_jetAxisDlength" ] stringa2=["nearTracks_pt","nearTracks_eta","nearTracks_phi","nearTracks_dz","nearTracks_dxy","nearTracks_mass","nearTracks_3D_ip","nearTracks_3D_sip", "nearTracks_2D_ip","nearTracks_2D_sip","nearTracks_PCAdist","nearTracks_PCAdsig","nearTracks_PCAonSeed_x","nearTracks_PCAonSeed_y","nearTracks_PCAonSeed_z", "nearTracks_PCAonSeed_xerr","nearTracks_PCAonSeed_yerr","nearTracks_PCAonSeed_zerr","nearTracks_PCAonTrack_x","nearTracks_PCAonTrack_y","nearTracks_PCAonTrack_z", "nearTracks_PCAonTrack_xerr","nearTracks_PCAonTrack_yerr","nearTracks_PCAonTrack_zerr","nearTracks_dotprodTrack","nearTracks_dotprodSeed","nearTracks_dotprodTrackSeed2D", "nearTracks_dotprodTrackSeed3D","nearTracks_dotprodTrackSeedVectors2D","nearTracks_dotprodTrackSeedVectors3D","nearTracks_PCAonSeed_pvd","nearTracks_PCAonTrack_pvd", "nearTracks_PCAjetAxis_dist","nearTracks_PCAjetMomenta_dotprod","nearTracks_PCAjetDirs_DEta","nearTracks_PCAjetDirs_DPhi"] f=TFile(rootfile) # tree=f.Get("analyzer1/tree") tree=root_numpy.tree2array(f.Get('analyzer1/tree'),branches=stringa2, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last) print "loaded" tree2=root_numpy.rec2array(tree) print tree2.shape print round(time.time()-starttime,2), "reshape" tree3=tree2.reshape((200,36,len(tree))) tree3=tree3.reshape((10,720,len(tree))) print tree3.shape tree3=tree3.swapaxes(0, 2) t2=root_numpy.tree2array(f.Get('analyzer1/tree'), branches=stringa, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last) t2=root_numpy.rec2array(t2) print t2.shape t2=t2.reshape((10,len(stringa),len(tree))) t2=t2.swapaxes(0, 2) tree5=numpy.concatenate((t2, tree3), axis=1) print tree5.shape numpy.save("tvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy", tree5) print time.time()-starttime f.Close() os.system("mv "+"tvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy"+" /gpfs/ddn/users/lgiannini/NN/DataMiniAODNewValidation")
def import_data_small_2(): signal = root2array(BASE_PATH + "sensitivity_0nubb_1E5_Pred_With_Cut.root", "Sensitivity", BRANCH_NAMES_TEST) signal = rec2array(signal) bkg2nu = root2array( BASE_PATH + "sensitivity_2nubb_1E5_Small_Pred_With_Cut.root", "Sensitivity", BRANCH_NAMES_TEST) bkg2nu = rec2array(bkg2nu) bkg214Bi = root2array( BASE_PATH + "sensitivity_Bi214_Foils_Small_Pred_With_Cut.root", "Sensitivity", BRANCH_NAMES_TEST) bkg214Bi = rec2array(bkg214Bi) bkg208Tl = root2array( BASE_PATH + "sensitivity_Tl208_Foils_Small_Pred_With_Cut.root", "Sensitivity", BRANCH_NAMES_TEST) bkg208Tl = rec2array(bkg208Tl) bkgRn = root2array( BASE_PATH + "sensitivity_Bi214_Wires_Small_Pred_With_Cut.root", "Sensitivity", BRANCH_NAMES_TEST) bkgRn = rec2array(bkgRn) return signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn
def load_data(filename, use_mc=False, cut_data=False): '''Load ROOT TTrees, return numpy arrays ''' # Get the number of branches (+1 for radius, 2 for each array entry) tf = TFile(filename, "read") n_branches = len(tf.Get("tree").GetListOfBranches()) n_inputs = (n_branches - 1) / 2 # Open the files and transform branches to numpy arrays if use_mc is True: branches_in = ["hitPatternMC_{0}".format(i) for i in range(n_inputs)] else: branches_in = ["hitPatternFit_{0}".format(i) for i in range(n_inputs)] print "MAX:", max(branches_in), branches_in[-1] branches_out = ["radius"] # root2array converts ROOT tree entries into numpy array (branches still in lists) ann_inputs = root2array(filename, "tree", branches_in) ann_output = root2array(filename, "tree", branches_out) # rec2array converts the list entries to an array for each record data_in = rec2array(ann_inputs) data_out = rec2array(ann_output) if cut_data is True: # Remove samples at z > 7m upper_neck = data_out < 7000 print upper_neck data_out = data_out[upper_neck] data_in = data_in[upper_neck] # Normalise radius (even though this is a regression problem # feature scaling still makes a big difference for the ANN) data_out = data_out / _radius_scale # On the order of the AV radius return data_in, data_out
def train_sum(sample_signal, sample_bkg, tree, branch_names, selection): csv1 = ['SubJet_csv_1'] csv2 = ['SubJet_csv_2'] sig_csv1 = root2array(sample_signal, "outTree", csv1, selection=selection) sig_csv2 = root2array(sample_signal, "outTree", csv2, selection=selection) bkg_csv1 = root2array(sample_bkg, "outTree", csv1, selection=selection) bkg_csv2 = root2array(sample_bkg, "outTree", csv2, selection=selection) sig_csv1 = rec2array(sig_csv1) sig_csv2 = rec2array(sig_csv2) bkg_csv1 = rec2array(bkg_csv1) bkg_csv2 = rec2array(bkg_csv2) sig_sum_csv = [[x[0] + y[0]] for x, y in zip(sig_csv1, sig_csv2)] bkg_sum_csv = [[x[0] + y[0]] for x, y in zip(bkg_csv1, bkg_csv2)] sig_sum_csv = np.array(sig_sum_csv) bkg_sum_csv = np.array(bkg_sum_csv) print(branch_names) signal = get_numpy_array(sample_signal, tree, branch_names, selection) backgr = get_numpy_array(sample_bkg, tree, branch_names, selection) signal = np.append(signal, sig_sum_csv, axis=1) backgr = np.append(backgr, bkg_sum_csv, axis=1) print("signal sample and bkg num py array done") X, y = merge_addColoumn(signal, backgr) print("signal , bkg merging done ") sig_weight = get_weight_coloumn(sample_signal, tree, ['weight'], selection) bkg_weght = np.ones((backgr.shape[0], 1)) sig_weight = np.concatenate(sig_weight, axis=0) bkg_weght = np.concatenate(bkg_weght, axis=0) weight = np.concatenate((sig_weight, bkg_weght), axis=0) print("weight np array done") print("splitting start") X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split( X, y, weight, test_size=0.33, random_state=42) print("splitting done") print("start training") dt = DecisionTreeClassifier(max_depth=5) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) # bdt = GradientBoostingClassifier(dt,n_estimators=800,learning_rate=0.5) bdt.fit(X_train, y_train, sample_weight=weight_train) print("bdt had done the fitting") print("start testing") decisions = bdt.decision_function(X_test) print(decisions) fpr, tpr, thresholds = roc_curve(y_test, decisions) print("training done") return fpr, tpr
def test_rec2array(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False), ], dtype=[('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) arr = rnp.rec2array(a) assert_array_equal(arr, np.array([[12345, 2, 2.1, 1], [3, 4, 4.2, 0]])) arr = rnp.rec2array(a, fields=['x', 'y']) assert_array_equal(arr, np.array([[12345, 2], [3, 4]])) # single field arr = rnp.rec2array(a, fields=['x']) assert_equal(arr.ndim, 1) assert_equal(arr.shape, (a.shape[0], ))
def run(name, source, quick=False): print time.asctime(time.localtime()), "Filling BDT Branches" branch_names = joblib.load("pickle/variables.pkl") if quick == True: signal = joblib.load('pickle/all_signalq.pkl') clf = joblib.load("pickle/" + name + "quick.pkl") else: signal = joblib.load('pickle/all_signal.pkl') clf = joblib.load("pickle/" + name + ".pkl") # predict and write probability of each MC event being signal bdt_MC_predicted = clf.predict_proba(signal) bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)] array2root(( np.hsplit(bdt_MC_predicted, 2)[1] ), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree") # predict and write probability of every data event being signal all_data = root2array( "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names) all_data = rec2array(all_data) bdt_data_predicted = clf.predict_proba(all_data) bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)] array2root(( np.hsplit(bdt_data_predicted, 2)[1] ), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree") print time.asctime(time.localtime()), "Branches Filled!"
def run(name, source, quick=False): print time.asctime(time.localtime()), "Filling BDT Branches" branch_names = joblib.load("pickle/variables.pkl") if quick == True: signal = joblib.load('pickle/all_signalq.pkl') clf = joblib.load("pickle/" + name + "quick.pkl") else: signal = joblib.load('pickle/all_signal.pkl') clf = joblib.load("pickle/" + name + ".pkl") # predict and write probability of each MC event being signal bdt_MC_predicted = clf.predict_proba(signal) bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)] array2root((np.hsplit(bdt_MC_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree") # predict and write probability of every data event being signal all_data = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names) all_data = rec2array(all_data) bdt_data_predicted = clf.predict_proba(all_data) bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)] array2root((np.hsplit(bdt_data_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree") print time.asctime(time.localtime()), "Branches Filled!"
def array(self, **kwargs): "" "" from root_numpy import rec2array rec = self.records(**kwargs) arr = rec2array(rec) return arr
def LoadObjectVars(self, Objects): print "Load ", Objects # Vars = root2array(filenames=self.File, treename=self.TreeName, branches=self.VarNamesDict[Objects], start=0 , stop=1000) Vars = root2array(filenames=self.File, treename=self.TreeName, branches=self.VarNamesDict[Objects]) Vars = rec2array(Vars) print "Make ", Objects if Objects == "Event": for col, var in enumerate(Vars[0]): if (isinstance(var, np.ndarray)): Vars[:, col] = np.array(map(lambda x: x[0], Vars[:, col])) self.Vars[Objects] = Vars else: VarList = [] for n_jet in range(0, self.nObjects): for col, var in enumerate(Vars[0]): VarList.append( np.expand_dims(np.array( map( lambda x: x[n_jet] if x.shape[0] > n_jet else 0, Vars[:, col])), axis=1)) self.Vars[Objects] = np.concatenate(VarList, axis=1) print "Shape ", Objects, ":\t", self.Vars[Objects].shape
def corrections(self, rec): # posterior trigger correction if not self.posterior_trigger_correction: return arr = rec2array(rec[['tau1_pt', 'tau2_pt']]) weights = evaluate(self.trigger_correct, arr) return [weights]
def __getitem__(self, index): # gets the batch for the supplied index # return a tuple (numpy array of image, numpy array of labels) or None at epoch end logging.debug("-" * 80) logging.debug("New batch importation") X = np.zeros((self.batch_size, len(self.inputs))) Y = np.zeros((self.batch_size, len(self.outputs))) pointer = 0 for f, size in self.batch_sample.items(): size = int(size) # For python2 #while True: # try: # data = rec2array(root2array(f,treename='tree',branches=self.inputs+self.outputs,start=index*size,stop=(index+1)*size)) # break # except OSError: # logging.warning("Could not import tree in worker, will try again in 3 seconds") # time.sleep(3) data = rec2array( root2array(f, treename='tree', branches=self.inputs + self.outputs, start=index * size, stop=(index + 1) * size)) X[pointer:pointer + size, :] = data[:, len(self.inputs):] Y[pointer:pointer + size, :] = data[:, :len(self.outputs)] pointer += size logging.debug("%s - Added %d entries from file %s" % (self.state_set, size, os.path.basename(f))) if self.weights_generator == '': return X, Y else: W = self.weightsGen.getWeights(Y) return X, Y, W
def get_numpy_array(sample, tree, branch_names, selection): branch_names = [c.strip() for c in branch_names] branch_names = (b.replace(" ", "_") for b in branch_names) branch_names = list(b.replace("-", "_") for b in branch_names) output_arr = root2array(sample, tree, branch_names, selection=selection) output_arr = rec2array(output_arr) return output_arr
def AddToROC(self,filename): # Check that correct target and records # valid_file = False for key,value in self.selector.items(): if key in os.path.basename(filename): target = value valid_file = True if not valid_file: return False# If file not to be taken into account # recover output # if self.weight_name and self.weight_name!='': out = root2array(filename,self.tree,branches=[self.variable+self.weight_name],selection=self.cut) else: out = root2array(filename,self.tree,branches=self.variable,selection=self.cut) try: out = rec2array(out) # If not a vector, need to remove dtype except: pass if out.ndim==1: out = out.reshape(-1,1) # vector -> array if out.shape[1]>1: # contains [dicriminant,weight] weight = out[:,1] out = out[:,0] # Add to container # tar = np.ones((out.shape[0],1))*target self.output = np.concatenate((self.output,out),axis=0) self.target = np.concatenate((self.target,tar),axis=0) if self.weight_name and self.weight_name!='': self.weight = weight return True
def get_weight_coloumn(signal_sample, tree, weight_branch, selection): weight_arr = root2array(signal_sample, tree, weight_branch, selection=selection) weight_arr = rec2array(weight_arr) return weight_arr
def selectBranches_Candidate(file_name, tree_name, branch_names, selection_cuts, isGen): file = root2array(filenames=file_name, treename=tree_name, branches=branch_names, selection=selection_cuts) file = rec2array(file) if isGen: px_index = branch_names.index("GenCandPx") py_index = branch_names.index("GenCandPy") else: px_index = branch_names.index("CandPx") py_index = branch_names.index("CandPy") # print("len_file = "+str(len(file))) for x in range(0, len(file)): for y in range(0, len(branch_names)): file[x, y].resize(ncand, refcheck=False) temp = file[x, y].reshape(1, ncand) temp = temp.astype(float) if y == 0: info = temp else: info = np.concatenate((info, temp)) bubbleSort(info, ncand, px_index, py_index) temp_jet = info temp_jet = temp_jet.reshape(1, len(branch_names), ncand) if x == 0: info_candidates = temp_jet else: info_candidates = np.concatenate((info_candidates, temp_jet)) if (x - 1) % 1000 == 0: print("info_candidates.shape = " + str(info_candidates.shape)) # print("done") return info_candidates
def svars(rootfile, first, last): stringa = [ "seed_pt", "seed_eta", "seed_phi", "seed_mass", "seed_dz", "seed_dxy", "seed_3D_ip", "seed_3D_sip", "seed_2D_ip", "seed_2D_sip", "seed_3D_signedIp", "seed_3D_signedSip", "seed_2D_signedIp", "seed_2D_signedSip", "seed_chi2reduced", "seed_nPixelHits", "seed_nHits", "seed_jetAxisDistance", "seed_jetAxisDlength" ] f = TFile(rootfile) tree = f.Get("analyzer1/tree") t2 = root_numpy.tree2array(tree, branches=stringa, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last) ll = len(t2) t2 = root_numpy.rec2array(t2) print t2.shape t2 = t2.reshape((10, len(stringa), ll)) t2 = t2.swapaxes(0, 2) # t2=numpy.reshape(t2, (len(t2), len(stringa), 10)) # print t2.shape numpy.save( "svars" + str(first) + "_" + str(last) + "_" + rootfile.split(".")[0] + ".npy", t2) print time.time() - starttime print t2.shape f.Close()
def concat_ttrees_to_array(ttrees, branches=None): """Concatenates multiple TTrees of different classes into one ndarray.""" rec = [] for i in range(len(ttrees)): rec.append(rnp.tree2rec(ttrees[i], branches)) return rnp.rec2array(rnp.stack(rec, fields=branches), fields=branches)
def read_inputs(config, setup): from ttH.TauRoast.processing import Process fn = os.path.join(config.get("indir", config["outdir"]), "ntuple.root") signal = None signal_weights = None for proc, weight in sum([cfg.items() for cfg in setup['signals']], []): for p in sum([Process.expand(proc)], []): logging.debug('reading {}'.format(p)) d = rec2array(root2array(fn, str(p), setup['variables'])) if isinstance(weight, float) or isinstance(weight, int): w = np.array([weight] * len(d)) else: w = rec2array(root2array(fn, str(p), [weight])).ravel() w *= p.cross_section / p.events if signal is not None: signal = np.concatenate((signal, d)) signal_weights = np.concatenate((signal_weights, w)) else: signal = d signal_weights = w background = None background_weights = None for proc, weight in sum([cfg.items() for cfg in setup['backgrounds']], []): for p in sum([Process.expand(proc)], []): logging.debug('reading {}'.format(p)) d = rec2array(root2array(fn, str(p), setup['variables'])) if isinstance(weight, float) or isinstance(weight, int): w = np.array([weight] * len(d)) else: w = rec2array(root2array(fn, str(p), [weight])).ravel() w *= p.cross_section / p.events if background is not None: background = np.concatenate((background, d)) background_weights = np.concatenate((background_weights, w)) else: background = d background_weights = w factor = np.sum(signal_weights) / np.sum(background_weights) logging.info("renormalizing background events by factor {}".format(factor)) background_weights *= factor return signal, signal_weights, background, background_weights
def get_inputs(sample_name, variables, filename=None, tree_name='mva', dir='', weight_name='event_weight', lumi=1.): x = None y = None w = None infiles = [] xsections = [] if filename != None: infiles = [dir + filename] else: if ('ttH' in sample_name): infiles = [dir + "mvaVars_ttH_loose.root"] xsections = [0.215] elif ('ttV' in sample_name): infiles = [ dir + "mvaVars_TTZ_loose.root", dir + "mvaVars_TTW_loose.root" ] xsections = [0.253, 0.204] # [TTZ, TTW] elif ('ttbar' in sample_name): infiles = [ dir + "mvaVars_TTSemilep_loose.root", dir + "mvaVars_TTDilep_loose.root" ] xsections = [182, 87.3] # [semilep, dilep] else: print "Pick one sample name from 'ttH', 'ttV' or 'ttbar'" return x, y, w for fn, xs in zip(infiles, xsections): xi = rec2array(root2array(fn, tree_name, variables)) wi = root2array(fn, tree_name, weight_name) # scale weight and renormalize total weights to one #wi *= (xs / sum(xsections)) /np.sum(wi) # scale samples based on lumi and cross section wi *= lumi * xs / np.sum(wi) if x is not None: x = np.concatenate((x, xi)) w = np.concatenate((w, wi)) else: x = xi w = wi if ('ttH' in sample_name): y = np.ones(x.shape[0]) else: y = np.zeros(x.shape[0]) #y = -1*np.ones(x.shape[0]) return x, y, w
def readout_to_numpy_arrays(infilename, treename, outpath, outname, unwanted_tags, unwanted_exact_tags): infile = ROOT.TFile.Open(infilename) myoutpath = outpath create_path(myoutpath) print 'creating numpy arrays for input sample %s' % (outname) # Get AnalysisTree entries = infile.AnalysisTree.GetEntriesFast() # print entries tree = infile.Get(treename) leaves = tree.GetListOfLeaves() variables = [] eventweights = ['eventweight'] for leaf in leaves: write = True for tag in unwanted_tags: if tag in leaf.GetName(): write = False for tag in unwanted_exact_tags: if tag == leaf.GetName(): write = False if write: variables.append(leaf.GetName()) print variables print "len(variables): ",len(variables) chunksize = 200000 maxidx = int(entries/float(chunksize)) + 1 if entries % chunksize == 0: maxidx -= 1 print entries, chunksize, maxidx for i in range(maxidx): mymatrix = root2array(filenames=infilename, treename=treename, branches=variables, start=i*chunksize, stop=(i+1)*chunksize) mymatrix = rec2array(mymatrix) myweights = root2array(filenames=infilename, treename=treename, branches=eventweights, start=i*chunksize, stop=(i+1)*chunksize) myweights = rec2array(myweights) thisoutname = myoutpath + outname + '_' + str(i) + '.npy' thisoutname_weights = myoutpath + 'Weights_' + outname + '_' + str(i) + '.npy' np.save(thisoutname, mymatrix) np.save(thisoutname_weights, myweights) percent = float(i+1)/float(maxidx) * 100. sys.stdout.write( '{0:d} of {1:d} ({2:4.2f} %) jobs done.\r'.format(i+1, maxidx, percent)) if not i == maxidx-1: sys.stdout.flush() with open(myoutpath + 'variable_names.pkl', 'w') as f: pickle.dump(variables, f)
def arrays(self, category, region, cuts=None, fields=None, clf=None, clf_name='classifier', include_weight=True, systematic='NOMINAL'): bkg_recs, sig_recs = self.records( category, region, cuts=cuts, fields=fields, clf=clf, clf_name=clf_name, include_weight=include_weight, systematic=systematic) bkg_arrs = {} sig_arrs = {} for b, rec in bkg_recs.items(): bkg_arrs[b] = rec2array(rec) for s, rec in sig_recs.items(): sig_arrs[s] = rec2array(rec) return bkg_arrs, sig_arrs
def add_classifier(data, c_name, classifier, features): """ Add a classifier column to the record array """ # compute discriminator and append to array data_disc = classifier.predict_proba(rec2array(data[features]))[:,1] data = append_fields(data, [c_name], [data_disc], asrecarray=True, usemask=False) return data
def Convert(self): if self.weights: self.variables.append(self.weights[0]) print self.variables train_Signal=root2array(self.SPath, self.Streename, self.variables) train_Background=root2array(self.BPath, self.Btreename, self.variables) train_Signal=rec2array(train_Signal) self.train_Signal = train_Signal print '#Signalevents = ', len(train_Signal) train_Background=rec2array(train_Background) self.train_Background = train_Background print '#Backgroundevents = ', len(train_Background) X_train = np.concatenate((train_Signal, train_Background)) y_train = np.concatenate((np.ones(train_Signal.shape[0]), np.zeros(train_Background.shape[0]))) if self.StestPath=='': X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42) else: test_Signal=root2array(self.StestPath, self.Streename, self.variables) test_Background=root2array(self.BtestPath, self.Btreename, self.variables) test_Signal=rec2array(test_Signal) test_Background=rec2array(test_Background) self.test_Signal = test_Signal self.test_Background = test_Background X_test = np.concatenate((test_Signal,test_Background)) y_test = np.concatenate((np.ones(test_Signal.shape[0]), np.zeros(test_Background.shape[0]))) weights = [] for i in X_train: self.train_weights.append(i[-1]) #i.delete( i[-1] ) X_train = np.delete(X_train, np.s_[-1], 1) for i in X_test: self.test_weights.append(i[-1]) #i.delete(i[-1]) X_test = np.delete(X_test, np.s_[-1], 1) del self.variables[-1] self.Var_Array = X_train self.ID_Array = y_train self.test_var=X_test self.test_ID=y_test ######################################### #---store stuff to compare afterwards---# IDfile = open("ID.pkl","w") # pickle.dump(self.test_ID,IDfile) # IDfile.close() #
def selectBranches_Event(file_name, tree_name, branch_names, selection_cuts): file = root2array(filenames=file_name, treename=tree_name, branches=branch_names, selection=selection_cuts) # it needs 2 steps to a proper coversion into numpy.ndarray file = rec2array(file) file = file.astype(variable_type) # return numpy.ndarray whose shape is (n_events,branches.size()) return file
def add_classifier(data, c_name, classifier, features): """ Add a classifier column to the record array """ # compute discriminator and append to array data_disc = classifier.predict_proba(rec2array(data[features]))[:, 1] data = append_fields(data, [c_name], [data_disc], asrecarray=True, usemask=False) return data
def jvars(rootfile, first, last): jvars=["jet_pt","jet_eta", "jet_phi","jet_mass","jet_flavour"] f=TFile(rootfile) tree=f.Get("analyzer1/tree") t2=root_numpy.tree2array(tree, branches=jvars, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last) t2=root_numpy.rec2array(t2) numpy.save("jvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy", t2) print t2.shape print time.time()-starttime f.Close()
def Run(self, batchsize=int(1e5)): r"""Fill all registered histograms. The histograms are filled using the :func:`root_numpy.root2array` method. :param batchsize: number of events to processed at once (default: 100000) :type batchsize: ``int`` """ branchexprs = set() for histo, options in self._store: branchexprs.update(options["varexp"].split(":")) branchexprs.add("({})*({})".format(options["weight"], options["cuts"])) if not options["append"]: histo.Reset() for start in range(0, self._entries, batchsize): array = rnp.root2array( self._filepath, self._treename, branches=branchexprs, start=start, stop=start + batchsize, ) for histo, options in self._store: if not ":" in options["varexp"]: varexp = array[options["varexp"]] else: varexp = rnp.rec2array( array[options["varexp"].split(":")]) cuts = array["({})*({})".format(options["weight"], options["cuts"])] mask = np.where(cuts != 0) rnp.fill_hist(histo, varexp[mask], weights=cuts[mask]) zeroentriesoptions = [] for histo, options in self._store: options = { k: v for k, v in options.items() if not k in ["varexp", "append"] } if histo.GetEntries( ) == 0 and options not in zeroentriesoptions: logger.warning( "No events have been extracted for tree '{}' in file '{}'" "using cuts='{}' and weight='{}'!".format( self._treename, self._filepath, options["cuts"], options["weight"], )) zeroentriesoptions.append(options) logger.info( "Filled {} histograms using tree '{}' in file '{}'.".format( len(self._store), self._treename, self._filepath))
def __getitem__(self,index): # gets the batch for the supplied index # return a tuple (numpy array of image, numpy array of labels) or None at epoch end logging.debug("-"*80) logging.debug("New batch importation") X = np.zeros((self.batch_size,len(self.inputs))) Y = np.zeros((self.batch_size,len(self.outputs))) pointer = 0 for f,size in self.batch_sample.items(): size = int(size) # For python2 X[pointer:pointer+size,:]= rec2array(root2array(f,treename='tree',branches=self.inputs,start=index*size,stop=(index+1)*size)) Y[pointer:pointer+size,:] = rec2array(root2array(f,treename='tree',branches=self.outputs,start=index*size,stop=(index+1)*size)) pointer += size logging.debug("%s - Added %d entries from file %s"%(self.state_set,size,os.path.basename(f))) if self.weights_generator == '': return X,Y else: W = self.weightsGen.getWeights(Y) return X,Y,W
def classify(self, sample, category, region, cuts=None, systematic='NOMINAL'): if self.clfs == None: raise RuntimeError("you must train the classifiers first") partitions = sample.partitioned_records( category=category, region=region, fields=self.fields, cuts=cuts, systematic=systematic, num_partitions=2, return_idx=True, key=self.partition_key) score_idx = [[], []] for i, partition in enumerate(partitions): for rec, idx in partition: weight = rec['weight'] arr = rec2array(rec, self.fields) # each classifier is never used on the partition that trained it scores = self.clfs[i].decision_function(arr) score_idx[i].append((idx, scores, weight)) # must preserve order of scores wrt the other fields! # merge the scores and weights according to the idx merged_scores = [] merged_weight = [] for left, right in zip(*score_idx): left_idx, left_scores, left_weight = left right_idx, right_scores, right_weight = right insert_idx = np.searchsorted(left_idx, right_idx) scores = np.insert(left_scores, insert_idx, right_scores) weight = np.insert(left_weight, insert_idx, right_weight) merged_scores.append(scores) merged_weight.append(weight) scores = np.concatenate(merged_scores) weight = np.concatenate(merged_weight) if self.transform: log.info("classifier scores are transformed") if isinstance(self.transform, types.FunctionType): # user-defined transformation scores = self.transform(scores) else: # logistic tranformation used by TMVA (MethodBDT.cxx) scores = -1 + 2.0 / (1.0 + np.exp(-self.clfs[0].n_estimators * self.clfs[0].learning_rate * scores / 1.5)) return scores, weight
def test_rec2array(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) arr = rnp.rec2array(a) assert_array_equal(arr, np.array([ [12345, 2, 2.1, 1], [3, 4, 4.2, 0]])) arr = rnp.rec2array(a, fields=['x', 'y']) assert_array_equal(arr, np.array([ [12345, 2], [3, 4]])) # single field arr = rnp.rec2array(a, fields=['x']) assert_equal(arr.ndim, 1) assert_equal(arr.shape, (a.shape[0],))
def make_partitioned_dataset(signals, backgrounds, category, region, fields, partition_key, cuts=None): signal_arrs = [] signal_weight_arrs = [] background_arrs = [] background_weight_arrs = [] for signal in signals: left, right = signal.partitioned_records( category=category, region=region, fields=fields, cuts=cuts, key=partition_key) signal_weight_arrs.append( (left['weight'], right['weight'])) signal_arrs.append( (rec2array(left, fields), rec2array(right, fields))) for background in backgrounds: left, right = background.partitioned_records( category=category, region=region, fields=fields, cuts=cuts, key=partition_key) background_weight_arrs.append( (left['weight'], right['weight'])) background_arrs.append( (rec2array(left, fields), rec2array(right, fields))) return (signal_arrs, signal_weight_arrs, background_arrs, background_weight_arrs)
def importROOTdata(branch_names, fName, treeName="DecayTree"): """Import signal and background root files to signal and background numpy arrays for the selected branches :param branch_names: names of the branches to be imported :type branch_names: tuple :param fName: name of the root file to be imported. :type fName: str :param treeName: tree name in the files. :type fName: str :rtype: ndarray """ from root_numpy import root2array, rec2array data_array = root2array(fName, treeName, branch_names) data_array = rec2array(data_array) return data_array
def array(self, category=None, region=None, fields=None, cuts=None, clf=None, clf_name='classifer', include_weight=True, systematic='NOMINAL'): return rec2array(self.merged_records( category=category, region=region, fields=fields, cuts=cuts, clf=clf, clf_name=clf_name, include_weight=include_weight, systematic=systematic))
def evaluate(config, tree, names, transform=None): output = [] dtype = [] for name in names: setup = load(config, name.split("_")[1]) data = rec2array(tree2array(tree.raw(), list(transform(setup["variables"])) if transform else setup["variables"])) if name.startswith("sklearn"): fn = os.path.join(config["mvadir"], name + ".pkl") with open(fn, 'rb') as fd: bdt, label = pickle.load(fd) scores = [] if len(data) > 0: scores = bdt.predict_proba(data)[:, 1] output += [scores] dtype += [(name, 'float64')] fn = os.path.join(config["mvadir"], name + ".xml") reader = r.TMVA.Reader("Silent") for var in setup['variables']: reader.AddVariable(var, array('f', [0.])) reader.BookMVA("BDT", fn) scores = evaluate_reader(reader, "BDT", data) output += [scores] dtype += [(name.replace("sklearn", "tmvalike"), 'float64')] f = r.TFile(os.path.join(config.get("mvadir", config.get("indir", config["outdir"])), "mapping.root"), "READ") if f.IsOpen(): likelihood = f.Get("hTargetBinning") def lh(values): return likelihood.GetBinContent(likelihood.FindBin(*values)) indices = dict((v, n) for n, (v, _) in enumerate(dtype)) tt = output[indices['tmvalike_tt']] ttZ = output[indices['tmvalike_ttZ']] if len(tt) == 0: output += [[]] else: output += [np.apply_along_axis(lh, 1, np.array([tt, ttZ]).T)] dtype += [('tmvalike_likelihood', 'float64')] f.Close() data = np.array(zip(*output), dtype) tree.mva(array2tree(data))
def DrawCorrelationMatrixFromROOT(infile,intree,outfile,brancharray,selection="",pickEvery=None): X = np.ndarray((0,len(brancharray)),float) # container to hold the combined trees in numpy array structure treeArray = rootnp.root2array(infile,intree,brancharray,selection,0,None,pickEvery,False,'weight') X = rootnp.rec2array(treeArray) df = pd.DataFrame(X,columns=brancharray) corrmat = df.corr(method='pearson', min_periods=1)#'spearman' fig, ax1 = plt.subplots(ncols=1, figsize=(12,10)) opts = {'cmap': plt.get_cmap("RdBu"),'vmin': corrmat.min().min(), 'vmax': corrmat.max().max()} heatmap1 = ax1.pcolor(corrmat, **opts) plt.colorbar(heatmap1, ax=ax1) ax1.set_title("Correlation Matrix {%s}"%selection) labels = corrmat.columns.values for ax in (ax1,): # shift location of ticks to center of the bins ax.set_xticks(np.arange(len(labels))+0.5, minor=False) ax.set_yticks(np.arange(len(labels))+0.5, minor=False) ax.set_xticklabels(labels, minor=False, ha='right', rotation=70) ax.set_yticklabels(labels, minor=False) fig.tight_layout() log.info("Dumping output in %s" %outfile) fig.savefig(outfile)
from root_numpy import root2array, rec2array, array2root bdt_file = '/lustre/cmswork/hh/mvas/xgboost/train_3CSVM_0.5sig_0.7bkg_weighted.pkl' branch_names = ["H1_pT", "H2_pT", "H1_dEta_abs", "H2_dEta_abs", "H1_dPhi_abs", "H2_dPhi_abs"] # compute bdt values bdt = joblib.load(bdt_file) for root_file in args.root_files: print "processing {}".format(root_file) # load vars data from ROOT data = root2array(root_file, args.tree_name, branch_names) data_bdt = bdt.predict_proba(rec2array(data[branch_names]))[:,1] # save to ROOT file data_bdt.dtype = [(args.bdt_name, np.float32)] array2root(data_bdt, root_file, "tree")
base = os.path.basename(fname) match = fname_regex.match(base) if not match: raise ValueError("Could not match the regex to the file %s" % fname) flavor = match.group('flavor') full_category = match.group('category') category = [i for i in sv_categories if i in full_category][0] if flavor != args.flavour: continue log.info('processing file %s' % fname) extfile = fileserver.serve(fname) pool_files.append(extfile) nfiles_per_sample = None tree = rootnp.root2array(extfile.path,'tree',variables,None,0,nfiles_per_sample,args.pickEvery,False,'weight') tree = rootnp.rec2array(tree) X = np.concatenate((X, tree),0) y = np.concatenate((y,np.ones(tree.shape[0]))) # This is needed for pandas DataFrame Structure log.info('Converting data to pandas DataFrame structure') # Create a pandas DataFrame for our data # this provides many convenience functions # for exploring your dataset # see http://betatim.github.io/posts/sklearn-for-TMVA-users/ for more info # need to reshape y so it is a 2D array with one column df = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=variables+['y']) corrmat = df.drop('y', 1).corr(method='pearson', min_periods=1) fig, ax1 = plt.subplots(ncols=1, figsize=(12,10))
def ReadData(path_to_file, sname, selection=""): # Make data object dataobj = Data() # Get the data indata = root2array(filenames = path_to_file, treename = "tree", branches = dataobj.t_varnames+dataobj.w_varnames, selection = selection) # Add an extra field for the weights emptydata = [] for i in range(len(m_weightnames)): emptydata.append(np.zeros(len(indata),dtype=float)) indata = append_fields(base = indata, names = m_weightnames, data = emptydata, usemask = False, dtypes=float) # Loop and calculate the weights weight_tool = WeightTool() for i in range(len(indata)): if sname == m_sname_corsika or sname == m_sname_corsikaLE: indata[i][m_weightnames[0]] = weight_tool.getWeight(indata[i],sname) indata[i][m_weightnames[1]] = 0 indata[i][m_weightnames[2]] = 0 elif sname == m_sname_data: indata[i][m_weightnames[0]] = 1 indata[i][m_weightnames[1]] = 0 indata[i][m_weightnames[2]] = 0 else: indata[i][m_weightnames[0]] = weight_tool.getWeight(indata[i],m_sname_E2) indata[i][m_weightnames[1]] = weight_tool.getWeight(indata[i],m_sname_Conv) indata[i][m_weightnames[2]] = weight_tool.getWeight(indata[i],m_sname_Prompt) # Convert to record array #indata = rec2array(indata,fields=dataobj.t_varnames + ['w']) indata = rec2array(indata) # Remove nan if exists indata = indata[~np.isnan(indata).any(axis=1)] # Get Entries nEntries = len(indata) # Set the targets # 1 -- signal # 0 -- background if sname == m_sname_E2: targets = np.ones(nEntries,dtype=int) else: targets = np.zeros(nEntries,dtype=int) # Set properties of data object dataobj.setData(indata) dataobj.setTargets(targets) dataobj.setName(sname) #print "---------------------------------------" #print dataobj.data #print dataobj.targets #print "" return dataobj
j_n = "pfjets[{}].{}" branch_names = [j_n.format(i, v) for i,v in it.product(range(4), j_v)] mix_data = np.genfromtxt(asc_file, names=branch_names) mix_data.dtype.names = branch_names # fix names (symbols were erased) mix_data = add_cartesian(mix_data) mix_data = add_dijet_vars(mix_data) bdt = joblib.load(bdt_file) features = ["dijet[0].Pt()","dijet[1].Pt()", "dijet[0].DEta()","dijet[1].DEta()", "dijet[0].DPhi()","dijet[1].DPhi()"] mix_data_bdt = bdt.decision_function(rec2array(mix_data[features])) bdt_name = "bdt_value" mix_data = append_fields(mix_data, [bdt_name], [mix_data_bdt] , asrecarray=True, usemask=False) to_write = branch_names + [bdt_name] np.savetxt(out_file, rec2array(mix_data[to_write]))
lep1tauSS_cosDeltaPhi lep1_mt lep1_mva jet_deltaRavg """.split() with open(save, 'rd') as fd: bdt = pickle.load(fd) sig = r.TH1F('sig', '', 40, 0, 1) bkg = r.TH1F('bkg', '', 40, 0, 1) infile = r.TFile(ntuple) for s in signals: data = rec2array(root2array(ntuple, str(s), variables)) for v in bdt.predict_proba(data)[:, 1]: sig.Fill(v) for b in backgrounds: data = rec2array(root2array(ntuple, str(b), variables)) for v in bdt.predict_proba(data)[:, 1]: bkg.Fill(v) c = r.TCanvas() sig.SetLineColor(r.kBlue) sig.Scale(1. / sig.Integral()) sig.Draw() bkg.SetLineColor(r.kRed) bkg.Scale(1. / bkg.Integral()) bkg.Draw("same") c.SaveAs('output.png')
# REVALIDATION OF ALL CLASSIFIERS --> dump Discriminators # #****************************************************** dict_Discriminators = {} #****************************************************** # All types, all classifiers #****************************************************** log.info('Processing: %sall types, all classifiers (including the best for each type)%s' %(Fore.BLUE,Fore.WHITE)) for t in Types: variables = pickle.load(open(args.Typesdir+t+"/featurenames.pkl","r")) variables = [x for x in variables if x != 'flavour'] X = rootnp.root2array(args.InputFile,args.InputTree,variables,None,0,args.elements_per_sample,args.pickEvery,False,'weight') X = rootnp.rec2array(X) for c in clf_names: log.info('Type: %s%s%s, Classifier: %s%s%s' %(Fore.RED,t,Fore.WHITE,Fore.GREEN,c,Fore.WHITE)) classifier = dict_clf[t+'_'+c] dict_Discriminators[t+'_'+c] = classifier.predict_proba(X)[:,1] best_clf_name = dict_pickles["Best"][t][0] best_classifier = dict_clf[t+'_BEST_'+best_clf_name] log.info('Type: %s%s%s, Best Classifier is: %s%s%s' %(Fore.RED,t,Fore.WHITE,Fore.GREEN,best_clf_name,Fore.WHITE)) dict_Discriminators[t+'_BEST_'+best_clf_name] = best_classifier.predict_proba(X)[:,1] #****************************************************** # CombinedMVA #******************************************************
def test_rec2array(): # scalar fields a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) arr = rnp.rec2array(a) assert_array_equal(arr, np.array([ [12345, 2, 2.1, 1], [3, 4, 4.2, 0]])) arr = rnp.rec2array(a, fields=['x', 'y']) assert_array_equal(arr, np.array([ [12345, 2], [3, 4]])) # single scalar field arr = rnp.rec2array(a, fields=['x']) assert_array_equal(arr, np.array([[12345], [3]], dtype=np.int32)) # single scalar field simplified arr = rnp.rec2array(a, fields='x') assert_array_equal(arr, np.array([12345, 3], dtype=np.int32)) # case where array has single record assert_equal(rnp.rec2array(a[:1]).shape, (1, 4)) assert_equal(rnp.rec2array(a[:1], fields=['x']).shape, (1, 1)) assert_equal(rnp.rec2array(a[:1], fields='x').shape, (1,)) # array fields a = np.array([ ([1, 2, 3], [4.5, 6, 9.5],), ([4, 5, 6], [3.3, 7.5, 8.4],),], dtype=[ ('x', np.int32, (3,)), ('y', np.float32, (3,))]) arr = rnp.rec2array(a) assert_array_almost_equal(arr, np.array([[[1, 4.5], [2, 6], [3, 9.5]], [[4, 3.3], [5, 7.5], [6, 8.4]]])) # single array field arr = rnp.rec2array(a, fields=['y']) assert_array_almost_equal(arr, np.array([[[4.5], [6], [9.5]], [[3.3], [7.5], [8.4]]])) # single array field simplified arr = rnp.rec2array(a, fields='y') assert_array_almost_equal(arr, np.array([[4.5, 6, 9.5], [3.3, 7.5, 8.4]])) # case where array has single record assert_equal(rnp.rec2array(a[:1], fields=['y']).shape, (1, 3, 1)) assert_equal(rnp.rec2array(a[:1], fields='y').shape, (1, 3)) # lengths mismatch a = np.array([ ([1, 2], [4.5, 6, 9.5],), ([4, 5], [3.3, 7.5, 8.4],),], dtype=[ ('x', np.int32, (2,)), ('y', np.float32, (3,))]) assert_raises(ValueError, rnp.rec2array, a) # mix of scalar and array fields should fail a = np.array([ (1, [4.5, 6, 9.5],), (4, [3.3, 7.5, 8.4],),], dtype=[ ('x', np.int32), ('y', np.float32, (3,))]) assert_raises(ValueError, rnp.rec2array, a)
def train(self, signals, backgrounds, cuts=None, max_sig=None, max_bkg=None, norm_sig_to_bkg=True, same_size_sig_bkg=True, # NOTE: if True this crops signal a lot!! remove_negative_weights=False, grid_search=True, cv_nfold=5, use_cache=True, **clf_params): """ Determine best BDTs on left and right partitions. Each BDT will then be used on the other partition. """ if use_cache and not self.clfs: if self.load(): return signal_recs = [] signal_arrs = [] signal_weight_arrs = [] for signal in signals: left, right = signal.partitioned_records( category=self.category, region=self.region, fields=self.all_fields, cuts=cuts, key=self.partition_key) signal_weight_arrs.append( (left['weight'], right['weight'])) signal_arrs.append( (rec2array(left, self.fields), rec2array(right, self.fields))) signal_recs.append((left, right)) background_recs = [] background_arrs = [] background_weight_arrs = [] for background in backgrounds: left, right = background.partitioned_records( category=self.category, region=self.region, fields=self.all_fields, cuts=cuts, key=self.partition_key) background_weight_arrs.append( (left['weight'], right['weight'])) background_arrs.append( (rec2array(left, self.fields), rec2array(right, self.fields))) background_recs.append((left, right)) self.clfs = [None, None] for partition_idx in range(2): clf_filename = os.path.join(CACHE_DIR, 'classify', 'clf_%s%s_%d' % ( self.category.name, self.clf_output_suffix, partition_idx)) # train a classifier # merge arrays and create training samples signal_train = np.concatenate(map(itemgetter(partition_idx), signal_arrs)) signal_weight_train = np.concatenate(map(itemgetter(partition_idx), signal_weight_arrs)) background_train = np.concatenate(map(itemgetter(partition_idx), background_arrs)) background_weight_train = np.concatenate(map(itemgetter(partition_idx), background_weight_arrs)) if remove_negative_weights: # remove samples from the training sample with a negative weight signal_train = signal_train[signal_weight_train >= 0] background_train = background_train[background_weight_train >= 0] signal_weight_train = signal_weight_train[signal_weight_train >= 0] background_weight_train = background_weight_train[background_weight_train >= 0] if max_sig is not None and max_sig < len(signal_train): subsample = np.random.permutation(len(signal_train))[:max_sig_train] signal_train = signal_train[subsample] signal_weight_train = signal_weight_train[subsample] if max_bkg is not None and max_bkg < len(background_train): subsample = np.random.permutation(len(background_train))[:max_bkg_train] background_train = background_train[subsample] background_weight_train = background_weight_train[subsample] if same_size_sig_bkg: if len(background_train) > len(signal_train): # random subsample of background so it's the same size as signal subsample = np.random.permutation( len(background_train))[:len(signal_train)] background_train = background_train[subsample] background_weight_train = background_weight_train[subsample] elif len(background_train) < len(signal_train): # random subsample of signal so it's the same size as background subsample = np.random.permutation( len(signal_train))[:len(background_train)] signal_train = signal_train[subsample] signal_weight_train = signal_weight_train[subsample] if norm_sig_to_bkg: # normalize signal to background signal_weight_train *= ( background_weight_train.sum() / signal_weight_train.sum()) log.info("Training Samples:") log.info("Signal: %d events, %s features" % signal_train.shape) log.info("Sum(signal weights): %f" % signal_weight_train.sum()) log.info("Background: %d events, %s features" % background_train.shape) log.info("Sum(background weight): %f" % background_weight_train.sum()) log.info("Total: %d events" % ( signal_train.shape[0] + background_train.shape[0])) sample_train = np.concatenate((background_train, signal_train)) sample_weight_train = np.concatenate( (background_weight_train, signal_weight_train)) labels_train = np.concatenate( (np.zeros(len(background_train)), np.ones(len(signal_train)))) if self.standardize: # TODO use same std for classification sample_train = std(sample_train) # random permutation of training sample perm = np.random.permutation(len(labels_train)) sample_train = sample_train[perm] sample_weight_train = sample_weight_train[perm] labels_train = labels_train[perm] log.info("training a new classifier...") #log.info("plotting input variables as they are given to the BDT") ## draw plots of the input variables #for i, branch in enumerate(self.fields): # log.info("plotting %s ..." % branch) # branch_data = sample_train[:,i] # if 'scale' in variables.VARIABLES[branch]: # branch_data *= variables.VARIABLES[branch]['scale'] # _min, _max = branch_data.min(), branch_data.max() # plt.figure() # plt.hist(branch_data[labels_train==0], # bins=20, range=(_min, _max), # weights=sample_weight_train[labels_train==0], # label='Background', histtype='stepfilled', # alpha=.5) # plt.hist(branch_data[labels_train==1], # bins=20, range=(_min, _max), # weights=sample_weight_train[labels_train==1], # label='Signal', histtype='stepfilled', alpha=.5) # label = variables.VARIABLES[branch]['title'] # if 'units' in variables.VARIABLES[branch]: # label += ' [%s]' % variables.VARIABLES[branch]['units'] # plt.xlabel(label) # plt.legend() # plt.savefig(os.path.join(PLOTS_DIR, 'train_var_%s_%s%s.png' % ( # self.category.name, branch, self.output_suffix))) #log.info("plotting sample weights ...") #_min, _max = sample_weight_train.min(), sample_weight_train.max() #plt.figure() #plt.hist(sample_weight_train[labels_train==0], # bins=20, range=(_min, _max), # label='Background', histtype='stepfilled', # alpha=.5) #plt.hist(sample_weight_train[labels_train==1], # bins=20, range=(_min, _max), # label='Signal', histtype='stepfilled', alpha=.5) #plt.xlabel('sample weight') #plt.legend() #plt.savefig(os.path.join(PLOTS_DIR, 'train_sample_weight_%s%s.png' % ( # self.category.name, self.output_suffix))) if partition_idx == 0: # grid search params min_leaf_high = int((sample_train.shape[0] / 8) * (cv_nfold - 1.) / cv_nfold) min_leaf_low = max(10, int(min_leaf_high / 100.)) min_leaf_step = max((min_leaf_high - min_leaf_low) / 50, 1) max_n_estimators = 200 min_n_estimators = 1 n_estimators_step = 50 min_samples_leaf = range( min_leaf_low, min_leaf_high, min_leaf_step) #n_estimators = range( # min_n_estimators, max_n_estimators, n_estimators_step) n_estimators = np.power(2, np.arange(0, 8)) grid_params = { 'base_estimator__min_samples_leaf': min_samples_leaf, #'n_estimators': n_estimators } #AdaBoostClassifier.staged_score = staged_score clf = AdaBoostClassifier( DecisionTreeClassifier(), learning_rate=.1, algorithm='SAMME.R', random_state=0) grid_clf = BoostGridSearchCV( clf, grid_params, max_n_estimators=max_n_estimators, min_n_estimators=min_n_estimators, #n_estimators_step=1, # can use default ClassifierMixin score #score_func=precision_score, cv = StratifiedKFold(labels_train, cv_nfold), n_jobs=20) #grid_clf = GridSearchCV( # clf, grid_params, # # can use default ClassifierMixin score # #score_func=precision_score, # cv = StratifiedKFold(labels_train, cv_nfold), # n_jobs=20) log.info("") log.info("using a %d-fold cross validation" % cv_nfold) log.info("performing a grid search over these parameter values:") for param, values in grid_params.items(): log.info('{0} {1}'.format(param.split('__')[-1], values)) log.info("Minimum number of classifiers: %d" % min_n_estimators) log.info("Maximum number of classifiers: %d" % max_n_estimators) log.info("") log.info("training new classifiers ...") grid_clf.fit( sample_train, labels_train, sample_weight=sample_weight_train) clf = grid_clf.best_estimator_ grid_scores = grid_clf.grid_scores_ log.info("Best score: %f" % grid_clf.best_score_) log.info("Best Parameters:") log.info(grid_clf.best_params_) # plot a grid of the scores plot_grid_scores( grid_scores, best_point={ 'base_estimator__min_samples_leaf': clf.base_estimator.min_samples_leaf, 'n_estimators': clf.n_estimators}, params={ 'base_estimator__min_samples_leaf': 'min leaf', 'n_estimators': 'trees'}, name=self.category.name + self.output_suffix + "_%d" % partition_idx) # scale up the min-leaf and retrain on the whole set min_samples_leaf = clf.base_estimator.min_samples_leaf clf = sklearn.clone(clf) clf.base_estimator.min_samples_leaf = int( min_samples_leaf * cv_nfold / float(cv_nfold - 1)) clf.fit(sample_train, labels_train, sample_weight=sample_weight_train) log.info("After scaling up min_leaf") out = StringIO() print >> out print >> out print >> out, clf log.info(out.getvalue()) else: # training on the other partition log.info("training a new classifier ...") # use same params as in first partition clf = sklearn.clone(clf) out = StringIO() print >> out print >> out print >> out, clf log.info(out.getvalue()) clf.fit(sample_train, labels_train, sample_weight=sample_weight_train) if isinstance(clf, AdaBoostClassifier): # export to graphviz dot format if os.path.isdir(clf_filename): shutil.rmtree(clf_filename) os.mkdir(clf_filename) for itree, tree in enumerate(clf): export_graphviz(tree, out_file=os.path.join( clf_filename, 'tree_{0:d}.dot'.format(itree)), feature_names=self.all_fields) with open('{0}.pickle'.format(clf_filename), 'w') as f: pickle.dump(clf, f) print_feature_ranking(clf, self.fields) self.clfs[(partition_idx + 1) % 2] = clf
def Optimize(name,X,y,features_array,signal_selection,bkg_selection,DumpDiscriminators=False,DumpFile="",Optmization_fraction = 0.1,train_test_splitting=0.2,verbosity=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_splitting) X_train_skimmed = np.asarray([X_train[i] for i in range(len(X_train)) if i%int(1./Optmization_fraction) == 0]) # optimization only on 10 % y_train_skimmed = np.asarray([y_train[i] for i in range(len(y_train)) if i%int(1./Optmization_fraction) == 0]) Classifiers = {} # # GBC # log.info('%s %s %s: Starting to process %s Gradient Boosting Classifier %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) gbc_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'learning_rate':list([0.05,0.1])} gbc_clf = GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=0, cv=2) gbc_clf.fit(X_train_skimmed,y_train_skimmed) gbc_best_clf = gbc_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(gbc_best_clf.get_params())) gbc_best_clf.verbose = 2 gbc_best_clf.fit(X_train,y_train) gbc_disc = gbc_best_clf.predict_proba(X_test)[:,1] gbc_fpr, gbc_tpr, gbc_thresholds = roc_curve(y_test, gbc_disc) Classifiers["GBC"]=(gbc_best_clf,y_test,gbc_disc,gbc_fpr,gbc_tpr,gbc_thresholds) # # Randomized Forest # log.info('%s %s %s: Starting to process %s Randomized Forest Classifier %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) rf_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'max_features':list(["sqrt","log2",0.5])} rf_clf = GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=0, cv=2) rf_clf.fit(X_train_skimmed,y_train_skimmed) rf_best_clf = rf_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(rf_best_clf.get_params())) rf_best_clf.verbose = 2 rf_best_clf.fit(X_train,y_train) rf_disc = rf_best_clf.predict_proba(X_test)[:,1] rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf_disc) Classifiers["RF"]=(rf_best_clf,y_test,rf_disc,rf_fpr,rf_tpr,rf_thresholds) # # Stochastic Gradient Descent # log.info('%s %s %s: Starting to process %s Stochastic Gradient Descent %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) sgd_parameters = {'loss':list(['log','modified_huber']), 'penalty':list(['l2','l1','elasticnet']),'alpha':list([0.0001,0.00005,0.001]), 'n_iter':list([10,50,100])} sgd_clf = GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=0, cv=2) sgd_clf.fit(X_train_skimmed,y_train_skimmed) sgd_best_clf = sgd_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(sgd_best_clf.get_params())) sgd_best_clf.verbose = 2 sgd_best_clf.fit(X_train,y_train) sgd_disc = sgd_best_clf.predict_proba(X_test)[:,1] sgd_fpr, sgd_tpr, sgd_thresholds = roc_curve(y_test, sgd_disc) Classifiers["SGD"]=(sgd_best_clf,y_test,sgd_disc,sgd_fpr,sgd_tpr,sgd_thresholds) # # Nearest Neighbors # log.info('%s %s %s: Starting to process %s Nearest Neighbors %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) knn_parameters = {'n_neighbors':list([5,10,50,100]), 'algorithm':list(['ball_tree','kd_tree','brute']),'leaf_size':list([20,30,40]), 'metric':list(['euclidean','minkowski','manhattan','chebyshev'])} knn_clf = GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=0, cv=2) knn_clf.fit(X_train_skimmed,y_train_skimmed) knn_best_clf = knn_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(knn_best_clf.get_params())) knn_best_clf.verbose = 2 knn_best_clf.fit(X_train,y_train) knn_disc = knn_best_clf.predict_proba(X_test)[:,1] knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_disc) Classifiers["kNN"]=(knn_best_clf,y_test,knn_disc,knn_fpr,knn_tpr,knn_thresholds) # # Naive Bayes (Likelihood Ratio) # log.info('%s %s %s: Starting to process %s Naive Bayes (Likelihood Ratio) %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) nb_best_clf = GaussianNB() # There is no tuning of a likelihood ratio! if verbosity: log.info('Parameters of the best classifier: A simple likelihood ratio has no parameters to be tuned!') nb_best_clf.verbose = 2 nb_best_clf.fit(X_train,y_train) nb_disc = nb_best_clf.predict_proba(X_test)[:,1] nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, nb_disc) Classifiers["NB"]=(nb_best_clf,y_test,nb_disc,nb_fpr,nb_tpr,nb_thresholds) # # Multi-Layer Perceptron (Neural Network) # log.info('%s %s %s: Starting to process %s Multi-Layer Perceptron (Neural Network) %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) mlp_parameters = {'activation':list(['tanh','relu']), 'hidden_layer_sizes':list([10,(5,10),(10,15)]), 'algorithm':list(['adam']), 'alpha':list([0.0001,0.00005]), 'tol':list([0.00001,0.00005,0.0001]), 'learning_rate_init':list([0.001,0.005,0.0005])} mlp_clf = GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=0, cv=2) #learning_rate = 'adaptive' mlp_clf.fit(X_train_skimmed,y_train_skimmed) mlp_best_clf = mlp_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(mlp_best_clf.get_params())) mlp_best_clf.verbose = 2 mlp_best_clf.fit(X_train,y_train) mlp_disc = mlp_best_clf.predict_proba(X_test)[:,1] mlp_fpr, mlp_tpr, mlp_thresholds = roc_curve(y_test, mlp_disc) Classifiers["MLP"]=(mlp_best_clf,y_test,mlp_disc,mlp_fpr,mlp_tpr,mlp_thresholds) # # Support Vector Machine # log.info('%s %s %s: Starting to process %s Support Vector Machine %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE)) svm_parameters = {'kernel':list(['rbf']), 'gamma':list(['auto',0.05]), 'C':list([0.9,1.0])} svm_clf = GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=0, cv=2) svm_clf.fit(X_train_skimmed,y_train_skimmed) svm_best_clf = svm_clf.best_estimator_ if verbosity: log.info('Parameters of the best classifier: %s' % str(svm_best_clf.get_params())) svm_best_clf.verbose = 2 #svm_best_clf.fit(X_train,y_train) svm_disc = svm_best_clf.predict_proba(X_test)[:,1] svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_disc) Classifiers["SVM"]=(svm_best_clf,y_test,svm_disc,svm_fpr,svm_tpr,svm_thresholds) if DumpDiscriminators: XX = rootnp.root2array(DumpFile,'tree',features_array,None,0,None,None,False,'weight') XX = rootnp.rec2array(XX) ordered_MVAs = ['GBC','RF','SVM','SGD','kNN','NB','MLP'] dict_Discriminators = {} for c in ordered_MVAs: classifier = Classifiers[c][0] dict_Discriminators[name+'_'+c] = classifier.predict_proba(XX)[:,1] inputfile = ROOT.TFile(DumpFile) inputtree = inputfile.Get('tree') inputtree.SetBranchStatus("*",1) branch_list = inputtree.GetListOfBranches() branch_name_list = [d.GetName() for d in branch_list] for mva in ordered_MVAs: branch_name = name+"_"+mva if branch_name in branch_name_list: inputtree.SetBranchStatus(branch_name,0) newfile = ROOT.TFile(DumpFile.split('.root')[0]+'_tmp.root','RECREATE') newtree = inputtree.CloneTree(0) dict_Leaves = {} for mva in ordered_MVAs: branch_name = name+"_"+mva dict_Leaves[branch_name] = array('d',[0]) newtree.Branch(branch_name, dict_Leaves[branch_name], branch_name + "/D") log.info('%s: Starting to process the output tree' %name) nEntries = inputtree.GetEntries() for i in range(nEntries): if i%1000 == 0: log.info('Processing event %s/%s (%s%.2f%s%%)' %(i,nEntries,Fore.GREEN,100*float(i)/float(nEntries),Fore.WHITE)) inputtree.GetEntry(i) for key,value in dict_Discriminators.iteritems(): dict_Leaves[key][0] = value[i] newtree.Fill() newtree.Write() newfile.Close() inputfile.Close() os.system('cp %s %s'%(DumpFile.split('.root')[0]+'_tmp.root',DumpFile)) os.system('rm %s'%DumpFile.split('.root')[0]+'_tmp.root') log.info('Done: output file dumped in %s' %DumpFile) return Classifiers
def run(source, quick=False): print time.asctime(time.localtime()), "Copying datasets" branch_names = [] with open(source, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in reader: branch_names.append(row[0]) if quick == True: step_size = 100 else: step_size = 1 if source == "BDTvarBs2phiphi.csv": myselection = "B_s0_MM > 5486.77" backgr = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA.root", "DecayTree", branch_names, myselection, step = step_size) backgr = rec2array(backgr) signal = root2array("/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA.root", "DecayTree", branch_names, step = step_size) signal = rec2array(signal) # data contains every data point (later split into evaluation and test) data = np.concatenate((signal, backgr)) # output contains binary class of data (later split into evaluation and test) output = np.concatenate((np.ones(signal.shape[0]), np.zeros(backgr.shape[0]))) frac = 0.5 data_dev, data_eval, output_dev, output_eval = train_test_split(data, output, test_size=0.33, random_state=492) data_train, data_test, output_train, output_test = train_test_split(data_dev, output_dev, test_size=frac, random_state=42) joblib.dump(branch_names, 'pickle/variables.pkl') print time.asctime(time.localtime()), "Real Data contains", len(data), "entries. Training on ", len(data)*frac, "Entries" print time.asctime(time.localtime()), "Monte Carlo contains", len(signal), "entries. Training on ", len(signal)*frac, "Entries" if quick == True: joblib.dump(signal, 'pickle/all_signalq.pkl') joblib.dump(data, 'pickle/all_dataq.pkl') joblib.dump(data_dev, 'pickle/datadevq.pkl') joblib.dump(data_eval, 'pickle/dataevq.pkl') joblib.dump(output_dev, 'pickle/outputdevq.pkl') joblib.dump(output_eval, 'pickle/outputevq.pkl') joblib.dump(data_train, 'pickle/dataq.pkl') joblib.dump(data_test, 'pickle/datatestq.pkl') joblib.dump(output_train, 'pickle/outputq.pkl') joblib.dump(output_test, 'pickle/outputtestq.pkl') else: joblib.dump(signal, 'pickle/all_signal.pkl') joblib.dump(data, 'pickle/all_data.pkl') joblib.dump(data_dev, 'pickle/datadev.pkl') joblib.dump(data_eval, 'pickle/dataev.pkl') joblib.dump(output_dev, 'pickle/outputdev.pkl') joblib.dump(output_eval, 'pickle/outputev.pkl') joblib.dump(data_train, 'pickle/data.pkl') joblib.dump(data_test, 'pickle/datatest.pkl') joblib.dump(output_train, 'pickle/output.pkl') joblib.dump(output_test, 'pickle/outputtest.pkl') print time.asctime(time.localtime()), "Datasets produced!"
parser.add_argument('--pickEvery', type=int, default=10, help='pick one element every ...') args = parser.parse_args() if args.batch: ROOT.gROOT.SetBatch(True) features = general+vertex+leptons filename = "./TTjets.root" treename = "tree" File = TFile(filename) tree = File.Get(treename) X = np.ndarray((0,len(features)),float) # container to hold the combined trees in numpy array structure treeArray = rootnp.root2array(filename,treename,features,None,0,args.element_per_sample,args.pickEvery,False,'weight') X = rootnp.rec2array(treeArray) flavours = rootnp.root2array(filename,treename,"flavour",None,0,args.element_per_sample,args.pickEvery,False,'weight') y = np.ones(len(flavours)) assert args.signal == "C" or args.signal == "B" or args.signal == "DUSG", "Invalid signal flavour: " + args.signal + ", must be C, B or DUSG" signalselection = "" bckgrselection = "" if args.signal == "C": for i,fl in enumerate(flavours): y[i] = 1 if abs(fl) == 4 else 0 signalselection = "flavour == 4" assert args.bkg == "DUSG" or args.bkg == "B", "Invalid background flavour: " + args.bkg + ", must be either DUSG or B for signal flavour: " + args.signal if args.bkg == "DUSG": bckgrselection = "flavour != 4 && flavour != 5" elif args.bkg == "B": bckgrselection = "flavour == 5" elif args.signal == "B": for i,fl in enumerate(flavours):
def BestClassifier(Classifiers,FoM,typ_name='',features_array=[],signal_selection='',bkg_selection='',DumpDiscriminators=False,DumpFile=""): """ Goal: select from a set of classifier dictionaries (containing the name, object,discriminators, tpr, ...) the best one based on Figure of Merit FoM returns: name_of_best_clf,best_clf_object """ assert FoM == 'AUC' or FoM == 'OOP' or FoM == 'ACC' or FoM == 'PUR', "Invalid Figure of Merit: " + FoM AUC_tmp = {} OOP_tmp = {} PUR_tmp = {} ACC_tmp = {} for name, clf in Classifiers.items(): #if idx == 0: clf_names.append(name) y_true = clf[1] disc = clf[2] fpr = clf[3] tpr = clf[4] thres = clf[5] disc_s = disc[y_true == 1] disc_b = disc[y_true == 0] tp = [len(disc_s[disc_s>=t]) for t in thres] fp = [len(disc_b[disc_b>=t]) for t in thres] tn = [len(disc_b[disc_b<t]) for t in thres] fn = [len(disc_s[disc_s<t]) for t in thres] # # Area under ROC-curve # if FoM == 'AUC': AUC_tmp[name]=roc_auc_score(y_true,disc) # # Optimal Operating Point # elif FoM == 'OOP': dist = [math.sqrt((i-1)**2 + (j-0)**2) for i,j in zip(tpr,fpr)] OOP_tmp[name] = 1-min(dist) # # Purity # elif FoM == 'PUR': atEff = 0.5 pur = [float(i)/float(i+j) if (i+j != 0) else 0 for i,j in zip(tp,fp)] val, dx = min((val, dx) for (dx, val) in enumerate([abs(atEff-i) for i in tpr]))# point with eff closes to [atEff] PUR_tmp[name] = pur[dx] # Purity at [atEff]% efficiency # # Accuracy # elif FoM == 'ACC': Acc = [float(i+j)/float(i+j+k+l) if (i+j+k+l !=0) else 0 for i,j,k,l in zip(tp,tn,fp,fn)] ACC_tmp[name] = Acc[dx] # Accuracy at [atEff]% efficiency if DumpDiscriminators: XX = rootnp.root2array(DumpFile,'tree',features_array,None,0,None,None,False,'weight') XX = rootnp.rec2array(XX) dict_Discriminators = {} classifier = Classifiers[max(AUC_tmp.iteritems(), key=itemgetter(1))[0]][0] best_mva_name = max(AUC_tmp.iteritems(), key=itemgetter(1))[0] dict_Discriminators[typ_name+'_BEST_'+best_mva_name] = classifier.predict_proba(XX)[:,1] inputfile = ROOT.TFile(DumpFile) inputtree = inputfile.Get('tree') inputtree.SetBranchStatus("*",1) branch_list = inputtree.GetListOfBranches() branch_name_list = [d.GetName() for d in branch_list] branch_name = typ_name+'_BEST_' if any([branch_name in s for s in branch_name_list]): inputtree.SetBranchStatus(branch_name+"*",0) newfile = ROOT.TFile(DumpFile.split('.root')[0]+'_tmp.root','RECREATE') newtree = inputtree.CloneTree(0) dict_Leaves = {} branch_name = typ_name+'_BEST_'+best_mva_name dict_Leaves[branch_name] = array('d',[0]) newtree.Branch(branch_name, dict_Leaves[branch_name], branch_name + "/D") log.info('%s: Starting to process the output tree' %typ_name) nEntries = inputtree.GetEntries() for i in range(nEntries): if i%1000 == 0: log.info('Processing event %s/%s (%s%.2f%s%%)' %(i,nEntries,Fore.GREEN,100*float(i)/float(nEntries),Fore.WHITE)) inputtree.GetEntry(i) for key,value in dict_Discriminators.iteritems(): dict_Leaves[key][0] = value[i] newtree.Fill() newtree.Write() newfile.Close() inputfile.Close() os.system('cp %s %s'%(DumpFile.split('.root')[0]+'_tmp.root',DumpFile)) os.system('rm %s'%DumpFile.split('.root')[0]+'_tmp.root') log.info('Done: output file dumped in %s' %DumpFile) if FoM == "AUC": return max(AUC_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(AUC_tmp.iteritems(), key=itemgetter(1))[0]][0] elif FoM == "OOP": return max(OOP_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(OOP_tmp.iteritems(), key=itemgetter(1))[0]][0] elif FoM == "PUR": return max(PUR_tmp.iteritems(), key=itemgetter(1))[0], Classifiers[max(PUR_tmp.iteritems(), key=itemgetter(1))[0]][0] elif FoM == "ACC": return max(ACC_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(ACC_tmp.iteritems(), key=itemgetter(1))[0]][0]
log.info("processing file %s for training" % fname) with io.root_open(fname) as tfile: match = fname_regex.match(fname) if not match: raise ValueError("Could not match the regex to the file %s" % fname) flavor = match.group("flavor") full_category = match.group("category") category = [i for i in sv_categories if i in full_category][0] # if flavor == 'C': # log.info('Jet_flavour %s is not considered signal or background in this training and is omitted' % flavor) # continue nfiles_per_sample = None skip_n_events = 2 # put this to 1 to include all the events tree = rootnp.root2array(fname, "ttree", variables, None, 0, nfiles_per_sample, skip_n_events, False, "weight") tree = rootnp.rec2array(tree) X = np.concatenate((X, tree), 0) if flavor == "B": y = np.concatenate((y, np.ones(tree.shape[0]))) weight_B = np.empty(tree.shape[0]) weight_B.fill(2) weights_flavour = np.concatenate((weights_flavour, weight_B)) elif flavor == "C": y = np.concatenate((y, np.zeros(tree.shape[0]))) weight_C = np.empty(tree.shape[0]) weight_C.fill(1) weights_flavour = np.concatenate((weights_flavour, weight_C)) else: y = np.concatenate((y, np.zeros(tree.shape[0]))) weight_DUSG = np.empty(tree.shape[0]) weight_DUSG.fill(3)
def draw_array_helper(self, field_hist, category, region, cuts=None, weighted=True, field_scale=None, weight_hist=None, scores=None, clf=None, min_score=None, max_score=None, systematic='NOMINAL', bootstrap_data=False): from .data import Data, DataInfo all_fields = [] classifiers = [] for f in field_hist.iterkeys(): if isinstance(f, basestring): all_fields.append(f) elif isinstance(f, Classifier): classifiers.append(f) else: all_fields.extend(list(f)) if len(classifiers) > 1: raise RuntimeError( "more than one classifier in fields is not supported") elif len(classifiers) == 1: classifier = classifiers[0] else: classifier = None if isinstance(self, Data) and bootstrap_data: log.info("using bootstrapped data") analysis = bootstrap_data recs = [] scores = [] for s in analysis.backgrounds: rec = s.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=clf, systematic=systematic) recs.append(rec) b_rec = stack(recs, fields=all_fields + ['classifier', 'weight']) s_rec = analysis.higgs_125.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=clf, systematic=systematic) # handle negative weights separately b_neg = b_rec[b_rec['weight'] < 0] b_pos = b_rec[b_rec['weight'] >= 0] def bootstrap(rec): prob = np.abs(rec['weight']) prob = prob / prob.sum() # random sample without replacement log.warning(str(int(round(abs(rec['weight'].sum()))))) sample_idx = np.random.choice( rec.shape[0], size=int(round(abs(rec['weight'].sum()))), replace=False, p=prob) return rec[sample_idx] rec = stack([ bootstrap(b_neg), bootstrap(b_pos), bootstrap(s_rec)], fields=all_fields + ['classifier', 'weight']) rec['weight'][:] = 1. scores = rec['classifier'] else: # TODO: only get unblinded vars rec = self.merged_records(category, region, fields=all_fields, cuts=cuts, include_weight=True, clf=classifier, systematic=systematic) if isinstance(scores, tuple): # sanity #assert (scores[1] == rec['weight']).all() # ignore the score weights since they should be the same as the rec # weights scores = scores[0] if weight_hist is not None and scores is not None: log.warning("applying a weight histogram") edges = np.array(list(weight_hist.xedges())) # handle strange cases edges[0] -= 1E10 edges[-1] += 1E10 weights = np.array(list(weight_hist.y())).take( edges.searchsorted(scores) - 1) weights = rec['weight'] * weights else: weights = rec['weight'] if scores is not None: if min_score is not None: idx = scores > min_score rec = rec[idx] weights = weights[idx] scores = scores[idx] if max_score is not None: idx = scores < max_score rec = rec[idx] weights = weights[idx] scores = scores[idx] for fields, hist in field_hist.items(): if isinstance(fields, Classifier): fields = ['classifier'] # fields can be a single field or list of fields elif not isinstance(fields, (list, tuple)): fields = [fields] if hist is None: # this var might be blinded continue # defensive copy if isinstance(fields, tuple): # select columns in numpy recarray with a list fields = list(fields) arr = np.copy(rec[fields]) if field_scale is not None: for field in fields: if field in field_scale: arr[field] *= field_scale[field] # convert to array arr = rec2array(arr, fields=fields) # HACK HACK HACK _weights = weights if fields == ['dEta_jets']: log.warning("HACK HACK") nonzero = arr > 0 arr = arr[nonzero] _weights = weights[nonzero] # include the scores if the histogram dimensionality allows if scores is not None and hist.GetDimension() == len(fields) + 1: arr = np.c_[arr, scores] elif hist.GetDimension() != len(fields): raise TypeError( 'histogram dimensionality does not match ' 'number of fields: %s' % (', '.join(fields))) hist.fill_array(arr, weights=_weights) if isinstance(self, Data): if hasattr(hist, 'datainfo'): hist.datainfo += self.info else: hist.datainfo = DataInfo(self.info.lumi, self.info.energies)
for bootstrap_idx in range(100): sys.stdout.write("bootstrap {0} ...\r".format(bootstrap_idx)) sys.stdout.flush() # resample with replacement # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.random.choice.html sample_idx = np.random.choice(len(array), size=len(array), replace=True) array_bootstrapped = array[sample_idx] # convert back to a TTree and write it out tree_bootstrapped = array2tree( array_bootstrapped, name='bootstrap_{0}'.format(bootstrap_idx)) tree_bootstrapped.Write() tree_bootstrapped.Delete() # fill the ROOT histogram with the numpy array hist.Reset() fill_hist(hist, rec2array(array_bootstrapped)) hist.Draw() hist.xaxis.title = 'x' hist.yaxis.title = 'y' hist.zaxis.title = 'Events' hist.xaxis.limits = (-2.5, 2.5) hist.yaxis.limits = (-2.5, 2.5) hist.zaxis.range_user = (0, 60) hist.xaxis.divisions = 5 hist.yaxis.divisions = 5 hist.zaxis.divisions = 5 canvas.Print('bootstrap.gif+50') # loop the gif canvas.Print('bootstrap.gif++') output.Close()