def root2panda(files_path, tree_name, mask=False, **kwargs): ''' Args: ----- files_path: a string like './data/*.root', for example tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open kwargs: arguments taken by root2rec, such as branches to consider, etc Returns: -------- output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored Note: ----- if you are working with .root files that contain different branches, you might have to mask your data in that case, return pd.DataFrame(ss.data) ''' files = glob.glob(files_path) # -- check whether a name was passed for the tree_name --> for root files with only one tree and no folders, # -- you do not need to specify any name (I believe) if (tree_name == ''): ss = stack_arrays([root2rec(fpath, **kwargs) for fpath in files]) else: ss = stack_arrays( [root2rec(fpath, tree_name, **kwargs) for fpath in files]) if (mask): return pd.DataFrame(ss.data) else: try: return pd.DataFrame(ss) except Exception, e: return pd.DataFrame(ss.data)
def readFiles(): print 'Reading files...' weightsS = root2rec(files_signal, treename='tree', branches=['full_weight'], selection=selection)['full_weight'] weightsB = root2rec(files_bg, treename='tree', branches=['full_weight'], selection=selection)['full_weight'] sum_weightsS = np.sum(weightsS) sum_weightsB = np.sum(weightsB) weightsB = weightsB * sum_weightsS/sum_weightsB nS = len(weightsS) nB = len(weightsB) fullWeight = np.concatenate((weightsS, weightsB)) # fullWeight = fullWeight['weight'] # fullWeight = np.ones(len(fullWeight)) # del weightsS, weightsB arrSB = root2array(files_signal + files_bg, treename='tree', branches=trainVars(), selection=selection) # Need a matrix-like array instead of a 1-D array of lists for sklearn arrSB = (np.asarray([arrSB[var] for var in trainVars()])).transpose() targets = np.concatenate((np.ones(nS),np.zeros(nB))) print 'Done reading files.' return arrSB, fullWeight, targets
def root2panda(files_path, tree_name, mask = False, **kwargs): ''' Args: ----- files_path: a string like './data/*.root', for example tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open kwargs: arguments taken by root2rec, such as branches to consider, etc Returns: -------- output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored Note: ----- if you are working with .root files that contain different branches, you might have to mask your data in that case, return pd.DataFrame(ss.data) ''' files = glob.glob(files_path) # -- check whether a name was passed for the tree_name --> for root files with only one tree and no folders, # -- you do not need to specify any name (I believe) if (tree_name == ''): ss = stack_arrays([root2rec(fpath, **kwargs) for fpath in files]) else: ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files]) if (mask): return pd.DataFrame(ss.data) else: try: return pd.DataFrame(ss) except Exception, e: return pd.DataFrame(ss.data)
def test_slice(): a = rnp.root2rec(load('single1.root'), stop=10) assert_equal(len(a), 10) assert_equal(a.n_int[-1], 10) a = rnp.root2rec(load('single1.root'), stop=11, start=1) assert_equal(len(a), 10) assert_equal(a.n_int[-1], 11) a = rnp.root2rec(load('single1.root'), stop=105, start=95) assert_equal(len(a), 5) assert_equal(a.n_int[-1], 100)
def test_selection_and_expression(): ref = len(rnp.root2rec( load('test.root'), branches=['x', 'y'], selection='z>0')) assert_equal(ref, len(rnp.root2rec( load('test.root'), branches=['x', 'y', 'z'], selection='z>0'))) assert_equal(ref, len(rnp.root2rec( load('test.root'), branches=['x', 'x*y'], selection='z>0'))) assert_equal(ref, len(rnp.root2rec( load('test.root'), branches=['x', 'x*z'], selection='z>0')))
def root2panda(file_paths, tree_name, **kwargs): ''' Args: ----- files_path: a string like './data/*.root', for example tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open kwargs: arguments taken by root2rec, such as branches to consider, etc Returns: -------- output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored Note: ----- if you are working with .root files that contain different branches, you might have to mask your data in that case, return pd.DataFrame(ss.data) ''' if isinstance(file_paths, basestring): files = glob.glob(file_paths) else: files = [matched_f for f in file_paths for matched_f in glob.glob(f)] ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files]) try: return pd.DataFrame(ss) except Exception: return pd.DataFrame(ss.data)
def test_variable_length_arrays(): f = load(['vary1.root', 'vary2.root']) a = rnp.root2rec(f) assert_equal( a.dtype, [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'), ('n_char', 'O'), ('n_uchar', 'O'), ('n_short', 'O'), ('n_ushort', 'O'), ('n_int', 'O'), ('n_uint', 'O'), ('n_long', 'O'), ('n_ulong', 'O'), ('f_float', 'O'), ('d_double', 'O'), ('n2_int', 'O'), ('f2_float', 'O'), ('d2_double', 'O')]) # check lengths for i in range(len(a)): assert_equal(a.len_n[i], len(a.n_int[i])) assert_equal(a.len_f[i], len(a.f_float[i])) assert_equal(a.len_d[i], len(a.d_double[i])) assert_equal((a.len_n[i], 2), a.n2_int[i].shape) assert_equal((a.len_f[i], 3), a.f2_float[i].shape) assert_equal((a.len_d[i], 4), a.d2_double[i].shape) # check elements assert_equal(a.len_n[0], 0) assert_equal(a.len_f[0], 1) assert_equal(a.len_d[0], 2) assert_equal(a.n_int[-1][-1], 417) assert_equal(a.f_float[-1][0], 380.5) assert_equal(a.f_float[-1][-1], 456.5) assert_equal(a.d_double[-1][0], 380.25) assert_equal(a.d_double[-1][-1], 497.25)
def test_vector(): a = rnp.root2rec(load('hvector.root')) assert_equal( a.dtype, [('v_i', 'O'), ('v_f', 'O'), ('v_F', 'O'), ('v_d', 'O'), ('v_l', 'O'), ('v_c', 'O'), ('v_b', 'O')]) assert_equal(a.v_i[1].dtype, np.int32) assert_equal(a.v_f[1].dtype, np.float32) assert_equal(a.v_F[1].dtype, np.float32) assert_equal(a.v_d[1].dtype, np.float64) assert_equal(a.v_l[1].dtype, np.int64) assert_equal(a.v_c[1].dtype, np.int8) assert_equal(a.v_b[1].dtype, np.bool) #check couple value assert_equal(a.v_i[1][0], 1) assert_equal(a.v_i[2][1], 3) assert_equal(a.v_i[-1][0], 99) assert_equal(a.v_i[-1][-1], 107) assert_equal(a.v_f[1][0], 2.0) assert_equal(a.v_f[2][1], 5.0) assert_equal(a.v_f[-1][0], 198.0) assert_equal(a.v_f[-1][-1], 206.0) assert_equal(a.v_F[1][0], 2.0) assert_equal(a.v_F[2][1], 5.0) assert_equal(a.v_F[-1][0], 198.0) assert_equal(a.v_F[-1][-1], 206.0)
def root2pandas(files_path, tree_name, **kwargs): ''' Args: ----- files_path: a string like './data/*.root', for example tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root file that we want to open kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc Returns: -------- output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored Note: ----- if you are working with .root files that contain different branches, you might have to mask your data in that case, return pd.DataFrame(ss.data) ''' # -- create list of .root files to process files = glob.glob(files_path) # -- process ntuples into rec arrays ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files]) try: return pd.DataFrame(ss) except Exception: return pd.DataFrame(ss.data)
def test_vary(): f = load(['vary1.root', 'vary2.root']) a = rnp.root2rec(f) assert_equal( a.dtype, [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'), ('n_char', 'O'), ('n_uchar', 'O'), ('n_short', 'O'), ('n_ushort', 'O'), ('n_int', 'O'), ('n_uint', 'O'), ('n_long', 'O'), ('n_ulong', 'O'), ('f_float', 'O'), ('d_double', 'O')]) #check length for i in range(len(a)): assert_equal(a.len_n[i], len(a.n_int[i])) assert_equal(a.len_f[i], len(a.f_float[i])) assert_equal(a.len_d[i], len(a.d_double[i])) #couple element check assert_equal(a.len_n[0], 0) assert_equal(a.len_f[0], 1) assert_equal(a.len_d[0], 2) assert_equal(a.n_int[-1][-1], 417) assert_equal(a.f_float[-1][0], 380.5) assert_equal(a.f_float[-1][-1], 456.5) assert_equal(a.d_double[-1][0], 380.25) assert_equal(a.d_double[-1][-1], 497.25)
def test_struct(): assert_array_equal(rnp.root2rec(load('struct.root')), np.array([(10, 15.5, 20, 781.2)], dtype=[ ('branch1_intleaf', '<i4'), ('branch1_floatleaf', '<f4'), ('branch2_intleaf', '<i4'), ('branch2_floatleaf', '<f4')]))
def test_object_expression(): rec = rnp.root2rec(load(['object1.root', 'object2.root']), branches=['vect.Pt()']) assert_array_equal( rec['vect.Pt()'], np.concatenate([ np.arange(10, dtype='d') + 1, np.arange(10, dtype='d') + 2]))
def harvest(filenames,definitions,**kwargs): """ Extract the variable data from the provided files Args: filenames (list): the files to extract from currently supported: {0} Keyword Args: transformation (func): will be applied to the read out data Returns: pd.Series or pd.DataFrame """.format(REGISTERED_FILEEXTENSIONS.__repr__()) data = pd.Series() for filename in filenames: filetype = f.strip_all_endings(filename)[1] assert filetype in REGISTERED_FILEEXTENSIONS, "Filetype {} not known!".format(filetype) assert os.path.exists(filename), "File {} does not exist!".format(filetype) Logger.debug("Attempting to harvest {1} file {0}".format(filename,filetype)) if filetype == ".h5" and not isinstance(filename, tables.table.Table): # store = pd.HDFStore(filename) hdftable = tables.openFile(filename) else: hdftable = filename tmpdata = pd.Series() for definition in definitions: if filetype == ".h5": try: # data = store.select_column(*definition) tmpdata = hdftable.getNode("/" + definition[0]).col(definition[1]) tmpdata = pd.Series(tmpdata, dtype=n.float64) Logger.debug("Found {} entries in table for {}{}".format(len(tmpdata),definition[0],definition[1])) break except tables.NoSuchNodeError: Logger.debug("Can not find definition {0} in {1}! ".format(definition, filename)) continue elif filetype == ".root": tmpdata = rn.root2rec(filename, *definition) tmpdata = pd.Series(data) if filetype == ".h5": hdftable.close() #tmpdata = harvest_single_file(filename, filetype,definitions) # self.data = self.data.append(data.map(self.transform)) # concat should be much faster if "transformation" in kwargs: transform = kwargs['transformation'] data = pd.concat([data, tmpdata.map(transform)]) else: data = pd.concat([data, tmpdata]) del tmpdata return data
def getRootToRec(filename, treename): """ Convert and return a tree into a record numpy array Inputs: filename and treename Return: rec array """ from root_numpy import root2array, root2rec info('(getRootToRec) building rec array from tree %s in file %s' % (treename, filename)) return root2rec(filename, treename)
def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.): #set seed cuts seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1) leading_trigger_jets = [] #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']] gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']] #bins for all histograms num_offlineEvents = 0 # main loop that goes over the file for event_num in range(total_num_events): if event_num % 100 == 0: print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh) # pull in data row by row data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1)) oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names]) # if there are no offline jets, we skip it if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold: continue num_offlineEvents += 1 '''can use seed_filter on an event by event basis''' # max number of seeds based on number of offline jets #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets)) tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter) tEvent.get_event() leading_trigger_jets = leading_trigger_jets + tEvent.event.jets '''at this point, we've processed all the data and we just need to make plots''' bins_leading_trigger_jets = np.arange(0.,4000.,5.) hist_leading_trigger_jets = np.histogram([jet.E/np.cosh(jet.eta) for jet in leading_trigger_jets], bins=bins_leading_trigger_jets)[0] # first get the widths of the bins when we make the plots width_leading_trigger_jets = np.array([x - bins_leading_trigger_jets[i-1] for i,x in enumerate(bins_leading_trigger_jets)][1:]) filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh) #make figures '''Leading Trigger Jets Histogram''' pl.figure() pl.xlabel('$E_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Number of leading trigger jets') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.bar(bins_leading_trigger_jets[:-1], hist_leading_trigger_jets, width=width_leading_trigger_jets) pl_lJet = {'bins': bins_leading_trigger_jets,\ 'values': hist_leading_trigger_jets,\ 'width': width_leading_trigger_jets} pickle.dump(pl_lJet, file('events_histogram_leading_trigger_jets_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_histogram_leading_trigger_jets_%s.png' % filename_ending) pl.close()
def readFiles(): print 'Reading files...' # weightsS = root2rec(files_signal, treename='tree', branches=['weight'], selection=selection) weights_sig = root2rec(files_sig, treename='tree', branches=['full_weight'], selection=selection)['full_weight'] weights_ZTT = root2rec(files_ZTT, treename='tree', branches=['full_weight'], selection=selection)['full_weight'] weightsB = root2rec(files_bg, treename='tree', branches=['full_weight'], selection=selection)['full_weight'] sum_weights_sig = np.sum(weights_sig) sum_weights_ZTT = np.sum(weights_ZTT) sum_weightsB = np.sum(weightsB) normWeights_ZTT = weights_ZTT * sum_weights_sig/sum_weights_ZTT normWeightsB = weightsB * sum_weights_sig/sum_weightsB # nS = len(weightsS) n_sig = len(weights_sig) n_ZTT = len(weights_ZTT) nB = len(weightsB) # fullWeight = np.concatenate((weightsS, weightsB)) fullWeight = np.concatenate((weights_sig, normWeights_ZTT, normWeightsB)) unNormFullWeight = np.concatenate((weights_sig, weights_ZTT, weightsB)) # fullWeight = fullWeight['weight'] # fullWeight = np.ones(len(fullWeight)) # del weightsS, weightsB # arrSB = root2array(files_signal + files_bg, treename='tree', branches=trainVars(), selection=selection) arrSB = root2array(files_sig + files_ZTT + files_bg, treename='tree', branches=trainVars(), selection=selection) # Need a matrix-like array instead of a 1-D array of lists for sklearn arrSB = (np.asarray([arrSB[var] for var in trainVars()])).transpose() # targets = np.concatenate((np.ones(nS),np.zeros(nB))) # targets = np.concatenate((np.ones(n_sig)*2, np.ones(n_ZTT),np.zeros(nB))) targets = np.concatenate((np.ones(n_sig)*vals[0], np.ones(n_ZTT)*vals[1], np.ones(nB)*vals[2])) print 'Done reading files.' #import pdb; pdb.set_trace() return arrSB, fullWeight, unNormFullWeight, targets
def main(): filename = '../Blatt7.root' signal_size = 10000 background_size = 20000 print("Reading Data from file " + filename) background = root2rec(filename, 'Untergrund_MC', branches=['AnzahlHits', 'x', 'y']) signal = root2rec(filename, 'Signal_MC_Akzeptanz', branches=['AnzahlHits', 'x', 'y']) background = np.asarray([background['AnzahlHits'], background['x'], background['y']]).T signal = np.asarray([signal['AnzahlHits'], signal['x'], signal['y']]).T training = np.append(signal[:5000],background[:5000], axis=0) label = np.append(np.ones(5000, dtype=np.int), np.zeros(5000, dtype=np.int), axis=0) test_data = np.append(signal[:signal_size],background[:background_size], axis=0) test_label = np.append(np.ones(signal_size, dtype=np.int), np.zeros(background_size, dtype=np.int), axis=0) print("Creating KD-Tree") kd = KDTree(training, leafsize=20) print("Starting prediction with k = 10") prediction = knn(data=test_data, label=label, tree=kd, k=10) performance(label=test_label, prediction=prediction) print("Starting prediction with k = 20") prediction = knn(data=test_data, label=label, tree=kd, k=20) performance(label=test_label, prediction=prediction) print("Using log(AnzahlHits)") #new training and test data for log10(AnzahlHits) background[:,0] = np.log10(background[:,0]) signal[:,0] = np.log10(signal[:,0]) training = np.append(signal[:5000], background[:5000], axis=0) test_data = np.append(signal[:signal_size], background[:background_size], axis=0) print("Creating KD-Tree") kd = KDTree(training, leafsize=20) print("Starting prediction") prediction = knn(data=test_data, label=label, tree=kd, k=10) performance(label=test_label, prediction=prediction)
def test_string(): a = rnp.root2rec(load('string.root')) types = [ ('message', 'O'), ('vect', 'O'), ('vect2d', 'O'), ] assert_equal(a.dtype, types) assert_equal(a[0][0], 'Hello World!') assert_equal(a[0][1][0], 'Hello!') assert_equal(a[0][2][0][0], 'Hello!')
def test_stack(): rec = rnp.root2rec(load('test.root')) s = rnp.stack([rec, rec]) assert_equal(s.shape[0], 2 * rec.shape[0]) assert_equal(s.dtype.names, rec.dtype.names) s = rnp.stack([rec, rec], fields=['x', 'y']) assert_equal(s.shape[0], 2 * rec.shape[0]) assert_equal(s.dtype.names, ('x', 'y')) # recs don't have identical fields rec2 = recfunctions.drop_fields(rec, ['i', 'x']) s = rnp.stack([rec, rec2]) assert_equal(set(s.dtype.names), set(['y', 'z']))
def getvars(vars, flav, filename, ptmin=20, ptmax=200, etamin=0., etamax=2.1): leaves_train = [flav + '_' + var for var in vars] leaves = leaves_train + [flav + '_pt', flav + '_eta'] array = root2rec(filename, 'tree', leaves) xx = np.vstack([array[leave] for leave in leaves_train]).T[1::2] pt = array[flav + '_pt'][1::2] eta = array[flav + '_eta'][1::2] return xx[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) & (np.fabs(eta) < etamax)]
def run_code(event_num): data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1)) oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names]) tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter) tEvent.get_event() grid_towers = gTowers.Grid(cell_resolution=0.02, domain=domain) grid_towers.add_tower_event(tEvent) grid_towers.save(title='Event %d, gTowers, cell resolution=0.02' % event_num, filename='event_%d_towers.png' % event_num, colzLabel = '$E_T^{\mathrm{tower}}$') grid_offline = gTowers.Grid(cell_resolution=0.02, recon_algo = 'gaussian', domain=domain) grid_offline.add_event(oEvent) grid_offline.save(title='Event %d, offline jets, cell resolution=0.02' % event_num, filename='event_%d_offline_jets.png' % event_num, colzLabel = '$p_T^{\mathrm{jet}}$') grid_trigger = gTowers.Grid(cell_resolution=0.02, recon_algo = 'gaussian', domain=domain) grid_trigger.add_event(tEvent.get_event()) grid_trigger.save(title='Event %d, trigger jets, cell resolution=0.02' % event_num, filename='event_%d_trigger_jets.png' % event_num, colzLabel = '$E_T^{\mathrm{jet}}$')
def harvest_from_rootfile(self,rootfile,definition): """ Get data from a root file Args: rootfile (str): Name of the *.root file definition (tuple): Name of branches/leaves in the rootfile Returns: pd.Series or DataFrame """ #FIXME: What happens if it is not found in the rootfile data = rn.root2rec(rootfile,*definition) if self.defsize == 2: data = pd.Series(data) elif self.defsize == 1: data = pd.DataFrame(data) else: raise ValueError return data
def getjetvar(jet, var, filename, ptmin=20, ptmax=200, etamin=0., etamax=2.1, nocut=True): leaves = [jet + 'pt', jet + 'eta'] if var not in ['pt', 'eta']: leaves += [jet + var] array = root2rec(filename, 'tree', leaves) vars = array[jet + var] pt = array[jet + 'pt'] eta = array[jet + 'eta'] if not nocut: vars = vars[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) & (np.fabs(eta) < etamax)] return vars
def test_vary(): f = load(['vary1.root', 'vary2.root']) a = rnp.root2rec(f) assert_equal( a.dtype, [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'), ('n_int', 'O'), ('f_float', 'O'), ('d_double', 'O')]) #check length for i in range(len(a)): assert_equal(a.len_n[i], len(a.n_int[i])) assert_equal(a.len_f[i], len(a.f_float[i])) assert_equal(a.len_d[i], len(a.d_double[i])) #couple element check assert_equal(a.len_n[0], 0) assert_equal(a.len_f[0], 1) assert_equal(a.len_d[0], 2) assert_equal(a.n_int[-1][-1], 417) assert_equal(a.f_float[-1][0], 380.5) assert_equal(a.f_float[-1][-1], 456.5) assert_equal(a.d_double[-1][0], 380.25) assert_equal(a.d_double[-1][-1], 497.25)
def gettracks(vars, flav, filename, ptmin=20, ptmax=200, etamin=0., etamax=2.1, train=1): leaves_train = [flav + '_trk' + var for var in vars] leaves = leaves_train + [flav + '_pt', flav + '_eta'] array = root2rec(filename, 'tree', leaves) xx = np.array([ zerofill(array[leave].tolist()).T for leave in leaves_train ]).T[train::2] pt = array[flav + '_pt'][train::2] eta = array[flav + '_eta'][train::2] return xx[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) & (np.fabs(eta) < etamax)]
def getvar(var, flav, filename, reco=False, ptmin=80, ptmax=110, etamin=0., etamax=2.1, train=0): varflav = flav if reco: varflav += 'reco' leaves = [flav + '_pt', flav + '_eta'] if var not in ['pt', 'eta']: leaves += [varflav + '_' + var] array = root2rec(filename, 'tree', leaves) vars = array[varflav + '_' + var][train::2] pt = array[flav + '_pt'][train::2] eta = array[flav + '_eta'][train::2] return vars[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) & (np.fabs(eta) < etamax)]
from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.externals import joblib #load data op = optparse.OptionParser(usage=__doc__) op.add_option("--treeplot", dest="TREEPLOT", default=False, action="store_true", help="Make a decision tree classifier plot") op.add_option("--bounds", dest="BOUNDS", default=False, action="store_true", help="Plot decision tree boundaries in two input variables") opts, args = op.parse_args() infname_sig, infname_bkg = args[0], args[1] #set up DataFrames df_sig = pandas.DataFrame(root_numpy.root2rec(infname_sig, branches=["tau2_sd","tau3_sd","softdropjet.M()","fatjet.M()"])) df_bkg = pandas.DataFrame(root_numpy.root2rec(infname_bkg, branches=["tau2_sd","tau3_sd","softdropjet.M()","fatjet.M()"])) df_sig["is_signal"] = 1 df_bkg["is_signal"] = 0 df = pandas.concat([df_sig, df_bkg], ignore_index=True) df = df.iloc[np.random.permutation(len(df))] df_train = df[0:150000] df_test_orig = df[150000:] df_sig = np.asarray(df_sig) df_bkg = np.asarray(df_bkg) df_train = np.asarray(df_train) df_test = np.asarray(df_test_orig)
from root_numpy import root2rec import numpy as np import pylab as pl import pickle # read the sample sample = root2rec('sample.root') y = sample['label'] X = np.vstack([sample[var] for var in ['a', 'b']]).T with open('sklearn_bdt.pickle', 'r') as f: bdt = pickle.load(f) plot_colors = "br" plot_step = 0.02 class_names = "AB" pl.figure(figsize=(10, 5)) # Plot the decision boundaries pl.subplot(121) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) cs = pl.contourf(xx, yy, Z, cmap=pl.cm.Paired) pl.axis("tight")
plik=str(sys.argv[1]); try: os.mkdir("./rysunki/"+plik) except: print "Jedziemyyyy...." #z reki #pliki=["ot001", "ot002", "ot009", "ot019", "ot020" , "nt001"]; #z konsoli #pliki = []; #for i in xrange(1, len(sys.argv)): # pliki += [str(sys.argv[i])] rec = root2rec(plik+'.root', "tvec") ''' x=[];z=[]; for i in xrange(0, len(rec)): if (math.fabs(rec.Teta[i]) < 1.61): x.append(rec.Tbeta[i]); if (rec.Tl1[i]): z.append(rec.Tbeta[i]) ''' x = np.extract(np.absolute(rec.Teta) < 1.61, rec.Tbeta) z = np.extract(rec.Tl1, rec.Tbeta)
def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.): #set seed cuts seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1.0e5) #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']] gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']] #bins for all histograms bins_towerMultiplicity = np.arange(0, 1000, 5).astype(float) bins_towerHistogram = np.array([0,50,100,150,200,250,300,350,400,500,750,1000,4000]).astype(float) bins_efficiency = np.arange(0,1240, 20).astype(float) hist_towerMultiplicity = np.zeros(len(bins_towerMultiplicity)-1).astype(float) hist_towerHistogram = np.zeros(len(bins_towerHistogram)-1).astype(float) hist_efficiency_num = np.zeros(len(bins_efficiency)-1).astype(float) hist_efficiency_den = np.zeros(len(bins_efficiency)-1).astype(float) num_offlineEvents = 0 # main loop that goes over the file for event_num in range(total_num_events): if event_num % 100 == 0: print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh) # pull in data row by row data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1)) oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names]) # if there are no offline jets, we skip it if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold: continue num_offlineEvents += 1 '''can use seed_filter on an event by event basis''' # max number of seeds based on number of offline jets #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets)) tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter) # build up the first two histograms using just the gTower data # note, we have np.histogram(...)[0] since we only need the hist data tower_ETs = [tower.E/np.cosh(tower.eta) for tower in tEvent.towers] hist_towerMultiplicity += np.cumsum(np.histogram(tower_ETs, bins=bins_towerMultiplicity)[0][::-1])[::-1] #this makes a reverse cumulative sum hist_towerHistogram += np.histogram(tower_ETs, bins=bins_towerHistogram)[0] tEvent.get_event() #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers()) paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets) paired_data = np.array([[oJet.pT, tJet.E/np.cosh(tJet.eta)] for oJet,tJet in paired_jets if oJet.pT > offline_jetpT_threshold]) # build up the turn on curve histograms hist_efficiency_den += np.histogram(paired_data[:,0], bins=bins_efficiency)[0] hist_efficiency_num += np.histogram(paired_data[np.where(paired_data[:,1] > gTower_jetET_threshold),0], bins=bins_efficiency)[0] '''at this point, we've processed all the data and we just need to make plots''' # first get the widths of the bins when we make the plots width_towerMultiplicity = np.array([x - bins_towerMultiplicity[i-1] for i,x in enumerate(bins_towerMultiplicity)][1:]) width_towerHistogram = np.array([x - bins_towerHistogram[i-1] for i,x in enumerate(bins_towerHistogram)][1:]) width_efficiency = np.array([x - bins_efficiency[i-1] for i,x in enumerate(bins_efficiency)][1:]) # rescale tower data to define it per event hist_towerMultiplicity = 1.0*hist_towerMultiplicity/num_offlineEvents hist_towerHistogram = 1.0*hist_towerHistogram/num_offlineEvents #histogram y-range hist_ylim = (10.**-3., 10.**4.) filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh) #make figures '''Tower Multiplicity''' pl.figure() pl.xlabel('$E_T^{\mathrm{threshold}}$ [GeV]') pl.ylabel('Number of gTowers per event') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.bar(bins_towerMultiplicity[:-1], hist_towerMultiplicity, width=width_towerMultiplicity, log=True) pl.ylim(hist_ylim) pl_tMult = {'bins': bins_towerMultiplicity,\ 'values': hist_towerMultiplicity,\ 'width': width_towerMultiplicity} pickle.dump(pl_tMult, file('events_threshold_histogram_multiplicity_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_threshold_histogram_multiplicity_%s.png' % filename_ending) pl.close() '''Tower Histogram per Event''' pl.figure() pl.xlabel('$p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Number of gTowers per event') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.bar(bins_towerHistogram[:-1], hist_towerHistogram, width=width_towerHistogram, log=True) pl.ylim(hist_ylim) pl_tHist = {'bins': bins_towerHistogram,\ 'values': hist_towerHistogram,\ 'width': width_towerHistogram} pickle.dump(pl_tHist, file('events_threshold_histogram_towers_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_threshold_histogram_towers_%s.png' % filename_ending) pl.close() xlim_efficiency = (0.0,1.0) ylim_efficiency = (0.0,1.0) '''Turn on curves''' pl.figure() pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Turn-On Curve Denominator') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.bar(bins_efficiency[:-1], hist_efficiency_den, width=width_efficiency) xlim_efficiency = pl.xlim() xlim_efficiency = (0.0, xlim_efficiency[1]) pl.xlim(xlim_efficiency) ylim_efficiency = pl.ylim() ylim_efficiency = (0.0, ylim_efficiency[1]) pl.ylim(ylim_efficiency) pl_turnon_den = {'bins': bins_efficiency,\ 'values': hist_efficiency_den,\ 'width': width_efficiency} pickle.dump(pl_turnon_den, file('events_turnon_denominator_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_turnon_denominator_%s.png' % filename_ending) pl.close() pl.figure() pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Turn-On Curve Numerator') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.bar(bins_efficiency[:-1], hist_efficiency_num, width=width_efficiency) pl.xlim(xlim_efficiency) pl.ylim(ylim_efficiency) pl_turnon_num = {'bins': bins_efficiency,\ 'values': hist_efficiency_num,\ 'width': width_efficiency} pickle.dump(pl_turnon_num, file('events_turnon_numerator_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_turnon_numerator_%s.png' % filename_ending) pl.close() nonzero_bins = np.where(hist_efficiency_den != 0) #compute integral and differential curves hist_efficiency_curve_differential = np.true_divide(hist_efficiency_num[nonzero_bins], hist_efficiency_den[nonzero_bins]) hist_efficiency_curve_integral = np.true_divide(np.cumsum(hist_efficiency_num[nonzero_bins][::-1])[::-1], np.cumsum(hist_efficiency_den[nonzero_bins][::-1])[::-1]) #get halfway in between really xpoints_efficiency = bins_efficiency[:-1] + width_efficiency/2. def binomial_errors(hist_ratio, hist_one, hist_two): errors = [] for w, num, den in zip(hist_ratio, hist_one, hist_two): # root.cern.ch/root/html/src/TH1.cxx.html#l5.yxD # formula cited (for histograms [num, den] with no errors) is: # w = num/den # if w = 1: # sigma = 0 # else: # sigma = abs( (1 - 2*w + w**2) / den**2 ) if w == 1.0: errors.append(0.0) else: errors.append( (np.abs( (1.-2.*w + w**2.)/den**2.))**0.5 ) return errors #binomial errors s^2 = n * p * q errors_efficiency_differential = binomial_errors(hist_efficiency_curve_differential, hist_efficiency_num[nonzero_bins], hist_efficiency_den[nonzero_bins]) errors_efficiency_integral = binomial_errors(hist_efficiency_curve_integral, np.cumsum(hist_efficiency_num[nonzero_bins][::-1])[::-1], np.cumsum(hist_efficiency_den[nonzero_bins][::-1])[::-1]) pl.figure() pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Trigger Efficiency - Differential') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.errorbar(xpoints_efficiency[nonzero_bins], hist_efficiency_curve_differential, yerr=errors_efficiency_differential, ecolor='black') pl.xlim(xlim_efficiency) pl.ylim((0.0,1.2)) pl.grid(True) pl_eff_diff = {'xdata': xpoints_efficiency,\ 'ydata': hist_efficiency_curve_differential,\ 'xerr' : 1.0,\ 'yerr' : errors_efficiency_differential,\ 'num' : hist_efficiency_num,\ 'den' : hist_efficiency_den,\ 'bins' : bins_efficiency,\ 'nonzero_bins': nonzero_bins} pickle.dump(pl_eff_diff, file('events_turnon_curve_differential_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_turnon_curve_differential_%s.png' % filename_ending) pl.close() pl.figure() pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('Trigger Efficiency - Integral') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.errorbar(xpoints_efficiency[nonzero_bins], hist_efficiency_curve_integral, yerr=errors_efficiency_integral, ecolor='black') pl.xlim(xlim_efficiency) pl.ylim((0.0,1.2)) pl.grid(True) pl_eff_int = {'xdata': xpoints_efficiency,\ 'ydata': hist_efficiency_curve_integral,\ 'xerr' : 1.0,\ 'yerr' : errors_efficiency_integral,\ 'num' : hist_efficiency_num,\ 'den' : hist_efficiency_den,\ 'bins' : bins_efficiency,\ 'nonzero_bins': nonzero_bins} pickle.dump(pl_eff_int, file('events_turnon_curve_integral_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_turnon_curve_integral_%s.png' % filename_ending) pl.close()
try: os.mkdir("./rysunki/") except: print "Jedziemyyyy...." #z reki #pliki=["ot001", "ot002", "ot009", "ot019", "ot020" , "nt001"]; PtBins=[0., 0.1,1.5, 2., 2.5, 3., 3.5, 4., 4.5, 5., 6., 7., 8.,10., 12., 14., 16., 18., 20., 25., 30., 35., 40., 45.,50., 60., 70., 80., 90., 100., 120., 140.,160.]; PtWidth=[(PtBins[j+1]-PtBins[j]) for j in range(len(PtBins)-1)] bin_width = 0.01; rec = root2rec(sys.argv[1], "tvec") #x = np.extract(np.absolute(rec.Teta) < 1.61, rec.Tbeta) x = np.extract(np.logical_and(np.logical_and(np.absolute(rec.Teta0) < 1.61, rec.Tlbx_1), rec.Tpt0 > 10), rec.Teta0); y = np.extract(np.logical_and(np.logical_and(np.absolute(rec.Teta0) < 1.61, rec.Tlbx_1), rec.Tpt0 > 10), rec.Tphi0); print len(x), len(y) pl.hist2d(x, y, bins=200, norm=LogNorm()) #pl.hist2d(x, y, bins=np.arange(0.,4,bin_width) , norm=LogNorm()) pl.colorbar() #plt.xscale('log') plt.ylabel(r'$\phi$') plt.xlabel(r'$\eta$') plt.draw()
#training_files = ['folds/'+f for f in os.listdir('folds') if f.find('train')!=-1] #training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz6_nTrk_v1_1300_1800_mw_merged.root'] training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_nonTrk_v3_400_1200_mw_merged.root'] #training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz5_nTrk_v1_800_1200_mw_merged.root'] cols = np.linspace(1,42,42,dtype=int) #filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz6_nTrk_v1_1300_1800_mw_merged.csv' filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_nonTrk_v3_400_1200_mw_merged.csv' #filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz5_nTrk_v1_800_1200_mw_merged.csv' for f in training_files: #tfile = rt.TFile.Open(f) #tree = tfile.Get('outputTree') # get the branches and only focus on the ones not listed below # remove the variables that are "observers", ie that do not get used for training, or weights, since those must not be scaled. #br = tree.GetListOfBranches() X = rn.root2rec(f) variables = list(X.dtype.names) #for b in br: # variables.append(br.GetName()) variables.remove('label') observers = ['mc_event_weight','jet_antikt10truthtrimmedptfrac5smallr20_pt','jet_antikt10truthtrimmedptfrac5smallr20_eta','m','pt','eta','phi','evt_xsec','evt_filtereff','evt_nevts','weight','jet_camkt12truth_pt','jet_camkt12truth_eta','jet_camkt12truth_phi','jet_camkt12truth_m','jet_camkt12lctopo_pt','jet_camkt12lctopo_eta','jet_camkt12lctopo_phi','jet_camkt12lctopo_m','eff','averageintperxing'] for o in observers: if o in variables: variables.remove(o) curr_means = np.zeros(len(variables)) curr_std = np.ones(len(variables)) weighted_means = np.zeros(len(variables)) weighted_std = np.ones(len(variables)) for j,v in enumerate(variables): mean = np.mean(X[v])
q_eff = [0.69, 0.78, 0.82, 0.85, 0.85, 0.86, 0.85, 0.84, 0.81, 0.75, 0.65, 0.50] # Convert eV to MeV for quantum efficiency for i in range(len(e_ph)): e_ph[i] /= 1000000 # Determine the maximum and minimum energies - this determines the # interpolation range. interp_min = e_ph[0] interp_max = e_ph[-1] # Interpolate to get a function for quantum efficiency in terms of photon # energy. q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff) # <codecell> # Get data data = root2rec(data_file, treename=treename_str) # Get output data file f = open(proc_data_file, "w") # <codecell> iEvent = 0 # Arrays of processed number of hits and energy deposit n_hits_proc = [] energy_proc = [] n_hits_proc_uncut = [] energy_proc_uncut = [] # Loop through the energy deposit arrays for each event for event_energy in data.energy: # Number of hits and energy detected in a single event n_hits_registered = 0
def test_specific_branch(): a = rnp.root2rec(load('single1.root'), branches=['f_float']) assert_equal(a.dtype, [('f_float', '<f4')])
def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.): #set seed cuts seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1.0e5) #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']] gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']] matched_jet_pairs = [] num_offlineEvents = 0 # main loop that goes over the file for event_num in range(total_num_events): if event_num % 100 == 0 and event_num != 0: print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh) # pull in data row by row data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1)) oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names]) # if there are no offline jets, we skip it if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold: continue num_offlineEvents += 1 '''can use seed_filter on an event by event basis''' # max number of seeds based on number of offline jets #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets)) tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter) tEvent.get_event() #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers()) paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets) matched_jet_pairs.append(np.array([[oJet.pT, tJet.E/np.cosh(tJet.eta)] for oJet,tJet in paired_jets if oJet.pT > offline_jetpT_threshold and tJet.E > 0.])) '''at this point, we've processed all the data and we just need to make plots''' filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh) matched_jet_pairs = np.array(matched_jet_pairs) all_jet_pairs = np.array([l for item in matched_jet_pairs for l in item]) leading_offline_jet_pairs = np.array([l for item in matched_jet_pairs for l in item if l[1] == np.amax(item[:,1])]) xlim = (1e2,5e2) ylim = (0.,1200.) #make figures '''All Jet Pairs''' pl.figure() pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('trigger $E_T^{\mathrm{jet}}$ [GeV]') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.scatter(all_jet_pairs[:,0], all_jet_pairs[:,1]) pl.grid(True, which='both') pl.xlim(xlim) pl.ylim(ylim) pl_aJet = {'xdata': all_jet_pairs[:,0],\ 'ydata': all_jet_pairs[:,1]} pickle.dump(pl_aJet, file('events_all_jet_pairs_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_all_jet_pairs_%s.png' % filename_ending) pl.close() '''Leading Offline Jet Pairs''' pl.figure() pl.xlabel('leading offline $p_T^{\mathrm{jet}}$ [GeV]') pl.ylabel('trigger $E_T^{\mathrm{jet}}$ [GeV]') pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh)) pl.scatter(leading_offline_jet_pairs[:,0], leading_offline_jet_pairs[:,1]) pl.grid(True, which='both') pl.xlim(xlim) pl.ylim(ylim) pl_lJet = {'xdata': leading_offline_jet_pairs[:,0],\ 'ydata': leading_offline_jet_pairs[:,1]} pickle.dump(pl_lJet, file('events_leading_offline_jet_pairs_%s.pkl' % filename_ending, 'w+') ) pl.savefig('events_leading_offline_jet_pairs_%s.png' % filename_ending) pl.close()
def loaddata(filename): a = root2rec(filename) return a
q_eff = [0.69, 0.78, 0.82, 0.85, 0.85, 0.86, 0.85, 0.84, 0.81, 0.75, 0.65, 0.50] # Convert eV to MeV for i in range(len(e_ph)): e_ph[i] /= 1000000 # Determine the maximum and minimum energies - this determines the # interpolation range. interp_min = e_ph[0] interp_max = e_ph[-1] # Interpolate to get a function for quantum efficiency in terms of photon # energy. q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff) # <codecell> # Get data data = root2rec("/home/pythontutorial/mountpoint/output/length_study/lengthStudy1000.root", treename="ntp1") print "got data" # Get output data file f = open("processed.dat", "w") # <codecell> # Cuts for the plotting of the data n_hits_plot_cut_low = 0 n_hits_plot_cut_high = 500 # Number of events processed iEvent_proc = 0 # Arrays of processed number of hits and energy deposit n_hits_proc = [] energy_proc = []
m = Minuit(x2reg, a=1, b=2, c=3) m.migrad() x2reg.show(m) # <markdowncell> # ###Let's do some physics # Remeber the D mass?? Let's try to fit relativistic Breit-Wigner to it. # <codecell> from root_numpy import root2rec # <codecell> data = root2rec('data/*.root') bb = root2rec('data/B*.root') cc = root2rec('data/cc*.root') # <codecell> hs = np.hstack hist([hs(data.DMass), hs(bb.DMass), hs(cc.DMass)], bins=50, histtype='step'); # <markdowncell> # ###Simple fit # First lets fit bb's DMass alone with a Breit-Wigner. # <codecell>
bins_towerHistogram = np.array([0,50,100,150,200,250,300,350,400,500,750,1000,4000]).astype(float) bins_efficiency = np.arange(0,1240, 20).astype(float) hist_towerMultiplicity = np.zeros(len(bins_towerMultiplicity)-1).astype(float) hist_towerHistogram = np.zeros(len(bins_towerHistogram)-1).astype(float) hist_efficiency_num = np.zeros(len(bins_efficiency)-1).astype(float) hist_efficiency_den = np.zeros(len(bins_efficiency)-1).astype(float) num_offlineEvents = 0 # main loop that goes over the file for event_num in range(total_num_events): if event_num % 100 == 0: print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh) # pull in data row by row data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1)) oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names]) # if there are no offline jets, we skip it if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold: continue num_offlineEvents += 1 '''can use seed_filter on an event by event basis''' # max number of seeds based on number of offline jets #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets)) tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter) #tEvent.get_event() paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers()) #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets)
# <codecell> for i in range(3): print 'loop', i # <markdowncell> # ###Reading ROOT FILE # <codecell> from root_numpy import root2rec # <codecell> bb = root2rec('data/B*.root') #yep that simple cc = root2rec('data/cc-BtoDpi-all.root') # <markdowncell> # ###And plotting # <codecell> hist([bb.R2All, cc.R2All], bins=100, histtype='stepfilled', color=['red', 'green'], alpha=0.5, label=[r'$B\bar{B}$', r'$c\bar{c}$']) legend().get_frame().set_alpha(0.5)
# Convert eV to MeV for i in range(len(e_ph)): e_ph[i] /= 1000000 # Determine the maximum and minimum energies - this determines the # interpolation range. interp_min = e_ph[0] interp_max = e_ph[-1] # Interpolate to get a function for quantum efficiency in terms of photon # energy. q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff) # <codecell> # Get data data = root2rec( "/home/pythontutorial/mountpoint/output/length_study/lengthStudy1000.root", treename="ntp1") print "got data" # Get output data file f = open("processed.dat", "w") # <codecell> # Cuts for the plotting of the data n_hits_plot_cut_low = 0 n_hits_plot_cut_high = 500 # Number of events processed iEvent_proc = 0 # Arrays of processed number of hits and energy deposit n_hits_proc = []
def test_expression(): rec = rnp.root2rec(load('single*.root')) rec2 = rnp.root2rec(load('single*.root'), branches=['f_float*2']) assert_array_equal(rec['f_float'] * 2, rec2['f_float*2'])
# ### ... and continue below the "Save dataframes" section # <markdowncell> # # Load data from ROOT trees # <codecell> # tree data information for import branches = ['mcnpart', 'mcid', 'mcm', 'mcp', 'mctheta', 'mcphi'] rootfiles = ['genev000.root', 'genev100.root', 'genev200.root'] # data set names corresponding to each of rootfiles dfnames = ['000', '100', '200'] # import tree data as 3 numpy structured records treerecs = [root2rec('../testdata/%s' % fn, branches=branches)[0:10000] for fn in rootfiles] # associate data sets with dfnames dfdict = dict(zip(dfnames, [pd.DataFrame(treerec) for treerec in treerecs])) # create single data frame with multi-index df = pd.Panel.from_dict(dfdict, orient='minor').swapaxes().to_frame() # <markdowncell> # # Add lab-frame 4-momentum components # <codecell> dfrad = df[['mctheta','mcphi']]*d2r dfcos = dfrad.applymap(np.cos) dfsin = dfrad.applymap(np.sin) df['pz'] = df.mcp*dfcos.mctheta
def main(): ### Paths for KEKCC ifpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/TPC/' #ofpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/TPC/skims/indiv_skims/' ### Debug variables counter = 0 r_files = [] for subdir, dirs, files in os.walk(ifpath): for f in files: ofpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/' r_file = str(subdir) + str(f) test = subdir.split('/') #if 'TPC3' in test or 'TPC4' in test or 'skims' in test: # continue if 'skims' in test: continue tpc_num = f.split('_')[0] date_dir = subdir.split('/')[-1] if 'TPC4' in test: date_dir = '2016-05-10' print('Date dir is:', date_dir) #print('Date dir is:', date_dir) print('Directory is:', subdir) if tpc_num == 'tpc3': ofpath += str('TPC3/') elif tpc_num == 'tpc4': ofpath += str('TPC4/') ofpath += str(date_dir) + str('/') if ('badtime' in test or 'old' in test or 'ENV' in test or 'tmp' in test or 'ToRemove' in test): continue ifile = os.path.join(subdir, f) names = f.split('/') infile_name = names[-1].split('.') tfile = str(infile_name[0]) + str('_skim') + str('.root') match = 0 ofile = str(ofpath) + str( infile_name[0]) + str('_skim') + str('.root') ### Uncomment this line if only non-existing files are to be generated #if os.path.isfile(ofile): continue counter += 1 ### Uncomment these lines if all files must be regenerated #input('Warning! You are about to delete all existing files!') if os.path.isfile(ofile): os.system('rm %s' % (ofile)) print('Infile is:', ifile) print('Outfile is:', ofile) log = str('logs/') + str(f) + str('.log') #os.system('bsub -q s -o %s "./refitter %s %s"' % (log, ifile, ofile)) ### Send large files to long queue, small files to short queue df = root2rec(ifile, 'tree', branches='m_event') evts = len(df) if evts > 60000: os.system('bsub -q l -o %s "./refitter %s %s"' % (log, ifile, ofile)) else: os.system('bsub -q s -o %s "./refitter %s %s"' % (log, ifile, ofile)) if counter == 0: sys.path.append('py') import job_check job_check.main()
def test_vector(): a = rnp.root2rec(load('vector.root')) types = [ ('v_i', 'O'), ('v_f', 'O'), ('v_F', 'O'), ('v_d', 'O'), ('v_l', 'O'), ('v_c', 'O'), ('v_b', 'O'), ('vv_i', 'O'), ('vv_f', 'O'), ('vv_F', 'O'), ('vv_d', 'O'), ('vv_l', 'O'), ('vv_c', 'O'), ('vv_b', 'O'), ] assert_equal(a.dtype, types) assert_equal(a.v_i[0].dtype, np.int32) assert_equal(a.v_f[0].dtype, np.float32) assert_equal(a.v_F[0].dtype, np.float32) assert_equal(a.v_d[0].dtype, np.float64) assert_equal(a.v_l[0].dtype, np.int64) assert_equal(a.v_c[0].dtype, np.int8) assert_equal(a.v_b[0].dtype, np.bool) # assert that wrapper array is np.object assert_equal(a.vv_i[0].dtype, np.object) assert_equal(a.vv_f[0].dtype, np.object) assert_equal(a.vv_F[0].dtype, np.object) assert_equal(a.vv_d[0].dtype, np.object) assert_equal(a.vv_l[0].dtype, np.object) assert_equal(a.vv_c[0].dtype, np.object) assert_equal(a.vv_b[0].dtype, np.object) assert_equal(a.vv_i[0][0].dtype, np.int32) assert_equal(a.vv_f[0][0].dtype, np.float32) assert_equal(a.vv_F[0][0].dtype, np.float32) assert_equal(a.vv_d[0][0].dtype, np.float64) assert_equal(a.vv_l[0][0].dtype, np.int64) assert_equal(a.vv_c[0][0].dtype, np.int8) assert_equal(a.vv_b[0][0].dtype, np.bool) # check a few values assert_equal(a.v_i[0][0], 1) assert_equal(a.v_i[1][1], 3) assert_equal(a.v_i[-2][0], 9) assert_equal(a.v_i[-2][-1], 17) assert_equal(a.v_f[0][0], 2.0) assert_equal(a.v_f[1][1], 5.0) assert_equal(a.v_f[-2][0], 18.0) assert_equal(a.v_f[-2][-1], 26.0) assert_equal(a.v_F[0][0], 2.0) assert_equal(a.v_F[1][1], 5.0) assert_equal(a.v_F[-2][0], 18.0) assert_equal(a.v_F[-2][-1], 26.0) # more strict conditioning for numpy arrays def assert_equal_array(arr1, arr2): return assert_equal((arr1 == arr2).all(), True, "array mismatch: {0} != {1}".format(arr1, arr2)) assert_equal_array(a.vv_i[0][0], np.array([1], dtype=np.int32) ) assert_equal_array(a.vv_i[1][1], np.array([2, 3], dtype=np.int32) ) assert_equal_array(a.vv_i[-2][0], np.array([9], dtype=np.int32) ) assert_equal_array(a.vv_i[-2][-1], np.array([ 9, 10, 11, 12, 13, 14, 15, 16, 17], dtype=np.int32)) assert_equal_array(a.vv_f[0][0], np.array([ 2.], dtype=np.float32) ) assert_equal_array(a.vv_f[1][1], np.array([ 4., 5.], dtype=np.float32) ) assert_equal_array(a.vv_f[-2][0], np.array([ 18.], dtype=np.float32) ) assert_equal_array(a.vv_f[-2][-1], np.array([ 18., 19., 20., 21., 22., 23., 24., 25., 26.], dtype=np.float32)) assert_equal_array(a.vv_F[0][0], np.array([ 2.], dtype=np.float32) ) assert_equal_array(a.vv_F[1][1], np.array([ 4., 5.], dtype=np.float32) ) assert_equal_array(a.vv_F[-2][0], np.array([ 18.], dtype=np.float32) ) assert_equal_array(a.vv_F[-2][-1], np.array([ 18., 19., 20., 21., 22., 23., 24., 25., 26.], dtype=np.float32))
def get_initial_DataFrame(inFile, TTree_name_arr, eta_bins, pt_bins, pid_dict, classes_str): """ This function loads the data. In case the input file 'inFile' is specified, the pandas DataFrame will be constructed from the ROOT input file, cuts will be applied and it will be stored to HDF5 for the next iteration in case the same cuts are to be used but e.g. a different reweighing procedure. """ import os from btag_nn_inputs import jet_eta_str, jet_pt_str, default_sample_info if inFile: from numpy.lib.recfunctions import stack_arrays from root_numpy import root2rec print 'Convert ROOT file to pandas DataFrame...' for i in range(len(TTree_name_arr)): if i == 0: df = pd.DataFrame( stack_arrays([root2rec(inFile, TTree_name_arr[i])])) else: df = df.append(pd.DataFrame( stack_arrays([root2rec(inFile, TTree_name_arr[i])])), ignore_index=True) print 'conversion complete' # only interested in absolute values of eta and the label, so this will speed the calculations up: df.update(df[jet_eta_str].abs(), join='left', overwrite=True) # only use absolute value of eta df.update(df['label'].abs(), join='left', overwrite=True) # only use absolute value of labels # dataset selection: pile-up removal and selection in eta, pT acceptance region, limited to b-, c- and light jets: if "tau" in pid_dict: df = df[(df['label'] == pid_dict.get("b")) | (df['label'] == pid_dict.get("c")) | (df['label'] == pid_dict.get("u")) | (df['label'] == pid_dict.get("tau"))] # jet flavor selection else: df = df[(df['label'] == pid_dict.get("b")) | (df['label'] == pid_dict.get("c")) | (df['label'] == pid_dict.get("u"))] # jet flavor selection df = df[(df[jet_pt_str] > pt_bins[0]) & (df[jet_eta_str] < eta_bins[len(eta_bins) - 1] )] # jet min-pT and max-abs-eta cut df = df[((df['JVT'] > 0.59) & (df[jet_eta_str] < 2.4) & (df[jet_pt_str] < 60.)) | (df[jet_pt_str] >= 60.) | (df[jet_eta_str] >= 2.4)] # pile-up removal (use this when working in GeV) # store as HDF5 file to speed up the progress for next iteration: file_info_str = inFile.split('/')[1].replace( '.root', '') + '_' + classes_str + 'jets_pTmax' + str( int(pt_bins[len(pt_bins) - 1]) / 1000) + 'GeV' df.to_hdf('inputFiles/' + file_info_str + '.h5', 'df') print 'saved input data in HDF5 format for next run.' return df, file_info_str elif not inFile: file_info_str = default_sample_info + '_' + classes_str + 'jets_pTmax' + str( int(pt_bins[len(pt_bins) - 1])) try: if not os.path.isfile('inputFiles/' + file_info_str + '.h5'): print "File does not exist. Try running the path to the ROOT file as additional argument." return False except IOError as ex: print('({})'.format(e)) return pd.read_hdf('inputFiles/' + file_info_str + '.h5', 'df'), file_info_str
def load_data(ipath): ''' Loads the dataset :type dataset: string :param dataset: the path to the dataset (here MNIST) ''' ############# # LOAD DATA # ############# # Download the MNIST dataset if it is not present #data_dir, data_file = os.path.split(dataset) #if data_dir == "" and not os.path.isfile(dataset): # # Check if dataset is in the data directory. # new_path = os.path.join( # os.path.split(__file__)[0], # "..", # "data", # dataset # ) # if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': # dataset = new_path #if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': # from six.moves import urllib # origin = ( # 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' # ) # print('Downloading data from %s' % origin) # urllib.request.urlretrieve(origin, dataset) print('... loading data') alpha_counter = 1 xray_counter = 1 data_x = [] data_y = [] datapath = ipath for f in os.listdir(datapath): ifile = str(datapath) + str(f) data = root2rec(ifile) for event in data : e_type = 3 if event.proton == 1 or (event.other == 1 and event.xray == 0): continue if (event.bottom_alpha == 1 or event.top_alpha== 1) : alpha_counter += 1 if alpha_counter % 50 == 0 : e_type = 0 alpha_counter = 1 if event.xray == 1 : xray_counter += 1 if xray_counter % 50 == 0 : e_type = 1 xray_counter = 1 if event.neutron == 1 : e_type = 2 if e_type < 3 : data_y.append(e_type) pix = numpy.zeros([336, 80]) cols = event.col rows = event.row for i in range(event.npoints): pix[rows[i]-1][cols[i]-1] = 1 #pix[rows[i]-1][cols[i]-1] = (event.tot[i]+1)/16 #pix[rows[i]-1][cols[i]-1] = (event.tot[i]+1) pix = numpy.reshape(pix, 336*80) data_x.append(pix) data_x = numpy.asarray(data_x, dtype=numpy.float32) data_y = numpy.asarray(data_y, dtype=numpy.float32) train = int(len(data_x) * 0.8) test = int((len(data_x)-train)/2) train_x = data_x[:train] train_y = data_y[:train] valid_x = data_x[train:-(test+1)] valid_y = data_y[train:-(test+1)] test_x = data_x[-test:] test_y = data_y[-test:] #test_x = data_x[train:train+test] #test_y = data_y[train:train+test] #valid_x = data_x[:test] #valid_y = data_y[:test] test_set = [test_x, test_y] valid_set = [valid_x, valid_y] train_set = [train_x, train_y] # Load the dataset #with gzip.open(dataset, 'rb') as f: # try: # train_set, valid_set, test_set = pickle.load(f, encoding='latin1') # except: # train_set, valid_set, test_set = pickle.load(f) # train_set, valid_set, test_set format: tuple(input, target) # input is a numpy.ndarray of 2 dimensions (a matrix) # where each row corresponds to an example. target is a # numpy.ndarray of 1 dimension (vector) that has the same length as # the number of rows in the input. It should give the target # to the example with the same index in the input. def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue #shared_x = shared_x.flatten() return shared_x, T.cast(shared_y, 'int32') test_set_x, test_set_y = shared_dataset(test_set) valid_set_x, valid_set_y = shared_dataset(valid_set) train_set_x, train_set_y = shared_dataset(train_set) rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] return rval
def getvar(var, filename): leaves = [var] array = root2rec(filename, 'tree', leaves) vars = array[var] return vars
def load(self): self.data = rnp.root2rec(self.filename, '%s/%s' % (self.directory, self.tree)) print "Loaded %s:%s/%s" % (self.filename.split( os.sep)[-1], self.directory, self.tree)