def createWeighterObjects(self, allsourcefiles): # # Calculates the weights needed for flattening the pt/eta spectrum from DeepJetCore.Weighter import Weighter weighter = Weighter() weighter.undefTruth = self.undefTruth weighter.class_weights = self.class_weights branches = [self.weightbranchX, self.weightbranchY] branches.extend(self.truth_branches) if self.remove: weighter.setBinningAndClasses([self.weight_binX, self.weight_binY], self.weightbranchX, self.weightbranchY, self.truth_branches, self.red_classes, self.truth_red_fusion, method=self.referenceclass) counter = 0 import ROOT from root_numpy import tree2array, root2array if self.remove: for fname in allsourcefiles: fileTimeOut(fname, 120) nparray = uproot_root2array(fname, treename="ttree", stop=None, branches=branches) norm_hist = True if self.referenceclass == 'flatten': norm_hist = False # from IPython import embed;embed() weighter.addDistributions(nparray, norm_h=norm_hist) #del nparray counter = counter + 1 weighter.createRemoveProbabilitiesAndWeights(self.referenceclass) print("calculate means") print("debugging this point here!") from DeepJetCore.preprocessing import meanNormProd nparray = self.readTreeFromRootToTuple( allsourcefiles, branches=self.vtx_branches + self.eta_rel_branches + self.track_branches + self.global_branches, limit=500000) print("read tree from sourcefiles") for a in (self.vtx_branches + self.eta_rel_branches + self.track_branches + self.global_branches): for b in range(len(nparray[a])): nparray[a][b] = np.where( np.logical_and(np.isfinite(nparray[a][b]), np.abs(nparray[a][b]) < 100000.0), nparray[a][b], 0) means = np.array([], dtype='float32') if len(nparray): means = meanNormProd(nparray) print("weigheter created") return {'weigther': weighter, 'means': means}
def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="Events"): fileTimeOut(filename, 10)#10 seconds for eos to recover tree = uproot.open(filename)[treename] rechitcoll = RecHitCollection(use_true_muon_momentum=self.include_tracks, cp_plus_pu_mode=self.cp_plus_pu_mode, tree=tree) #in a similar manner, we can also add tracks from conversions etc here if self.include_tracks: trackcoll = TrackCollection(tree=tree) rechitcoll.append(trackcoll) # adds t_is_unique rechitcoll.addUniqueIndices() # converts to DeepJetCore.SimpleArray farr = rechitcoll.getFinalFeaturesSA() t = rechitcoll.getFinalTruthDictSA() return [farr, t['t_idx'], t['t_energy'], t['t_pos'], t['t_time'], t['t_pid'], t['t_spectator'], t['t_fully_contained'], t['t_rec_energy'], t['t_is_unique'] ],[], []
def createWeighterObjects(self, allsourcefiles): # # Calculates the weights needed for flattening the pt/eta spectrum from DeepJetCore.Weighter import Weighter weighter = Weighter() weighter.undefTruth = self.undefTruth branches = [self.weightbranchX, self.weightbranchY] branches.extend(self.truth_branches) if self.remove: weighter.setBinningAndClasses([self.weight_binX, self.weight_binY], self.weightbranchX, self.weightbranchY, self.truth_branches) counter = 0 import ROOT from root_numpy import tree2array, root2array if self.remove: for fname in allsourcefiles: fileTimeOut(fname, 120) nparray = root2array(fname, treename="deepntuplizer/tree", stop=None, branches=branches) weighter.addDistributions(nparray) #del nparray counter = counter + 1 weighter.createRemoveProbabilitiesAndWeights(self.referenceclass) return {'weigther': weighter}
def readFromRootFile(self,filename,TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here import ROOT fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples=tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import read2DArray print(filename) feature_array = read2DArray(filename,"tree","image2d",self.nsamples,32,32) print('feature_array',feature_array.shape) truth = self.read_truthclasses(filename) #notremoves=weighter.createNotRemoveIndices(Tuple) # this removes parts of the dataset for weighting the events #feature_array = feature_array[notremoves > 0] # call this in the end self.nsamples=len(feature_array) self.x=[feature_array] # list of feature numpy arrays self.y=[truth] # list of target numpy arrays (truth) self.w=[] # list of weight arrays. One for each truth target, not used
def readFromRootFile(self,filename,TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here fileTimeOut(filename,120) uproot_tree = uproot.open(filename)['clusters'] cluster_pt = uproot_tree.array('cluster_pt') pt_filter = cluster_pt > 5. n_cell = uproot_tree.array('n_cell') def to_ndarray(*args): return numpy.squeeze(numpy.dstack(args)) branches = [ 'cell_layer', 'cell_x', 'cell_y', 'cell_z', 'cell_r', 'cell_eta', 'cell_theta', 'cell_phi', 'cell_dist', 'cell_energy', 'cell_wafer', 'cell_wafertype' ] print("reading feature array") feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray) print(feature_array.shape) print("reading truth") #truth = self.read_truthclasses(filename) truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray) print("creating remove indxs") Tuple = self.readTreeFromRootToTuple(filename) notremoves=weighter.createNotRemoveIndices(Tuple) notremoves += pt_filter # this removes parts of the dataset for weighting the events if self.remove: feature_array = feature_array[notremoves > 0] n_cell = n_cell[notremoves > 0] truth = truth[notremoves > 0] # call this in the end self.nsamples=len(feature_array) self.x=[n_cell, feature_array] # list of feature numpy arrays self.y=[truth] # list of target numpy arrays (truth) self.w=[] # list of weight arrays. One for each truth target
def readPU(minbias_files, nevents=50, nfiles=5, nPU=200): from DeepJetCore.TrainData import fileTimeOut import ROOT select = np.array(range(len(minbias_files))) np.random.shuffle(select) if len(select) < nfiles: nfiles = len(select) print( "mixing.readPU: warning: less PU files available than requested - falling back" ) print(nfiles) #print(select) #open them #take nPU random events inarrs = [] i = 0 while len(inarrs) < nfiles: file = minbias_files[select[i]] i += 1 #print('mixing: get data '+str(select[i])+' to go '+str(nfiles-i-1)) fileTimeOut(file, 10) #check if file is valid try: f = ROOT.TFile.Open(file) f.Get("B4") except: continue ramfile = file try: tree = uproot.open(ramfile)["B4"] arr = tonumpy(tree["rechit_energy"].array()) #print('arr',arr.shape) #arr = np.expand_dims(arr, axis=0)# 1 x nev x rh #print('arr2',arr.shape, ramfile) inarrs.append(arr) except: continue allarr = np.concatenate(inarrs, axis=0) # nfiles*nev x rh #allarr = np.reshape(allarr, [allarr.shape[0]*allarr.shape[1],allarr.shape[2]]) #print(allarr.shape) #print('mixing events') evtarrs = [] for ev in range(nevents): idx = np.random.randint(allarr.shape[0], size=nPU) evt = allarr[idx] evt = np.sum(evt, axis=0, keepdims=True) #print(evt.shape) evtarrs.append(evt) evts = np.concatenate(evtarrs, axis=0) #print('mixed '+str(evts.shape)) return evts
def readFromRootFile(self, filename, TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get(self.treename) self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import readListArray feature_array, n_rechits_cut = readListArray( filename, self.treename, self.feat_branch, self.nsamples, list_size=self.max_rechits, n_feat_per_element=self.n_features, zeropad=True, list_size_cut=True) energy_only = feature_array[:, :, 0:1] #keep dimension fraction_array, _ = readListArray( filename, self.treename, self.truth_branch, self.nsamples, list_size=self.max_rechits, n_feat_per_element=self. n_simcluster, #nsimcluster, right now just one, but zero-padded here zeropad=True, list_size_cut=True) print('TrainData_hitlistX: ', filename, ';convert from root: fraction of hits cut ', 100. * float(n_rechits_cut) / float(self.nsamples), '%') #needs the energy, too to determine weights fraction_array = numpy.concatenate([fraction_array, energy_only], axis=-1) #in case something was removed here if n_rechits_cut > 0: feature_array = feature_array[0:self.nsamples - n_rechits_cut] fraction_array = fraction_array[0:self.nsamples - n_rechits_cut] self.nsamples = len(feature_array) self.x = [feature_array] self.y = [ fraction_array ] # we need the features also in the truth part for weighting self.w = [] # no event weights
def fileIsValid(self, filename): try: fileTimeOut(filename, 2) tree = uproot.open(filename)["WindowNTupler/tree"] f = ROOT.TFile.Open(filename) t = f.Get("WindowNTupler/tree") if t.GetEntries() < 1: raise ValueError("") except Exception as e: print(e) return False return True
def convertFromSourceFile(self, filename, weighterobjects, istraining): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw=stopwatch() swall=stopwatch() import ROOT fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples=tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename,None, [self.branches[0]], [self.branchcutoffs[0]],self.nsamples) x_cpf = MeanNormZeroPadParticles(filename,None, self.branches[1], self.branchcutoffs[1],self.nsamples) x_npf = MeanNormZeroPadParticles(filename,None, self.branches[2], self.branchcutoffs[2],self.nsamples) x_sv = MeanNormZeroPadParticles(filename,None, self.branches[3], self.branchcutoffs[3],self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) truthtuple = Tuple[self.truthclasses] #print(self.truthclasses) alltruth=self.reduceTruth(truthtuple) print(x_global.shape,self.nsamples) return [x_global,x_cpf,x_npf,x_sv], [alltruth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here fileTimeOut(filename, 120) uproot_tree = uproot.open(filename)['clusters'] cluster_pt = uproot_tree.array('cluster_pt') pt_filter = cluster_pt > 5. n_cell = uproot_tree.array('n_cell') def to_ndarray(*args): return np.stack(args, axis=-1) branches = ['cell_energy', 'cell_theta', 'cell_phi', 'cell_z'] print("reading feature array") feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray) print(feature_array.shape) print("reading truth") #truth = self.read_truthclasses(filename) truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray) egamma = truth[..., 0:1] + truth[..., 2:3] muon = truth[..., 1:2] pi0 = truth[..., 3:4] hadron = truth[..., 4:5] + truth[..., 5:6] truth = np.concatenate((egamma, muon, pi0, hadron), axis=-1) print("creating remove indxs") Tuple = self.readTreeFromRootToTuple(filename) notremoves = weighter.createNotRemoveIndices(Tuple) notremoves += pt_filter # this removes parts of the dataset for weighting the events if self.remove: feature_array = feature_array[notremoves > 0] n_cell = n_cell[notremoves > 0] truth = truth[notremoves > 0] # call this in the end self.nsamples = len(feature_array) self.x = [feature_array, n_cell] # list of feature numpy arrays self.y = [truth] # list of target numpy arrays (truth) self.w = [] # list of weight arrays. One for each truth target
def convertFromSourceFile(self, filename, weighterobjects, istraining): # This is the only really mandatory function (unless writeFromSourceFile is defined). # It defines the conversion rule from an input source file to the lists of training # arrays self.x, self.y, self.w # self.x is a list of input feature arrays # self.y is a list of truth arrays # self.w is optional and can contain a weight array # (needs to have same number of entries as truth array) # If no weights are needed, this can be left completely empty # # The conversion should convert finally to numpy arrays. In the future, # also tensorflow tensors will be supported. # # In this example, differnt ways of reading files are deliberatly mixed # print('reading ' + filename) import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import read2DArray feature_array = read2DArray(filename, "tree", "image2d", nsamples, 32, 32) print('feature_array', feature_array.shape) import uproot urfile = uproot.open(filename)["tree"] truth = np.concatenate([ np.expand_dims(urfile.array("isA"), axis=1), np.expand_dims(urfile.array("isB"), axis=1), np.expand_dims(urfile.array("isC"), axis=1) ], axis=1) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! self.nsamples = len(feature_array) #returns a list of feature arrays, a list of truth arrays and a list of weight arrays return [feature_array], [truth], []
def fileIsValid(self, filename): #uproot does not raise exceptions early enough for testing import ROOT try: fileTimeOut(filename, 2) tree = uproot.open(filename)["Events"] f=ROOT.TFile.Open(filename) t=f.Get("Events") if t.GetEntries() < 1: raise ValueError("") except Exception as e: print('problem with file',filename) print(e) return False return True
def createWeighterObjects(self, allsourcefiles): # # Calculates the weights needed for flattening the pt/eta spectrum from DeepJetCore.Weighter import Weighter weighter = Weighter() weighter.undefTruth = self.undefTruth branches = [self.weightbranchX, self.weightbranchY] branches.extend(self.truth_branches) if self.remove: weighter.setBinningAndClasses([self.weight_binX, self.weight_binY], self.weightbranchX, self.weightbranchY, self.truth_branches) counter = 0 import ROOT from root_numpy import tree2array, root2array if self.remove: for fname in allsourcefiles: fileTimeOut(fname, 120) nparray = root2array(fname, treename="deepntuplizer/tree", stop=None, branches=branches) weighter.addDistributions(nparray) #del nparray counter = counter + 1 weighter.createRemoveProbabilitiesAndWeights(self.referenceclass) print("calculate means") from DeepJetCore.preprocessing import meanNormProd nparray = self.readTreeFromRootToTuple( allsourcefiles, branches=self.vtx_branches + self.eta_rel_branches + self.track_branches + self.global_branches, limit=500000) for a in (self.vtx_branches + self.eta_rel_branches + self.track_branches + self.global_branches): for b in range(len(nparray[a])): nparray[a][b] = np.where(nparray[a][b] < 100000.0, nparray[a][b], 0) means = np.array([], dtype='float32') if len(nparray): means = meanNormProd(nparray) return {'weigther': weighter, 'means': means}
def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="SLCIOConverted"): fileTimeOut(filename, 10)#10 seconds for eos to recover tree = uproot.open(filename)[treename] nevents = tree.numentries selection=None hit_energy , rs = self.branchToFlatArray(tree["energy"], True,selection) hit_x = self.branchToFlatArray(tree["positionX"], False,selection) hit_y = self.branchToFlatArray(tree["positionY"], False,selection) hit_z = self.branchToFlatArray(tree["positionZ"], False,selection) hit_ass_truth_idx = self.branchToFlatArray(tree["maxE_particle_index"], False,selection) hit_ass_truth_energy = self.branchToFlatArray(tree["maxE_particle_energy"], False,selection) #not used right now hit_ass_truth_pX = self.branchToFlatArray(tree["maxE_particle_pX"], False,selection) hit_ass_truth_pY = self.branchToFlatArray(tree["maxE_particle_pY"], False,selection) hit_ass_truth_pZ = self.branchToFlatArray(tree["maxE_particle_pZ"], False,selection) features = np.concatenate([ hit_energy, hit_x , hit_y, hit_z ], axis=-1) farr = SimpleArray(features,rs,name="features") t_idxarr = SimpleArray(hit_ass_truth_idx,rs,name="t_idx") t_energyarr = SimpleArray(hit_ass_truth_energy,rs,name="t_energy") zeros = np.zeros_like(hit_ass_truth_energy) #just for compatibility t_posarr = SimpleArray(zeros,rs,name="t_pos") t_time = SimpleArray(zeros,rs,name="t_time") t_pid = SimpleArray(zeros,rs,name="t_pid") #this would need some massaging so we can't use the PID directly t_spectator = SimpleArray(zeros,rs,name="t_spectator") t_fully_contained = SimpleArray(zeros,rs,name="t_fully_contained") t_rest = SimpleArray(zeros,rs,name="t_rest") #breaks with old plotting but needs to be done at some point return [farr, t_idxarr, t_energyarr, t_posarr, t_time, t_pid, t_spectator, t_fully_contained],[t_rest], []
def readTreeFromRootToTuple(self, filenames, limit=None, branches=None): ''' To be used to get the initial tupel for further processing in inherting classes Makes sure the number of entries is properly set can also read a list of files (e.g. to produce weights/removes from larger statistics (not fully tested, yet) ''' if branches is None or len(branches) == 0: return np.array([], dtype='float32') #print(branches) #remove duplicates usebranches = list(set(branches)) tmpbb = [] for b in usebranches: if len(b): tmpbb.append(b) usebranches = tmpbb import ROOT from root_numpy import tree2array, root2array if isinstance(filenames, list): for f in filenames: fileTimeOut(f, 120) print('add files') print('debugging this') print("Branches:\n{}".format(usebranches)) import uproot as ur import awkward as ak # this was substituted from the old root2array function nparray = uproot_root2array(filenames, treename="ttree", stop=limit, branches=usebranches) print('done add files') return nparray else: fileTimeOut(filenames, 120) #give eos a minute to recover rfile = ROOT.TFile(filenames) tree = rfile.Get(self.treename) if not self.nsamples: self.nsamples = tree.GetEntries() nparray = tree2array(tree, stop=limit, branches=usebranches) return nparray
def readFromRootFile(self,filename,TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here import ROOT fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get(self.treename) self.nsamples=tree.GetEntries() max_rechits = 40000 # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import readListArray feature_array = readListArray(filename, self.treename, "rechit_features", self.nsamples, list_size=max_rechits, n_feat_per_element=7, zeropad=True) energy_only = feature_array[:,:,0:1]#keep dimension fraction_array = readListArray(filename, self.treename, "simcluster_fractions", self.nsamples, list_size=max_rechits, n_feat_per_element=7,#nsimcluster, right now just one, but zero-padded here zeropad=True) #needs the energy, too to determine weights fraction_array = numpy.concatenate([fraction_array,energy_only],axis=-1) #in case something was removed here self.nsamples=len(feature_array) self.x=[feature_array] self.y=[fraction_array] # we need the features also in the truth part for weighting self.w=[] # no event weights
def convertFromSourceFile(self, filename, weighterobjects, istraining): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, None, ['x'], [1], self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple( filename, branches=['class1', 'class2', 'x']) truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(truthtuple) #print(x_global.shape,x_global[0:10]) #print(alltruth.shape,alltruth[0:10]) #print(alltruth.flags) newnsamp = x_global.shape[0] self.nsamples = newnsamp print(x_global.shape, alltruth.shape, self.nsamples) truth = SimpleArray(alltruth, name="truth") feat = SimpleArray(x_global, name="features0") return [feat], [truth], []
def readFromRootFile(self,filename,TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here import numpy as np import ROOT fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples=tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import read2DArray,readListArray print(filename) feature_image = read2DArray(filename,"tree","image2d",self.nsamples,24,24) npy_array = self.readTreeFromRootToTuple(filename) scale = np.expand_dims(npy_array['scale'],axis=1) xcenter = np.expand_dims(npy_array['xcenter'],axis=1) ycenter = np.expand_dims(npy_array['ycenter'],axis=1) ptype = np.expand_dims(npy_array['type'],axis=1) print('ycenter',ycenter.shape) add_features = np.concatenate([scale,xcenter,ycenter,ptype],axis=1) xcoords = numpy.expand_dims( numpy.array(list(npy_array['xcoords']),dtype='float32'), axis=2) ycoords = numpy.expand_dims( numpy.array(list(npy_array['ycoords']),dtype='float32'), axis=2) xcoords = numpy.reshape(xcoords, newshape=[xcoords.shape[0],24,24,1]) ycoords = numpy.reshape(ycoords, newshape=[xcoords.shape[0],24,24,1]) print('xcoords',xcoords.shape) all_coords = numpy.concatenate([xcoords,ycoords],axis=-1) #readListArray(filename,"tree","frac_at_idxs",self.nsamples,4,1) alltruth = numpy.zeros(self.nsamples)+1. #this is real data self.x = [feature_image,all_coords,add_features] self.y = [alltruth] self.w=[]
def readFromRootFile(self, filename, TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here fileTimeOut(filename, 120) uproot_tree = uproot.open(filename)['clusters'] def to_ndarray(*args): return numpy.squeeze(numpy.dstack(args)) branches_template = [ 'bin_eta', 'bin_theta', 'bin_phi', 'bin_x', 'bin_y', 'bin_eta_global', 'bin_theta_global', 'bin_phi_global', 'bin_dist_global', 'bin_x_global', 'bin_y_global', 'bin_z_global', 'bin_energy', 'bin_layer' ] branches = [] for icell in range(2): branches.extend([b + ('_%d' % icell) for b in branches_template]) feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray) feature_array = numpy.reshape(feature_array, (-1, 5, 5, 38, 28)) print("reading truth") truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray) Tuple = self.readTreeFromRootToTuple(filename) print("creating remove indxs") notremoves = weighter.createNotRemoveIndices(Tuple) # this removes parts of the dataset for weighting the events if self.remove: feature_array = feature_array[notremoves > 0] truth = truth[notremoves > 0] # call this in the end self.nsamples = len(feature_array) self.x = [feature_array] # list of feature numpy arrays self.y = [truth] # list of target numpy arrays (truth) self.w = [] # list of weight arrays. One for each truth target
def readFromRootFile(self, filename, TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() npy_array = self.readTreeFromRootToTuple(filename) truthtuple = npy_array[self.truthclasses] alltruth = self.reduceTruth(truthtuple) alltruept = npy_array[self.regtruth] # user code x_global = MeanNormZeroPad(filename, None, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, None, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, None, self.branches[2], self.branchcutoffs[2], self.nsamples) x_recopts = MeanNormZeroPad(filename, None, [self.branches[3]], [self.branchcutoffs[3]], self.nsamples) nold = self.nsamples self.x = [x_global, x_cpf, x_npf, x_recopts] # list of feature numpy arrays self.y = [alltruth, alltruept] # list of target numpy arrays (truth) self.w = [] # list of weight arrays. One for each truth target self._normalize_input_(weighter, npy_array) print('reduced to ', self.nsamples, 'of', nold)
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, None, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(truthtuple) newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp print(x_global.shape, self.nsamples) self.w = [] self.x = [x_global] self.y = [alltruth]
def createWeighterObjects(self, allsourcefiles): # # Calculates the weights needed for flattening the pt/eta spectrum from DeepJetCore.Weighter import Weighter weighter = Weighter() weighter.undefTruth = self.undefTruth weighter.class_weights = self.class_weights branches = [self.weightbranchX, self.weightbranchY] branches.extend(self.truth_branches) if self.remove: weighter.setBinningAndClasses([self.weight_binX, self.weight_binY], self.weightbranchX, self.weightbranchY, self.truth_branches, self.red_classes, self.truth_red_fusion, method=self.referenceclass) counter = 0 import ROOT from root_numpy import tree2array, root2array if self.remove: for fname in allsourcefiles: fileTimeOut(fname, 120) nparray = root2array(fname, treename="deepntuplizer/tree", stop=None, branches=branches) norm_hist = True if self.referenceclass == 'flatten': norm_hist = False weighter.addDistributions(nparray, norm_h=norm_hist) #del nparray counter = counter + 1 weighter.createRemoveProbabilitiesAndWeights(self.referenceclass) #weighter.printHistos('/afs/cern.ch/user/a/ademoor/Flatten/') #If you need to print the 2D histo, choose your output dir return {'weigther': weighter}
def readFromRootFile(self, filename, TupleMeanStd, weighter): #the first part is standard, no changes needed from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles import numpy import ROOT fileTimeOut(filename, 120) #give eos 2 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() #the definition of what to do with the branches # those are the global branches (jet pt etc) # they should be just glued to each other in one vector # and zero padded (and mean subtracted and normalised) #x_global = MeanNormZeroPad(filename,TupleMeanStd, # [self.branches[0]], # [self.branchcutoffs[0]],self.nsamples) # the second part (the pf candidates) should be treated particle wise # an array with (njets, nparticles, nproperties) is created x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0], self.branchcutoffs[0], self.nsamples) x_db = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_db_raw = ZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) # now, some jets are removed to avoid pt and eta biases Tuple = self.readTreeFromRootToTuple(filename) if self.remove: # jets are removed until the shapes in eta and pt are the same as # the truth class 'fj_isNonBB' notremoves = weighter.createNotRemoveIndices(Tuple) #undef=Tuple[self.undefTruth] #notremoves-=undef if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves #weighter.createNotRemoveIndices(Tuple) else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(Tuple) undef = numpy.sum(alltruth, axis=1) if self.weight or self.remove: print('Training samples, remove undefined') weights = weights[undef > 0] x_glb = x_glb[undef > 0] x_db = x_db[undef > 0] x_db_raw = x_db_raw[undef > 0] x_sv = x_sv[undef > 0] x_cpf = x_cpf[undef > 0] alltruth = alltruth[undef > 0] if self.remove: print('Removing to match weighting') notremoves = notremoves[undef > 0] weights = weights[notremoves > 0] x_glb = x_glb[notremoves > 0] x_db = x_db[notremoves > 0] x_db_raw = x_db_raw[notremoves > 0] x_sv = x_sv[notremoves > 0] x_cpf = x_cpf[notremoves > 0] alltruth = alltruth[notremoves > 0] if self.weight: print('Adding weights, removing events with 0 weight') x_glb = x_glb[weights > 0] x_db = x_db[weights > 0] x_db_raw = x_db_raw[weights > 0] x_sv = x_sv[weights > 0] x_cpf = x_cpf[weights > 0] alltruth = alltruth[weights > 0] # Weights get adjusted last so they can be used as an index weights = weights[weights > 0] newnsamp = x_glb.shape[0] print('Keeping {}% of input events in the training dataCollection'. format(int(float(newnsamp) / float(self.nsamples) * 100))) self.nsamples = newnsamp #print("Subsample composition:") #for lab in ['fJ_isQCD', 'fj_isH', 'fj_isCC', 'fj_isBB']: # print(numpy.sum((Tuple[lab].view(numpy.ndarray))), lab) #for lab, stat in zip(self.reducedtruthclasses, stats): # print(lab, ': {}%'.format(stat)) # fill everything self.w = [weights] self.x = [x_db, x_cpf, x_sv] self.z = [x_glb, x_db_raw] self.y = [alltruth]
def readFromRootFile(self, filename, TupleMeanStd, weighter): #the first part is standard, no changes needed from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles import numpy import ROOT fileTimeOut(filename, 120) #give eos 2 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0], self.branchcutoffs[0], self.nsamples) x_db = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) # now, some jets are removed to avoid pt and eta biases Tuple = self.readTreeFromRootToTuple(filename) #if self.remove: # jets are removed until the shapes in eta and pt are the same as # the truth class 'fj_isNonBB' notremoves = weighter.createNotRemoveIndices(Tuple) if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) # create all collections: #truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(Tuple) undef = numpy.sum(alltruth, axis=1) #weights=weights[undef > 0] #x_glb=x_glb[undef > 0] #x_db=x_db[undef > 0] #alltruth=alltruth[undef > 0] notremoves = notremoves[undef > 0] undef = Tuple['fj_isNonCC'] * Tuple['sample_isQCD'] * Tuple[ 'fj_isQCD'] + Tuple['fj_isCC'] * Tuple['fj_isH'] # remove the entries to get same jet shapes if self.remove: print('remove') weights = weights[notremoves > 0] x_glb = x_glb[notremoves > 0] x_db = x_db[notremoves > 0] alltruth = alltruth[notremoves > 0] newnsamp = x_glb.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp # fill everything self.w = [weights] self.x = [x_db] self.z = [x_glb] self.y = [alltruth]
def convertFromSourceFile(self, filename, weighterobjects, istraining): # This is the only really mandatory function (unless writeFromSourceFile is defined). # It defines the conversion rule from an input source file to the lists of training # arrays self.x, self.y, self.w # self.x is a list of input feature arrays # self.y is a list of truth arrays # self.w is optional and can contain a weight array # (needs to have same number of entries as truth array) # If no weights are needed, this can be left completely empty # # The conversion should convert finally to numpy arrays. In the future, # also tensorflow tensors will be supported. # # In this example, differnt ways of reading files are deliberatly mixed # print('reading ' + filename) import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data #from DeepJetCore.preprocessing import read2DArray #feature_array = read2DArray(filename,"tree","image2d",self.nsamples,32,32) #print('feature_array',feature_array.shape) import uproot3 as uproot urfile = uproot.open(filename)["tree"] truth = np.concatenate([ np.expand_dims(urfile.array("lep_isPromptId_Training"), axis=1), np.expand_dims(urfile.array("lep_isNonPromptId_Training"), axis=1), np.expand_dims(urfile.array("lep_isFakeId_Training"), axis=1) ], axis=1) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! self.global_branches = [ 'lep_pt', 'lep_eta', 'lep_phi', 'lep_mediumId', 'lep_miniPFRelIso_all', 'lep_pfRelIso03_all', 'lep_sip3d', 'lep_dxy', 'lep_dz', 'lep_charge', 'lep_dxyErr', 'lep_dzErr', 'lep_ip3d', 'lep_jetPtRelv2', 'lep_jetRelIso', 'lep_miniPFRelIso_chg', 'lep_mvaLowPt', 'lep_nStations', 'lep_nTrackerLayers', 'lep_pfRelIso03_all', 'lep_pfRelIso03_chg', 'lep_pfRelIso04_all', 'lep_ptErr', 'lep_segmentComp', 'lep_tkRelIso', 'lep_tunepRelPt', ] self.pfCand_neutral_branches = [ 'pfCand_neutral_eta', 'pfCand_neutral_phi', 'pfCand_neutral_pt', 'pfCand_neutral_puppiWeight', 'pfCand_neutral_puppiWeightNoLep', 'pfCand_neutral_ptRel', 'pfCand_neutral_deltaR', ] self.npfCand_neutral = 5 ## works: #x_global = np.concatenate([np.expand_dims(urfile.array(var), axis=1) for var in self.global_branches], axis=1) #x_global = x_global.astype(dtype='float32', order='C') #important, float32 and C-type! #self.nsamples=len(x_global) from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles x_global = MeanNormZeroPad(filename, None, [self.global_branches], [1], self.nsamples) x_pfCand_neutral = MeanNormZeroPadParticles( filename, None, self.pfCand_neutral_branches, self.npfCand_neutral, self.nsamples) x_global = x_global.astype(dtype='float32', order='C') x_pfCand_neutral = x_pfCand_neutral.astype(dtype='float32', order='C') #returns a list of feature arrays, a list of truth arrays and a list of weight arrays return [x_global, x_pfCand_neutral], [truth], []
def _convertFromSourceFile(self, filename, weighterobjects, istraining): fileTimeOut(filename, 10) #10 seconds for eos to recover tree = uproot.open(filename)["B4"] nevents = tree.numentries #truth isElectron = self.tonumpy(tree["isElectron"].array()) isGamma = self.tonumpy(tree["isGamma"].array()) isPositron = self.tonumpy(tree["isPositron"].array()) true_energy = self.tonumpy(tree["true_energy"].array()) true_x = self.tonumpy(tree["true_x"].array()) true_y = self.tonumpy(tree["true_y"].array()) rechit_energy = self.tonumpy(tree["rechit_energy"].array()) rechit_x = self.tonumpy(tree["rechit_x"].array()) rechit_y = self.tonumpy(tree["rechit_y"].array()) rechit_z = self.tonumpy(tree["rechit_z"].array()) rechit_layer = self.tonumpy(tree["rechit_layer"].array()) rechit_detid = self.tonumpy(tree['rechit_detid'].array()) #print('rechit_energy',rechit_energy,rechit_energy.shape) #print(rechit_detid) #for ... feat, truth, layers, npart_arr, truth_en = self.mergeShowers( isElectron, isGamma, isPositron, true_energy, true_x, true_y, rechit_energy, rechit_x, rechit_y, rechit_z, rechit_layer, rechit_detid, maxpart=self.npart, istraining=istraining) print('feat', feat.shape) calo, track = self.separateLayers(feat, layers) calo = np.reshape(calo, [truth.shape[0], 16, 16, -1]) track = np.reshape(track, [truth.shape[0], 64, 64, -1]) #print(calo[0,:,:,1:3]) #this needs to be rebinned in x and y #calosort = np.argsort(calo[:,:,1]*100+calo[:,:,2], axis=-1) #calo = calo[calosort] #calo = np.reshape(calo, [truth.shape[0],16,16,-1]) debug = False if debug: import matplotlib.pyplot as plt calotruth = truth[:, 0:16 * 16, :] calotruth = np.reshape(calotruth, [truth.shape[0], 16, 16, -1]) tracktruth = truth[:, 16 * 16:, :] tracktruth = np.reshape(tracktruth, [truth.shape[0], 64, 64, -1]) for event in range(10): #print truth index and rec energy fig, ax = plt.subplots(1, 1) ax.imshow(calotruth[event, :, :, 6], aspect=1) fig.savefig("calo_idx" + str(event) + ".pdf") ax.imshow(calo[event, :, :, 0], aspect=1) fig.savefig("calo_en" + str(event) + ".pdf") ax.imshow(calotruth[event, :, :, 0], aspect=1) fig.savefig("calo_tmask" + str(event) + ".pdf") ax.imshow(tracktruth[event, :, :, 6], aspect=1) fig.savefig("tracktruth_idx" + str(event) + ".pdf") ax.imshow(track[event, :, :, 0], aspect=1) fig.savefig("track_en" + str(event) + ".pdf") ax.imshow(tracktruth[event, :, :, 0], aspect=1) fig.savefig("tracktruth_tmask" + str(event) + ".pdf") print('calo', calo.shape) print('track', track.shape) print('truth', truth.shape) #np.set_printoptions(threshold=10*1280) #print('calo',calo[0]) #print('track',track[0]) if hasattr(self, "truth_en"): self.truth_en = truth_en return [calo, track], [truth], [ ] #[tracker0, tracker1, tracker2, tracker3, calo] , [trutharray], []
def base_convertFromSourceFile(self, filename, weighterobjects, istraining, treename="Events", removeTracks=True): fileTimeOut(filename, 10) #10 seconds for eos to recover tree = uproot.open(filename)[treename] hits = "RecHitHGC" front_face_z = 323 #this needs to be more precise recHitZUnsplit = self.hitObservable(tree, hits, "z", split=False, flatten=False) self.setSplitIdx(recHitZUnsplit < 0) recHitZ = self.splitJaggedArray(recHitZUnsplit) offsets = recHitZ.offsets recHitX = self.hitObservable(tree, hits, "x", split=True, flatten=False) recHitY = self.hitObservable(tree, hits, "y", split=True, flatten=False) recHitSimClusIdx = self.hitObservable(tree, hits, "BestMergedSimClusterIdx", split=True, flatten=False) #Define spectators recHit_df_events = [ pd.DataFrame({ "recHitX": recHitX[i], "recHitY": recHitY[i], "recHitZ": recHitZ[i], "recHitSimClusIdx": recHitSimClusIdx[i] }) for i in range(recHitX.shape[0]) ] for ievent in range(len(recHit_df_events)): df_event = recHit_df_events[ievent] unique_shower_idx = np.unique(df_event['recHitSimClusIdx']) df_event['spectator_distance'] = 0 # df_event['recHitSimClus_nHits'] = df_event.groupby( 'recHitSimClusIdx' ).recHitX.transform( len ) #adding number of rec hits that are associated to this truth cluster for idx in unique_shower_idx: df_shower = df_event[df_event['recHitSimClusIdx'] == idx] x_to_fit = df_shower[['recHitX', 'recHitY', 'recHitZ']].to_numpy() spectators_shower_dist = find_pcas(x_to_fit, PCA_n=2, min_hits=10) if (spectators_shower_dist is not None): spectators_idx = (df_shower.index.tolist()) df_event.loc[spectators_idx, 'spectator_distance'] = spectators_shower_dist del df_shower del df_event #Expand back recHitX = np.expand_dims(recHitX.content, axis=1) recHitY = np.expand_dims(recHitY.content, axis=1) recHitZ = np.expand_dims(recHitZ.content, axis=1) recHitSpectatorFlag = np.concatenate( np.array([ recHit_df_events[i]['spectator_distance'].to_numpy() for i in range(len(recHit_df_events)) ], dtype=object)).reshape(-1, 1) recHitSimClusterNumHits = np.concatenate( np.array([ recHit_df_events[i]['recHitSimClus_nHits'].to_numpy() for i in range(len(recHit_df_events)) ], dtype=object)).reshape(-1, 1) #number of rec hits del recHit_df_events recHitEnergy = self.hitObservable(tree, hits, "energy") recHitDetaId = self.hitObservable(tree, hits, "detId") recHitTime = self.hitObservable(tree, hits, "time") recHitR = np.sqrt(recHitX * recHitX + recHitY * recHitY + recHitZ * recHitZ) recHitTheta = np.arccos(recHitZ / recHitR) recHitEta = -np.log(np.tan(recHitTheta / 2)) # Don't split this until the end, so it can be used to index the truth arrays recHitSimClusIdx = self.hitObservable(tree, hits, "BestMergedSimClusterIdx", split=False, flatten=False) simClusterDepEnergy = tree["MergedSimCluster_recEnergy"].array() simClusterEnergy = tree["MergedSimCluster_boundaryEnergy"].array() simClusterEnergyNoMu = tree[ "MergedSimCluster_boundaryEnergyNoMu"].array() simClusterNumHits = tree["MergedSimCluster_nHits"].array( ) #numebr of sim hits # Remove muon energy, add back muon deposited energy unmergedId = tree["SimCluster_pdgId"].array() unmergedDepE = tree["SimCluster_recEnergy"].array() unmergedMatchIdx = tree["MergedSimCluster_SimCluster_MatchIdx"].array() unmergedMatches = tree["MergedSimCluster_SimClusterNumMatch"].array() unmergedDepEMuOnly = unmergedDepE unmergedDepEMuOnly[np.abs(unmergedId) != 13] = 0 # Add another layer of nesting, then sum over all unmerged associated to merged unmergedDepEMuOnly = ak.JaggedArray.fromcounts( unmergedMatches.counts, ak.JaggedArray.fromcounts( unmergedMatches.content, unmergedDepEMuOnly[unmergedMatchIdx].flatten())) depEMuOnly = unmergedDepEMuOnly.sum() #why wasn't it possible to just do instead of all of the above : simClusterEnergy[simClusterPdgId == 13] = simClusterDepEnergy ? simClusterEnergyMuCorr = simClusterEnergyNoMu + depEMuOnly simClusterX = tree["MergedSimCluster_impactPoint_x"].array() simClusterY = tree["MergedSimCluster_impactPoint_y"].array() simClusterZ = tree["MergedSimCluster_impactPoint_z"].array() simClusterTime = tree["MergedSimCluster_impactPoint_t"].array() simClusterEta = tree["MergedSimCluster_impactPoint_eta"].array() simClusterPhi = tree["MergedSimCluster_impactPoint_phi"].array() simClusterPdgId = tree["MergedSimCluster_pdgId"].array() # Mark simclusters outside of volume or with very few hits as noise # Maybe not a good idea if the merged SC pdgId is screwed up # Probably removing neutrons is a good idea though #noNeutrons = simClusterPdgId[recHitSimClusIdx] == 2112 #filter non-boundary positions. Hopefully working? goodSimClus = tree["MergedSimCluster_isTrainable"].array() # Don't split by index here to keep same dimensions as SimClusIdx markNoise = self.truthObjects(~goodSimClus, recHitSimClusIdx, False, split=False, flatten=False).astype(np.bool_) nbefore = (recHitSimClusIdx < 0).sum().sum() recHitSimClusIdx[markNoise] = -1 nafter = (recHitSimClusIdx < 0).sum().sum() print("Number of noise hits before", nbefore, "after", nafter) print('removed another factor of', nafter / nbefore, ' bad simclusters') recHitTruthPID = self.truthObjects(simClusterPdgId, recHitSimClusIdx, 0.) recHitTruthDepEnergy = self.truthObjects(simClusterDepEnergy, recHitSimClusIdx, 0) recHitTruthEnergy = self.truthObjects(simClusterEnergy, recHitSimClusIdx, 0) recHitTruthEnergyCorrMu = self.truthObjects(simClusterEnergyMuCorr, recHitSimClusIdx, 0) low_energy_shower_cutoff = 3 # Uncorrected currently not used recHitTruthEnergy = np.where( recHitTruthEnergy > low_energy_shower_cutoff, recHitTruthEnergy, recHitTruthDepEnergy) recHitTruthEnergy = np.where( recHitTruthEnergyCorrMu > low_energy_shower_cutoff, recHitTruthEnergyCorrMu, recHitTruthDepEnergy) #very bad names because these quatities are associated to Merged Clusters and not hits recHitTruthX = self.truthObjects(simClusterX, recHitSimClusIdx, 0) recHitTruthY = self.truthObjects(simClusterY, recHitSimClusIdx, 0) recHitTruthZ = self.truthObjects(simClusterZ, recHitSimClusIdx, 0) recHitTruthTime = self.truthObjects(simClusterTime, recHitSimClusIdx, 0) recHitTruthR = np.sqrt(recHitTruthX * recHitTruthX + recHitTruthY * recHitTruthY + recHitTruthZ * recHitTruthZ) recHitTruthTheta = np.arccos( np.divide(recHitTruthZ, recHitTruthR, out=np.zeros_like(recHitTruthZ), where=recHitTruthR != 0)) recHitTruthPhi = self.truthObjects(simClusterPhi, recHitSimClusIdx, 0) recHitTruthEta = self.truthObjects(simClusterEta, recHitSimClusIdx, 0) #recHitAverageEnergy = self.truthObjects(simClusterDepEnergy/simClusterNumHits, recHitSimClusIdx, 0) #this is not technically very good because simClusterNumHits is number of sim clusters, not reco recHitAverageEnergy = recHitTruthDepEnergy / recHitSimClusterNumHits #print(recHitTruthPhi) #print(np.max(recHitTruthPhi)) #print(np.min(recHitTruthPhi)) # Placeholder zeroFeature = np.zeros(shape=(len(recHitEnergy), 1), dtype='float32') features = np.concatenate( [ recHitEnergy, recHitEta, zeroFeature, #indicator if it is track or not recHitTheta, recHitR, recHitX, recHitY, recHitZ, recHitTime, ], axis=1) farr = SimpleArray(name="recHitFeatures") farr.createFromNumpy(features, offsets) del features recHitSimClusIdx = np.expand_dims( self.splitJaggedArray(recHitSimClusIdx).content.astype(np.int32), axis=1) print('noise', (100 * np.count_nonzero(recHitSimClusIdx < 0)) // recHitSimClusIdx.shape[0], '% of hits') print('truth eta min max', np.min(np.abs(recHitTruthEta[recHitSimClusIdx >= 0])), np.max(np.abs(recHitTruthEta[recHitSimClusIdx >= 0]))) print( 'non-boundary truth positions', np.count_nonzero( np.abs(np.abs(recHitTruthZ[recHitSimClusIdx >= 0]) - 320) > 5) / recHitTruthZ[recHitSimClusIdx >= 0].shape[0]) #now all numpy #Why do we want noise (-1) sim hits to be equal rec? recHitTruthX[recHitSimClusIdx < 0] = recHitX[recHitSimClusIdx < 0] recHitTruthY[recHitSimClusIdx < 0] = recHitY[recHitSimClusIdx < 0] recHitTruthZ[recHitSimClusIdx < 0] = recHitZ[recHitSimClusIdx < 0] recHitTruthEnergyCorrMu[recHitSimClusIdx < 0] = recHitEnergy[ recHitSimClusIdx < 0] recHitTruthTime[recHitSimClusIdx < 0] = recHitTime[ recHitSimClusIdx < 0] #import matplotlib.pyplot as plt #plt.hist(np.abs(recHitTruthEnergyCorrMu[recHitSimClusIdx>=0]/recHitTruthDepEnergy[recHitSimClusIdx>=0])) #plt.yscale('log') #plt.savefig("scat.pdf") truth = np.concatenate( [ np.array(recHitSimClusIdx, dtype='float32'), # 0 recHitTruthEnergyCorrMu, recHitTruthX, recHitTruthY, recHitTruthZ, #4 zeroFeature, #truthHitAssignedDirX, zeroFeature, #6 zeroFeature, recHitTruthEta, recHitTruthPhi, recHitTruthTime, #10 zeroFeature, zeroFeature, recHitTruthDepEnergy, #13 zeroFeature, #14 zeroFeature, #15 recHitTruthPID, #16 - 16+n_classes #won't be used anymore np.array(recHitSpectatorFlag, dtype='float32'), np.where(recHitTruthZ < front_face_z, 1., 0.).astype('float32') ], axis=1) t_idxarr = SimpleArray(recHitSimClusIdx, offsets, name="recHitTruthClusterIdx") t_energyarr = SimpleArray(name="recHitTruthEnergy") t_energyarr.createFromNumpy(recHitTruthEnergyCorrMu, offsets) t_posarr = SimpleArray(name="recHitTruthPosition") t_posarr.createFromNumpy( np.concatenate([recHitTruthX, recHitTruthY], axis=-1), offsets) t_time = SimpleArray(name="recHitTruthTime") t_time.createFromNumpy(recHitTruthTime, offsets) t_pid = SimpleArray(name="recHitTruthID") t_pid.createFromNumpy(recHitTruthPID, offsets) t_spectator = SimpleArray( name="recHitSpectatorFlag" ) #why do we have inconsistent namings, where is it needed? wrt. to truth array t_spectator.createFromNumpy(recHitSpectatorFlag.astype('float32'), offsets) t_fully_contained = SimpleArray(name="recHitFullyContainedFlag") t_fully_contained.createFromNumpy( np.where(recHitTruthZ < front_face_z, 1., 0.).astype('float32'), offsets) #remaining truth is mostly for consistency in the plotting tools t_rest = SimpleArray(name="recHitTruth") t_rest.createFromNumpy(truth, offsets) return [ farr, t_idxarr, t_energyarr, t_posarr, t_time, t_pid, t_spectator, t_fully_contained ], [t_rest], []
def convertFromSourceFile(self, filename, weighterobjects, istraining): # Function to produce the numpy training arrays from root files from DeepJetCore.Weighter import Weighter from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() if not istraining: self.remove = False def reduceTruth(uproot_arrays): b = uproot_arrays[b'isB'] bb = uproot_arrays[b'isBB'] gbb = uproot_arrays[b'isGBB'] bl = uproot_arrays[b'isLeptonicB'] blc = uproot_arrays[b'isLeptonicB_C'] lepb = bl + blc c = uproot_arrays[b'isC'] cc = uproot_arrays[b'isCC'] gcc = uproot_arrays[b'isGCC'] ud = uproot_arrays[b'isUD'] s = uproot_arrays[b'isS'] uds = ud + s g = uproot_arrays[b'isG'] return np.vstack( (b + lepb, bb + gbb, c + cc + gcc, uds + g)).transpose() print('reading ' + filename) import ROOT from root_numpy import tree2array, root2array fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles x_global = MeanNormZeroPad(filename, weighterobjects['means'], [ self.global_branches, self.track_branches, self.eta_rel_branches, self.vtx_branches ], [1, self.n_track, self.n_eta_rel, self.n_vtx], self.nsamples) import uproot3 as uproot urfile = uproot.open(filename)["deepntuplizer/tree"] truth_arrays = urfile.arrays(self.truth_branches) truth = reduceTruth(truth_arrays) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! x_global = x_global.astype(dtype='float32', order='C') if self.remove: b = [self.weightbranchX, self.weightbranchY] b.extend(self.truth_branches) b.extend(self.undefTruth) fileTimeOut(filename, 120) for_remove = root2array(filename, treename="deepntuplizer/tree", stop=None, branches=b) notremoves = weighterobjects['weigther'].createNotRemoveIndices( for_remove) undef = for_remove['isUndefined'] notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') if self.remove: print('remove') x_global = x_global[notremoves > 0] truth = truth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') print('remove nans') x_global = np.where( np.logical_and(np.isfinite(x_global), (np.abs(x_global) < 100000.0)), x_global, 0) return [x_global], [truth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, TupleMeanStd, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) if self.remove: notremoves = weighter.createNotRemoveIndices(Tuple) undef = Tuple['isUndefined'] notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] #print(self.truthclasses) alltruth = self.reduceTruth(truthtuple) #print(alltruth.shape) if self.remove: print('remove') weights = weights[notremoves > 0] x_global = x_global[notremoves > 0] x_cpf = x_cpf[notremoves > 0] x_npf = x_npf[notremoves > 0] x_sv = x_sv[notremoves > 0] alltruth = alltruth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp print(x_global.shape, self.nsamples) self.w = [weights] self.x = [x_global, x_cpf, x_npf, x_sv] self.y = [alltruth]
def convertFromSourceFile(self, filename, weighterobjects, istraining): # Function to produce the numpy training arrays from root files from DeepJetCore.Weighter import Weighter from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() if not istraining: self.remove = False def reduceTruth(uproot_arrays): b = uproot_arrays[str.encode(map_prefix(b'Jet_isB'))] bb = uproot_arrays[str.encode(map_prefix(b'Jet_isBB'))] gbb = uproot_arrays[str.encode(map_prefix(b'Jet_isGBB'))] bl = uproot_arrays[str.encode(map_prefix(b'Jet_isLeptonicB'))] blc = uproot_arrays[str.encode(map_prefix(b'Jet_isLeptonicB_C'))] lepb = bl + blc c = uproot_arrays[str.encode(map_prefix(b'Jet_isC'))] cc = uproot_arrays[str.encode(map_prefix(b'Jet_isCC'))] gcc = uproot_arrays[str.encode(map_prefix(b'Jet_isGCC'))] ud = uproot_arrays[str.encode(map_prefix(b'Jet_isUD'))] s = uproot_arrays[str.encode(map_prefix(b'Jet_isS'))] uds = ud + s g = uproot_arrays[str.encode(map_prefix(b'Jet_isG'))] return np.vstack( (b + lepb, bb + gbb, c + cc + gcc, uds + g)).transpose() print('reading ' + filename) import ROOT from root_numpy import tree2array, root2array fileTimeOut(filename, 600) #give eos a minute to recover rfile = ROOT.TFile(filename) # tree = rfile.Get("ttree") # self.nsamples = tree.GetEntries() # from IPython import embed;embed() tree = u3.open(filename)["ttree"] self.nsamples = tree.numentries print("Nsamples: {}".format(self.nsamples)) # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles for obj in [ filename, weighterobjects['means'], [ self.global_branches, self.track_branches, self.eta_rel_branches, self.vtx_branches ], [1, self.n_track, self.n_eta_rel, self.n_vtx], self.nsamples ]: print("DEBUGGING:\t{}".format(type(obj))) print("DEBUGGING:\n\tPrinting MeanNormZeroPad arguments:") print("\t{}\n\t{}\n\t{}".format(filename, weighterobjects['means'], self.nsamples)) print("reading in with new uproot+awkward function") nparr = uproot_tree_to_numpy( filename, weighterobjects['means'], [ self.global_branches, self.track_branches, self.eta_rel_branches, self.vtx_branches ], [1, self.n_track, self.n_eta_rel, self.n_vtx], self.nsamples, treename="ttree") print("succesfully created numpy array") x_global = nparr # x_global = MeanNormZeroPad(filename,weighterobjects['means'], # [self.global_branches,self.track_branches,self.eta_rel_branches,self.vtx_branches], # [1,self.n_track,self.n_eta_rel,self.n_vtx],self.nsamples) print("opening file with uproot") import uproot3 as uproot urfile = uproot.open(filename)["ttree"] truth_arrays = urfile.arrays(self.truth_branches) print("truth_branches:") print(self.truth_branches) print("truth_arrays:") print(truth_arrays) truth = reduceTruth(truth_arrays) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! x_global = x_global.astype(dtype='float32', order='C') if self.remove: b = [self.weightbranchX, self.weightbranchY] b.extend(self.truth_branches) b.extend(self.undefTruth) fileTimeOut(filename, 120) for_remove = uproot_root2array(filename, treename="ttree", stop=None, branches=b) notremoves = weighterobjects['weigther'].createNotRemoveIndices( for_remove) undef = for_remove['Jet_isUndefined'] print("\nundef:") print(undef) print("undef dtype: ", undef.dtype) print() print(notremoves) notremoves -= np.array(undef, dtype=np.float32) print('took ', sw.getAndReset(), ' to create remove indices') if self.remove: print('remove') x_global = x_global[notremoves > 0] truth = truth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') print('remove nans') x_global = np.where( np.logical_and(np.isfinite(x_global), (np.abs(x_global) < 100000.0)), x_global, 0) return [x_global], [truth], []