def convertFromSourceFile(self, filename, weighterobjects, istraining): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw=stopwatch() swall=stopwatch() import ROOT fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples=tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename,None, [self.branches[0]], [self.branchcutoffs[0]],self.nsamples) x_cpf = MeanNormZeroPadParticles(filename,None, self.branches[1], self.branchcutoffs[1],self.nsamples) x_npf = MeanNormZeroPadParticles(filename,None, self.branches[2], self.branchcutoffs[2],self.nsamples) x_sv = MeanNormZeroPadParticles(filename,None, self.branches[3], self.branchcutoffs[3],self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) truthtuple = Tuple[self.truthclasses] #print(self.truthclasses) alltruth=self.reduceTruth(truthtuple) print(x_global.shape,self.nsamples) return [x_global,x_cpf,x_npf,x_sv], [alltruth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, None, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, None, self.branches[1], self.branchcutoffs[1], self.nsamples) x_etarel = MeanNormZeroPadParticles(filename, None, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, None, self.branches[3], self.branchcutoffs[3], self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') npy_array = self.readTreeFromRootToTuple(filename) reg_truth = npy_array['gen_pt_WithNu'].view(numpy.ndarray) reco_pt = npy_array['jet_corr_pt'].view(numpy.ndarray) correctionfactor = numpy.zeros(self.nsamples) for i in range(self.nsamples): correctionfactor[i] = reg_truth[i] / reco_pt[i] truthtuple = npy_array[self.truthclasses] alltruth = self.reduceTruth(truthtuple) self.x = [x_global, x_cpf, x_etarel, x_sv, reco_pt] self.y = [alltruth, correctionfactor] self._normalize_input_(weighter, npy_array)
def readFromRootFile(self, filename, TupleMeanStd, weighter): # this function defines how to convert the root ntuple to the training format # options are not yet described here from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() npy_array = self.readTreeFromRootToTuple(filename) truthtuple = npy_array[self.truthclasses] alltruth = self.reduceTruth(truthtuple) alltruept = npy_array[self.regtruth] # user code x_global = MeanNormZeroPad(filename, None, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, None, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, None, self.branches[2], self.branchcutoffs[2], self.nsamples) x_recopts = MeanNormZeroPad(filename, None, [self.branches[3]], [self.branchcutoffs[3]], self.nsamples) nold = self.nsamples self.x = [x_global, x_cpf, x_npf, x_recopts] # list of feature numpy arrays self.y = [alltruth, alltruept] # list of target numpy arrays (truth) self.w = [] # list of weight arrays. One for each truth target self._normalize_input_(weighter, npy_array) print('reduced to ', self.nsamples, 'of', nold)
def convertFromSourceFile(self, filename, weighterobjects, istraining): # Function to produce the numpy training arrays from root files from DeepJetCore.Weighter import Weighter from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() if not istraining: self.remove = False #def reduceTruth(uproot_arrays): # #import numpy as np # prompt = uproot_arrays[b'lep_isPromptId_Training'] # nonPrompt = uproot_arrays[b'lep_isNonPromptId_Training'] # fake = uproot_arrays[b'lep_isFakeId_Training'] # print (prompt, nonPrompt, fake) # return np.vstack((prompt, nonPrompt, fake)).transpose() # #return np.concatenate( [ prompt, nonPrompt, fake] ) print('reading '+filename) import ROOT from root_numpy import tree2array, root2array fileTimeOut(filename,120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import MeanNormZeroPad,MeanNormZeroPadParticles x_global = MeanNormZeroPad(filename,None, [self.global_branches], [1],self.nsamples) x_pfCand_neutral = MeanNormZeroPadParticles(filename,None, self.pfCand_neutral_branches, self.npfCand_neutral,self.nsamples) x_pfCand_charged = MeanNormZeroPadParticles(filename,None, self.pfCand_charged_branches, self.npfCand_charged,self.nsamples) x_pfCand_photon = MeanNormZeroPadParticles(filename,None, self.pfCand_photon_branches, self.npfCand_photon,self.nsamples) x_pfCand_electron = MeanNormZeroPadParticles(filename,None, self.pfCand_electron_branches, self.npfCand_electron,self.nsamples) x_pfCand_muon = MeanNormZeroPadParticles(filename,None, self.pfCand_muon_branches, self.npfCand_muon,self.nsamples) x_pfCand_SV = MeanNormZeroPadParticles(filename,None, self.SV_branches, self.nSV,self.nsamples) #import uproot3 as uproot #urfile = uproot.open(filename)["tree"] #truth_arrays = urfile.arrays(self.truth_branches) #truth = reduceTruth(truth_arrays) #truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! import uproot3 as uproot urfile = uproot.open(filename)["tree"] truth = np.concatenate([np.expand_dims(urfile.array("lep_isPromptId_Training"), axis=1) , np.expand_dims(urfile.array("lep_isNonPromptId_Training"), axis=1), np.expand_dims(urfile.array("lep_isFakeId_Training"), axis=1)],axis=1) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! x_global = x_global.astype(dtype='float32', order='C') x_pfCand_neutral = x_pfCand_neutral.astype(dtype='float32', order='C') x_pfCand_charged = x_pfCand_charged.astype(dtype='float32', order='C') x_pfCand_photon = x_pfCand_photon.astype(dtype='float32', order='C') x_pfCand_electron = x_pfCand_electron.astype(dtype='float32', order='C') x_pfCand_muon = x_pfCand_muon.astype(dtype='float32', order='C') x_pfCand_SV = x_pfCand_SV.astype(dtype='float32', order='C') if self.remove: b = [self.weightbranchX,self.weightbranchY] b.extend(self.truth_branches) b.extend(self.undefTruth) fileTimeOut(filename, 120) for_remove = root2array( filename, treename = "tree", stop = None, branches = b ) notremoves=weighterobjects['weigther'].createNotRemoveIndices(for_remove) #undef=for_remove['isUndefined'] #notremoves-=undef print('took ', sw.getAndReset(), ' to create remove indices') #if counter_all == 0: # notremoves = list(np.ones(np.shape(notremoves))) if self.remove: #print('remove') print ("notremoves", notremoves, "<- notremoves") x_global = x_global[notremoves > 0] x_pfCand_neutral = x_pfCand_neutral[notremoves > 0] x_pfCand_charged = x_pfCand_charged[notremoves > 0] x_pfCand_photon = x_pfCand_photon[notremoves > 0] x_pfCand_electron = x_pfCand_electron[notremoves > 0] x_pfCand_muon = x_pfCand_muon[notremoves > 0] x_pfCand_SV = x_pfCand_SV[notremoves > 0] truth = truth[notremoves > 0] newnsamp=x_global.shape[0] print('reduced content to ', int(float(newnsamp)/float(self.nsamples)*100),'%') #print(x_global) #print(x_pfCand_neutral) #print(x_pfCand_charged) #print(x_pfCand_photon) #print(x_pfCand_electron) #print(x_pfCand_muon) #print(x_pfCand_SV) print('remove nans') x_global = np.where(np.isfinite(x_global) , x_global, 0) x_pfCand_neutral = np.where(np.isfinite(x_pfCand_neutral), x_pfCand_neutral, 0) x_pfCand_charged = np.where(np.isfinite(x_pfCand_charged), x_pfCand_charged, 0) x_pfCand_photon = np.where(np.isfinite(x_pfCand_photon), x_pfCand_photon, 0) x_pfCand_electron = np.where(np.isfinite(x_pfCand_electron), x_pfCand_electron, 0) x_pfCand_muon = np.where(np.isfinite(x_pfCand_muon), x_pfCand_muon, 0) x_pfCand_SV = np.where(np.isfinite(x_pfCand_SV), x_pfCand_SV, 0) return [x_global, x_pfCand_neutral, x_pfCand_charged, x_pfCand_photon, x_pfCand_electron, x_pfCand_muon, x_pfCand_SV], [truth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): #the first part is standard, no changes needed from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles import numpy import ROOT fileTimeOut(filename, 120) #give eos 2 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() #the definition of what to do with the branches # those are the global branches (jet pt etc) # they should be just glued to each other in one vector # and zero padded (and mean subtracted and normalised) #x_global = MeanNormZeroPad(filename,TupleMeanStd, # [self.branches[0]], # [self.branchcutoffs[0]],self.nsamples) # the second part (the pf candidates) should be treated particle wise # an array with (njets, nparticles, nproperties) is created x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0], self.branchcutoffs[0], self.nsamples) x_db = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_db_raw = ZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) # now, some jets are removed to avoid pt and eta biases Tuple = self.readTreeFromRootToTuple(filename) if self.remove: # jets are removed until the shapes in eta and pt are the same as # the truth class 'fj_isNonBB' notremoves = weighter.createNotRemoveIndices(Tuple) #undef=Tuple[self.undefTruth] #notremoves-=undef if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves #weighter.createNotRemoveIndices(Tuple) else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(Tuple) undef = numpy.sum(alltruth, axis=1) if self.weight or self.remove: print('Training samples, remove undefined') weights = weights[undef > 0] x_glb = x_glb[undef > 0] x_db = x_db[undef > 0] x_db_raw = x_db_raw[undef > 0] x_sv = x_sv[undef > 0] x_cpf = x_cpf[undef > 0] alltruth = alltruth[undef > 0] if self.remove: print('Removing to match weighting') notremoves = notremoves[undef > 0] weights = weights[notremoves > 0] x_glb = x_glb[notremoves > 0] x_db = x_db[notremoves > 0] x_db_raw = x_db_raw[notremoves > 0] x_sv = x_sv[notremoves > 0] x_cpf = x_cpf[notremoves > 0] alltruth = alltruth[notremoves > 0] if self.weight: print('Adding weights, removing events with 0 weight') x_glb = x_glb[weights > 0] x_db = x_db[weights > 0] x_db_raw = x_db_raw[weights > 0] x_sv = x_sv[weights > 0] x_cpf = x_cpf[weights > 0] alltruth = alltruth[weights > 0] # Weights get adjusted last so they can be used as an index weights = weights[weights > 0] newnsamp = x_glb.shape[0] print('Keeping {}% of input events in the training dataCollection'. format(int(float(newnsamp) / float(self.nsamples) * 100))) self.nsamples = newnsamp #print("Subsample composition:") #for lab in ['fJ_isQCD', 'fj_isH', 'fj_isCC', 'fj_isBB']: # print(numpy.sum((Tuple[lab].view(numpy.ndarray))), lab) #for lab, stat in zip(self.reducedtruthclasses, stats): # print(lab, ': {}%'.format(stat)) # fill everything self.w = [weights] self.x = [x_db, x_cpf, x_sv] self.z = [x_glb, x_db_raw] self.y = [alltruth]
def readFromRootFile(self, filename, TupleMeanStd, weighter): #the first part is standard, no changes needed from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles import numpy import ROOT fileTimeOut(filename, 120) #give eos 2 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0], self.branchcutoffs[0], self.nsamples) x_db = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) # now, some jets are removed to avoid pt and eta biases Tuple = self.readTreeFromRootToTuple(filename) #if self.remove: # jets are removed until the shapes in eta and pt are the same as # the truth class 'fj_isNonBB' notremoves = weighter.createNotRemoveIndices(Tuple) if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) # create all collections: #truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(Tuple) undef = numpy.sum(alltruth, axis=1) #weights=weights[undef > 0] #x_glb=x_glb[undef > 0] #x_db=x_db[undef > 0] #alltruth=alltruth[undef > 0] notremoves = notremoves[undef > 0] undef = Tuple['fj_isNonCC'] * Tuple['sample_isQCD'] * Tuple[ 'fj_isQCD'] + Tuple['fj_isCC'] * Tuple['fj_isH'] # remove the entries to get same jet shapes if self.remove: print('remove') weights = weights[notremoves > 0] x_glb = x_glb[notremoves > 0] x_db = x_db[notremoves > 0] alltruth = alltruth[notremoves > 0] newnsamp = x_glb.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp # fill everything self.w = [weights] self.x = [x_db] self.z = [x_glb] self.y = [alltruth]
def convertFromSourceFile(self, filename, weighterobjects, istraining): # This is the only really mandatory function (unless writeFromSourceFile is defined). # It defines the conversion rule from an input source file to the lists of training # arrays self.x, self.y, self.w # self.x is a list of input feature arrays # self.y is a list of truth arrays # self.w is optional and can contain a weight array # (needs to have same number of entries as truth array) # If no weights are needed, this can be left completely empty # # The conversion should convert finally to numpy arrays. In the future, # also tensorflow tensors will be supported. # # In this example, differnt ways of reading files are deliberatly mixed # print('reading ' + filename) import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data #from DeepJetCore.preprocessing import read2DArray #feature_array = read2DArray(filename,"tree","image2d",self.nsamples,32,32) #print('feature_array',feature_array.shape) import uproot3 as uproot urfile = uproot.open(filename)["tree"] truth = np.concatenate([ np.expand_dims(urfile.array("lep_isPromptId_Training"), axis=1), np.expand_dims(urfile.array("lep_isNonPromptId_Training"), axis=1), np.expand_dims(urfile.array("lep_isFakeId_Training"), axis=1) ], axis=1) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! self.global_branches = [ 'lep_pt', 'lep_eta', 'lep_phi', 'lep_mediumId', 'lep_miniPFRelIso_all', 'lep_pfRelIso03_all', 'lep_sip3d', 'lep_dxy', 'lep_dz', 'lep_charge', 'lep_dxyErr', 'lep_dzErr', 'lep_ip3d', 'lep_jetPtRelv2', 'lep_jetRelIso', 'lep_miniPFRelIso_chg', 'lep_mvaLowPt', 'lep_nStations', 'lep_nTrackerLayers', 'lep_pfRelIso03_all', 'lep_pfRelIso03_chg', 'lep_pfRelIso04_all', 'lep_ptErr', 'lep_segmentComp', 'lep_tkRelIso', 'lep_tunepRelPt', ] self.pfCand_neutral_branches = [ 'pfCand_neutral_eta', 'pfCand_neutral_phi', 'pfCand_neutral_pt', 'pfCand_neutral_puppiWeight', 'pfCand_neutral_puppiWeightNoLep', 'pfCand_neutral_ptRel', 'pfCand_neutral_deltaR', ] self.npfCand_neutral = 5 ## works: #x_global = np.concatenate([np.expand_dims(urfile.array(var), axis=1) for var in self.global_branches], axis=1) #x_global = x_global.astype(dtype='float32', order='C') #important, float32 and C-type! #self.nsamples=len(x_global) from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles x_global = MeanNormZeroPad(filename, None, [self.global_branches], [1], self.nsamples) x_pfCand_neutral = MeanNormZeroPadParticles( filename, None, self.pfCand_neutral_branches, self.npfCand_neutral, self.nsamples) x_global = x_global.astype(dtype='float32', order='C') x_pfCand_neutral = x_pfCand_neutral.astype(dtype='float32', order='C') #returns a list of feature arrays, a list of truth arrays and a list of weight arrays return [x_global, x_pfCand_neutral], [truth], []
def convertFromSourceFile(self, filename, weighterobjects, istraining): # Function to produce the numpy training arrays from root files from DeepJetCore.Weighter import Weighter from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() if not istraining: self.remove = False def reduceTruth(uproot_arrays): b = uproot_arrays[b'isB'] bb = uproot_arrays[b'isBB'] gbb = uproot_arrays[b'isGBB'] bl = uproot_arrays[b'isLeptonicB'] blc = uproot_arrays[b'isLeptonicB_C'] lepb = bl + blc c = uproot_arrays[b'isC'] cc = uproot_arrays[b'isCC'] gcc = uproot_arrays[b'isGCC'] ud = uproot_arrays[b'isUD'] s = uproot_arrays[b'isS'] uds = ud + s g = uproot_arrays[b'isG'] return np.vstack( (b, bb + gbb, lepb, c + cc + gcc, uds, g)).transpose() print('reading ' + filename) import ROOT from root_numpy import tree2array, root2array fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles x_global = MeanNormZeroPad(filename, None, [self.global_branches], [1], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, None, self.cpf_branches, self.n_cpf, self.nsamples) x_npf = MeanNormZeroPadParticles(filename, None, self.npf_branches, self.n_npf, self.nsamples) x_vtx = MeanNormZeroPadParticles(filename, None, self.vtx_branches, self.n_vtx, self.nsamples) import uproot3 as uproot urfile = uproot.open(filename)["deepntuplizer/tree"] truth_arrays = urfile.arrays(self.truth_branches) truth = reduceTruth(truth_arrays) truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! x_global = x_global.astype(dtype='float32', order='C') x_cpf = x_cpf.astype(dtype='float32', order='C') x_npf = x_npf.astype(dtype='float32', order='C') x_vtx = x_vtx.astype(dtype='float32', order='C') if self.remove: b = [self.weightbranchX, self.weightbranchY] b.extend(self.truth_branches) b.extend(self.undefTruth) fileTimeOut(filename, 120) for_remove = root2array(filename, treename="deepntuplizer/tree", stop=None, branches=b) notremoves = weighterobjects['weigther'].createNotRemoveIndices( for_remove) undef = for_remove['isUndefined'] notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') if self.remove: print('remove') x_global = x_global[notremoves > 0] x_cpf = x_cpf[notremoves > 0] x_npf = x_npf[notremoves > 0] x_vtx = x_vtx[notremoves > 0] truth = truth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') print('remove nans') x_global = np.where(np.isfinite(x_global), x_global, 0) x_cpf = np.where(np.isfinite(x_cpf), x_cpf, 0) x_npf = np.where(np.isfinite(x_npf), x_npf, 0) x_vtx = np.where(np.isfinite(x_vtx), x_vtx, 0) return [x_global, x_cpf, x_npf, x_vtx], [truth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, TupleMeanStd, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) if self.remove: notremoves = weighter.createNotRemoveIndices(Tuple) undef = Tuple['isUndefined'] notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] #print(self.truthclasses) alltruth = self.reduceTruth(truthtuple) #print(alltruth.shape) if self.remove: print('remove') weights = weights[notremoves > 0] x_global = x_global[notremoves > 0] x_cpf = x_cpf[notremoves > 0] x_npf = x_npf[notremoves > 0] x_sv = x_sv[notremoves > 0] alltruth = alltruth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp print(x_global.shape, self.nsamples) self.w = [weights] self.x = [x_global, x_cpf, x_npf, x_sv] self.y = [alltruth]
def convertFromSourceFile(self, filename, weighterobjects, istraining): # Function to produce the numpy training arrays from root files from DeepJetCore.Weighter import Weighter from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() if not istraining: self.remove = False print('reading ' + filename) import ROOT from root_numpy import tree2array, root2array fileTimeOut(filename, 120) # give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() # user code, example works with the example 2D images in root format generated by make_example_data from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles print('padding ' + filename) x_global = MeanNormZeroPad( filename, None, # 2nd argument None: means no normalisation [self.global_branches], [1], self.nsamples) x_pfCand_neutral = MeanNormZeroPadParticles( filename, None, self.pfCand_neutral_branches, self.npfCand_neutral, self.nsamples) x_pfCand_charged = MeanNormZeroPadParticles( filename, None, self.pfCand_charged_branches, self.npfCand_charged, self.nsamples) x_pfCand_photon = MeanNormZeroPadParticles(filename, None, self.pfCand_photon_branches, self.npfCand_photon, self.nsamples) x_pfCand_electron = MeanNormZeroPadParticles( filename, None, self.pfCand_electron_branches, self.npfCand_electron, self.nsamples) x_pfCand_muon = MeanNormZeroPadParticles(filename, None, self.pfCand_muon_branches, self.npfCand_muon, self.nsamples) x_pfCand_SV = MeanNormZeroPadParticles(filename, None, self.SV_branches, self.nSV, self.nsamples) import uproot3 as uproot urfile = uproot.open(filename)["tree"] mytruth = [] for arr in self.truth_branches: mytruth.append(np.expand_dims(urfile.array(arr), axis=1)) truth = np.concatenate(mytruth, axis=1) # important, float32 and C-type! truth = truth.astype(dtype='float32', order='C') x_global = x_global.astype(dtype='float32', order='C') x_pfCand_neutral = x_pfCand_neutral.astype(dtype='float32', order='C') x_pfCand_charged = x_pfCand_charged.astype(dtype='float32', order='C') x_pfCand_photon = x_pfCand_photon.astype(dtype='float32', order='C') x_pfCand_electron = x_pfCand_electron.astype(dtype='float32', order='C') x_pfCand_muon = x_pfCand_muon.astype(dtype='float32', order='C') x_pfCand_SV = x_pfCand_SV.astype(dtype='float32', order='C') if self.remove: b = [self.weightbranchX, self.weightbranchY] b.extend(self.truth_branches) b.extend(self.undefTruth) fileTimeOut(filename, 120) for_remove = root2array( # returns a structured np array filename, treename="tree", stop=None, branches=b) notremoves = weighterobjects['weigther'].createNotRemoveIndices( for_remove) print('took ', sw.getAndReset(), ' to create remove indices') if self.remove: x_global = x_global[notremoves > 0] x_pfCand_neutral = x_pfCand_neutral[notremoves > 0] x_pfCand_charged = x_pfCand_charged[notremoves > 0] x_pfCand_photon = x_pfCand_photon[notremoves > 0] x_pfCand_electron = x_pfCand_electron[notremoves > 0] x_pfCand_muon = x_pfCand_muon[notremoves > 0] x_pfCand_SV = x_pfCand_SV[notremoves > 0] truth = truth[notremoves > 0] newnsamp = x_global.shape[0] print('Weighter reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') print('removing nans') x_global = np.where(np.isfinite(x_global), x_global, 0) x_pfCand_neutral = np.where(np.isfinite(x_pfCand_neutral), x_pfCand_neutral, 0) x_pfCand_charged = np.where(np.isfinite(x_pfCand_charged), x_pfCand_charged, 0) x_pfCand_photon = np.where(np.isfinite(x_pfCand_photon), x_pfCand_photon, 0) x_pfCand_electron = np.where(np.isfinite(x_pfCand_electron), x_pfCand_electron, 0) x_pfCand_muon = np.where(np.isfinite(x_pfCand_muon), x_pfCand_muon, 0) x_pfCand_SV = np.where(np.isfinite(x_pfCand_SV), x_pfCand_SV, 0) return [ x_global, x_pfCand_neutral, x_pfCand_charged, x_pfCand_photon, x_pfCand_electron, x_pfCand_muon, x_pfCand_SV ], [truth], []
def readFromRootFile(self, filename, TupleMeanStd, weighter): #the first part is standard, no changes needed from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles import numpy import ROOT fileTimeOut(filename, 120) #give eos 2 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get("tree") self.nsamples = tree.GetEntries() #the definition of what to do with the branches # those are the global branches (jet pt etc) # they should be just glued to each other in one vector # and zero padded (and mean subtracted and normalised) #x_global = MeanNormZeroPad(filename,TupleMeanStd, # [self.branches[0]], # [self.branchcutoffs[0]],self.nsamples) # the second part (the pf candidates) should be treated particle wise # an array with (njets, nparticles, nproperties) is created x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0], self.branchcutoffs[0], self.nsamples) x_dbr = ZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_db = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) Tuple = self.readTreeFromRootToTuple(filename) notremoves = weighter.createNotRemoveIndices(Tuple) if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) empty = numpy.empty(self.nsamples) # create all collections: truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(truthtuple) undef = numpy.sum(alltruth, axis=1) weights = weights[undef > 0] x_glb = x_glb[undef > 0] x_db = x_db[undef > 0] alltruth = alltruth[undef > 0] # print("LENS", len(weights), len(notremoves)) # remove the entries to get same jet shapes if self.remove: print('remove') notremoves = notremoves[undef > 0] weights = weights[notremoves > 0] x_glb = x_glb[notremoves > 0] x_db = x_db[notremoves > 0] alltruth = alltruth[notremoves > 0] #newnsamp=x_global.shape[0] newnsamp = x_glb.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp # fill everything self.w = [weights] self.x = [x_db] self.z = [x_glb, x_dbr] self.y = [alltruth]
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, createCountMap, createDensity, MeanNormZeroPad, createDensityMap, MeanNormZeroPadParticles import numpy from DeepJetCore.stopwatch import stopwatch sw = stopwatch() swall = stopwatch() import ROOT fileTimeOut(filename, 120) #give eos a minute to recover rfile = ROOT.TFile(filename) tree = rfile.Get("deepntuplizer/tree") self.nsamples = tree.GetEntries() print('took ', sw.getAndReset(), ' seconds for getting tree entries') # split for convolutional network x_global = MeanNormZeroPad(filename, TupleMeanStd, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) #here the difference starts nbins = 8 x_chmap = createDensity( filename, inbranches=['Cpfcan_ptrel', 'Cpfcan_etarel', 'Cpfcan_phirel'], modes=['sum', 'average', 'average'], nevents=self.nsamples, dimension1=['Cpfcan_eta', 'jet_eta', nbins, 0.45], dimension2=['Cpfcan_phi', 'jet_phi', nbins, 0.45], counterbranch='nCpfcand', offsets=[-1, -0.5, -0.5]) x_neumap = createDensity( filename, inbranches=['Npfcan_ptrel', 'Npfcan_etarel', 'Npfcan_phirel'], modes=['sum', 'average', 'average'], nevents=self.nsamples, dimension1=['Npfcan_eta', 'jet_eta', nbins, 0.45], dimension2=['Npfcan_phi', 'jet_phi', nbins, 0.45], counterbranch='nCpfcand', offsets=[-1, -0.5, -0.5]) x_chcount = createCountMap(filename, TupleMeanStd, self.nsamples, ['Cpfcan_eta', 'jet_eta', nbins, 0.45], ['Cpfcan_phi', 'jet_phi', nbins, 0.45], 'nCpfcand') x_neucount = createCountMap(filename, TupleMeanStd, self.nsamples, ['Npfcan_eta', 'jet_eta', nbins, 0.45], ['Npfcan_phi', 'jet_phi', nbins, 0.45], 'nNpfcand') print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') Tuple = self.readTreeFromRootToTuple(filename) if self.remove: notremoves = weighter.createNotRemoveIndices(Tuple) undef = Tuple['isUndefined'] notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: print('neither remove nor weight') weights = numpy.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] #print(self.truthclasses) alltruth = self.reduceTruth(truthtuple) regtruth = Tuple['gen_pt_WithNu'] regreco = Tuple['jet_corr_pt'] #print(alltruth.shape) if self.remove: print('remove') weights = weights[notremoves > 0] x_global = x_global[notremoves > 0] x_cpf = x_cpf[notremoves > 0] x_npf = x_npf[notremoves > 0] x_sv = x_sv[notremoves > 0] x_chmap = x_chmap[notremoves > 0] x_neumap = x_neumap[notremoves > 0] x_chcount = x_chcount[notremoves > 0] x_neucount = x_neucount[notremoves > 0] alltruth = alltruth[notremoves > 0] regreco = regreco[notremoves > 0] regtruth = regtruth[notremoves > 0] newnsamp = x_global.shape[0] print('reduced content to ', int(float(newnsamp) / float(self.nsamples) * 100), '%') self.nsamples = newnsamp x_map = numpy.concatenate((x_chmap, x_neumap, x_chcount, x_neucount), axis=3) self.w = [weights, weights] self.x = [x_global, x_cpf, x_npf, x_sv, x_map, regreco] self.y = [alltruth, regtruth]
def readFromRootFile(self, filename, TupleMeanStd, weighter): from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles import ROOT fileTimeOut(filename, 60) #give eos 1 minutes to recover rfile = ROOT.TFile(filename) tree = rfile.Get(self.treename) self.nsamples = tree.GetEntries() x_global = MeanNormZeroPad(filename, TupleMeanStd, [self.branches[0]], [self.branchcutoffs[0]], self.nsamples) x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[1], self.branchcutoffs[1], self.nsamples) x_npf = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[2], self.branchcutoffs[2], self.nsamples) x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd, self.branches[3], self.branchcutoffs[3], self.nsamples) Tuple = self.readTreeFromRootToTuple(filename) undef = Tuple['isUndefined'] if self.remove: notremoves = weighter.createNotRemoveIndices(Tuple) notremoves -= undef if self.weight: weights = weighter.getJetWeights(Tuple) elif self.remove: weights = notremoves else: weights = np.empty(self.nsamples) weights.fill(1.) truthtuple = Tuple[self.truthclasses] alltruth = self.reduceTruth(truthtuple) # scale down by number of classes in a reduced class if self.weight: if hasattr(self, 'reducedtruthmap'): for i, row in enumerate(iter(alltruth)): for t, truth in enumerate(self.reducedtruthclasses): if row[t] == 1: weights[i] = weights[i] * 1. / len( self.reducedtruthmap[truth]) # remove jets to have the same counts if self.remove: if hasattr(self, 'reducedtruthmap'): total = [] for rt in self.reducedtruthclasses: total += [ sum([ weighter.totalcounts[t] for t, truth in enumerate(self.truthclasses) if truth in self.reducedtruthmap[rt] ]) ] lowest = min(total) for i, row in enumerate(iter(alltruth)): for t, truth in enumerate(self.reducedtruthclasses): if not row[t]: continue keep = float(lowest) / total[t] rand = np.random.ranf() if rand > keep: notremoves[i] = 0 else: total = weighter.totalcounts lowest = min(total) for i, row in enumerate(iter(truthtuple)): for t, truth in enumerate(self.truthclasses): if not row[t]: continue keep = float(lowest) / total[t] rand = np.random.ranf() if rand > keep: notremoves[i] = 0 # pt cut #pt = Tuple['jet_pt'] #weights = weights[ pt > 30] #x_global = x_global[pt > 30] #x_cpf = x_cpf[ pt > 30] #x_npf = x_npf[ pt > 30] #x_sv = x_sv[ pt > 30] #alltruth = alltruth[pt > 30] if self.remove: weights = weights[notremoves > 0] x_global = x_global[notremoves > 0] x_cpf = x_cpf[notremoves > 0] x_npf = x_npf[notremoves > 0] x_sv = x_sv[notremoves > 0] alltruth = alltruth[notremoves > 0] if self.weight: x_global = x_global[weights > 0] x_cpf = x_cpf[weights > 0] x_npf = x_npf[weights > 0] x_sv = x_sv[weights > 0] alltruth = alltruth[weights > 0] weights = weights[weights > 0] #if self.remove or self.weight: if True: # remove samples with no predicted class skip = np.all(alltruth == 0, axis=1) alltruth = alltruth[~skip] x_global = x_global[~skip] x_cpf = x_cpf[~skip] x_npf = x_npf[~skip] x_sv = x_sv[~skip] weights = weights[~skip] # remove samples with multiple predicted classes skip = np.sum(alltruth, axis=1) > 1 alltruth = alltruth[~skip] x_global = x_global[~skip] x_cpf = x_cpf[~skip] x_npf = x_npf[~skip] x_sv = x_sv[~skip] weights = weights[~skip] newnsamp = x_global.shape[0] logging.info('reduced content to {}%'.format( int(float(newnsamp) / float(self.nsamples) * 100))) self.nsamples = newnsamp if weights.ndim > 1: weights = weights.reshape(weights.shape[0]) self.w = [weights] self.x = [x_global, x_cpf, x_npf, x_sv] self.y = [alltruth]