Ejemplo n.º 1
0
    def createWeighterObjects(self, allsourcefiles):
        #
        # Calculates the weights needed for flattening the pt/eta spectrum

        from DeepJetCore.Weighter import Weighter
        weighter = Weighter()
        weighter.undefTruth = self.undefTruth
        weighter.class_weights = self.class_weights
        branches = [self.weightbranchX, self.weightbranchY]
        branches.extend(self.truth_branches)

        if self.remove:
            weighter.setBinningAndClasses([self.weight_binX, self.weight_binY],
                                          self.weightbranchX,
                                          self.weightbranchY,
                                          self.truth_branches,
                                          self.red_classes,
                                          self.truth_red_fusion,
                                          method=self.referenceclass)

        counter = 0
        import ROOT
        from root_numpy import tree2array, root2array
        if self.remove:
            for fname in allsourcefiles:
                fileTimeOut(fname, 120)
                nparray = uproot_root2array(fname,
                                            treename="ttree",
                                            stop=None,
                                            branches=branches)
                norm_hist = True
                if self.referenceclass == 'flatten':
                    norm_hist = False
                # from IPython import embed;embed()
                weighter.addDistributions(nparray, norm_h=norm_hist)
                #del nparray
                counter = counter + 1
            weighter.createRemoveProbabilitiesAndWeights(self.referenceclass)

        print("calculate means")
        print("debugging this point here!")
        from DeepJetCore.preprocessing import meanNormProd
        nparray = self.readTreeFromRootToTuple(
            allsourcefiles,
            branches=self.vtx_branches + self.eta_rel_branches +
            self.track_branches + self.global_branches,
            limit=500000)
        print("read tree from sourcefiles")
        for a in (self.vtx_branches + self.eta_rel_branches +
                  self.track_branches + self.global_branches):
            for b in range(len(nparray[a])):
                nparray[a][b] = np.where(
                    np.logical_and(np.isfinite(nparray[a][b]),
                                   np.abs(nparray[a][b]) < 100000.0),
                    nparray[a][b], 0)
        means = np.array([], dtype='float32')
        if len(nparray):
            means = meanNormProd(nparray)
        print("weigheter created")
        return {'weigther': weighter, 'means': means}
Ejemplo n.º 2
0
 def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="Events"):
     
     fileTimeOut(filename, 10)#10 seconds for eos to recover 
     tree = uproot.open(filename)[treename]
     
     
     rechitcoll = RecHitCollection(use_true_muon_momentum=self.include_tracks,
                                   cp_plus_pu_mode=self.cp_plus_pu_mode,
                                   tree=tree)
     
     #in a similar manner, we can also add tracks from conversions etc here
     if self.include_tracks:
         trackcoll = TrackCollection(tree=tree)
         rechitcoll.append(trackcoll)
     
     # adds t_is_unique
     rechitcoll.addUniqueIndices()
     
     # converts to DeepJetCore.SimpleArray
     farr = rechitcoll.getFinalFeaturesSA()
     t = rechitcoll.getFinalTruthDictSA()
     
     return [farr, 
             t['t_idx'], t['t_energy'], t['t_pos'], t['t_time'], 
             t['t_pid'], t['t_spectator'], t['t_fully_contained'],
             t['t_rec_energy'], t['t_is_unique'] ],[], []
Ejemplo n.º 3
0
    def createWeighterObjects(self, allsourcefiles):
        #
        # Calculates the weights needed for flattening the pt/eta spectrum

        from DeepJetCore.Weighter import Weighter
        weighter = Weighter()
        weighter.undefTruth = self.undefTruth
        branches = [self.weightbranchX, self.weightbranchY]
        branches.extend(self.truth_branches)

        if self.remove:
            weighter.setBinningAndClasses([self.weight_binX, self.weight_binY],
                                          self.weightbranchX,
                                          self.weightbranchY,
                                          self.truth_branches)

        counter = 0
        import ROOT
        from root_numpy import tree2array, root2array
        if self.remove:
            for fname in allsourcefiles:
                fileTimeOut(fname, 120)
                nparray = root2array(fname,
                                     treename="deepntuplizer/tree",
                                     stop=None,
                                     branches=branches)
                weighter.addDistributions(nparray)
                #del nparray
                counter = counter + 1
            weighter.createRemoveProbabilitiesAndWeights(self.referenceclass)
        return {'weigther': weighter}
Ejemplo n.º 4
0
 def readFromRootFile(self,filename,TupleMeanStd, weighter):
 
     # this function defines how to convert the root ntuple to the training format
     # options are not yet described here
     
     import ROOT
     fileTimeOut(filename,120) #give eos a minute to recover
     rfile = ROOT.TFile(filename)
     tree = rfile.Get("tree")
     self.nsamples=tree.GetEntries()
     
     
     # user code, example works with the example 2D images in root format generated by make_example_data
     from DeepJetCore.preprocessing import read2DArray
     print(filename)
     feature_array = read2DArray(filename,"tree","image2d",self.nsamples,32,32)
     
     print('feature_array',feature_array.shape)
     truth = self.read_truthclasses(filename)
     
     #notremoves=weighter.createNotRemoveIndices(Tuple)
     
     # this removes parts of the dataset for weighting the events
     #feature_array = feature_array[notremoves > 0]
             
     # call this in the end
     
     self.nsamples=len(feature_array)
     
     self.x=[feature_array] # list of feature numpy arrays
     self.y=[truth] # list of target numpy arrays (truth)
     self.w=[] # list of weight arrays. One for each truth target, not used
Ejemplo n.º 5
0
    def readFromRootFile(self,filename,TupleMeanStd, weighter):
        # this function defines how to convert the root ntuple to the training format
        # options are not yet described here

        fileTimeOut(filename,120)

        uproot_tree = uproot.open(filename)['clusters']

        cluster_pt = uproot_tree.array('cluster_pt')
        pt_filter = cluster_pt > 5.

        n_cell = uproot_tree.array('n_cell')

        def to_ndarray(*args):
            return numpy.squeeze(numpy.dstack(args))

        branches = [
            'cell_layer',
            'cell_x',
            'cell_y',
            'cell_z',
            'cell_r',
            'cell_eta',
            'cell_theta',
            'cell_phi',
            'cell_dist',
            'cell_energy',
            'cell_wafer',
            'cell_wafertype'
        ]

        print("reading feature array")
        feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray)
        print(feature_array.shape)

        print("reading truth")
        #truth = self.read_truthclasses(filename)
        truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray)

        print("creating remove indxs")
        Tuple = self.readTreeFromRootToTuple(filename)
        notremoves=weighter.createNotRemoveIndices(Tuple)

        notremoves += pt_filter

        # this removes parts of the dataset for weighting the events
        if self.remove:
            feature_array = feature_array[notremoves > 0]
            n_cell = n_cell[notremoves > 0]
            truth = truth[notremoves > 0]
        # call this in the end

        self.nsamples=len(feature_array)

        self.x=[n_cell, feature_array] # list of feature numpy arrays
        self.y=[truth] # list of target numpy arrays (truth)
        self.w=[] # list of weight arrays. One for each truth target
Ejemplo n.º 6
0
def readPU(minbias_files, nevents=50, nfiles=5, nPU=200):

    from DeepJetCore.TrainData import fileTimeOut
    import ROOT

    select = np.array(range(len(minbias_files)))
    np.random.shuffle(select)
    if len(select) < nfiles:
        nfiles = len(select)
        print(
            "mixing.readPU: warning: less PU files available than requested - falling back"
        )
        print(nfiles)
    #print(select)
    #open them
    #take nPU random events
    inarrs = []
    i = 0
    while len(inarrs) < nfiles:
        file = minbias_files[select[i]]
        i += 1
        #print('mixing: get data '+str(select[i])+' to go '+str(nfiles-i-1))
        fileTimeOut(file, 10)
        #check if file is valid
        try:
            f = ROOT.TFile.Open(file)
            f.Get("B4")
        except:
            continue

        ramfile = file
        try:
            tree = uproot.open(ramfile)["B4"]
            arr = tonumpy(tree["rechit_energy"].array())
            #print('arr',arr.shape)
            #arr = np.expand_dims(arr, axis=0)# 1 x nev x rh
            #print('arr2',arr.shape, ramfile)
            inarrs.append(arr)
        except:
            continue

    allarr = np.concatenate(inarrs, axis=0)  # nfiles*nev x rh
    #allarr = np.reshape(allarr, [allarr.shape[0]*allarr.shape[1],allarr.shape[2]])

    #print(allarr.shape)
    #print('mixing events')
    evtarrs = []
    for ev in range(nevents):
        idx = np.random.randint(allarr.shape[0], size=nPU)
        evt = allarr[idx]
        evt = np.sum(evt, axis=0, keepdims=True)
        #print(evt.shape)
        evtarrs.append(evt)

    evts = np.concatenate(evtarrs, axis=0)
    #print('mixed '+str(evts.shape))
    return evts
Ejemplo n.º 7
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):

        # this function defines how to convert the root ntuple to the training format
        # options are not yet described here

        import ROOT
        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get(self.treename)
        self.nsamples = tree.GetEntries()

        # user code, example works with the example 2D images in root format generated by make_example_data
        from DeepJetCore.preprocessing import readListArray

        feature_array, n_rechits_cut = readListArray(
            filename,
            self.treename,
            self.feat_branch,
            self.nsamples,
            list_size=self.max_rechits,
            n_feat_per_element=self.n_features,
            zeropad=True,
            list_size_cut=True)

        energy_only = feature_array[:, :, 0:1]  #keep dimension

        fraction_array, _ = readListArray(
            filename,
            self.treename,
            self.truth_branch,
            self.nsamples,
            list_size=self.max_rechits,
            n_feat_per_element=self.
            n_simcluster,  #nsimcluster, right now just one, but zero-padded here
            zeropad=True,
            list_size_cut=True)

        print('TrainData_hitlistX: ', filename,
              ';convert from root: fraction of hits cut ',
              100. * float(n_rechits_cut) / float(self.nsamples), '%')

        #needs the energy, too to determine weights
        fraction_array = numpy.concatenate([fraction_array, energy_only],
                                           axis=-1)
        #in case something was removed here
        if n_rechits_cut > 0:
            feature_array = feature_array[0:self.nsamples - n_rechits_cut]
            fraction_array = fraction_array[0:self.nsamples - n_rechits_cut]

        self.nsamples = len(feature_array)

        self.x = [feature_array]
        self.y = [
            fraction_array
        ]  # we need the features also in the truth part for weighting
        self.w = []  # no event weights
Ejemplo n.º 8
0
 def fileIsValid(self, filename):
     try:
         fileTimeOut(filename, 2)
         tree = uproot.open(filename)["WindowNTupler/tree"]
         f = ROOT.TFile.Open(filename)
         t = f.Get("WindowNTupler/tree")
         if t.GetEntries() < 1:
             raise ValueError("")
     except Exception as e:
         print(e)
         return False
     return True
Ejemplo n.º 9
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):
        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles
        import numpy
        from DeepJetCore.stopwatch import stopwatch
        
        sw=stopwatch()
        swall=stopwatch()
        
        import ROOT
        
        fileTimeOut(filename,120) #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples=tree.GetEntries()
        
        print('took ', sw.getAndReset(), ' seconds for getting tree entries')
        
        
        # split for convolutional network
        
        x_global = MeanNormZeroPad(filename,None,
                                   [self.branches[0]],
                                   [self.branchcutoffs[0]],self.nsamples)
        
        x_cpf = MeanNormZeroPadParticles(filename,None,
                                   self.branches[1],
                                   self.branchcutoffs[1],self.nsamples)
        
        x_npf = MeanNormZeroPadParticles(filename,None,
                                   self.branches[2],
                                   self.branchcutoffs[2],self.nsamples)
        
        x_sv = MeanNormZeroPadParticles(filename,None,
                                   self.branches[3],
                                   self.branchcutoffs[3],self.nsamples)
        
        
        
        print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)')
        
        Tuple = self.readTreeFromRootToTuple(filename)
        
        
        
        truthtuple =  Tuple[self.truthclasses]
        #print(self.truthclasses)
        alltruth=self.reduceTruth(truthtuple)
        
       
        print(x_global.shape,self.nsamples)

        return [x_global,x_cpf,x_npf,x_sv], [alltruth], []
Ejemplo n.º 10
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):
        # this function defines how to convert the root ntuple to the training format
        # options are not yet described here

        fileTimeOut(filename, 120)

        uproot_tree = uproot.open(filename)['clusters']

        cluster_pt = uproot_tree.array('cluster_pt')
        pt_filter = cluster_pt > 5.

        n_cell = uproot_tree.array('n_cell')

        def to_ndarray(*args):
            return np.stack(args, axis=-1)

        branches = ['cell_energy', 'cell_theta', 'cell_phi', 'cell_z']

        print("reading feature array")
        feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray)
        print(feature_array.shape)

        print("reading truth")
        #truth = self.read_truthclasses(filename)
        truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray)

        egamma = truth[..., 0:1] + truth[..., 2:3]
        muon = truth[..., 1:2]
        pi0 = truth[..., 3:4]
        hadron = truth[..., 4:5] + truth[..., 5:6]
        truth = np.concatenate((egamma, muon, pi0, hadron), axis=-1)

        print("creating remove indxs")
        Tuple = self.readTreeFromRootToTuple(filename)
        notremoves = weighter.createNotRemoveIndices(Tuple)

        notremoves += pt_filter

        # this removes parts of the dataset for weighting the events
        if self.remove:
            feature_array = feature_array[notremoves > 0]
            n_cell = n_cell[notremoves > 0]
            truth = truth[notremoves > 0]
        # call this in the end

        self.nsamples = len(feature_array)

        self.x = [feature_array, n_cell]  # list of feature numpy arrays
        self.y = [truth]  # list of target numpy arrays (truth)
        self.w = []  # list of weight arrays. One for each truth target
Ejemplo n.º 11
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):
        # This is the only really mandatory function (unless writeFromSourceFile is defined).
        # It defines the conversion rule from an input source file to the lists of training
        # arrays self.x, self.y, self.w
        #  self.x is a list of input feature arrays
        #  self.y is a list of truth arrays
        #  self.w is optional and can contain a weight array
        #         (needs to have same number of entries as truth array)
        #         If no weights are needed, this can be left completely empty
        #
        # The conversion should convert finally to numpy arrays. In the future,
        # also tensorflow tensors will be supported.
        #
        # In this example, differnt ways of reading files are deliberatly mixed
        #

        print('reading ' + filename)

        import ROOT
        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("tree")
        nsamples = tree.GetEntries()

        # user code, example works with the example 2D images in root format generated by make_example_data
        from DeepJetCore.preprocessing import read2DArray

        feature_array = read2DArray(filename, "tree", "image2d", nsamples, 32,
                                    32)

        print('feature_array', feature_array.shape)

        import uproot

        urfile = uproot.open(filename)["tree"]
        truth = np.concatenate([
            np.expand_dims(urfile.array("isA"), axis=1),
            np.expand_dims(urfile.array("isB"), axis=1),
            np.expand_dims(urfile.array("isC"), axis=1)
        ],
                               axis=1)

        truth = truth.astype(dtype='float32',
                             order='C')  #important, float32 and C-type!

        self.nsamples = len(feature_array)

        #returns a list of feature arrays, a list of truth arrays and a list of weight arrays
        return [feature_array], [truth], []
Ejemplo n.º 12
0
 def fileIsValid(self, filename):
     #uproot does not raise exceptions early enough for testing
     import ROOT
     try:
         fileTimeOut(filename, 2)
         tree = uproot.open(filename)["Events"]
         f=ROOT.TFile.Open(filename)
         t=f.Get("Events")
         if t.GetEntries() < 1:
             raise ValueError("")
     except Exception as e:
         print('problem with file',filename)
         print(e)
         return False
     return True
Ejemplo n.º 13
0
    def createWeighterObjects(self, allsourcefiles):
        #
        # Calculates the weights needed for flattening the pt/eta spectrum

        from DeepJetCore.Weighter import Weighter
        weighter = Weighter()
        weighter.undefTruth = self.undefTruth
        branches = [self.weightbranchX, self.weightbranchY]
        branches.extend(self.truth_branches)

        if self.remove:
            weighter.setBinningAndClasses([self.weight_binX, self.weight_binY],
                                          self.weightbranchX,
                                          self.weightbranchY,
                                          self.truth_branches)

        counter = 0
        import ROOT
        from root_numpy import tree2array, root2array
        if self.remove:
            for fname in allsourcefiles:
                fileTimeOut(fname, 120)
                nparray = root2array(fname,
                                     treename="deepntuplizer/tree",
                                     stop=None,
                                     branches=branches)
                weighter.addDistributions(nparray)
                #del nparray
                counter = counter + 1
            weighter.createRemoveProbabilitiesAndWeights(self.referenceclass)

        print("calculate means")
        from DeepJetCore.preprocessing import meanNormProd
        nparray = self.readTreeFromRootToTuple(
            allsourcefiles,
            branches=self.vtx_branches + self.eta_rel_branches +
            self.track_branches + self.global_branches,
            limit=500000)
        for a in (self.vtx_branches + self.eta_rel_branches +
                  self.track_branches + self.global_branches):
            for b in range(len(nparray[a])):
                nparray[a][b] = np.where(nparray[a][b] < 100000.0,
                                         nparray[a][b], 0)
        means = np.array([], dtype='float32')
        if len(nparray):
            means = meanNormProd(nparray)
        return {'weigther': weighter, 'means': means}
Ejemplo n.º 14
0
 def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="SLCIOConverted"):
     
     fileTimeOut(filename, 10)#10 seconds for eos to recover 
     
     tree = uproot.open(filename)[treename]
     nevents = tree.numentries
     selection=None
     
     hit_energy , rs = self.branchToFlatArray(tree["energy"], True,selection)
     hit_x  = self.branchToFlatArray(tree["positionX"], False,selection)
     hit_y  = self.branchToFlatArray(tree["positionY"], False,selection)
     hit_z  = self.branchToFlatArray(tree["positionZ"], False,selection)
     
     
     hit_ass_truth_idx = self.branchToFlatArray(tree["maxE_particle_index"], False,selection)
     hit_ass_truth_energy = self.branchToFlatArray(tree["maxE_particle_energy"], False,selection)
     
     #not used right now
     hit_ass_truth_pX = self.branchToFlatArray(tree["maxE_particle_pX"], False,selection)
     hit_ass_truth_pY = self.branchToFlatArray(tree["maxE_particle_pY"], False,selection)
     hit_ass_truth_pZ = self.branchToFlatArray(tree["maxE_particle_pZ"], False,selection)
     
     
     
     features = np.concatenate([
         hit_energy,
         hit_x   ,
         hit_y, 
         hit_z 
         ], axis=-1)
     
     farr = SimpleArray(features,rs,name="features")
     
     t_idxarr = SimpleArray(hit_ass_truth_idx,rs,name="t_idx")
     t_energyarr = SimpleArray(hit_ass_truth_energy,rs,name="t_energy")
     
     zeros = np.zeros_like(hit_ass_truth_energy)
     #just for compatibility
     t_posarr = SimpleArray(zeros,rs,name="t_pos")
     t_time = SimpleArray(zeros,rs,name="t_time")
     t_pid = SimpleArray(zeros,rs,name="t_pid") #this would need some massaging so we can't use the PID directly
     t_spectator = SimpleArray(zeros,rs,name="t_spectator")
     t_fully_contained = SimpleArray(zeros,rs,name="t_fully_contained")
     
     t_rest = SimpleArray(zeros,rs,name="t_rest") #breaks with old plotting but needs to be done at some point
             
     return [farr, t_idxarr, t_energyarr, t_posarr, t_time, t_pid, t_spectator, t_fully_contained],[t_rest], []
Ejemplo n.º 15
0
    def readTreeFromRootToTuple(self, filenames, limit=None, branches=None):
        '''
        To be used to get the initial tupel for further processing in inherting classes
        Makes sure the number of entries is properly set
        
        can also read a list of files (e.g. to produce weights/removes from larger statistics
        (not fully tested, yet)
        '''

        if branches is None or len(branches) == 0:
            return np.array([], dtype='float32')

        #print(branches)
        #remove duplicates
        usebranches = list(set(branches))
        tmpbb = []
        for b in usebranches:
            if len(b):
                tmpbb.append(b)
        usebranches = tmpbb

        import ROOT
        from root_numpy import tree2array, root2array
        if isinstance(filenames, list):
            for f in filenames:
                fileTimeOut(f, 120)
            print('add files')
            print('debugging this')
            print("Branches:\n{}".format(usebranches))

            import uproot as ur
            import awkward as ak
            # this was substituted from the old root2array function
            nparray = uproot_root2array(filenames,
                                        treename="ttree",
                                        stop=limit,
                                        branches=usebranches)
            print('done add files')
            return nparray
        else:
            fileTimeOut(filenames, 120)  #give eos a minute to recover
            rfile = ROOT.TFile(filenames)
            tree = rfile.Get(self.treename)
            if not self.nsamples:
                self.nsamples = tree.GetEntries()
            nparray = tree2array(tree, stop=limit, branches=usebranches)
            return nparray
Ejemplo n.º 16
0
 def readFromRootFile(self,filename,TupleMeanStd, weighter):
 
     # this function defines how to convert the root ntuple to the training format
     # options are not yet described here
     
     import ROOT
     fileTimeOut(filename,120) #give eos a minute to recover
     rfile = ROOT.TFile(filename)
     tree = rfile.Get(self.treename)
     self.nsamples=tree.GetEntries()
     
     max_rechits = 40000
     
     # user code, example works with the example 2D images in root format generated by make_example_data
     from DeepJetCore.preprocessing import  readListArray
     
     feature_array = readListArray(filename,
                                   self.treename,
                                   "rechit_features",
                                   self.nsamples,
                                   list_size=max_rechits, 
                                   n_feat_per_element=7,
                                   zeropad=True)
     
     
     energy_only = feature_array[:,:,0:1]#keep dimension
     
     
     fraction_array = readListArray(filename,
                                    self.treename,
                                    "simcluster_fractions",
                                    self.nsamples,
                                   list_size=max_rechits, 
                                   n_feat_per_element=7,#nsimcluster, right now just one, but zero-padded here
                                   zeropad=True)
     
     
     #needs the energy, too to determine weights
     fraction_array = numpy.concatenate([fraction_array,energy_only],axis=-1)
     #in case something was removed here
     self.nsamples=len(feature_array)
     
     self.x=[feature_array] 
     self.y=[fraction_array] # we need the features also in the truth part for weighting
     self.w=[] # no event weights
Ejemplo n.º 17
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):

        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles
        import numpy
        from DeepJetCore.stopwatch import stopwatch

        sw = stopwatch()
        swall = stopwatch()

        import ROOT

        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()

        print('took ', sw.getAndReset(), ' seconds for getting tree entries')

        # split for convolutional network

        x_global = MeanNormZeroPad(filename, None, ['x'], [1], self.nsamples)

        print('took ', sw.getAndReset(),
              ' seconds for mean norm and zero padding (C module)')

        Tuple = self.readTreeFromRootToTuple(
            filename, branches=['class1', 'class2', 'x'])

        truthtuple = Tuple[self.truthclasses]

        alltruth = self.reduceTruth(truthtuple)

        #print(x_global.shape,x_global[0:10])
        #print(alltruth.shape,alltruth[0:10])
        #print(alltruth.flags)

        newnsamp = x_global.shape[0]
        self.nsamples = newnsamp

        print(x_global.shape, alltruth.shape, self.nsamples)

        truth = SimpleArray(alltruth, name="truth")
        feat = SimpleArray(x_global, name="features0")

        return [feat], [truth], []
Ejemplo n.º 18
0
 def readFromRootFile(self,filename,TupleMeanStd, weighter):
 
     # this function defines how to convert the root ntuple to the training format
     # options are not yet described here
     import numpy as np
     import ROOT
     fileTimeOut(filename,120) #give eos a minute to recover
     rfile = ROOT.TFile(filename)
     tree = rfile.Get("tree")
     self.nsamples=tree.GetEntries()
     
     
     # user code, example works with the example 2D images in root format generated by make_example_data
     from DeepJetCore.preprocessing import read2DArray,readListArray
     print(filename)
     feature_image = read2DArray(filename,"tree","image2d",self.nsamples,24,24)
     
     npy_array = self.readTreeFromRootToTuple(filename)
     scale   = np.expand_dims(npy_array['scale'],axis=1)
     xcenter = np.expand_dims(npy_array['xcenter'],axis=1)
     ycenter = np.expand_dims(npy_array['ycenter'],axis=1)
     ptype   = np.expand_dims(npy_array['type'],axis=1)
     
     print('ycenter',ycenter.shape)
     
     add_features = np.concatenate([scale,xcenter,ycenter,ptype],axis=1)
     
     
     xcoords = numpy.expand_dims( numpy.array(list(npy_array['xcoords']),dtype='float32'), axis=2)
     ycoords = numpy.expand_dims( numpy.array(list(npy_array['ycoords']),dtype='float32'), axis=2)
     xcoords = numpy.reshape(xcoords, newshape=[xcoords.shape[0],24,24,1])
     ycoords = numpy.reshape(ycoords, newshape=[xcoords.shape[0],24,24,1])
     
     print('xcoords',xcoords.shape)
     
     all_coords = numpy.concatenate([xcoords,ycoords],axis=-1)
     
     #readListArray(filename,"tree","frac_at_idxs",self.nsamples,4,1)
     
     alltruth = numpy.zeros(self.nsamples)+1. #this is real data
     
     self.x = [feature_image,all_coords,add_features] 
     self.y = [alltruth]
     self.w=[]
Ejemplo n.º 19
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):
        # this function defines how to convert the root ntuple to the training format
        # options are not yet described here

        fileTimeOut(filename, 120)

        uproot_tree = uproot.open(filename)['clusters']

        def to_ndarray(*args):
            return numpy.squeeze(numpy.dstack(args))

        branches_template = [
            'bin_eta', 'bin_theta', 'bin_phi', 'bin_x', 'bin_y',
            'bin_eta_global', 'bin_theta_global', 'bin_phi_global',
            'bin_dist_global', 'bin_x_global', 'bin_y_global', 'bin_z_global',
            'bin_energy', 'bin_layer'
        ]
        branches = []
        for icell in range(2):
            branches.extend([b + ('_%d' % icell) for b in branches_template])

        feature_array = uproot_tree.arrays(branches, outputtype=to_ndarray)
        feature_array = numpy.reshape(feature_array, (-1, 5, 5, 38, 28))

        print("reading truth")
        truth = uproot_tree.arrays(self.truthclasses, outputtype=to_ndarray)

        Tuple = self.readTreeFromRootToTuple(filename)

        print("creating remove indxs")
        notremoves = weighter.createNotRemoveIndices(Tuple)

        # this removes parts of the dataset for weighting the events
        if self.remove:
            feature_array = feature_array[notremoves > 0]
            truth = truth[notremoves > 0]
        # call this in the end

        self.nsamples = len(feature_array)

        self.x = [feature_array]  # list of feature numpy arrays
        self.y = [truth]  # list of target numpy arrays (truth)
        self.w = []  # list of weight arrays. One for each truth target
Ejemplo n.º 20
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):

        # this function defines how to convert the root ntuple to the training format
        # options are not yet described here
        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles

        import ROOT

        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("tree")
        self.nsamples = tree.GetEntries()

        npy_array = self.readTreeFromRootToTuple(filename)

        truthtuple = npy_array[self.truthclasses]

        alltruth = self.reduceTruth(truthtuple)
        alltruept = npy_array[self.regtruth]

        # user code
        x_global = MeanNormZeroPad(filename, None, [self.branches[0]],
                                   [self.branchcutoffs[0]], self.nsamples)

        x_cpf = MeanNormZeroPadParticles(filename, None, self.branches[1],
                                         self.branchcutoffs[1], self.nsamples)

        x_npf = MeanNormZeroPadParticles(filename, None, self.branches[2],
                                         self.branchcutoffs[2], self.nsamples)

        x_recopts = MeanNormZeroPad(filename, None, [self.branches[3]],
                                    [self.branchcutoffs[3]], self.nsamples)

        nold = self.nsamples

        self.x = [x_global, x_cpf, x_npf,
                  x_recopts]  # list of feature numpy arrays
        self.y = [alltruth, alltruept]  # list of target numpy arrays (truth)
        self.w = []  # list of weight arrays. One for each truth target
        self._normalize_input_(weighter, npy_array)

        print('reduced to ', self.nsamples, 'of', nold)
Ejemplo n.º 21
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):

        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles
        import numpy
        from DeepJetCore.stopwatch import stopwatch

        sw = stopwatch()
        swall = stopwatch()

        import ROOT

        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()

        print('took ', sw.getAndReset(), ' seconds for getting tree entries')

        # split for convolutional network

        x_global = MeanNormZeroPad(filename, None, [self.branches[0]],
                                   [self.branchcutoffs[0]], self.nsamples)

        print('took ', sw.getAndReset(),
              ' seconds for mean norm and zero padding (C module)')

        Tuple = self.readTreeFromRootToTuple(filename)

        truthtuple = Tuple[self.truthclasses]
        alltruth = self.reduceTruth(truthtuple)

        newnsamp = x_global.shape[0]
        print('reduced content to ',
              int(float(newnsamp) / float(self.nsamples) * 100), '%')
        self.nsamples = newnsamp

        print(x_global.shape, self.nsamples)

        self.w = []
        self.x = [x_global]
        self.y = [alltruth]
Ejemplo n.º 22
0
    def createWeighterObjects(self, allsourcefiles):
        #
        # Calculates the weights needed for flattening the pt/eta spectrum

        from DeepJetCore.Weighter import Weighter
        weighter = Weighter()
        weighter.undefTruth = self.undefTruth
        weighter.class_weights = self.class_weights
        branches = [self.weightbranchX, self.weightbranchY]
        branches.extend(self.truth_branches)

        if self.remove:
            weighter.setBinningAndClasses([self.weight_binX, self.weight_binY],
                                          self.weightbranchX,
                                          self.weightbranchY,
                                          self.truth_branches,
                                          self.red_classes,
                                          self.truth_red_fusion,
                                          method=self.referenceclass)

        counter = 0
        import ROOT
        from root_numpy import tree2array, root2array
        if self.remove:
            for fname in allsourcefiles:
                fileTimeOut(fname, 120)
                nparray = root2array(fname,
                                     treename="deepntuplizer/tree",
                                     stop=None,
                                     branches=branches)
                norm_hist = True
                if self.referenceclass == 'flatten':
                    norm_hist = False
                weighter.addDistributions(nparray, norm_h=norm_hist)
                #del nparray
                counter = counter + 1
            weighter.createRemoveProbabilitiesAndWeights(self.referenceclass)
            #weighter.printHistos('/afs/cern.ch/user/a/ademoor/Flatten/') #If you need to print the 2D histo, choose your output dir
            return {'weigther': weighter}
Ejemplo n.º 23
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):

        #the first part is standard, no changes needed
        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles
        import numpy
        import ROOT

        fileTimeOut(filename, 120)  #give eos 2 minutes to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()

        #the definition of what to do with the branches

        # those are the global branches (jet pt etc)
        # they should be just glued to each other in one vector
        # and zero padded (and mean subtracted and normalised)
        #x_global = MeanNormZeroPad(filename,TupleMeanStd,
        #                           [self.branches[0]],
        #                           [self.branchcutoffs[0]],self.nsamples)

        # the second part (the pf candidates) should be treated particle wise
        # an array with (njets, nparticles, nproperties) is created

        x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0],
                                 self.branchcutoffs[0], self.nsamples)

        x_db = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                        self.branches[1],
                                        self.branchcutoffs[1], self.nsamples)

        x_db_raw = ZeroPadParticles(filename, TupleMeanStd, self.branches[1],
                                    self.branchcutoffs[1], self.nsamples)

        x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                         self.branches[2],
                                         self.branchcutoffs[2], self.nsamples)

        x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                        self.branches[3],
                                        self.branchcutoffs[3], self.nsamples)

        # now, some jets are removed to avoid pt and eta biases

        Tuple = self.readTreeFromRootToTuple(filename)
        if self.remove:
            # jets are removed until the shapes in eta and pt are the same as
            # the truth class 'fj_isNonBB'
            notremoves = weighter.createNotRemoveIndices(Tuple)
            #undef=Tuple[self.undefTruth]
        #notremoves-=undef

        if self.weight:
            weights = weighter.getJetWeights(Tuple)
        elif self.remove:
            weights = notremoves  #weighter.createNotRemoveIndices(Tuple)
        else:
            print('neither remove nor weight')
            weights = numpy.empty(self.nsamples)
            weights.fill(1.)

        truthtuple = Tuple[self.truthclasses]
        alltruth = self.reduceTruth(Tuple)
        undef = numpy.sum(alltruth, axis=1)

        if self.weight or self.remove:
            print('Training samples, remove undefined')
            weights = weights[undef > 0]
            x_glb = x_glb[undef > 0]
            x_db = x_db[undef > 0]
            x_db_raw = x_db_raw[undef > 0]
            x_sv = x_sv[undef > 0]
            x_cpf = x_cpf[undef > 0]
            alltruth = alltruth[undef > 0]

        if self.remove:
            print('Removing to match weighting')
            notremoves = notremoves[undef > 0]
            weights = weights[notremoves > 0]
            x_glb = x_glb[notremoves > 0]
            x_db = x_db[notremoves > 0]
            x_db_raw = x_db_raw[notremoves > 0]
            x_sv = x_sv[notremoves > 0]
            x_cpf = x_cpf[notremoves > 0]
            alltruth = alltruth[notremoves > 0]

        if self.weight:
            print('Adding weights, removing events with 0 weight')
            x_glb = x_glb[weights > 0]
            x_db = x_db[weights > 0]
            x_db_raw = x_db_raw[weights > 0]
            x_sv = x_sv[weights > 0]
            x_cpf = x_cpf[weights > 0]
            alltruth = alltruth[weights > 0]
            # Weights get adjusted last so they can be used as an index
            weights = weights[weights > 0]

        newnsamp = x_glb.shape[0]
        print('Keeping {}% of input events in the training dataCollection'.
              format(int(float(newnsamp) / float(self.nsamples) * 100)))
        self.nsamples = newnsamp

        #print("Subsample composition:")
        #for lab in ['fJ_isQCD', 'fj_isH', 'fj_isCC', 'fj_isBB']:
        #	print(numpy.sum((Tuple[lab].view(numpy.ndarray))), lab)
        #for lab, stat in zip(self.reducedtruthclasses, stats):
        #	print(lab, ': {}%'.format(stat))

        # fill everything
        self.w = [weights]
        self.x = [x_db, x_cpf, x_sv]
        self.z = [x_glb, x_db_raw]
        self.y = [alltruth]
Ejemplo n.º 24
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):

        #the first part is standard, no changes needed
        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles, ZeroPadParticles
        import numpy
        import ROOT

        fileTimeOut(filename, 120)  #give eos 2 minutes to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()

        x_glb = ZeroPadParticles(filename, TupleMeanStd, self.branches[0],
                                 self.branchcutoffs[0], self.nsamples)

        x_db = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                        self.branches[1],
                                        self.branchcutoffs[1], self.nsamples)

        # now, some jets are removed to avoid pt and eta biases

        Tuple = self.readTreeFromRootToTuple(filename)
        #if self.remove:
        # jets are removed until the shapes in eta and pt are the same as
        # the truth class 'fj_isNonBB'
        notremoves = weighter.createNotRemoveIndices(Tuple)
        if self.weight:
            weights = weighter.getJetWeights(Tuple)
        elif self.remove:
            weights = notremoves
        else:
            print('neither remove nor weight')
            weights = numpy.empty(self.nsamples)
            weights.fill(1.)

        # create all collections:
        #truthtuple =  Tuple[self.truthclasses]
        alltruth = self.reduceTruth(Tuple)
        undef = numpy.sum(alltruth, axis=1)
        #weights=weights[undef > 0]
        #x_glb=x_glb[undef > 0]
        #x_db=x_db[undef > 0]
        #alltruth=alltruth[undef > 0]
        notremoves = notremoves[undef > 0]

        undef = Tuple['fj_isNonCC'] * Tuple['sample_isQCD'] * Tuple[
            'fj_isQCD'] + Tuple['fj_isCC'] * Tuple['fj_isH']

        # remove the entries to get same jet shapes
        if self.remove:
            print('remove')
            weights = weights[notremoves > 0]
            x_glb = x_glb[notremoves > 0]
            x_db = x_db[notremoves > 0]
            alltruth = alltruth[notremoves > 0]

        newnsamp = x_glb.shape[0]
        print('reduced content to ',
              int(float(newnsamp) / float(self.nsamples) * 100), '%')
        self.nsamples = newnsamp

        # fill everything
        self.w = [weights]
        self.x = [x_db]
        self.z = [x_glb]
        self.y = [alltruth]
Ejemplo n.º 25
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):
        # This is the only really mandatory function (unless writeFromSourceFile is defined).
        # It defines the conversion rule from an input source file to the lists of training
        # arrays self.x, self.y, self.w
        #  self.x is a list of input feature arrays
        #  self.y is a list of truth arrays
        #  self.w is optional and can contain a weight array
        #         (needs to have same number of entries as truth array)
        #         If no weights are needed, this can be left completely empty
        #
        # The conversion should convert finally to numpy arrays. In the future,
        # also tensorflow tensors will be supported.
        #
        # In this example, differnt ways of reading files are deliberatly mixed
        #

        print('reading ' + filename)

        import ROOT
        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("tree")
        self.nsamples = tree.GetEntries()

        # user code, example works with the example 2D images in root format generated by make_example_data
        #from DeepJetCore.preprocessing import read2DArray

        #feature_array = read2DArray(filename,"tree","image2d",self.nsamples,32,32)
        #print('feature_array',feature_array.shape)

        import uproot3 as uproot

        urfile = uproot.open(filename)["tree"]
        truth = np.concatenate([
            np.expand_dims(urfile.array("lep_isPromptId_Training"), axis=1),
            np.expand_dims(urfile.array("lep_isNonPromptId_Training"), axis=1),
            np.expand_dims(urfile.array("lep_isFakeId_Training"), axis=1)
        ],
                               axis=1)

        truth = truth.astype(dtype='float32',
                             order='C')  #important, float32 and C-type!

        self.global_branches = [
            'lep_pt',
            'lep_eta',
            'lep_phi',
            'lep_mediumId',
            'lep_miniPFRelIso_all',
            'lep_pfRelIso03_all',
            'lep_sip3d',
            'lep_dxy',
            'lep_dz',
            'lep_charge',
            'lep_dxyErr',
            'lep_dzErr',
            'lep_ip3d',
            'lep_jetPtRelv2',
            'lep_jetRelIso',
            'lep_miniPFRelIso_chg',
            'lep_mvaLowPt',
            'lep_nStations',
            'lep_nTrackerLayers',
            'lep_pfRelIso03_all',
            'lep_pfRelIso03_chg',
            'lep_pfRelIso04_all',
            'lep_ptErr',
            'lep_segmentComp',
            'lep_tkRelIso',
            'lep_tunepRelPt',
        ]

        self.pfCand_neutral_branches = [
            'pfCand_neutral_eta',
            'pfCand_neutral_phi',
            'pfCand_neutral_pt',
            'pfCand_neutral_puppiWeight',
            'pfCand_neutral_puppiWeightNoLep',
            'pfCand_neutral_ptRel',
            'pfCand_neutral_deltaR',
        ]
        self.npfCand_neutral = 5

        ## works:
        #x_global = np.concatenate([np.expand_dims(urfile.array(var), axis=1) for var in self.global_branches], axis=1)
        #x_global = x_global.astype(dtype='float32', order='C') #important, float32 and C-type!
        #self.nsamples=len(x_global)

        from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles
        x_global = MeanNormZeroPad(filename, None, [self.global_branches], [1],
                                   self.nsamples)

        x_pfCand_neutral = MeanNormZeroPadParticles(
            filename, None, self.pfCand_neutral_branches, self.npfCand_neutral,
            self.nsamples)

        x_global = x_global.astype(dtype='float32', order='C')
        x_pfCand_neutral = x_pfCand_neutral.astype(dtype='float32', order='C')

        #returns a list of feature arrays, a list of truth arrays and a list of weight arrays
        return [x_global, x_pfCand_neutral], [truth], []
Ejemplo n.º 26
0
    def _convertFromSourceFile(self, filename, weighterobjects, istraining):

        fileTimeOut(filename, 10)  #10 seconds for eos to recover

        tree = uproot.open(filename)["B4"]
        nevents = tree.numentries

        #truth
        isElectron = self.tonumpy(tree["isElectron"].array())
        isGamma = self.tonumpy(tree["isGamma"].array())
        isPositron = self.tonumpy(tree["isPositron"].array())
        true_energy = self.tonumpy(tree["true_energy"].array())
        true_x = self.tonumpy(tree["true_x"].array())
        true_y = self.tonumpy(tree["true_y"].array())

        rechit_energy = self.tonumpy(tree["rechit_energy"].array())
        rechit_x = self.tonumpy(tree["rechit_x"].array())
        rechit_y = self.tonumpy(tree["rechit_y"].array())
        rechit_z = self.tonumpy(tree["rechit_z"].array())
        rechit_layer = self.tonumpy(tree["rechit_layer"].array())
        rechit_detid = self.tonumpy(tree['rechit_detid'].array())

        #print('rechit_energy',rechit_energy,rechit_energy.shape)
        #print(rechit_detid)

        #for ...
        feat, truth, layers, npart_arr, truth_en = self.mergeShowers(
            isElectron,
            isGamma,
            isPositron,
            true_energy,
            true_x,
            true_y,
            rechit_energy,
            rechit_x,
            rechit_y,
            rechit_z,
            rechit_layer,
            rechit_detid,
            maxpart=self.npart,
            istraining=istraining)

        print('feat', feat.shape)

        calo, track = self.separateLayers(feat, layers)

        calo = np.reshape(calo, [truth.shape[0], 16, 16, -1])
        track = np.reshape(track, [truth.shape[0], 64, 64, -1])

        #print(calo[0,:,:,1:3])
        #this needs to be rebinned in x and y
        #calosort = np.argsort(calo[:,:,1]*100+calo[:,:,2], axis=-1)
        #calo = calo[calosort]
        #calo = np.reshape(calo, [truth.shape[0],16,16,-1])

        debug = False
        if debug:
            import matplotlib.pyplot as plt
            calotruth = truth[:, 0:16 * 16, :]
            calotruth = np.reshape(calotruth, [truth.shape[0], 16, 16, -1])
            tracktruth = truth[:, 16 * 16:, :]
            tracktruth = np.reshape(tracktruth, [truth.shape[0], 64, 64, -1])
            for event in range(10):
                #print truth index and rec energy
                fig, ax = plt.subplots(1, 1)
                ax.imshow(calotruth[event, :, :, 6], aspect=1)
                fig.savefig("calo_idx" + str(event) + ".pdf")
                ax.imshow(calo[event, :, :, 0], aspect=1)
                fig.savefig("calo_en" + str(event) + ".pdf")
                ax.imshow(calotruth[event, :, :, 0], aspect=1)
                fig.savefig("calo_tmask" + str(event) + ".pdf")

                ax.imshow(tracktruth[event, :, :, 6], aspect=1)
                fig.savefig("tracktruth_idx" + str(event) + ".pdf")
                ax.imshow(track[event, :, :, 0], aspect=1)
                fig.savefig("track_en" + str(event) + ".pdf")
                ax.imshow(tracktruth[event, :, :, 0], aspect=1)
                fig.savefig("tracktruth_tmask" + str(event) + ".pdf")

        print('calo', calo.shape)
        print('track', track.shape)
        print('truth', truth.shape)

        #np.set_printoptions(threshold=10*1280)

        #print('calo',calo[0])
        #print('track',track[0])

        if hasattr(self, "truth_en"):
            self.truth_en = truth_en

        return [calo, track], [truth], [
        ]  #[tracker0, tracker1, tracker2, tracker3, calo] , [trutharray], []
Ejemplo n.º 27
0
    def base_convertFromSourceFile(self,
                                   filename,
                                   weighterobjects,
                                   istraining,
                                   treename="Events",
                                   removeTracks=True):

        fileTimeOut(filename, 10)  #10 seconds for eos to recover
        tree = uproot.open(filename)[treename]

        hits = "RecHitHGC"
        front_face_z = 323  #this needs to be more precise

        recHitZUnsplit = self.hitObservable(tree,
                                            hits,
                                            "z",
                                            split=False,
                                            flatten=False)
        self.setSplitIdx(recHitZUnsplit < 0)
        recHitZ = self.splitJaggedArray(recHitZUnsplit)
        offsets = recHitZ.offsets

        recHitX = self.hitObservable(tree,
                                     hits,
                                     "x",
                                     split=True,
                                     flatten=False)
        recHitY = self.hitObservable(tree,
                                     hits,
                                     "y",
                                     split=True,
                                     flatten=False)
        recHitSimClusIdx = self.hitObservable(tree,
                                              hits,
                                              "BestMergedSimClusterIdx",
                                              split=True,
                                              flatten=False)

        #Define spectators
        recHit_df_events = [
            pd.DataFrame({
                "recHitX": recHitX[i],
                "recHitY": recHitY[i],
                "recHitZ": recHitZ[i],
                "recHitSimClusIdx": recHitSimClusIdx[i]
            }) for i in range(recHitX.shape[0])
        ]
        for ievent in range(len(recHit_df_events)):
            df_event = recHit_df_events[ievent]
            unique_shower_idx = np.unique(df_event['recHitSimClusIdx'])
            df_event['spectator_distance'] = 0  #
            df_event['recHitSimClus_nHits'] = df_event.groupby(
                'recHitSimClusIdx'
            ).recHitX.transform(
                len
            )  #adding number of rec hits that are associated to this truth cluster
            for idx in unique_shower_idx:
                df_shower = df_event[df_event['recHitSimClusIdx'] == idx]
                x_to_fit = df_shower[['recHitX', 'recHitY',
                                      'recHitZ']].to_numpy()
                spectators_shower_dist = find_pcas(x_to_fit,
                                                   PCA_n=2,
                                                   min_hits=10)
                if (spectators_shower_dist is not None):
                    spectators_idx = (df_shower.index.tolist())
                    df_event.loc[spectators_idx,
                                 'spectator_distance'] = spectators_shower_dist
                del df_shower
            del df_event

        #Expand back
        recHitX = np.expand_dims(recHitX.content, axis=1)
        recHitY = np.expand_dims(recHitY.content, axis=1)
        recHitZ = np.expand_dims(recHitZ.content, axis=1)
        recHitSpectatorFlag = np.concatenate(
            np.array([
                recHit_df_events[i]['spectator_distance'].to_numpy()
                for i in range(len(recHit_df_events))
            ],
                     dtype=object)).reshape(-1, 1)
        recHitSimClusterNumHits = np.concatenate(
            np.array([
                recHit_df_events[i]['recHitSimClus_nHits'].to_numpy()
                for i in range(len(recHit_df_events))
            ],
                     dtype=object)).reshape(-1, 1)  #number of rec hits
        del recHit_df_events

        recHitEnergy = self.hitObservable(tree, hits, "energy")
        recHitDetaId = self.hitObservable(tree, hits, "detId")
        recHitTime = self.hitObservable(tree, hits, "time")
        recHitR = np.sqrt(recHitX * recHitX + recHitY * recHitY +
                          recHitZ * recHitZ)
        recHitTheta = np.arccos(recHitZ / recHitR)
        recHitEta = -np.log(np.tan(recHitTheta / 2))

        # Don't split this until the end, so it can be used to index the truth arrays
        recHitSimClusIdx = self.hitObservable(tree,
                                              hits,
                                              "BestMergedSimClusterIdx",
                                              split=False,
                                              flatten=False)

        simClusterDepEnergy = tree["MergedSimCluster_recEnergy"].array()
        simClusterEnergy = tree["MergedSimCluster_boundaryEnergy"].array()
        simClusterEnergyNoMu = tree[
            "MergedSimCluster_boundaryEnergyNoMu"].array()
        simClusterNumHits = tree["MergedSimCluster_nHits"].array(
        )  #numebr of sim hits

        # Remove muon energy, add back muon deposited energy
        unmergedId = tree["SimCluster_pdgId"].array()
        unmergedDepE = tree["SimCluster_recEnergy"].array()
        unmergedMatchIdx = tree["MergedSimCluster_SimCluster_MatchIdx"].array()
        unmergedMatches = tree["MergedSimCluster_SimClusterNumMatch"].array()
        unmergedDepEMuOnly = unmergedDepE
        unmergedDepEMuOnly[np.abs(unmergedId) != 13] = 0
        # Add another layer of nesting, then sum over all unmerged associated to merged
        unmergedDepEMuOnly = ak.JaggedArray.fromcounts(
            unmergedMatches.counts,
            ak.JaggedArray.fromcounts(
                unmergedMatches.content,
                unmergedDepEMuOnly[unmergedMatchIdx].flatten()))
        depEMuOnly = unmergedDepEMuOnly.sum()
        #why wasn't it possible to just do instead of all of the above : simClusterEnergy[simClusterPdgId == 13] = simClusterDepEnergy ?

        simClusterEnergyMuCorr = simClusterEnergyNoMu + depEMuOnly

        simClusterX = tree["MergedSimCluster_impactPoint_x"].array()
        simClusterY = tree["MergedSimCluster_impactPoint_y"].array()
        simClusterZ = tree["MergedSimCluster_impactPoint_z"].array()
        simClusterTime = tree["MergedSimCluster_impactPoint_t"].array()
        simClusterEta = tree["MergedSimCluster_impactPoint_eta"].array()
        simClusterPhi = tree["MergedSimCluster_impactPoint_phi"].array()
        simClusterPdgId = tree["MergedSimCluster_pdgId"].array()

        # Mark simclusters outside of volume or with very few hits as noise
        # Maybe not a good idea if the merged SC pdgId is screwed up
        # Probably removing neutrons is a good idea though
        #noNeutrons = simClusterPdgId[recHitSimClusIdx] == 2112

        #filter non-boundary positions. Hopefully working?
        goodSimClus = tree["MergedSimCluster_isTrainable"].array()
        # Don't split by index here to keep same dimensions as SimClusIdx
        markNoise = self.truthObjects(~goodSimClus,
                                      recHitSimClusIdx,
                                      False,
                                      split=False,
                                      flatten=False).astype(np.bool_)

        nbefore = (recHitSimClusIdx < 0).sum().sum()
        recHitSimClusIdx[markNoise] = -1
        nafter = (recHitSimClusIdx < 0).sum().sum()

        print("Number of noise hits before", nbefore, "after", nafter)
        print('removed another factor of', nafter / nbefore,
              ' bad simclusters')

        recHitTruthPID = self.truthObjects(simClusterPdgId, recHitSimClusIdx,
                                           0.)
        recHitTruthDepEnergy = self.truthObjects(simClusterDepEnergy,
                                                 recHitSimClusIdx, 0)
        recHitTruthEnergy = self.truthObjects(simClusterEnergy,
                                              recHitSimClusIdx, 0)
        recHitTruthEnergyCorrMu = self.truthObjects(simClusterEnergyMuCorr,
                                                    recHitSimClusIdx, 0)

        low_energy_shower_cutoff = 3
        # Uncorrected currently not used
        recHitTruthEnergy = np.where(
            recHitTruthEnergy > low_energy_shower_cutoff, recHitTruthEnergy,
            recHitTruthDepEnergy)
        recHitTruthEnergy = np.where(
            recHitTruthEnergyCorrMu > low_energy_shower_cutoff,
            recHitTruthEnergyCorrMu, recHitTruthDepEnergy)

        #very bad names because these quatities are associated to Merged Clusters and not hits
        recHitTruthX = self.truthObjects(simClusterX, recHitSimClusIdx, 0)
        recHitTruthY = self.truthObjects(simClusterY, recHitSimClusIdx, 0)
        recHitTruthZ = self.truthObjects(simClusterZ, recHitSimClusIdx, 0)
        recHitTruthTime = self.truthObjects(simClusterTime, recHitSimClusIdx,
                                            0)
        recHitTruthR = np.sqrt(recHitTruthX * recHitTruthX +
                               recHitTruthY * recHitTruthY +
                               recHitTruthZ * recHitTruthZ)
        recHitTruthTheta = np.arccos(
            np.divide(recHitTruthZ,
                      recHitTruthR,
                      out=np.zeros_like(recHitTruthZ),
                      where=recHitTruthR != 0))
        recHitTruthPhi = self.truthObjects(simClusterPhi, recHitSimClusIdx, 0)
        recHitTruthEta = self.truthObjects(simClusterEta, recHitSimClusIdx, 0)
        #recHitAverageEnergy =  self.truthObjects(simClusterDepEnergy/simClusterNumHits, recHitSimClusIdx, 0) #this is not technically very good because simClusterNumHits is number of sim clusters, not reco
        recHitAverageEnergy = recHitTruthDepEnergy / recHitSimClusterNumHits

        #print(recHitTruthPhi)
        #print(np.max(recHitTruthPhi))
        #print(np.min(recHitTruthPhi))

        # Placeholder
        zeroFeature = np.zeros(shape=(len(recHitEnergy), 1), dtype='float32')

        features = np.concatenate(
            [
                recHitEnergy,
                recHitEta,
                zeroFeature,  #indicator if it is track or not
                recHitTheta,
                recHitR,
                recHitX,
                recHitY,
                recHitZ,
                recHitTime,
            ],
            axis=1)

        farr = SimpleArray(name="recHitFeatures")
        farr.createFromNumpy(features, offsets)
        del features

        recHitSimClusIdx = np.expand_dims(
            self.splitJaggedArray(recHitSimClusIdx).content.astype(np.int32),
            axis=1)

        print('noise', (100 * np.count_nonzero(recHitSimClusIdx < 0)) //
              recHitSimClusIdx.shape[0], '% of hits')
        print('truth eta min max',
              np.min(np.abs(recHitTruthEta[recHitSimClusIdx >= 0])),
              np.max(np.abs(recHitTruthEta[recHitSimClusIdx >= 0])))
        print(
            'non-boundary truth positions',
            np.count_nonzero(
                np.abs(np.abs(recHitTruthZ[recHitSimClusIdx >= 0]) - 320) > 5)
            / recHitTruthZ[recHitSimClusIdx >= 0].shape[0])

        #now all numpy
        #Why do we want noise (-1) sim hits to be equal rec?
        recHitTruthX[recHitSimClusIdx < 0] = recHitX[recHitSimClusIdx < 0]
        recHitTruthY[recHitSimClusIdx < 0] = recHitY[recHitSimClusIdx < 0]
        recHitTruthZ[recHitSimClusIdx < 0] = recHitZ[recHitSimClusIdx < 0]
        recHitTruthEnergyCorrMu[recHitSimClusIdx < 0] = recHitEnergy[
            recHitSimClusIdx < 0]
        recHitTruthTime[recHitSimClusIdx < 0] = recHitTime[
            recHitSimClusIdx < 0]

        #import matplotlib.pyplot as plt
        #plt.hist(np.abs(recHitTruthEnergyCorrMu[recHitSimClusIdx>=0]/recHitTruthDepEnergy[recHitSimClusIdx>=0]))
        #plt.yscale('log')
        #plt.savefig("scat.pdf")

        truth = np.concatenate(
            [
                np.array(recHitSimClusIdx, dtype='float32'),  # 0
                recHitTruthEnergyCorrMu,
                recHitTruthX,
                recHitTruthY,
                recHitTruthZ,  #4
                zeroFeature,  #truthHitAssignedDirX,
                zeroFeature,  #6
                zeroFeature,
                recHitTruthEta,
                recHitTruthPhi,
                recHitTruthTime,  #10
                zeroFeature,
                zeroFeature,
                recHitTruthDepEnergy,  #13
                zeroFeature,  #14
                zeroFeature,  #15
                recHitTruthPID,  #16 - 16+n_classes #won't be used anymore
                np.array(recHitSpectatorFlag, dtype='float32'),
                np.where(recHitTruthZ < front_face_z, 1., 0.).astype('float32')
            ],
            axis=1)

        t_idxarr = SimpleArray(recHitSimClusIdx,
                               offsets,
                               name="recHitTruthClusterIdx")

        t_energyarr = SimpleArray(name="recHitTruthEnergy")
        t_energyarr.createFromNumpy(recHitTruthEnergyCorrMu, offsets)

        t_posarr = SimpleArray(name="recHitTruthPosition")
        t_posarr.createFromNumpy(
            np.concatenate([recHitTruthX, recHitTruthY], axis=-1), offsets)

        t_time = SimpleArray(name="recHitTruthTime")
        t_time.createFromNumpy(recHitTruthTime, offsets)

        t_pid = SimpleArray(name="recHitTruthID")
        t_pid.createFromNumpy(recHitTruthPID, offsets)

        t_spectator = SimpleArray(
            name="recHitSpectatorFlag"
        )  #why do we have inconsistent namings, where is it needed? wrt. to truth array
        t_spectator.createFromNumpy(recHitSpectatorFlag.astype('float32'),
                                    offsets)

        t_fully_contained = SimpleArray(name="recHitFullyContainedFlag")
        t_fully_contained.createFromNumpy(
            np.where(recHitTruthZ < front_face_z, 1., 0.).astype('float32'),
            offsets)

        #remaining truth is mostly for consistency in the plotting tools
        t_rest = SimpleArray(name="recHitTruth")
        t_rest.createFromNumpy(truth, offsets)

        return [
            farr, t_idxarr, t_energyarr, t_posarr, t_time, t_pid, t_spectator,
            t_fully_contained
        ], [t_rest], []
Ejemplo n.º 28
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):

        # Function to produce the numpy training arrays from root files

        from DeepJetCore.Weighter import Weighter
        from DeepJetCore.stopwatch import stopwatch
        sw = stopwatch()
        swall = stopwatch()
        if not istraining:
            self.remove = False

        def reduceTruth(uproot_arrays):

            b = uproot_arrays[b'isB']

            bb = uproot_arrays[b'isBB']
            gbb = uproot_arrays[b'isGBB']

            bl = uproot_arrays[b'isLeptonicB']
            blc = uproot_arrays[b'isLeptonicB_C']
            lepb = bl + blc

            c = uproot_arrays[b'isC']
            cc = uproot_arrays[b'isCC']
            gcc = uproot_arrays[b'isGCC']

            ud = uproot_arrays[b'isUD']
            s = uproot_arrays[b'isS']
            uds = ud + s

            g = uproot_arrays[b'isG']

            return np.vstack(
                (b + lepb, bb + gbb, c + cc + gcc, uds + g)).transpose()

        print('reading ' + filename)

        import ROOT
        from root_numpy import tree2array, root2array
        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()
        # user code, example works with the example 2D images in root format generated by make_example_data
        from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles
        x_global = MeanNormZeroPad(filename, weighterobjects['means'], [
            self.global_branches, self.track_branches, self.eta_rel_branches,
            self.vtx_branches
        ], [1, self.n_track, self.n_eta_rel, self.n_vtx], self.nsamples)

        import uproot3 as uproot
        urfile = uproot.open(filename)["deepntuplizer/tree"]
        truth_arrays = urfile.arrays(self.truth_branches)
        truth = reduceTruth(truth_arrays)
        truth = truth.astype(dtype='float32',
                             order='C')  #important, float32 and C-type!

        x_global = x_global.astype(dtype='float32', order='C')

        if self.remove:
            b = [self.weightbranchX, self.weightbranchY]
            b.extend(self.truth_branches)
            b.extend(self.undefTruth)
            fileTimeOut(filename, 120)
            for_remove = root2array(filename,
                                    treename="deepntuplizer/tree",
                                    stop=None,
                                    branches=b)
            notremoves = weighterobjects['weigther'].createNotRemoveIndices(
                for_remove)
            undef = for_remove['isUndefined']
            notremoves -= undef
            print('took ', sw.getAndReset(), ' to create remove indices')

        if self.remove:
            print('remove')
            x_global = x_global[notremoves > 0]
            truth = truth[notremoves > 0]

        newnsamp = x_global.shape[0]
        print('reduced content to ',
              int(float(newnsamp) / float(self.nsamples) * 100), '%')

        print('remove nans')
        x_global = np.where(
            np.logical_and(np.isfinite(x_global),
                           (np.abs(x_global) < 100000.0)), x_global, 0)
        return [x_global], [truth], []
Ejemplo n.º 29
0
    def readFromRootFile(self, filename, TupleMeanStd, weighter):
        from DeepJetCore.preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles
        import numpy
        from DeepJetCore.stopwatch import stopwatch

        sw = stopwatch()
        swall = stopwatch()

        import ROOT

        fileTimeOut(filename, 120)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        tree = rfile.Get("deepntuplizer/tree")
        self.nsamples = tree.GetEntries()

        print('took ', sw.getAndReset(), ' seconds for getting tree entries')

        # split for convolutional network

        x_global = MeanNormZeroPad(filename, TupleMeanStd, [self.branches[0]],
                                   [self.branchcutoffs[0]], self.nsamples)

        x_cpf = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                         self.branches[1],
                                         self.branchcutoffs[1], self.nsamples)

        x_npf = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                         self.branches[2],
                                         self.branchcutoffs[2], self.nsamples)

        x_sv = MeanNormZeroPadParticles(filename, TupleMeanStd,
                                        self.branches[3],
                                        self.branchcutoffs[3], self.nsamples)

        print('took ', sw.getAndReset(),
              ' seconds for mean norm and zero padding (C module)')

        Tuple = self.readTreeFromRootToTuple(filename)

        if self.remove:
            notremoves = weighter.createNotRemoveIndices(Tuple)
            undef = Tuple['isUndefined']
            notremoves -= undef
            print('took ', sw.getAndReset(), ' to create remove indices')

        if self.weight:
            weights = weighter.getJetWeights(Tuple)
        elif self.remove:
            weights = notremoves
        else:
            print('neither remove nor weight')
            weights = numpy.empty(self.nsamples)
            weights.fill(1.)

        truthtuple = Tuple[self.truthclasses]
        #print(self.truthclasses)
        alltruth = self.reduceTruth(truthtuple)

        #print(alltruth.shape)
        if self.remove:
            print('remove')
            weights = weights[notremoves > 0]
            x_global = x_global[notremoves > 0]
            x_cpf = x_cpf[notremoves > 0]
            x_npf = x_npf[notremoves > 0]
            x_sv = x_sv[notremoves > 0]
            alltruth = alltruth[notremoves > 0]

        newnsamp = x_global.shape[0]
        print('reduced content to ',
              int(float(newnsamp) / float(self.nsamples) * 100), '%')
        self.nsamples = newnsamp

        print(x_global.shape, self.nsamples)

        self.w = [weights]
        self.x = [x_global, x_cpf, x_npf, x_sv]
        self.y = [alltruth]
Ejemplo n.º 30
0
    def convertFromSourceFile(self, filename, weighterobjects, istraining):

        # Function to produce the numpy training arrays from root files

        from DeepJetCore.Weighter import Weighter
        from DeepJetCore.stopwatch import stopwatch
        sw = stopwatch()
        swall = stopwatch()
        if not istraining:
            self.remove = False

        def reduceTruth(uproot_arrays):

            b = uproot_arrays[str.encode(map_prefix(b'Jet_isB'))]

            bb = uproot_arrays[str.encode(map_prefix(b'Jet_isBB'))]
            gbb = uproot_arrays[str.encode(map_prefix(b'Jet_isGBB'))]

            bl = uproot_arrays[str.encode(map_prefix(b'Jet_isLeptonicB'))]
            blc = uproot_arrays[str.encode(map_prefix(b'Jet_isLeptonicB_C'))]
            lepb = bl + blc

            c = uproot_arrays[str.encode(map_prefix(b'Jet_isC'))]
            cc = uproot_arrays[str.encode(map_prefix(b'Jet_isCC'))]
            gcc = uproot_arrays[str.encode(map_prefix(b'Jet_isGCC'))]

            ud = uproot_arrays[str.encode(map_prefix(b'Jet_isUD'))]
            s = uproot_arrays[str.encode(map_prefix(b'Jet_isS'))]
            uds = ud + s

            g = uproot_arrays[str.encode(map_prefix(b'Jet_isG'))]

            return np.vstack(
                (b + lepb, bb + gbb, c + cc + gcc, uds + g)).transpose()

        print('reading ' + filename)

        import ROOT
        from root_numpy import tree2array, root2array
        fileTimeOut(filename, 600)  #give eos a minute to recover
        rfile = ROOT.TFile(filename)
        # tree = rfile.Get("ttree")
        # self.nsamples = tree.GetEntries()
        # from IPython import embed;embed()
        tree = u3.open(filename)["ttree"]
        self.nsamples = tree.numentries
        print("Nsamples: {}".format(self.nsamples))

        # user code, example works with the example 2D images in root format generated by make_example_data
        from DeepJetCore.preprocessing import MeanNormZeroPad, MeanNormZeroPadParticles
        for obj in [
                filename, weighterobjects['means'],
            [
                self.global_branches, self.track_branches,
                self.eta_rel_branches, self.vtx_branches
            ], [1, self.n_track, self.n_eta_rel, self.n_vtx], self.nsamples
        ]:
            print("DEBUGGING:\t{}".format(type(obj)))
        print("DEBUGGING:\n\tPrinting MeanNormZeroPad arguments:")
        print("\t{}\n\t{}\n\t{}".format(filename, weighterobjects['means'],
                                        self.nsamples))
        print("reading in with new uproot+awkward function")
        nparr = uproot_tree_to_numpy(
            filename,
            weighterobjects['means'], [
                self.global_branches, self.track_branches,
                self.eta_rel_branches, self.vtx_branches
            ], [1, self.n_track, self.n_eta_rel, self.n_vtx],
            self.nsamples,
            treename="ttree")
        print("succesfully created numpy array")
        x_global = nparr

        # x_global = MeanNormZeroPad(filename,weighterobjects['means'],
        # [self.global_branches,self.track_branches,self.eta_rel_branches,self.vtx_branches],
        # [1,self.n_track,self.n_eta_rel,self.n_vtx],self.nsamples)

        print("opening file with uproot")
        import uproot3 as uproot
        urfile = uproot.open(filename)["ttree"]
        truth_arrays = urfile.arrays(self.truth_branches)
        print("truth_branches:")
        print(self.truth_branches)
        print("truth_arrays:")
        print(truth_arrays)
        truth = reduceTruth(truth_arrays)
        truth = truth.astype(dtype='float32',
                             order='C')  #important, float32 and C-type!

        x_global = x_global.astype(dtype='float32', order='C')

        if self.remove:
            b = [self.weightbranchX, self.weightbranchY]
            b.extend(self.truth_branches)
            b.extend(self.undefTruth)
            fileTimeOut(filename, 120)
            for_remove = uproot_root2array(filename,
                                           treename="ttree",
                                           stop=None,
                                           branches=b)
            notremoves = weighterobjects['weigther'].createNotRemoveIndices(
                for_remove)
            undef = for_remove['Jet_isUndefined']
            print("\nundef:")
            print(undef)
            print("undef dtype: ", undef.dtype)
            print()
            print(notremoves)
            notremoves -= np.array(undef, dtype=np.float32)
            print('took ', sw.getAndReset(), ' to create remove indices')

        if self.remove:
            print('remove')
            x_global = x_global[notremoves > 0]
            truth = truth[notremoves > 0]

        newnsamp = x_global.shape[0]
        print('reduced content to ',
              int(float(newnsamp) / float(self.nsamples) * 100), '%')

        print('remove nans')
        x_global = np.where(
            np.logical_and(np.isfinite(x_global),
                           (np.abs(x_global) < 100000.0)), x_global, 0)
        return [x_global], [truth], []