def createSamples(channel, analysis_dir, total_weight, server, add_data_cut=None, dataset='2017'): sample_dict = {} # print "creating samples from %s"%(analysis_dir) samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists( analysis_dir=analysis_dir, server=server, channel=channel, add_data_cut=add_data_cut, dataset=dataset) #select here the samples you wish to use # working_samples = samples_data_dde working_samples = samples_all working_samples = setSumWeights(working_samples) sample_dict['working_samples'] = working_samples print('') print('###########################################################') print('# %d samples to be used:' % (len(working_samples))) print('###########################################################') for sample in working_samples: print( '{:<20}{:<20}'.format(*[sample.name, ('path: ' + sample.ana_dir)])) # for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir+w.dir_name+'/'+tree_prod_name)])) return sample_dict
def makeDataFrame(self): sample_dict = {} samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel) working_samples = samples_doublefake working_samples = setSumWeights(working_samples) print('###########################################################') print('# measuring doublefakerake...') print('# %d samples to be used:'%(len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i,s in enumerate(working_samples): sample = working_samples[0] file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root']) chain.Add(file_name) dataframe = RDataFrame(chain) weight = 'weight * lhe_weight' dataframe = dataframe.Define('w',weight)\ .Define('ptCone',self.ptCone())\ .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\ .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\ .Define('abs_l1_eta','abs(l1_eta)')\ .Define('abs_l2_eta','abs(l2_eta)')\ .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\ .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\ return dataframe
def makeCfgs(signalDict, channel, dataset, ana_dir, signals): if ('mmm' in channel) or ('mem' in channel): ch = 'mu' if ('eee' in channel) or ('eem' in channel): ch = 'e' samples = [] Vs = [\ '0p00001', # v2 = 1em10 '0p00001414213562', # v2 = 2em10 '0p00001732050808', # v2 = 3em10 '0p00002', # v2 = 4em10 '0p000022360679774997898', # v2 = 5em10 '0p00002449489743', # v2 = 6em10 '0p00002645751311', # v2 = 7em10 '0p00002828427125', # v2 = 8em10 '0p00003', # v2 = 9em10 '0p000031622776601683795', # v2 = 1em09 '0p00004472135955', # v2 = 2em09 '0p00005477225575', # v2 = 3em09 '0p0000632455532', # v2 = 4em09 '0p00007071067811865475', # v2 = 5em09 '0p00007745966692', #v2 = 6em09 '0p00008366600265', #v2 = 7em09 '0p0000894427191', #v2 = 8em09 '0p00009486832981', #v2 = 9em09 '0p0001', # v2 = 1em08 '0p0001414213562', # v2 = 2em08 '0p0001732050808', # v2 = 3em08 '0p0002', # v2 = 4em08 '0p00022360679774997898', # v2 = 5em08 '0p0002449489743', # v2 = 6em08 '0p0002645751311', # v2 = 7em08 '0p0002828427125', # v2 = 8em08 '0p0003', # v2 = 9em08 '0p00031622776601683795', # v2 = 1em07 '0p0004472135955', # v2 = 2em07 '0p0005477225575', # v2 = 3em07 '0p000632455532', # v2 = 4em07 '0p0007071067811865475', # v2 = 5em07 '0p0007745966692', #v2 = 6em07 '0p0008366600265', #v2 = 7em07 '0p000894427191', #v2 = 8em07 '0p0009486832981', #v2 = 9em07 '0p001', # v2 = 1em06 '0p001414213562', # v2 = 2em06 '0p001732050808', # v2 = 3em06 '0p002', # v2 = 4em06 '0p0022360679774997898', # v2 = 5em06 '0p002449489743', # v2 = 6em06 '0p002645751311', # v2 = 7em06 '0p002828427125', # v2 = 8em06 '0p003', # v2 = 9em06 '0p0031622776601683795', # v2 = 1em05 '0p004472135955', # v2 = 2em05 '0p005477225575', # v2 = 3em05 '0p00632455532', # v2 = 4em05 '0p007071067811865475', # v2 = 5em05 '0p007745966692', #v2 = 6em05 '0p008366600265', #v2 = 7em05 '0p00894427191', #v2 = 8em05 '0p009486832981', #v2 = 9em05 '0p01', # v2 = 1em04 '0p01414213562', # v2 = 2em04 '0p01732050808', # v2 = 3em04 '0p02', # v2 = 4em04 '0p022360679774997898', # v2 = 5em04 '0p02449489743', # v2 = 6em04 '0p02645751311', # v2 = 7em04 '0p02828427125', # v2 = 8em04 '0p03', # v2 = 9em04 '0p031622776601683795', # v2 = 1em03 '0p04472135955', # v2 = 2em03 '0p05477225575', # v2 = 3em03 '0p0632455532', # v2 = 4em03 '0p07071067811865475', # v2 = 5em03 '0p07745966692', #v2 = 6em03 '0p08366600265', #v2 = 7em03 '0p0894427191', #v2 = 8em03 '0p09486832981', #v2 = 9em03 '0p1', # v2 = 1em02 '0p1414213562', # v2 = 2em02 '0p1732050808', # v2 = 3em02 '0p2', # v2 = 4em02 '0p22360679774997898', # v2 = 5em02 '0p2449489743', # v2 = 6em02 '0p2645751311', # v2 = 7em02 '0p2828427125', # v2 = 8em02 '0p3', # v2 = 9em02 '0p31622776601683795', # v2 = 1em01 '0p4472135955', # v2 = 2em01 '0p5477225575', # v2 = 3em01 '0p632455532', # v2 = 4em01 '0p7071067811865475', # v2 = 5em01 '0p7745966692', #v2 = 6em01 '0p8366600265', #v2 = 7em01 '0p894427191', #v2 = 8em01 '0p9486832981', #v2 = 9em01 ] for mass in signalDict: if signalDict[mass] == {}: continue maxEntries = 0 maxEntriesSampleKey = None for v in signalDict[mass]: entries = signalDict[mass][v]['count'] if entries >= maxEntries: maxEntries = entries maxEntriesSampleKey = v try: subdir = signalDict[mass][maxEntriesSampleKey]['name'] except: set_trace() for newV in Vs: name = 'HN3L_M_%s_V_%s_%s_massiveAndCKM_LO_reweighted' % (mass, newV, ch) sample = makeSample(name, subdir=subdir, signals=signals, dataset=dataset, channel=channel, analysis_dir=ana_dir) samples.append(sample) samples = setSumWeights(samples) return samples
def createArrays(features, branches, path_to_NeuralNet, faketype='DoubleFake', channel='mmm', multiprocess=True, dataset='2017', analysis_dir='/home/dehuazhu/SESSD/4_production/'): #define basic environmental parameters hostname = gethostname() channel = channel sample_dict = {} # call samples samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists( analysis_dir=analysis_dir, server=hostname, channel=channel, dataset=dataset) working_samples = samples_data # working_samples = samples_nonprompt # working_samples = samples_mc # necessary if you want to compare data with MC working_samples = setSumWeights(working_samples) samples_mc = setSumWeights(samples_mc) # make a TChain object by combining all necessary data samples print('###########################################################') if faketype == 'DoubleFake': print('# measuring doublefakerate...') if faketype == 'SingleFake1': print('# measuring singlefakerate for lepton 1...') if faketype == 'SingleFake2': print('# measuring singlefakerate for lepton 2...') print('# %d samples to be used:' % (len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i, s in enumerate(working_samples): # sample = working_samples[0] #super stupid mistake, I'm keeping it here as a painful reminder sample = working_samples[i] file_name = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) chain.Add(file_name) # define the selections if faketype == 'SingleFake1': region = Selections.Region('MR_SF1', channel, 'MR_SF1') selection_passing = region.data selection_failing = region.SF_LT if faketype == 'SingleFake2': region = Selections.Region('MR_SF2', channel, 'MR_SF2') selection_passing = region.data selection_failing = region.SF_TL if faketype == 'DoubleFake': region = Selections.Region('MR_DF', channel, 'MR_DF') selection_passing = region.data selection_failing = region.DF if faketype == 'nonprompt': region = Selections.Region('AN_Feb', channel, 'AN_Feb') selection_passing = region.data selection_failing = region.nonprompt selection_passing_MC = region.MC_contamination_pass selection_failing_MC = region.MC_contamination_fail # convert TChain object into numpy arrays for the training start = time.time() if multiprocess == True: queue = multiprocessing.Queue() result = [] processes = [] for key in ['pass', 'fail']: if key == 'pass': selection = selection_passing if key == 'fail': selection = selection_failing processes.append( multiprocessing.Process(target=tree2array_process, args=(queue, chain, branches, selection, key))) for p in processes: p.start() for p in processes: result.append(queue.get()) p.join() for r in result: if r[0] == 'pass': array_pass = r[1] if r[0] == 'fail': array_fail = r[1] if multiprocess == False: print('converting .root ntuples to numpy arrays... (passed events)') array_pass = tree2array(chain, branches=branches, selection=selection_passing) print('nevents from array_pass: '******'converting .root ntuples to numpy arrays... (failed events)') array_fail = tree2array(chain, branches=branches, selection=selection_failing) print('nevents from array_fail: ' + str(array_fail.size)) delta = time.time() - start print('It took %.2f seconds to create the arrays' % delta) df_pass = pd.DataFrame(array_pass) df_fail = pd.DataFrame(array_fail) #giving data the contamination weight '1' (i.e. ignore it) for array in [df_pass, df_fail]: array['contamination_weight'] = array.weight * array.lhe_weight # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * xsec / sumweights # adding MC prompt contamination print('###########################################################') print('now adding MC prompt contamination to the training') print('# %d samples to be used:' % (len(samples_mc))) print('###########################################################') for w in samples_mc: print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)])) lumi = 41530 # all eras # lumi = 4792 # only era B if multiprocess == True: pool = multiprocessing.Pool(len(samples_mc)) input_array = [] for i, sample in enumerate(samples_mc): for key in ['pass', 'fail']: file_in = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) if key == 'pass': selection = selection_passing_MC if key == 'fail': selection = selection_failing_MC entry = [ file_in, branches, selection, sample.name, key, sample.xsec, sample.sumweights ] input_array.append(entry) result = pool.map(root2array_PoolProcess, input_array) for i, sample in enumerate(result): array = sample[1] xsec = sample[2] sumweights = sample[3] try: array[ 'contamination_weight'] = array.weight * array.lhe_weight * lumi * ( -1) * xsec / sumweights # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * xsec /sumweights # array['contamination_weight'] = array.weight * array.lhe_weight except: set_trace() if sample[0] == 'pass': df_pass = pd.concat([df_pass, array]) # df_fail = pd.concat([df_fail,array]) # print ('added pass events to df_pass: %d'%len(array)) if sample[0] == 'fail': # df_pass = pd.concat([df_pass,array]) df_fail = pd.concat([df_fail, array]) # print ('added fail events to df_pass: %d'%len(array)) if multiprocess == False: for i, s in enumerate(samples_mc): sample = samples_mc[i] print('computing %s' % sample.name) file_in = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) selection_pass = selection_passing_MC selection_fail = selection_failing_MC passing = pd.DataFrame( root2array(file_in, 'tree', branches=branches, selection=selection_passing_MC)) failing = pd.DataFrame( root2array(file_in, 'tree', branches=branches, selection=selection_failing_MC)) for array in [passing, failing]: array[ 'contamination_weight'] = array.weight * array.lhe_weight * lumi * ( -1) * sample.xsec / sample.sumweights # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * sample.xsec / sample.sumweights df_pass = pd.concat([df_pass, passing]) # df_pass = pd.concat([df_fail,failing]) # df_fail = pd.concat([df_fail,passing]) df_fail = pd.concat([df_fail, failing]) print('array size after including MC: %d(pass); %d(fail)' % (len(df_pass), len(df_fail))) # add the target column df_pass['target'] = np.ones(df_pass.shape[0]).astype(np.int) df_fail['target'] = np.zeros(df_fail.shape[0]).astype(np.int) # concatenate the events and shuffle data = pd.concat([df_pass, df_fail]) data = data.sample( frac=1, replace=False, random_state=1986) # shuffle (and DON'T replace the sample) data.index = np.array(range(len(data))) data.to_pickle(path_to_NeuralNet + 'training_data.pkl')
def measureSFR(self, drawPlot = False): sample_dict = {} samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel) working_samples = samples_singlefake working_samples = setSumWeights(working_samples) print('###########################################################') print('# measuring singlefakerake...') print('# %d samples to be used:'%(len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i,s in enumerate(working_samples): sample = working_samples[0] file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root']) chain.Add(file_name) dataframe = RDataFrame(chain) weight = 'weight * lhe_weight' dataframe = dataframe.Define('w',weight)\ .Define('ptCone',self.ptCone())\ .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\ .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\ .Define('abs_l1_eta','abs(l1_eta)')\ .Define('abs_l2_eta','abs(l2_eta)')\ .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\ .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\ # bins_ptCone = np.array([5.,10., 20., 30., 40.,70., 2000]) # bins_eta = np.array([0., 0.8, 1.2, 2.4]) bins_ptCone = np.array([5.,10., 20., 30., 40.,70.]) bins_eta = np.array([0., 0.8, 1.2, 2.4]) selection_baseline = getSelection(self.channel,'MR_SF') selection_LL_uncorrelated = '(' + ' & '\ .join([\ selection_baseline,\ getSelection(self.channel,'L_L_uncorrelated')\ ]) + ')' selection_TT_uncorrelated = '(' + ' & '\ .join([\ selection_baseline,\ getSelection(self.channel,'L_L_uncorrelated'),\ getSelection(self.channel,'T_T')\ ]) + ')' h_LL_uncorrelated = dataframe\ .Filter(selection_LL_uncorrelated)\ .Histo2D(('h_LL_uncorrelated','h_LL_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w') #name the axis, also initiate the dataframe call h_LL_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta') h_TT_uncorrelated = dataframe\ .Filter(selection_TT_uncorrelated)\ .Histo2D(('h_TT_uncorrelated','h_TT_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w') #name the axis, also initiate the dataframe call h_TT_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta') # preparing the histo and save it into a .root file sfr_TH2_dir = '/home/dehuazhu/HNL/CMSSW_9_4_6_patch1/src/PlotFactory/DataBkgPlots/modules/DDE_singlefake.root' sfr_hist = h_TT_uncorrelated.Clone() # sfr_hist = h_LL_uncorrelated.Clone() # sfrhist = h_baseline.Clone() # sfr_hist.Divide(h_LL_uncorrelated.Clone()) # dfr_hist.SaveAs(sfr_TH2_dir) #uncomment this to save the TH2 # draw the histo if required if drawPlot == True: can = TCanvas('can', '') # sfr_hist.Draw('colzTextE') # sfr_hist.Draw('colz') sfr_hist.Draw() pf.showlumi('%d entries'%(sfr_hist.GetEntries())) # pf.showlogopreliminary() can.Update() set_trace()