def makeDataFrame(self): sample_dict = {} samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel) working_samples = samples_doublefake working_samples = setSumWeights(working_samples) print('###########################################################') print('# measuring doublefakerake...') print('# %d samples to be used:'%(len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i,s in enumerate(working_samples): sample = working_samples[0] file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root']) chain.Add(file_name) dataframe = RDataFrame(chain) weight = 'weight * lhe_weight' dataframe = dataframe.Define('w',weight)\ .Define('ptCone',self.ptCone())\ .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\ .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\ .Define('abs_l1_eta','abs(l1_eta)')\ .Define('abs_l2_eta','abs(l2_eta)')\ .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\ .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\ return dataframe
def createSamples(channel, analysis_dir, total_weight, server, add_data_cut=None, dataset = '2017'): sample_dict = {} # print "creating samples from %s"%(analysis_dir) samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists(analysis_dir=analysis_dir, server = server, channel=channel, add_data_cut=add_data_cut, dataset = dataset) #select here the samples you wish to use # working_samples = samples_data_dde working_samples = samples_all working_samples = setSumWeights(working_samples) sample_dict['working_samples'] = working_samples print('') print('###########################################################') print('# %d samples to be used:'%(len(working_samples))) print('###########################################################') for sample in working_samples: print('{:<20}{:<20}'.format(*[sample.name,('path: '+sample.ana_dir)])) # for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir+w.dir_name+'/'+tree_prod_name)])) return sample_dict
def make_all_friendtrees(multiprocess, server, analysis_dir, channel, path_to_NeuralNet, overwrite, dataset='2017'): print('making friendtrees for all datasamples') start = time.time() # call samples samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists( analysis_dir=analysis_dir, server=server, channel=channel, dataset=dataset) working_samples = samples_nonprompt for w in working_samples: print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)])) if multiprocess == True: pool = multiprocessing.Pool(len(working_samples)) input_array = [] for i, sample in enumerate(working_samples): sample = working_samples[i] file_name = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) input_array.append([ file_name, sample.name, path_to_NeuralNet + 'net.h5', path_to_NeuralNet, get_branches_nonprompt2(get_features_nonprompt2()), get_features_nonprompt2(), overwrite, ]) result = pool.map(makeFriendtree_Process, input_array) if multiprocess == False: for i, s in enumerate(working_samples): sample = working_samples[i] file_name = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) friend_file_name = makeFriendtree( tree_file_name=file_name, sample_name=sample.name, net_name=path_to_NeuralNet + 'net.h5', path_to_NeuralNet=path_to_NeuralNet, branches=get_branches_nonprompt2(get_features_nonprompt2()), features=get_features_nonprompt2(), overwrite=overwrite, ) duration = time.time() - start print('It took %.2f seconds to make all friendtrees.' % duration)
def createArrays(features, branches, path_to_NeuralNet, faketype='DoubleFake', channel='mmm', multiprocess=True, dataset='2017', analysis_dir='/home/dehuazhu/SESSD/4_production/'): #define basic environmental parameters hostname = gethostname() channel = channel sample_dict = {} # call samples samples_all, samples_singlefake, samples_doublefake, samples_nonprompt, samples_mc, samples_data = createSampleLists( analysis_dir=analysis_dir, server=hostname, channel=channel, dataset=dataset) working_samples = samples_data # working_samples = samples_nonprompt # working_samples = samples_mc # necessary if you want to compare data with MC working_samples = setSumWeights(working_samples) samples_mc = setSumWeights(samples_mc) # make a TChain object by combining all necessary data samples print('###########################################################') if faketype == 'DoubleFake': print('# measuring doublefakerate...') if faketype == 'SingleFake1': print('# measuring singlefakerate for lepton 1...') if faketype == 'SingleFake2': print('# measuring singlefakerate for lepton 2...') print('# %d samples to be used:' % (len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i, s in enumerate(working_samples): # sample = working_samples[0] #super stupid mistake, I'm keeping it here as a painful reminder sample = working_samples[i] file_name = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) chain.Add(file_name) # define the selections if faketype == 'SingleFake1': region = Selections.Region('MR_SF1', channel, 'MR_SF1') selection_passing = region.data selection_failing = region.SF_LT if faketype == 'SingleFake2': region = Selections.Region('MR_SF2', channel, 'MR_SF2') selection_passing = region.data selection_failing = region.SF_TL if faketype == 'DoubleFake': region = Selections.Region('MR_DF', channel, 'MR_DF') selection_passing = region.data selection_failing = region.DF if faketype == 'nonprompt': region = Selections.Region('AN_Feb', channel, 'AN_Feb') selection_passing = region.data selection_failing = region.nonprompt selection_passing_MC = region.MC_contamination_pass selection_failing_MC = region.MC_contamination_fail # convert TChain object into numpy arrays for the training start = time.time() if multiprocess == True: queue = multiprocessing.Queue() result = [] processes = [] for key in ['pass', 'fail']: if key == 'pass': selection = selection_passing if key == 'fail': selection = selection_failing processes.append( multiprocessing.Process(target=tree2array_process, args=(queue, chain, branches, selection, key))) for p in processes: p.start() for p in processes: result.append(queue.get()) p.join() for r in result: if r[0] == 'pass': array_pass = r[1] if r[0] == 'fail': array_fail = r[1] if multiprocess == False: print('converting .root ntuples to numpy arrays... (passed events)') array_pass = tree2array(chain, branches=branches, selection=selection_passing) print('nevents from array_pass: '******'converting .root ntuples to numpy arrays... (failed events)') array_fail = tree2array(chain, branches=branches, selection=selection_failing) print('nevents from array_fail: ' + str(array_fail.size)) delta = time.time() - start print('It took %.2f seconds to create the arrays' % delta) df_pass = pd.DataFrame(array_pass) df_fail = pd.DataFrame(array_fail) #giving data the contamination weight '1' (i.e. ignore it) for array in [df_pass, df_fail]: array['contamination_weight'] = array.weight * array.lhe_weight # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * xsec / sumweights # adding MC prompt contamination print('###########################################################') print('now adding MC prompt contamination to the training') print('# %d samples to be used:' % (len(samples_mc))) print('###########################################################') for w in samples_mc: print('{:<20}{:<20}'.format(*[w.name, ('path: ' + w.ana_dir)])) lumi = 41530 # all eras # lumi = 4792 # only era B if multiprocess == True: pool = multiprocessing.Pool(len(samples_mc)) input_array = [] for i, sample in enumerate(samples_mc): for key in ['pass', 'fail']: file_in = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) if key == 'pass': selection = selection_passing_MC if key == 'fail': selection = selection_failing_MC entry = [ file_in, branches, selection, sample.name, key, sample.xsec, sample.sumweights ] input_array.append(entry) result = pool.map(root2array_PoolProcess, input_array) for i, sample in enumerate(result): array = sample[1] xsec = sample[2] sumweights = sample[3] try: array[ 'contamination_weight'] = array.weight * array.lhe_weight * lumi * ( -1) * xsec / sumweights # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * xsec /sumweights # array['contamination_weight'] = array.weight * array.lhe_weight except: set_trace() if sample[0] == 'pass': df_pass = pd.concat([df_pass, array]) # df_fail = pd.concat([df_fail,array]) # print ('added pass events to df_pass: %d'%len(array)) if sample[0] == 'fail': # df_pass = pd.concat([df_pass,array]) df_fail = pd.concat([df_fail, array]) # print ('added fail events to df_pass: %d'%len(array)) if multiprocess == False: for i, s in enumerate(samples_mc): sample = samples_mc[i] print('computing %s' % sample.name) file_in = '/'.join([ sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root' ]) selection_pass = selection_passing_MC selection_fail = selection_failing_MC passing = pd.DataFrame( root2array(file_in, 'tree', branches=branches, selection=selection_passing_MC)) failing = pd.DataFrame( root2array(file_in, 'tree', branches=branches, selection=selection_failing_MC)) for array in [passing, failing]: array[ 'contamination_weight'] = array.weight * array.lhe_weight * lumi * ( -1) * sample.xsec / sample.sumweights # array['contamination_weight'] = array.weight * array.lhe_weight * lumi * sample.xsec / sample.sumweights df_pass = pd.concat([df_pass, passing]) # df_pass = pd.concat([df_fail,failing]) # df_fail = pd.concat([df_fail,passing]) df_fail = pd.concat([df_fail, failing]) print('array size after including MC: %d(pass); %d(fail)' % (len(df_pass), len(df_fail))) # add the target column df_pass['target'] = np.ones(df_pass.shape[0]).astype(np.int) df_fail['target'] = np.zeros(df_fail.shape[0]).astype(np.int) # concatenate the events and shuffle data = pd.concat([df_pass, df_fail]) data = data.sample( frac=1, replace=False, random_state=1986) # shuffle (and DON'T replace the sample) data.index = np.array(range(len(data))) data.to_pickle(path_to_NeuralNet + 'training_data.pkl')
def measureSFR(self, drawPlot = False): sample_dict = {} samples_all, samples_singlefake, samples_doublefake = createSampleLists(analysis_dir=self.analysis_dir, server = self.server, channel=self.channel) working_samples = samples_singlefake working_samples = setSumWeights(working_samples) print('###########################################################') print('# measuring singlefakerake...') print('# %d samples to be used:'%(len(working_samples))) print('###########################################################') for w in working_samples: print('{:<20}{:<20}'.format(*[w.name,('path: '+w.ana_dir)])) chain = TChain('tree') #TChain'ing all data samples together for i,s in enumerate(working_samples): sample = working_samples[0] file_name = '/'.join([sample.ana_dir, sample.dir_name, sample.tree_prod_name, 'tree.root']) chain.Add(file_name) dataframe = RDataFrame(chain) weight = 'weight * lhe_weight' dataframe = dataframe.Define('w',weight)\ .Define('ptCone',self.ptCone())\ .Define('abs_hnl_hn_vis_eta','abs(hnl_hn_vis_eta)')\ .Define('abs_hnl_hn_eta','abs(hnl_hn_eta)')\ .Define('abs_l1_eta','abs(l1_eta)')\ .Define('abs_l2_eta','abs(l2_eta)')\ .Define('abs_l1_jet_flavour_parton','abs(l1_jet_flavour_parton)')\ .Define('abs_l2_jet_flavour_parton','abs(l2_jet_flavour_parton)')\ # bins_ptCone = np.array([5.,10., 20., 30., 40.,70., 2000]) # bins_eta = np.array([0., 0.8, 1.2, 2.4]) bins_ptCone = np.array([5.,10., 20., 30., 40.,70.]) bins_eta = np.array([0., 0.8, 1.2, 2.4]) selection_baseline = getSelection(self.channel,'MR_SF') selection_LL_uncorrelated = '(' + ' & '\ .join([\ selection_baseline,\ getSelection(self.channel,'L_L_uncorrelated')\ ]) + ')' selection_TT_uncorrelated = '(' + ' & '\ .join([\ selection_baseline,\ getSelection(self.channel,'L_L_uncorrelated'),\ getSelection(self.channel,'T_T')\ ]) + ')' h_LL_uncorrelated = dataframe\ .Filter(selection_LL_uncorrelated)\ .Histo2D(('h_LL_uncorrelated','h_LL_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w') #name the axis, also initiate the dataframe call h_LL_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta') h_TT_uncorrelated = dataframe\ .Filter(selection_TT_uncorrelated)\ .Histo2D(('h_TT_uncorrelated','h_TT_uncorrelated',len(bins_ptCone)-1,bins_ptCone, len(bins_eta)-1, bins_eta),'ptCone','abs_hnl_hn_vis_eta','w') #name the axis, also initiate the dataframe call h_TT_uncorrelated.SetTitle(';ptCone [GeV]; dimuon #eta') # preparing the histo and save it into a .root file sfr_TH2_dir = '/home/dehuazhu/HNL/CMSSW_9_4_6_patch1/src/PlotFactory/DataBkgPlots/modules/DDE_singlefake.root' sfr_hist = h_TT_uncorrelated.Clone() # sfr_hist = h_LL_uncorrelated.Clone() # sfrhist = h_baseline.Clone() # sfr_hist.Divide(h_LL_uncorrelated.Clone()) # dfr_hist.SaveAs(sfr_TH2_dir) #uncomment this to save the TH2 # draw the histo if required if drawPlot == True: can = TCanvas('can', '') # sfr_hist.Draw('colzTextE') # sfr_hist.Draw('colz') sfr_hist.Draw() pf.showlumi('%d entries'%(sfr_hist.GetEntries())) # pf.showlogopreliminary() can.Update() set_trace()