def test_concatenate_awkward(): awkward = pytest.importorskip("awkward") files = skhep_testdata.data_path( "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*") arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="ak") assert isinstance(arrays, awkward.Array) assert set(awkward.fields(arrays)) == set(["i8", "f8"]) assert len(arrays) == 420
def test_concatenate_pandas(): pandas = pytest.importorskip("pandas") files = skhep_testdata.data_path( "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*") arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="pd") assert isinstance(arrays, pandas.DataFrame) assert set(arrays.columns.tolist()) == set(["i8", "f8"]) assert len(arrays) == 420
def pd_tree(self, path, tname, squery=None): try: tree = uproot.open(path)[tname] except: pwarning('error getting', tname, 'from file:', path) return None if not tree: perror('Tree {} not found in file {}'.format(tname, path)) return None df = uproot.concatenate(tree, library="pd") if squery: #df.query(squery, inplace=True) df = df.query(squery) df.reset_index(drop=True) return df
def load_dataframe(self): # Load event tree into dataframe if not self.skip_event_tree: event_tree = None event_df = None event_tree_name = self.tree_dir + self.event_tree_name with uproot.open(self.input_file)[event_tree_name] as event_tree: if not event_tree: raise ValueError("Tree %s not found in file %s" % (event_tree_name, self.input_file)) self.event_df_orig = uproot.concatenate(event_tree, self.event_columns, library="pd") # Check if there are duplicated event ids #print(self.event_df_orig) #d = self.event_df_orig.duplicated(self.unique_identifier, keep=False) #print(self.event_df_orig[d]) n_duplicates = sum( self.event_df_orig.duplicated(self.unique_identifier)) if n_duplicates > 0: raise ValueError( "There appear to be %i duplicate events in the event dataframe" % n_duplicates) # Apply event selection self.event_df_orig.reset_index(drop=True) if self.is_pp: event_criteria = 'is_ev_rej == 0' else: event_criteria = 'is_ev_rej == 0 and centrality > @self.min_centrality and centrality < @self.max_centrality' if self.event_plane_range: event_criteria += ' and event_plane_angle > @self.event_plane_range[0] and event_plane_angle < @self.event_plane_range[1]' event_df = self.event_df_orig.query(event_criteria) event_df.reset_index(drop=True) # Load track tree into dataframe track_tree = None track_df_orig = None track_tree_name = self.tree_dir + self.track_tree_name with uproot.open(self.input_file)[track_tree_name] as track_tree: if not track_tree: raise ValueError("Tree %s not found in file %s" % (track_tree_name, self.input_file)) track_df_orig = uproot.concatenate(track_tree, self.track_columns, library="pd") # Apply hole selection, in case of jetscape if self.is_jetscape: if self.holes: track_criteria = 'status == -1' else: track_criteria = 'status == 0' track_df_orig = track_df_orig.query(track_criteria) track_df_orig.reset_index(drop=True) # Check if there are duplicated tracks #print(track_df_orig) #d = track_df_orig.duplicated(self.track_columns, keep=False) #print(track_df_orig[d]) n_duplicates = sum(track_df_orig.duplicated(self.track_columns)) if n_duplicates > 0: raise ValueError( "There appear to be %i duplicate particles in the track dataframe" % n_duplicates) # Merge event info into track tree if self.skip_event_tree: self.track_df = track_df_orig else: self.track_df = pandas.merge(track_df_orig, event_df, on=self.unique_identifier) # Check if there are duplicated tracks in the merge dataframe #print(self.track_df) #d = self.track_df.duplicated(self.track_columns, keep=False) #print(self.track_df[d]) n_duplicates = sum(self.track_df.duplicated(self.track_columns)) if n_duplicates > 0: sys.exit( 'ERROR: There appear to be {} duplicate particles in the merged dataframe' .format(n_duplicates)) return self.track_df
def main(args): logger = setup_logging() v0_input_dir = args.v0_input_dir vcustom_input_dir = args.vcustom_input_dir output_dir = args.output_dir channel = args.channel logger.info("Fit to Double Crystal Ball") tree_name = tree_name_tmpl.format(channel) final_plots_specs = {} # Needed names for files and trees file_dirs = {"Vertex 0th": v0_input_dir, "Vertex Reco": vcustom_input_dir} fit_colors = {"Vertex 0th": "kRed", "Vertex Reco": "kBlue"} # Create sigma_m_over_m categories logger.info("Creating categories of SigmaMOverM") file_format = { "Vertex 0th": v0_input_dir + "/" + file_names_tmpl[channel], "Vertex Reco": vcustom_input_dir + "/" + file_names_tmpl[channel] } categories = {} smom = "sigma_m" # due to how we defined it in flashgg, it's already divided by M for vtx_name, direc in file_format.items(): categories[vtx_name] = {} final_plots_specs[vtx_name] = {} arr = uproot.concatenate(["{}:{}".format(direc, tree_name)], expressions=[smom], library="ak") arr = np.asarray([ev[0] for ev in arr.to_numpy()]) cut_format = "{var} > {min_edge} && {var} < {max_edge}" edge_min = 0. edge_max = 0.035 n_bins = 5 edges = get_edges(arr, edge_min, edge_max, n_bins) low = edges[0] for high in edges[1:]: cat_name = "SigmaMOverM_{:.5f}-{:.5f}".format(low, high) cat_string = cut_format.format(var=smom, min_edge=low, max_edge=high) categories[vtx_name][cat_name] = cat_string final_plots_specs[vtx_name][cat_name] = {} final_plots_specs[vtx_name][cat_name]["range"] = (low, high) low = high logger.info("Created categories {}".format(categories)) for vtx_name, direc in file_dirs.items(): logger.info("Working with vertex {}".format(vtx_name)) for cat_name, cut in categories[vtx_name].items(): logger.info("Working with category {}".format(cat_name)) chain = ROOT.TChain() files = [ fl for fl in os.listdir(direc) if fl.startswith(file_names_tmpl[channel][:20]) ] for fl in files: chain.Add("{}/{}/{}".format(direc, fl, tree_name)) rdf = ROOT.RDataFrame(chain) rdf_cut = rdf.Filter(cut) mass_arr = rdf_cut.Take[float]("mass").GetValue() weight_arr = rdf_cut.Take[float]("weight").GetValue() mass_fake_arr = array("d", [0.]) weight_fake_arr = array("d", [0.]) cut_tree = ROOT.TTree("cut_tree", "cut_tree") cut_tree.Branch("mass", mass_fake_arr, "mass/D") cut_tree.Branch("weight", weight_fake_arr, "weight/D") for ev_mass, ev_weight in zip(mass_arr, weight_arr): mass_fake_arr[0] = ev_mass weight_fake_arr[0] = ev_weight cut_tree.Fill() # RooFit objects mass = ROOT.RooRealVar("mass", "Invariant mass [GeV]", 125, 115, 135) weight = ROOT.RooRealVar("weight", "weight", -1, 1) mu = ROOT.RooRealVar("mu", "mu", 125, 120, 130) sigma1 = ROOT.RooRealVar("sigma1", "sigma1", 1, 0.1, 10) alpha1 = ROOT.RooRealVar("alpha1", "alpha1", 1, 0, 10) n1 = ROOT.RooRealVar("n1", "n1", 1, 0, 5) cb1 = ROOT.RooCBShape("cb1", "cb1", mass, mu, sigma1, alpha1, n1) sigma2 = ROOT.RooRealVar("sigma2", "sigma2", 4, 0.1, 10) alpha2 = ROOT.RooRealVar("alpha2", "alpha2", 1, 0, 10) n2 = ROOT.RooRealVar("n2", "n2", 1, 0, 5) frac = ROOT.RooRealVar("frac", "frac", 0.5, 0., 1.) cb2 = ROOT.RooCBShape("cb2", "cb2", mass, mu, sigma2, alpha2, n2) model = ROOT.RooAddPdf("model", "model", ROOT.RooArgList(cb1, cb2), ROOT.RooArgList(frac)) # Create (weighted) dataset data = ROOT.RooDataSet("data".format(cat_name), "data".format(cat_name), cut_tree, ROOT.RooArgSet(mass, weight), "", weight.GetName()) # Fit in subrange mass.setRange("higgs", 116, 134) logger.info("Performing fit") fit_result = fit_result = model.fitTo( data, ROOT.RooFit.Range("higgs"), ROOT.RooFit.Save(1), ROOT.RooFit.AsymptoticError(1)) # Plot decoration mass_frame = mass.frame( ROOT.RooFit.Title("Mass-{}-{}".format(vtx_name, cat_name))) mass_frame.GetYaxis().SetTitleOffset(1.6) data.plotOn(mass_frame, ROOT.RooFit.DataError(ROOT.RooAbsData.SumW2)) model.plotOn( mass_frame, ROOT.RooFit.LineColor(getattr(ROOT, fit_colors[vtx_name]))) chi_sq = mass_frame.chiSquare() model.paramOn( mass_frame, ROOT.RooFit.Layout(0.65), ROOT.RooFit.Label("chiSq / ndof = {:.5f}".format(chi_sq))) # Dump plots logger.info("Dumping plots") c = ROOT.TCanvas("", "") mass_frame.Draw() c.SaveAs("{}/mass_{}_{}.jpg".format(output_dir, vtx_name, cat_name)) c.SaveAs("{}/mass_{}_{}.pdf".format(output_dir, vtx_name, cat_name)) # Fill values for final plots parameters = { var.GetName(): var.getVal() for var in list(model.getParameters(data)) } # See https://root-forum.cern.ch/t/how-to-calculate-effective-sigma/39472/3 final_plots_specs[vtx_name][cat_name]["fitted_sigma"] = np.sqrt( (parameters["sigma1"]**2)*parameters["frac"] \ + (parameters["sigma2"]**2)*(1 - parameters["frac"]) ) # Propagate uncertainty on sigma effective # To get the covariances from fit result, remember the indexes cov_matrix = fit_result.covarianceMatrix() frac_index = 2 sigma1_index = 6 sigma2_index = 7 var_frac = cov_matrix[frac_index][frac_index] var_v_1 = cov_matrix[sigma1_index][sigma1_index] var_v_2 = cov_matrix[sigma2_index][sigma2_index] cov_v_1_v_2 = cov_matrix[sigma1_index][sigma2_index] cov_v_1_frac = cov_matrix[sigma1_index][frac_index] cov_v_2_frac = cov_matrix[sigma2_index][frac_index] final_plots_specs[vtx_name][cat_name][ "fitted_sigma_unc"] = eff_sigma_unc( parameters["frac"], 1 - parameters["frac"], parameters["sigma1"] - parameters["sigma2"], var_v_1, var_v_2, var_frac, cov_v_1_v_2, cov_v_1_frac, cov_v_2_frac) logger.info( "Dumping final plots specifications: {}".format(final_plots_specs)) with open("sigma_m_final_plots_specs_{}.pkl".format(channel), "wb") as fl: pickle.dump(final_plots_specs, fl)
'../../ntuples/0.9.6-2016_production/JpsiK-mc-step2/JpsiK--22_03_10--mc--12143001--2016--md.root:tree', '../../ntuples/0.9.6-2016_production/JpsiK-mc-step2/JpsiK--22_03_10--mc--12143001--2016--mu.root:tree', ] histoMcRawN = 'h_occupancy_mc_raw' histoDataRawN = 'h_occupancy_data_raw' histoRatioN = 'h_occupancy' mcBrsN = ['b_ownpv_ndof', 'ntracks', 'wjk_occ', 'wpid', 'wtrk'] ######################################### # Rebuild histogram from step-2 ntuples # ######################################### mcBrsRaw = concatenate(mcNtpsN, mcBrsN, library='np') globalCut = (mcBrsRaw['ntracks'] < 450) & (mcBrsRaw['b_ownpv_ndof'] < 200) # Apply a global cut mcBrs = {k: v[globalCut] for k, v in mcBrsRaw.items()} wtResult = np.prod([mcBrs[i] for i in ['wpid', 'wtrk', 'wjk_occ']], axis=0) hResult, *hResultBins = np.histogram2d( mcBrs['b_ownpv_ndof'], mcBrs['ntracks'], (20, 20), ((1, 200), (0, 450)), weights=wtResult) hMc, *hMcBins = np.histogram2d( mcBrs['b_ownpv_ndof'], mcBrs['ntracks'], (20, 20), ((1, 200), (0, 450)), weights=mcBrs['wpid']*mcBrs['wtrk']) ############################ # Load existing histograms # ############################
def build_dataframe( self, data_path: str, TTree_name: str, tree_dict: Dict[str, Set[str]], is_truth: bool, is_reco: bool, chunksize: int = 1024, validate_missing_events: bool = True, validate_duplicated_events: bool = True, validate_sumofweights: bool = True, ) -> pd.DataFrame: """ Builds a dataframe :param data_path: path to ROOT datafile(s) :param TTree_name: TTree in datapath to set as default tree :param tree_dict: dictionary of tree: variables to extract from Datapath :param is_truth: whether dataset contains truth data :param is_reco: whether dataset contains reco data :param chunksize: chunksize for uproot concat method :param validate_missing_events: whether to check for missing events :param validate_duplicated_events: whether to check for duplicated events :param validate_sumofweights: whether to check sum of weights against weight_mc :return: output dataframe containing columns corresponding to necessary variables """ self.logger.info(f"Building DataFrame from {data_path} ({file_utils.n_files(data_path)} file(s))...") # is the default tree a truth tree? default_tree_truth = 'truth' in TTree_name t1 = time.time() self.logger.debug(f"Extracting {tree_dict[TTree_name]} from {TTree_name} tree...") df = to_pandas(uproot.concatenate(data_path + ':' + TTree_name, tree_dict[TTree_name], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Extracted {len(df)} events.") self.logger.debug(f"Extracting ['total_EventsWeighted', 'dsid'] from 'sumWeights' tree...") sumw = to_pandas(uproot.concatenate(data_path + ':sumWeights', ['totalEventsWeighted', 'dsid'], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Calculating sum of weights and merging...") sumw = sumw.groupby('dsid').sum() df = pd.merge(df, sumw, left_on='mcChannelNumber', right_on='dsid', sort=False, copy=False) df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True) df.index.names = ['DSID', 'eventNumber'] self.logger.debug("Set DSID/eventNumber as index") # merge TTrees if validate_duplicated_events: validation = '1:1' self.logger.info(f"Validating duplicated events in tree {TTree_name}...") self.__drop_duplicates(df) self.__drop_duplicate_event_numbers(df) else: validation = 'm:m' self.logger.info("Skipping duplicted events validation") # iterate over TTrees and merge for tree in tree_dict: if tree == TTree_name: continue self.logger.debug(f"Extracting {tree_dict[tree]} from {tree} tree...") alt_df = to_pandas(uproot.concatenate(data_path + ":" + tree, tree_dict[tree], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Extracted {len(alt_df)} events.") alt_df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True) alt_df.index.names = ['DSID', 'eventNumber'] self.logger.debug("Set DSID/eventNumber as index") if validate_missing_events: self.logger.info(f"Checking for missing events in tree '{tree}'..") tree_is_truth = 'truth' in tree if tree_is_truth and not default_tree_truth: if n_missing := len(df.index.difference(alt_df.index)): raise Exception( f"Found {n_missing} events in '{TTree_name}' tree not found in '{tree}' tree") else: self.logger.debug(f"All events in {TTree_name} tree found in {tree} tree") elif default_tree_truth and not tree_is_truth: if n_missing := len(alt_df.index.difference(df.index)): raise Exception( f"Found {n_missing} events in '{tree}' tree not found in '{TTree_name}' tree") else: self.logger.debug(f"All events in {tree} tree found in {TTree_name} tree") else: self.logger.info(f"Skipping missing events check. Not truth/reco tree combination")
print('Compute errors...') result.errors(method='minuit_minos') print( f'Fit & error computation took a total of {time.time() - time_start:.2f} sec.' ) return result, nll if __name__ == '__main__': mplhep.style.use('LHCb2') args = parse_input() fit_params = load_params(args.params) ntp_brs = concatenate(args.input, [args.branch] + args.extraBranches, library='np') fit_var = ntp_brs[args.branch] print(f'Total events in data: {fit_var.size}') print('Initialize fit model...') obs = zfit.Space('x', limits=MODEL_BDY) fit_model, fit_components, _ = fit_model_overall(obs, fit_var, fit_params) output_plot_init = args.output + '/fit_init.pdf' \ if not args.outputPlotInit else args.outputPlotInit ensure_dir(output_plot_init) # Always plot the initial condition plot(fit_var, fit_components, output=output_plot_init, data_range=MODEL_BDY,
bin_centres = np.zeros(len(bin_edges) - 1) for i in range(1, len(bin_edges)): bin_centres[i - 1] = bin_edges[i] + (bin_edges[i] - bin_edges[i - 1]) / 2 return bin_centres if __name__ == "__main__": ntuple_dir = "E:\\NTuples\\TauClassifier" sig_files = glob.glob(f"{ntuple_dir}\\*Gammatautau*\\*.root") bkg_files = glob.glob(f"{ntuple_dir}\\*JZ*\\*.root") cuts = "(TauJets.jet_pt > 15000.0) & (TauJets.jet_pt < 10000000.0)" sig_pt = uproot.concatenate(sig_files, filter_name="TauJets.jet_pt", cut=cuts, library='np') bkg_pt = uproot.concatenate(bkg_files, filter_name="TauJets.jet_pt", cut=cuts, library='np') bkg_pt = bkg_pt["TauJets.jet_pt"] bkg_pt = np.sort(bkg_pt) / 1e6 sig_pt = sig_pt["TauJets.jet_pt"] sig_pt = np.sort(sig_pt) / 1e6 # Binning bin_edges = np.percentile(bkg_pt, np.linspace(0.0, 100.0, 50))
dataNtps = '../../run2-JpsiK/fit/fit_results/JpsiK-22_02_26_23_52-std-fit-2016/fit.root:tree' ######### # Plots # ######### varsToComp = ['b_ownpv_ndof', 'ntracks', 'b_pt', 'b_eta'] weightBrs = ['wpid', 'wtrk', 'w', 'wjk_kin', 'wjk_occ'] sweightBrs = ['sw_sig'] varsLabels = [r'$B$ PV NDOF', r'nTracks', r'$B$ $p_T$ [MeV]', r'$B$ $\eta$'] dataRanges = [[1, 200], [0, 450], [0, 25e3], [2, 5]] binnings = [20, 20, 20, 9] dataBrs = uproot.concatenate(dataNtps, varsToComp + sweightBrs, library='np') mcBrs = uproot.concatenate(mcNtps, varsToComp + weightBrs, library='np') # Make numpy histogram consistent w/ ROOT's globalCut = lambda brs: (brs['ntracks'] < 450) & (brs['b_ownpv_ndof'] < 200) & \ (brs['b_pt'] < 25e3) & (brs['b_eta'] < 5) dataCut = globalCut(dataBrs) mcCut = globalCut(mcBrs) def plot(output, br, xLabel, dataRange, bins, ratios=False): suf = ' MeV' if br == 'b_pt' else '' yLabel = f'Norm. / {(dataRange[1]-dataRange[0])/bins:.1f}{suf}' topPlotters = [] botPlotters = []
#!/usr/bin/env python import uproot import numpy as np # b_pt, b_eta BINNING = ([20, 9], [[0, 25e3], [2, 5]]) NTPS = '../../ntuples/0.9.6-2016_production/Dst_D0-mc-tracker_only/Dst_D0--22_02_24--mc--tracker_only--MC_2016_Beam6500GeV-2016-MagDown-TrackerOnly-Nu1.6-25ns-Pythia8_Sim09k_Reco16_Filtered_12773410_D0TAUNU.SAFESTRIPTRIG.DST/*-dv.root:TupleBminus/DecayTree' branches = uproot.concatenate( NTPS, ['b_PT', 'b_P', 'b_PZ'], library='np') brPT, brP, brPZ = branches['b_PT'], branches['b_P'], branches['b_PZ'] brETA = 0.5 * np.log((brP + brPZ) / (brP - brPZ)) histo = np.histogram2d(brPT, brETA, *BINNING) ntpOut = uproot.recreate('histo.root') ntpOut['histo'] = histo
def __init__(self, data_type, files, class_label, nbatches, variables_dict, dummy_var="truthProng", cuts=None, batch_size=None, label="Dataloader"): """ Class constructor - fills in meta-data for the data type :param data_type: The type of data file being loaded e.g. Gammatautau, JZ1, ect... :param files: A list of files of the same data type to be loaded :param class_label: 1 for signal, 0 for background :param variables_dict: dictionary of variables to load :param nbatches: number of batches to *roughly* split the data into :param dummy_var: A variable to be loaded from the file to be loaded and iterated through to work out the number of events in the data files. :param cuts: A string which can be parsed by uproot's cut option e.g. "(pt1 > 50) & ((E1>100) | (E1<90))" :param batch_size: Allows you to manually set the batch size for the data. This will override the automatically calculated batch size inferred from nbatches """ self._data_type = data_type self.label = label self.files = files self.dummy_var = dummy_var self.cut = cuts self._nbatches = nbatches self.class_label = class_label self._variables_dict = variables_dict self._current_index = 0 # Parse variables self._variables_list = [] for _, variable_list in variables_dict.items(): self._variables_list += variable_list # Work out how many events there in the sample test_arr = uproot.concatenate(self.files, filter_name="TauJets." + self.dummy_var, step_size=10000, cut=self.cut, library='np') self._num_events = len(test_arr["TauJets." + self.dummy_var]) # Set the DataLoader's batch size if batch_size is None: self.specific_batch_size = math.ceil(self._num_events / nbatches) else: self.specific_batch_size = batch_size # Setup the iterator self._batches_generator = uproot.iterate( self.files, filter_name=self._variables_list, cut=self.cut, step_size=self.specific_batch_size) # Work out the number of batches there are in the generator self._num_real_batches = 0 for _ in uproot.iterate(self.files, filter_name=self._variables_list[0], cut=self.cut, step_size=self.specific_batch_size): self._num_real_batches += 1 logger.log(f"Found {len(files)} files for {data_type}", 'INFO') logger.log(f"Found these files: {files}", 'INFO') logger.log(f"Found {self._num_events} events for {data_type}", 'INFO') logger.log( f"Number of batches in {self.label} {self.data_type()} = {self._num_real_batches}", 'DEBUG') logger.log(f"DataLoader for {data_type} initialized", "INFO")
'MC_WZneutrino_pt_born', 'MC_WZmu_el_phi_born', 'MC_WZneutrino_phi_born', 'MC_WZ_dilep_m_born', 'mcChannelNumber', 'weight_mc', 'KFactor_weight_truth', 'weight_pileup', 'eventNumber', ] # pull root data t = time.time() nominal_df = to_pandas( uproot.concatenate(DATAFILE + ':nominal_Loose', BRANCHES_NOMINAL, num_workers=N_THREADS)) print(f"Importing nominal from ROOT: {time.time() - t:.3f}s") nominal_df.to_pickle(OUT_DIR + 'nominal_wtaunu.pkl') before_len = len(nominal_df.index) print(f"number of events in nominal: {before_len}") t = time.time() nominal_df.drop_duplicates(inplace=True) print( f"Dropped {len(nominal_df) - before_len} duplicate events: {time.time() - t:.3f}s" ) before_len = len(nominal_df.index) t = time.time() nominal_df.drop_duplicates(['eventNumber', 'mcChannelNumber'], inplace=True) print(
def main(args): logger = setup_logging() v0_input_dir = args.v0_input_dir vcustom_input_dir = args.vcustom_input_dir output_dir = args.output_dir channel = args.channel tree_name = tree_name_tmpl.format(channel) plots_specs = {} # Needed names for files and trees file_dirs = {"Vertex 0th": v0_input_dir, "Vertex Reco": vcustom_input_dir} # Create sigma_m_over_m categories logger.info("Creating categories of SigmaMOverM") file_format = { "Vertex 0th": v0_input_dir + "/" + file_names_tmpl[channel], "Vertex Reco": vcustom_input_dir + "/" + file_names_tmpl[channel] } categories = {} smom = "sigma_m" # due to how we defined it in flashgg, it's already divided by M for vtx_name, direc in file_format.items(): categories[vtx_name] = {} plots_specs[vtx_name] = {} arr = uproot.concatenate(["{}:{}".format(direc, tree_name)], expressions=[smom], library="ak") arr = np.asarray([ev[0] for ev in arr.to_numpy()]) cut_format = "({var} > {min_edge}) & ({var} < {max_edge})" edge_min = 0. edge_max = 0.035 n_bins = 5 edges = get_edges(arr, edge_min, edge_max, n_bins) low = edges[0] for high in edges[1:]: cat_name = "SigmaMOverM_{:.5f}-{:.5f}".format(low, high) cat_string = cut_format.format(var=smom, min_edge=low, max_edge=high) categories[vtx_name][cat_name] = cat_string plots_specs[vtx_name][cat_name] = {} plots_specs[vtx_name][cat_name]["range"] = (low, high) low = high logger.info("Created categories {}".format(categories)) for vtx_name, direc in file_dirs.items(): logger.info("Working with vertex {}".format(vtx_name)) for cat_name, cut in categories[vtx_name].items(): logger.info("Working with category {}".format(cat_name)) files = [ fl for fl in os.listdir(direc) if fl.startswith(file_names_tmpl[channel][:20]) ] events = uproot.concatenate( [direc + "/" + fl + ":" + tree_name for fl in files], ["mass", "weight"], cut, library="np") mass = events["mass"] plots_specs[vtx_name][cat_name][ "sigma_effective"] = sigma_effective(mass) # Plot x_v0 = [ cat_spec["range"][0] + abs(cat_spec["range"][1] - cat_spec["range"][0]) / 2 for cat_spec in plots_specs["Vertex 0th"].values() ] x_vcustom = [ cat_spec["range"][0] + abs(cat_spec["range"][1] - cat_spec["range"][0]) / 2 for cat_spec in plots_specs["Vertex Reco"].values() ] x_s = {"Vertex 0th": x_v0, "Vertex Reco": x_vcustom} fmts = {"Vertex 0th": "r^", "Vertex Reco": "sb"} fig, (ax, rax) = plt.subplots(nrows=2, ncols=1, gridspec_kw={"height_ratios": (3, 1)}, sharex=True) fig.suptitle(channel) for vtx_name, cat_specs in plots_specs.items(): ax.plot(x_s[vtx_name], [ cat_spec["sigma_effective"] for cat_spec in plots_specs[vtx_name].values() ], fmts[vtx_name], label=vtx_name) rax_y = [ rel_diff(s0, sc) for s0, sc in zip([ plots_specs["Vertex 0th"][cat]["sigma_effective"] for cat in list(categories["Vertex 0th"].keys()) ], [ plots_specs["Vertex Reco"][cat]["sigma_effective"] for cat in list(categories["Vertex Reco"].keys()) ]) ] logger.info("Relative differences: {}".format(rax_y)) rax.plot(x_s["Vertex 0th"], rax_y, "ko") for x in [ax, rax]: for cat in plots_specs["Vertex 0th"].values(): low = cat["range"][0] x.axvline(low, color="black", alpha=0.4) rax.set_xlabel("$\sigma_M / M$") ax.set_ylabel("$\sigma_{effective}$") rax.set_ylabel("$rel\ diff$") ax.set_xlim(0.) ax.set_ylim(0.) rax.set_ylim(-0.01, 0.2) ax.legend(loc="upper left") ax.grid(which="both") rax.grid(which="both") logger.info("Dumping plot in {}".format(output_dir)) hep.cms.label(loc=0, data=True, llabel="Work in Progress", rlabel="", ax=ax, pad=.05) fig.savefig("{}/sigma_effective.png".format(output_dir), bbox_inches='tight') fig.savefig("{}/sigma_effective.pdf".format(output_dir), bbox_inches='tight')
wtJkOccRoot = mcRootBrs['wjk_occ'] wtJkOccAltRoot = mcRootBrs['wjk_alt'] histoRootMdl = TH2DModel( 'histoRoot', 'histoRoot', 20, 1, 200, 20, 0, 450 ) histoRoot = df.Histo2D(histoRootMdl, 'b_ownpv_ndof', 'ntracks', 'wt') ################## # Histo w/ numpy # ################## mcNumpyBrsN = ['b_ownpv_ndof', 'ntracks', 'wpid', 'wtrk', 'wjk_occ'] mcNumpyBrs = uproot.concatenate(f'{mcNtpN}:{mcTreeN}', mcNumpyBrsN, library='np') def getWeights(branches, histoRaw): histo, *binSpecs = histoRaw histoPadded = np.pad(histo, tuple((1, 1) for _ in range(histo.ndim))) binIdx = tuple(np.digitize(br, spec) for br, spec in zip(branches, binSpecs)) return histoPadded[binIdx] histoWtNp = uproot.open(histoNtpN)[histoN].to_numpy() wtJkOccNp = getWeights( (mcNumpyBrs['b_ownpv_ndof'], mcNumpyBrs['ntracks']), histoWtNp) histoNumpy, *_ = np.histogram2d(
def load_brs(ntp, tree, add_brs=None): br_names = [] if not add_brs else deepcopy(add_brs) for r in REWEIGHT_PROCEDURE.values(): br_names += r.vars return concatenate([f'{i}:{tree}' for i in ntp], br_names, library='np')
def test_concatenate_numpy(): files = skhep_testdata.data_path( "uproot-sample-6.20.04-uncompressed.root").replace("6.20.04", "*") arrays = uproot.concatenate({files: "sample"}, ["i8", "f8"], library="np") assert len(arrays["i8"]) == 420 assert len(arrays["f8"]) == 420
def test_concatenate(): with pytest.raises(ValueError): uproot.concatenate(skhep_testdata.data_path("uproot-issue63.root")) assert (len( uproot.concatenate( {skhep_testdata.data_path("uproot-issue63.root"): "blah"}, allow_missing=True, )) == 0) files = skhep_testdata.data_path( "uproot-sample-6.16.00-uncompressed.root").replace("6.16.00", "*") uproot.concatenate(files, "Ai8") uproot.concatenate({files: "sample"}, "Ai8") uproot.concatenate([files], "Ai8") uproot.concatenate([{files: "sample"}], "Ai8")
def main(args): # Read nano, micro, EB or EE cuts nanoaod_arr = ak.from_parquet(args.nano_input_dir) print("Read nanoaod: {}".format(nanoaod_arr.type)) microaod_arr = uproot.concatenate( ["{}/*.root:diphotonDumper/trees/ggH_125_13TeV_All_$SYST".format(args.micro_input_dir)] ) print("Read microaod: {}".format(microaod_arr.type)) # Stupid typo in flashgg if "lead_ch_iso_worst__uncorr" in microaod_arr.fields: microaod_arr["lead_ch_iso_worst_uncorr"] = microaod_arr["lead_ch_iso_worst__uncorr"] if args.sd == "EB": nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) < 1.5] nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) < 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) < 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) < 1.5] if args.sd == "EE": nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) > 1.5] nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) > 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) > 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) > 1.5] # Read catalogue of variables to be plotted with open("plots_specs.json", "r") as f: columns = json.load(f) # Create dict where keys are names of variables in nano and values are names of variables in micro nano_micro_names = {var["nano_col"]: var["micro_col"] for var in columns} nano_micro_names["event"] = "event" nano_micro_names["lumi"] = "lumi" # Event by event nano_dict = {k: nanoaod_arr[k] for k in nano_micro_names.keys()} nano_dict["lead_fixedGridRhoAll"] = nanoaod_arr["lead_fixedGridRhoAll"] # needed for XGBoost vs TMVA test_nano = ak.Array(nano_dict) test_micro = microaod_arr[nano_micro_names.values()] pd_nano = ak.to_pandas(test_nano) pd_micro = ak.to_pandas(test_micro) pd_nano = pd_nano.set_index(["event", "lumi"]) pd_micro = pd_micro.set_index(["event", "lumi"]) pd_joined = pd_nano.join(pd_micro, lsuffix="_nano", rsuffix="_micro") print("Joined dataframe:\n{}".format(pd_joined)) #Remove NaN values for nano_name, micro_name in nano_micro_names.items(): if nano_name in ["event", "lumi"]: break if nano_name == micro_name: nano_name += "_nano" micro_name += "_micro" pd_joined = pd_joined[pd_joined[nano_name].notna()] pd_joined = pd_joined[pd_joined[micro_name].notna()] # Cut over delta R # Here https://github.com/CoffeaTeam/coffea/blob/3db3fab23064c70d0ca63b185d51c7fa3b7849dc/coffea/nanoevents/methods/vector.py#L74 # useful info deltaR_threshold = 0.1 four_lead_nano = vector.obj( pt=pd_joined["lead_pt"], phi=pd_joined["lead_phi_nano"], eta=pd_joined["lead_eta_nano"], E=pd_joined["lead_energyRaw"] ) four_sublead_nano = vector.obj( pt=pd_joined["sublead_pt"], phi=pd_joined["sublead_phi_nano"], eta=pd_joined["sublead_eta_nano"], E=pd_joined["sublead_energyRaw"] ) pd_joined["deltaR_nano"] = four_lead_nano.deltaR(four_sublead_nano) four_lead_micro = vector.obj( pt=pd_joined["leadPt"], phi=pd_joined["lead_phi_micro"], eta=pd_joined["lead_eta_micro"], E=pd_joined["lead_SCRawE"] ) four_sublead_micro = vector.obj( pt=pd_joined["subleadPt"], phi=pd_joined["sublead_phi_micro"], eta=pd_joined["sublead_eta_micro"], E=pd_joined["sublead_SCRawE"] ) pd_joined["lead_deltaR"] = four_lead_nano.deltaR(four_lead_micro) pd_joined["sublead_deltaR"] = four_sublead_nano.deltaR(four_sublead_micro) pd_joined = pd_joined[pd_joined["lead_deltaR"] < deltaR_threshold] pd_joined = pd_joined[pd_joined["sublead_deltaR"] < deltaR_threshold] print("Final joined dataframe:\n{}".format(pd_joined)) # Plot print("Start plotting") for column in columns: fig, (up, middle, down) = plt.subplots( nrows=3, ncols=1, gridspec_kw={"height_ratios": (2, 1, 1)} ) nano_name = column["nano_col"] micro_name = column["micro_col"] if nano_name == micro_name: nano_name += "_nano" micro_name += "_micro" range = column["range"] # Up n, n_, n__ = up.hist(pd_joined[nano_name], bins=column["bins"], range=range, histtype="step", label="NanoAOD", linewidth=2) m, m_, m__ = up.hist(pd_joined[micro_name], bins=column["bins"], range=range, histtype="step", label="MicroAOD", linewidth=2) up.legend(fontsize=18, loc="upper right") up.set_xlim(range) up.set_xlabel(column["var"]) up.set_ylabel("Events") if "log" in column: up.set_yscale("log") # Middle ylim = [0, 2] middle.set_ylim(ylim) #middle.axhline(1, xmin=range[0], xmax=range[1], color="black", alpha=0.6) centers = (n_[:-1] + n_[1:]) / 2 middle.plot(centers, n / m, "k.") middle.set_xlim(range) middle.set_xlabel(column["var"]) middle.set_ylabel("$n/\mu$") middle.grid(which="both") # Down perc_range = (-300, 300) perc_bins = 500 down.hist(100 * (pd_joined[nano_name] - pd_joined[micro_name]) / pd_joined[micro_name], bins=perc_bins, range=perc_range, histtype="step", density=True, color="black", linewidth=2) #down.set_yscale("log") down.set_xlabel("$(n_{ev} - \mu_{ev})/\mu_{ev}$ [%]") down.set_ylabel("Events / {}%".format((perc_range[1] - perc_range[0]) / perc_bins)) print(column["nano_col"]) print("nano: {}".format(np.sum(n))) print("micro: {}".format(np.sum(m))) print("diff = {}".format(abs(np.sum(n) - np.sum(m)))) print("rel diff = {}%\n".format(100 * abs(np.sum(n) - np.sum(m)) / max(np.sum(n), np.sum(m)))) fig.tight_layout() fig.savefig("{}/{}_{}.png".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight') fig.savefig("{}/{}_{}.pdf".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight') plt.close(fig) # Dump pandas dataframe to parquet file pd_joined.to_parquet("nano_micro_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped dataframe to parquet file") # Redundant: dump separate dataframes for nano and micro with PhotonID inputs nano_vars = { "r9": "lead_r9_nano", "s4": "lead_s4_nano", "sieie": "lead_sieie_nano", "etaWidth": "lead_etaWidth", "phiWidth": "lead_phiWidth", "sieip": "lead_sieip_nano", "pfPhoIso03": "lead_pfPhoIso03", "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV", "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx", "mva_ID": "lead_mvaID_recomputed" } micro_vars = { "r9": "lead_r9_micro", "s4": "lead_s4_micro", "sieie": "lead_sieie_micro", "etaWidth": "lead_eta_width", "phiWidth": "lead_phi_width", "sieip": "lead_sieip_micro", "pfPhoIso03": "lead_pho_iso", "pfChargedIsoPFPV": "lead_ch_iso", "pfChargedIsoWorstVtx": "lead_ch_iso_worst", "mva_ID": "lead_mva" } nano_isos = { "pfPhoIso03": "lead_pfPhoIso03", "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV", "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx", "pfPhoIso03_uncorr": "lead_uncorr_pfPhoIso03", "pfChargedIsoPFPV_uncorr": "lead_uncorr_pfChargedIsoPFPV", "pfChargedIsoWorstVtx_uncorr": "lead_uncorr_pfChargedIsoWorstVtx", } micro_isos = { "pfPhoIso03": "lead_pho_iso", "pfChargedIsoPFPV": "lead_ch_iso", "pfChargedIsoWorstVtx": "lead_ch_iso_worst", "pfPhoIso03_uncorr": "lead_pho_iso_uncorr", "pfChargedIsoPFPV_uncorr": "lead_ch_iso_uncorr", "pfChargedIsoWorstVtx_uncorr": "lead_ch_iso_worst_uncorr", } nano_df = pd_joined[list(nano_vars.values())] nano_df.rename(columns=dict((v, k) for k, v in nano_vars.items()), inplace=True) nano_df.to_parquet("nano_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped nano dataframe to parquet file") micro_df = pd_joined[list(micro_vars.values())] micro_df.rename(columns=dict((v, k) for k, v in micro_vars.items()), inplace=True) micro_df.to_parquet("micro_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped micro dataframe to parquet file") nano_df = pd_joined[list(nano_isos.values())] nano_df.rename(columns=dict((v, k) for k, v in nano_isos.items()), inplace=True) nano_df.to_parquet("nano_{}_isos.parquet".format(args.sd), engine="fastparquet") print("Dumped nano dataframe for isos to parquet file") micro_df = pd_joined[list(micro_isos.values())] micro_df.rename(columns=dict((v, k) for k, v in micro_isos.items()), inplace=True) micro_df.to_parquet("micro_{}_isos.parquet".format(args.sd), engine="fastparquet") print("Dumped micro dataframe for isos to parquet file")
'MC_WZmu_el_pt_born', 'MC_WZneutrino_pt_born', 'MC_WZmu_el_phi_born', 'MC_WZneutrino_phi_born', 'MC_WZ_dilep_m_born', 'mcChannelNumber', 'weight_mc', 'KFactor_weight_truth', 'weight_pileup', 'eventNumber', ] # pull root data t = time.time() truth_df = to_pandas( uproot.concatenate(DATAFILE + ':truth', BRANCHES, num_workers=N_THREADS)) print(f"Importing from ROOT: {time.time() - t:.3f}s")\ # # delete duplicate events # t = time.time() # len_before = len(truth_df.index) # truth_df.drop_duplicates('eventNumber', keep='first', inplace=True) # print(f"Dropping duplicates: {time.time() - t:.3f}s ({len_before - len(truth_df.index)} duplicates found)") # calculate sum of weights t = time.time() sumw = to_pandas( uproot.concatenate(DATAFILE + ':sumWeights', ['dsid', 'totalEventsWeighted'], num_workers=N_THREADS)) sumw = sumw.groupby('dsid').sum()