def test_function_iterate_pandas_2(): pandas = pytest.importorskip("pandas") files = (skhep_testdata.data_path("uproot-HZZ.root").replace( "HZZ", "HZZ-{uncompressed,zlib,lz4}") + ":events") expect = 0 for arrays, report in uproot4.iterate(files, "Muon_Px", report=True, library="pd"): assert arrays["Muon_Px"].index.values[0] == (expect, 0) expect += report.tree.num_entries
def test_function_iterate(): files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace( "6.20.04", "*" ) expect = 0 for arrays, report in uproot4.iterate( {files: "sample"}, "i8", report=True, library="np" ): assert arrays["i8"][:5].tolist() == [-15, -14, -13, -12, -11] assert report.global_entry_start == expect assert report.global_entry_stop == expect + len(arrays["i8"]) expect += len(arrays["i8"])
def test_iterate(): with pytest.raises(ValueError): for arrays in uproot4.iterate(skhep_testdata.data_path("uproot-issue63.root")): pass assert ( len( list( uproot4.iterate( {skhep_testdata.data_path("uproot-issue63.root"): "blah"}, allow_missing=True, ) ) ) == 0 ) files = skhep_testdata.data_path("uproot-sample-6.16.00-uncompressed.root").replace( "6.16.00", "*" ) for arrays in uproot4.iterate(files, "Ai8"): pass for arrays in uproot4.iterate({files: "sample"}, "Ai8"): pass for arrays in uproot4.iterate([files], "Ai8"): pass for arrays in uproot4.iterate([{files: "sample"}], "Ai8"): pass
def test_function_iterate_pandas(): pandas = pytest.importorskip("pandas") files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace( "6.20.04", "*" ) expect = 0 for arrays, report in uproot4.iterate( {files: "sample"}, "i8", report=True, library="pd" ): assert arrays["i8"].values[:5].tolist() == [-15, -14, -13, -12, -11] assert arrays.index.values[0] == expect assert report.global_entry_start == expect assert report.global_entry_stop == expect + len(arrays["i8"]) expect += len(arrays["i8"])
def iter_chunks( path, treename="t", progress=True, step_size="50MB", columns=None, nthreads=4, ): """ Loop over specified ROOT files in `path` in chunks, returning dataframes. Tree name is specified via `treename`. Iterates over the files in chunks of `step_size` (as per `uproot4.iterate`), reading columns: list of columns ("branches") to read (default of `None` reads all) """ if ":" not in path: path = f"{path}:{treename}" iterable = uproot4.iterate( path, filter_name=columns, step_size=step_size, decompression_executor=concurrent.futures.ThreadPoolExecutor(nthreads), ) if progress: iterable = tqdm(iterable) nevents = 0 t0 = time.time() for arrays in iterable: df = awkward1_arrays_to_dataframe(arrays) nevents += len(df) yield df t1 = time.time() if progress: print( f"Processed {nevents} in {t1-t0:.2f}s ({1e-6*nevents/(t1-t0):.2f}MHz)" )
def test_branch_pluralization(): awkward1 = pytest.importorskip("awkward1") with uproot4.open(skhep_testdata.data_path( "uproot-Zmumu.root"))["events/px1"] as px1: assert px1.array(library="np")[:5].tolist() == [ -41.1952876442, 35.1180497674, 35.1180497674, 34.1444372454, 22.7835819537, ] assert px1.arrays(library="np")["px1"][:5].tolist() == [ -41.1952876442, 35.1180497674, 35.1180497674, 34.1444372454, 22.7835819537, ] for i, arrays in enumerate(px1.iterate(library="np", step_size=1000)): if i == 0: assert arrays["px1"][:5].tolist() == [ -41.1952876442, 35.1180497674, 35.1180497674, 34.1444372454, 22.7835819537, ] elif i == 1: assert arrays["px1"][:5].tolist() == [ 26.043758785, 26.043758785, 25.9962042016, -44.4626620943, 28.2794901505, ] elif i == 2: assert arrays["px1"][:5].tolist() == [ -43.3783782352, -43.3783782352, -43.2444221651, -20.2126675303, 43.7131175076, ] else: assert False for i, arrays in enumerate( uproot4.iterate( {skhep_testdata.data_path("uproot-Zmumu.root"): "events/px1"})): if i == 0: assert arrays["px1"][:5].tolist() == [ -41.1952876442, 35.1180497674, 35.1180497674, 34.1444372454, 22.7835819537, ] elif i == 1: assert arrays["px1"][:5].tolist() == [ 26.043758785, 26.043758785, 25.9962042016, -44.4626620943, 28.2794901505, ] elif i == 2: assert arrays["px1"][:5].tolist() == [ -43.3783782352, -43.3783782352, -43.2444221651, -20.2126675303, 43.7131175076, ] else: assert False
def from_uproot( ntuple_paths: List[pathlib.Path], pos_in_file: str, variable: str, bins: np.ndarray, weight: Optional[str] = None, selection_filter: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: """Reads an ntuple with uproot, and fills a histogram with the observable. The paths may contain wildcards. Args: ntuple_paths (List[pathlib.Path]): list of paths to ntuples pos_in_file (str): name of tree within ntuple variable (str): variable to bin histogram in bins (numpy.ndarray): bin edges for histogram weight (Optional[str], optional): event weight to extract, defaults to None (no weights applied) selection_filter (Optional[str], optional): filter to be applied on events, defaults to None (no filter) Returns: Tuple[np.ndarray, np.ndarray]: - yield per bin - stat. uncertainty per bin """ # concatenate the path to file and location within file with ":" paths_with_trees = [str(path) + ":" + pos_in_file for path in ntuple_paths] # determine whether the weight is a float or an expression # (for which a branch needs to be read) if weight is not None: try: float(weight) weight_is_expression = False except ValueError: # weight is not a float, need to evaluate the expression weight_is_expression = True else: # no weight specified, all weights are 1.0 weight_is_expression = False weight = "1.0" if weight_is_expression: # need to read observables and weights array_generator = uproot.iterate( paths_with_trees, expressions=[variable, weight], cut=selection_filter, ) obs_list = [] weight_list = [] for arr in array_generator: obs_list.append(ak.to_numpy(arr[variable])) weight_list.append(ak.to_numpy(arr[weight])) observables = np.concatenate(obs_list) weights = np.concatenate(weight_list) else: # only need to read the observables array_generator = uproot.iterate( paths_with_trees, expressions=[variable], cut=selection_filter, ) obs_list = [] for arr in array_generator: obs_list.append(ak.to_numpy(arr[variable])) observables = np.concatenate(obs_list) weights = np.ones_like(observables) * float(weight) yields, stdev = _bin_data(observables, weights, bins) return yields, stdev
def AddFFcorr(infname, intreename, outfname, outtreename, Lcstate, leptname, q2True_branchname, costhlTrue_branchname, nentries_to_read=1000000000, chunksize=10000): TH1.AddDirectory(kFALSE) perfname = None q2factor = None if Lcstate == 'Lc': perfname = './CorrectionTables/LcFFratios.root' q2factor = 1. elif (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): perfname = './CorrectionTables/LcstFFratios.root' q2factor = 1e-6 else: raise Exception('Lc state not recognised', Lcstate) if leptname != 'mu' and leptname != 'tau': raise Exception('Lepton name not recognised', leptname) print('Using the histname', Lcstate + leptname + "_ratio") #variables to get from file varsdf = ['runNumber', 'eventNumber'] varsdf += ['Lb_TRUEP_X', 'Lb_TRUEP_Y', 'Lb_TRUEP_Z', 'Lb_TRUEP_E'] varsdf += ['Lc_TRUEP_X', 'Lc_TRUEP_Y', 'Lc_TRUEP_Z', 'Lc_TRUEP_E'] varsdf += [ 'Lb_True' + leptname.capitalize() + '_PX', 'Lb_True' + leptname.capitalize() + '_PY', 'Lb_True' + leptname.capitalize() + '_PZ', 'Lb_True' + leptname.capitalize() + '_PE' ] varsdf += [ 'Lb_TrueNeutrino_PX', 'Lb_TrueNeutrino_PY', 'Lb_TrueNeutrino_PZ', 'Lb_TrueNeutrino_PE' ] File = TFile.Open(perfname, "read") Histg = File.Get(Lcstate + leptname + "_ratio") perfHist = Histg.Clone(Lcstate + leptname + "_rationew") File.Close() Xmin = perfHist.GetXaxis().GetXmin() Xmax = perfHist.GetXaxis().GetXmax() Ymin = perfHist.GetYaxis().GetXmin() Ymax = perfHist.GetYaxis().GetXmax() Limits = (Xmin, Xmax, Ymin, Ymax) print(Limits, perfHist.Integral()) #variables to store in the new ttree varstoStore = { 'runNumber': np.int, 'eventNumber': np.int, 'Event_FFcorr': np.float64, costhlTrue_branchname: np.float64, q2True_branchname: np.float64 } aliases = {} #create a new rootfile with uproot3.recreate(outfname) as f: f[outtreename] = uproot3.newtree(varstoStore) #loop over the old rootfile chunkwise events_read = 0 if chunksize >= nentries_to_read: chunksize = nentries_to_read for df_data in uproot4.iterate(infname + ':' + intreename, varsdf, aliases=aliases, cut=None, library="pd", step_size=chunksize): if events_read >= nentries_to_read: break #Compute q2 and cosThetaL pxl = df_data['Lb_True' + leptname.capitalize() + '_PX'] pxnu = df_data['Lb_TrueNeutrino_PX'] pyl = df_data['Lb_True' + leptname.capitalize() + '_PY'] pynu = df_data['Lb_TrueNeutrino_PY'] pzl = df_data['Lb_True' + leptname.capitalize() + '_PZ'] pznu = df_data['Lb_TrueNeutrino_PZ'] pel = df_data['Lb_True' + leptname.capitalize() + '_PE'] penu = df_data['Lb_TrueNeutrino_PE'] if (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): #this should be Lcstar momentum pxlc = df_data['Lb_TRUEP_X'] - pxl - pxnu pylc = df_data['Lb_TRUEP_X'] - pyl - pynu pzlc = df_data['Lb_TRUEP_X'] - pzl - pznu pelc = df_data['Lb_TRUEP_X'] - pel - penu elif Lcstate == 'Lc': pxlc = df_data['Lc_TRUEP_X'] pylc = df_data['Lc_TRUEP_Y'] pzlc = df_data['Lc_TRUEP_Z'] pelc = df_data['Lc_TRUEP_E'] PLc_lab = LorentzVector( Vector(pxlc, pylc, pzlc), pelc) #Format of LorentzVector(Vector(X,Y,Z), E) Pl_lab = LorentzVector(Vector(pxl, pyl, pzl), pel) PNu_lab = LorentzVector(Vector(pxnu, pynu, pznu), penu) PLb_lab = PLc_lab + Pl_lab + PNu_lab qsq, cthl = return_phasespace(PLb_lab, PLc_lab, Pl_lab) #print(qsq,cthl) df_data[q2True_branchname] = qsq df_data[costhlTrue_branchname] = cthl #get the corrections applyvars = [q2True_branchname, costhlTrue_branchname ] #has to be in correct order like in histogram df_data['Event_FFcorr'] = df_data[applyvars].apply( storeeff2D, args=[perfHist, Limits, q2factor], axis=1) #get only the things that need to be stored and write them to the file branch_dict = { vartostore: df_data[vartostore].to_numpy() for vartostore in list(varstoStore.keys()) } f[outtreename].extend(branch_dict) events_read += df_data.shape[0] print('Events read', events_read)