def test_lazy_colon(): uproot.lazy( skhep_testdata.data_path("uproot-issue63.root") + ":WtLoop_nominal") uproot.lazy([ skhep_testdata.data_path("uproot-issue63.root") + ":WtLoop_nominal", skhep_testdata.data_path("uproot-issue63.root") + ":WtLoop_Fake_nominal", ])
def getData(fnames="", treeName="Events", chunks=False): branchlist = [] for collection, attrs in branches.items(): branchlist += [collection + "_" + attr for attr in attrs] if chunks: ldmx_dict = uproot.iterate(fnames + ":" + treeName, branchlist) else: ldmx_dict = uproot.lazy(fnames + ":" + treeName, branchlist) return ldmx_dict
def do_the_work(file: Path) -> ak.Array: import uproot as uproot with uproot.open(file) as f_in: tree_name = f_in.keys()[0] return uproot.lazy(f'{file}:{tree_name}')
def test_awkward_pluralization(): awkward = pytest.importorskip("awkward") files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace( "6.20.04", "*" ) array = uproot.lazy({files: "sample"}) assert awkward.to_list(array[:5, "i4"]) == [-15, -14, -13, -12, -11]
def test(): array = uproot.lazy( skhep_testdata.data_path("uproot-HZZ-objects.root") + ":events") assert array.jetp4.fP.fX[:5].tolist() == [ [], [-38.87471389770508], [], [-71.6952133178711, 36.60636901855469, -28.866418838500977], [3.880161762237549, 4.979579925537109], ]
def test_awkward(): awkward = pytest.importorskip("awkward") files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace( "6.20.04", "*" ) cache = {} array = uproot.lazy({files: "sample"}, array_cache=cache) assert len(cache) == 0 assert awkward.to_list(array[:5, "i4"]) == [-15, -14, -13, -12, -11] assert len(cache) == 1 assert awkward.to_list(array[:5, "ai4"]) == [ [-14, -13, -12], [-13, -12, -11], [-12, -11, -10], [-11, -10, -9], [-10, -9, -8], ] assert len(cache) == 2 assert awkward.to_list(array[:5, "Ai4"]) == [ [], [-15], [-15, -13], [-15, -13, -11], [-15, -13, -11, -9], ] assert len(cache) == 3 assert awkward.to_list(array[:5, "str"]) == [ "hey-0", "hey-1", "hey-2", "hey-3", "hey-4", ] assert len(cache) == 4
def main(args): logger = setup_logging() v0_input_dir = args.v0_input_dir vcustom_input_dir = args.vcustom_input_dir output_dir = args.output_dir channel = args.channel tree_name = tree_name_tmpl.format(channel) # Needed names for files and trees v0_file = v0_input_dir + "/" + file_names_tmpl[channel] v_custom_file = vcustom_input_dir + "/" + file_names_tmpl[channel] ranges = { "pt": { "range": (0, 300), "label": "$p_T$" }, } for var, specs in ranges.items(): logger.info("Working with {}".format(var)) # Read two trees lazily imp_variables = [var] + ["vtx_z", "gen_vtx_z", "weight"] arr_vtx0 = uproot.lazy(["{}:{}".format(v0_file, tree_name)], imp_variables) arr_vtxc = uproot.lazy(["{}:{}".format(v_custom_file, tree_name)], imp_variables) # Compute quantities n_ranges = 35 var_range = np.linspace(specs["range"][0], specs["range"][1], n_ranges) var_ranges = [] inf = var_range[0] for sup in var_range[1:]: var_ranges.append((inf, sup)) inf = sup x_vtx0, x_vtxc, y_vtx0, y_vtxc = {}, {}, {}, {} xs = [np.mean(rng) for rng in var_ranges] x_vtx0["values"] = xs x_vtxc["values"] = xs x_vtx0["unc"] = [ np.std(arr_vtx0[(arr_vtx0[var] > rng[0]) & (arr_vtx0[var] < rng[1])][var].to_numpy()) for rng in var_ranges ] x_vtxc["unc"] = [ np.std(arr_vtxc[(arr_vtxc[var] > rng[0]) & (arr_vtxc[var] < rng[1])][var].to_numpy()) for rng in var_ranges ] y_vtx0["values"], y_vtx0["unc"] = count_fraction( arr_vtx0, var, var_ranges) y_vtxc["values"], y_vtxc["unc"] = count_fraction( arr_vtxc, var, var_ranges) # Plot fig, (ax, rax) = plt.subplots(nrows=2, ncols=1, gridspec_kw={"height_ratios": (3, 1)}, sharex=True) ax.errorbar(x_vtx0["values"], y_vtx0["values"], xerr=x_vtx0["unc"], yerr=np.array(y_vtx0["unc"]).T, fmt='ro', label="Vertex 0th") ax.errorbar(x_vtxc["values"], y_vtxc["values"], xerr=x_vtxc["unc"], yerr=np.array(y_vtxc["unc"]).T, fmt='bs', label="Vertex Reco") rdiff = [ rel_diff_asymm(v0, vc, v0_uncs, vc_uncs) for v0, vc, v0_uncs, vc_uncs in zip(y_vtx0["values"], y_vtxc["values"], y_vtx0["unc"], y_vtxc["unc"]) ] rax.errorbar(x_vtx0["values"], y=[rd[0] for rd in rdiff], yerr=np.array([rd[1] for rd in rdiff]).T, fmt='ko') ax.legend(fontsize=18, loc="lower right") rax.set_xlabel(specs["label"]) ax.set_ylabel("Fraction of |$Z_{reco}$ - $Z_{true}$| < 10 mm") rax.set_ylabel("$rel\ diff$") ax.set_ylim(*y_lims[var][channel]["ax"]) rax.set_ylim(*y_lims[var][channel]["rax"]) ax.set_xlim(left=0.) rax.set_xlim(left=0.) ax.grid(which="both") rax.grid(which="both") output_name = "{}_id_efficiency".format(var) hep.cms.label(loc=0, data=True, llabel="Work in Progress", rlabel="", ax=ax, pad=.05) fig.savefig("{}/{}.png".format(output_dir, output_name), bbox_inches='tight') fig.savefig("{}/{}.pdf".format(output_dir, output_name), bbox_inches='tight') logger.info("Dumped plot in {}".format(output_dir))
def load_df(): import uproot as uproot with uproot.open(good_root_file_path) as f_in: tree_name = f_in.keys()[0] return uproot.lazy(f'{good_root_file_path}:{tree_name}')
def get_gen_events(self): # tree_gen = uproot.open(self.fileName)[self.treeName_gen] # events = tree_gen.arrays() # generator events tree_path = self.__define_tree_expression(is_gen=True) events = uproot.lazy(tree_path) return events
def get_events(self): # tree_in = uproot.open(self.fileName)[self.treeName] # events = tree_in.arrays() # filtered events tree_path = self.__define_tree_expression(is_gen=False) events = uproot.lazy(tree_path) return events
def test_lazy(): with pytest.raises(ValueError): uproot.lazy(skhep_testdata.data_path("uproot-issue63.root")) with pytest.raises(ValueError): uproot.lazy( {skhep_testdata.data_path("uproot-issue63.root"): "blah"}, allow_missing=True, ) uproot.lazy( {skhep_testdata.data_path("uproot-issue63.root"): "WtLoop_nominal"}) uproot.lazy({ skhep_testdata.data_path("uproot-issue63.root"): "WtLoop_nominal", skhep_testdata.data_path("uproot-issue63.root"): "WtLoop_Fake_nominal", }) uproot.lazy([{ skhep_testdata.data_path("uproot-issue63.root"): "WtLoop_nominal" }]) uproot.lazy({ skhep_testdata.data_path("uproot-issue63.root") + "*": "WtLoop_nominal" }) uproot.lazy([{ skhep_testdata.data_path("uproot-issue63.root") + "*": "WtLoop_nominal" }])
# base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', # base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', # #base + 'PrivateSamples.SUEP_2018_mMed-400_mDark-2_temp-2_decay-darkPhoHad_13TeV-pythia8_n-100_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', #} datasets = { # base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', # base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection', } dataset2 = {'qcd_CUETP8M1.root': 'tree'} #dataset3 = {'qcd_CUETP8M1_up.root': 'tree'} #dataset4 = {'qcd_CUETP8M1_low.root': 'tree'} events = uproot.lazy(datasets) pythia = uproot.lazy(dataset2) #pythia_up = uproot.lazy(dataset3) #pythia_low = uproot.lazy(dataset4) multiplicity_pythia = pythia['nTracks'] #multiplicity_pythia_up = pythia_up['nTracks'] #multiplicity_pythia_low = pythia_low['nTracks'] met = events['MET'] ht = events['HT'] pv_x = events['PrimaryVertices.fCoordinates.fX'] CrossSection = events['CrossSection'] """tracks_x = events['Tracks.fCoordinates.fX'] tracks_y = events['Tracks.fCoordinates.fY'] tracks_z = events['Tracks.fCoordinates.fZ'] tracks_fromPV0 = events['Tracks_fromPV0']
def main(): logger = setup_logging() # Needed names for files and trees v0_file = "/work/gallim/root_files/vertex_investigation/VertexInvestigation_vtx0/output_GluGluHToGG_M125_TuneCP5_13TeV-amcatnloFXFX-pythia8_storeWeights_alesauva-UL2018_0-10_6_4-v0-RunIISummer19UL18MiniAOD-106X_upgrade2018_realistic_v11_L1v1-v1-3f96409841a3cc85b911eb441562baae_USER_*.root" v_custom_file = "/work/gallim/root_files/vertex_investigation/VertexInvestigation/output_GluGluHToGG_M125_TuneCP5_13TeV-amcatnloFXFX-pythia8_storeWeights_alesauva-UL2018_0-10_6_4-v0-RunIISummer19UL18MiniAOD-106X_upgrade2018_realistic_v11_L1v1-v1-3f96409841a3cc85b911eb441562baae_USER_*.root" tree_name = "diphotonDumper/trees/ggH_125_13TeV_All_$SYST" output_dir = "/eos/home-g/gallim/www/plots/Hgg/VertexInvestigation/mass_fit" # Read two trees lazily imp_variables = ["weight", "lead_eta", "sublead_eta", "sigma_m", "mass"] arr_vtx0 = uproot.lazy(["{}:{}".format(v0_file, tree_name)], imp_variables) arr_vtxc = uproot.lazy(["{}:{}".format(v_custom_file, tree_name)], imp_variables) arrays = {"vtx0": arr_vtx0, "vtxc": arr_vtxc} # Define categories categories = {"EBEB": EBEB_mask, "EBEE": EBEE_mask, "EEEE": EEEE_mask} masked_arrays = {"EBEB": {}, "EBEE": {}, "EEEE": {}} histos = {} fits = {"EBEB": {}, "EBEE": {}, "EEEE": {}} # Define zfit objects for fits logger.info("Creating zfit objects") fit_range = [115, 135] obs = zfit.Space("M", limits=fit_range) mu1 = zfit.Parameter("mu1", 125, 120, 130) sigma1 = zfit.Parameter("sigma1", 1, 0.1, 10) mu2 = zfit.Parameter("mu2", 125, 120, 130) sigma2 = zfit.Parameter("sigma2", 1, 0.1, 10) n = zfit.Parameter("n", 1, 0, 10) alpha = zfit.Parameter("alpha", 1, 0, 10) frac = zfit.Parameter("frac", 0.5, 0, 1) gauss = zfit.pdf.Gauss(obs=obs, mu=mu1, sigma=sigma1) cb = zfit.pdf.CrystalBall(obs=obs, mu=mu2, sigma=sigma2, n=n, alpha=alpha) model = zfit.pdf.SumPDF(pdfs=[gauss, cb], fracs=frac) minimizer = zfit.minimize.Minuit() variables = [{ "name": "mass", "bins": 100, "range": [115, 135] }, { "name": "sigma_m", "bins": 80, "range": [0., 0.035] }] # Loop over categories for cat_name, func in categories.items(): logger.info("Working with category {}".format(cat_name)) for vtx_name, arr in arrays.items(): masked_arrays[cat_name][vtx_name] = arr[func(arr)] histos[cat_name] = hist.Hist( "Density", hist.Cat("vertex", "Vertex"), *[ hist.Bin(spec["name"], spec["name"], spec["bins"], *spec["range"]) for spec in variables ]) # fill histos for vtx_name, arr in masked_arrays[cat_name].items(): histos[cat_name].fill(vertex=vtx_name, mass=arr["mass"], sigma_m=arr["sigma_m"], weight=arr["weight"]) # Plot superimposed vertex values for mass and sigma_m (from flashgg) for var in variables: logger.info("Creating plot for variable {}".format(var["name"])) fig, ax = plt.subplots() loc_vars = [sp["name"] for sp in variables] loc_vars.remove(var["name"]) hist.plot1d(histos[cat_name].sum(*loc_vars), density=True) output_name = "{}_{}".format(var["name"], cat_name) hep.cms.label(loc=0, data=True, llabel="Work in Progress", rlabel="", ax=ax, pad=.05) fig.savefig("{}/{}.png".format(output_dir, output_name), bbox_inches='tight') fig.savefig("{}/{}.pdf".format(output_dir, output_name), bbox_inches='tight') # Fits logger.info("Proceed with fits") x = np.linspace(115, 135, 1000) # x values for model in plots for vtx_name, arr in masked_arrays[cat_name].items(): data = zfit.Data.from_numpy(obs=obs, array=arr["mass"].to_numpy()) nll = zfit.loss.UnbinnedNLL(model=model, data=data) fits[cat_name]["result"] = minimizer.minimize(nll) fits[cat_name]["param_errors"] = fits[cat_name]["result"].hesse() # Compute chi-square and p-value logger.info("Computing goodness of fit") parameters = [mu1, sigma1, mu2, sigma2, n, alpha, frac] observed_values, observed_edges = np.histogram( arr["mass"].to_numpy(), variables[0]["bins"], variables[0]["range"]) observed_centers = .5 * (observed_edges[1:] + observed_edges[:-1]) plot_scale = len( arr["mass"]) / variables[0]["bins"] * obs.area().numpy() expected_values = model.pdf(observed_centers).numpy() * plot_scale res = chisquare(observed_values, f_exp=expected_values) textstr = format_fit_info(arr, fits[cat_name]["result"], res, *parameters) logger.info(textstr) # Plot superimposed histogram and model logger.info( "Creating plot for category {}, vertex {} with model".format( cat_name, vtx_name)) fig, ax = plt.subplots() y = model.pdf(x, norm_range=fit_range).numpy() plt.plot(x, y, label="Model") err_opts = { 'linestyle': 'none', 'marker': '.', 'markersize': 10., 'color': 'k', 'elinewidth': 1, } hist.plot1d(histos[cat_name].sum("sigma_m")[vtx_name], density=True, error_opts=err_opts) # Stats box props = dict(boxstyle='round', facecolor='wheat', alpha=0.5) ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=12, verticalalignment='top', bbox=props) output_name = "mass_{}_{}_with_model".format(vtx_name, cat_name) hep.cms.label(loc=0, data=True, llabel="Work in Progress", rlabel="", ax=ax, pad=.05) fig.savefig("{}/{}.png".format(output_dir, output_name), bbox_inches='tight') fig.savefig("{}/{}.pdf".format(output_dir, output_name), bbox_inches='tight')
def setupPionData(root_file_dict,branches=[], layers=[], cluster_tree='ClusterTree', balance_data=True, n_max=-1, cut_distributions=[], cut_values=[], cut_types=[], match_distribution='', match_binning=(), match_log=False, verbose=False, load=False, save=False, filename='', return_indices=False): pdata = {} pcells = {} keys = list(root_file_dict.keys()) rng = np.random.default_rng() pdata_filename = filename + '_frame.h5' pcell_filename = filename + '_images.h5' selec_filename = filename + '_selections.h5' if(load and pathlib.Path(pdata_filename).exists() and pathlib.Path(pcell_filename).exists()): if(verbose): print('Loading pandas DataFrame and calo images from {} and {}.'.format(pdata_filename,pcell_filename)) # Load the DataFrame and images from disk. pdata = { key: pd.read_hdf(pdata_filename,key=key) for key in keys } hf = h5.File(pcell_filename,'r') for key in keys: pcells[key] = {} for layer in layers: pcells[key][layer] = hf['{}:{}'.format(key,layer)][:] hf.close() if(return_indices): # TODO: Rework this a little! hf = h5.File(selec_filename,'r') indices = {key: hf[key][:] for key in keys} hf.close() else: # root_file_dict entries might be glob-style strings, or lists of files. We should consider both possibilities. arrays = {} for key,root_files in root_file_dict.items(): if(type(root_files) == list): arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in branches) else: arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in branches) indices = ApplyCuts(arrays, cut_distributions, cut_values, cut_types, verbose) # Filter out clusters so that our data series match in their distribution of a user-supplied variable. if(match_distribution != ''): if(match_distribution in branches and len(match_binning) == 3): if(verbose): print('Matching data series on distribution: {}.'.format(match_distribution)) binning = np.linspace(match_binning[1],match_binning[2],match_binning[0]+1) n_bins = len(binning) - 1 distributions = { key: np.histogram(arrays[key][match_distribution][indices[key]].to_numpy(), bins=binning)[0] # only keep bin counts for key in keys } # Now determine how many clusters we keep in each bin, for each key. n_keep = np.zeros(n_bins,dtype=np.dtype('i8')) for i in range(n_bins): n_keep[i] = int(np.min([x[i] for x in distributions.values()])) # Now we need to throw out some clusters -- in other words, only keep some. # We will randomly choose which ones we keep, for each match_distribution bin, # for each data series (key). for key in keys: sorted_indices = indices[key][np.argsort(arrays[key][match_distribution][indices[key]])] keep_indices = [] bin_idx_edges = np.insert(np.cumsum(distributions[key]),0,0) for i in range(n_bins): index_block = sorted_indices[bin_idx_edges[i]:bin_idx_edges[i+1]] # all indices corresponding to the i'th bin of match_distribution, for this key keep_indices.append(rng.choice(index_block, n_keep[i], replace=False)) n_before = len(indices[key]) indices[key] = np.hstack(keep_indices) n_after = len(indices[key]) #if(verbose): print('\t{}, number of events: {} -> {}'.format(key, n_before, n_after)) else: print('Warning: Requested matching of distribution \"{}\" but this variable is not among the branches you selected from the data. Skipping this step.'.format(match_distribution)) # Balance data so we have equal amounts of each category. # Note that if we did the matching above, we can potentially skip this as # balancing was implicitly done. However, we might want to take the opportunity # to further slim down our dataset. if(balance_data): n_max_tmp = np.min([len(x) for x in indices.values()]) if(n_max > 0): n_max = np.minimum(n_max_tmp, n_max) else: n_max = n_max_tmp if(verbose): print('Balancing data: {} events per category.'.format(n_max)) indices = {key:rng.choice(val, n_max, replace=False) for key,val in indices.items()} # Make a boolean mask from the indices. This speeds things up below, as opposed to passing (unsorted) lists of indices. for key in indices.keys(): msk = np.zeros(len(arrays[key]),dtype=bool) msk[indices[key]] = True indices[key] = msk # Now, apply our selection indices to the arrays. arrays = { key:arrays[key][indices[key]] for key in keys } # Make the dataframes from the arrays. if(verbose): print('Preparing pandas DataFrame.') pdata = { key: ak.to_pandas(arrays[key][branches]) for key in keys } # Re-make the arrays with just our layer info (using our selection indices again). arrays = {} for key,root_files in root_file_dict.items(): if(type(root_files) == list): arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in layers)[indices[key]] else: arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in layers)[indices[key]] # Make our calorimeter images. nentries = len(keys) * len(layers) i = 0 if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40) pcells = {} for key in keys: pcells[key] = {} for layer in layers: pcells[key][layer] = setupCells_new(arrays[key],layer) i+=1 if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40) # Save the dataframes and calorimeter images in HDF5 format for easy access next time. if(filename != '' and save): if(verbose): print('Saving DataFrames to {}.'.format(pdata_filename)) for key,frame in pdata.items(): frame.to_hdf(pdata_filename, key=key, mode='a',complevel=6) if(verbose): print('Saving calorimeter images to {}.'.format(pcell_filename)) hf = h5.File(pcell_filename, 'w') for key in pcells.keys(): for layer in layers: dset = hf.create_dataset('{}:{}'.format(key,layer), data=pcells[key][layer], chunks=True, compression='gzip', compression_opts=7) hf.close() # One may optionally also save the selected event indices. This can be useful if referring back to the original data source. if(return_indices): # Save the indices to a file. hf = h5.File(selec_filename, 'w') for key in indices.keys(): dset = hf.create_dataset(key, data=indices[key], chunks=True, compression='gzip', compression_opts=7) hf.close() return pdata, pcells, indices # return indices return pdata, pcells # don't return indices
def test_lazy_called_on_nonexistent_file(): awkward = pytest.importorskip("awkward") filename = "nonexistent_file.root" with pytest.raises(uproot._util._FileNotFoundError) as excinfo: uproot.lazy(filename) assert filename in str(excinfo.value)
from coffea import processor from coffea.processor.test_items import NanoEventsProcessor from coffea.nanoevents import schemas if __name__ == "__main__": config_dict = { "skyhook": { "ceph_config_path": "/tmp/testskyhookjob/ceph.conf", "ceph_data_pool": "cephfs_data", } } with open("/root/.coffea.toml", "w") as f: toml.dump(config_dict, f) ak.to_parquet( uproot.lazy("tests/samples/nano_dy.root:Events"), "nano_dy.parquet", list_to32=True, use_dictionary=False, compression="GZIP", compression_level=1, ) ak.to_parquet( uproot.lazy("tests/samples/nano_dimuon.root:Events"), "nano_dimuon.parquet", list_to32=True, use_dictionary=False, compression="GZIP", compression_level=1, )
def __init__(self, root_files, tree_name, scalar_branches, matrix_branches = list(mu.cell_meta.keys()), target=None, batch_size=200, shuffle=True, # TODO: Turning off shuffle caused some problems in simple tests, when retrieving data. How? step_size=None, flatten_images=False, key_map=None): # Deal with the case of a dictionary input -- this means that the targets will be the # categories specified by the dictionary keys. We will need to keep track of which target # value each individual file is associated with, so that we can ultimately determine the # target value for every index in our (unshuffled) list of events. if(type(root_files) == dict): self.external_classification = True if(target is not None): print('Warning: target is set to {}, but ROOT files have been passed as a dictionary -> target will be ignored, using dictionary keys as classification labels.'.format(target)) self.root_files = [] keys = list(root_files.keys()) keys.sort() nlabels = len(keys) self.external_classification_nclasses = nlabels nentries_dict = {} classes_dict = {} for i,key in enumerate(keys): rfiles = root_files[key] if(type(rfiles) != list): rfiles = glob.glob(rfiles,recursive=True) for rfile in rfiles: with ur.open(rfile, cache=None, array_cache=None)[tree_name] as tree: nentries_dict[rfile] = tree.num_entries classes_dict[rfile] = i self.root_files += rfiles self.root_files.sort() # At this point, we know how many events we have for every file, and which classification (number) # each file corresponds with. Thus we can determine the event index boundaries at which the classification # scores change -- and from this, we can determine the classification score of each event without explicitly saving # the score per event. In terms of memory usage, this will scale more nicely than explicitly saving all those scores. index_score_boundaries = {} # key is upper bound of index range (inclusive!), value is classification value nentries = 0 for rfile in self.root_files: nentries += nentries_dict[rfile] index_score_boundaries[nentries-1] = classes_dict[rfile] self.index_score_boundaries = index_score_boundaries else: self.external_classification = False self.external_classification_nclasses = None self.index_score_boundaries = None if(type(root_files) == list): self.root_files = root_files else: self.root_files = glob.glob(root_files,recursive=True) self.root_files.sort() if(step_size is None): self.step_size = '{} MB'.format(batch_size) # TODO: Is this reasonable? else: self.step_size = step_size self.tree_name = tree_name self.scalar_branches = scalar_branches # We will create a lazy array for these, as it performs well. self.matrix_branches = matrix_branches # These will only be handled when fetching data! Not using lazy array (too slow). self.target = target # Quick hack for the case of external classification, in which case the target is redundant if(self.external_classification): self.target = self.scalar_branches[0] if(self.target is not None): assert(self.target in self.scalar_branches) self.batch_size = batch_size self.shuffle = shuffle if(self.scalar_branches is None): filter_func = lambda x: x.name not in list(mu.cell_meta.keys()) else: filter_func = lambda x: x.name in self.scalar_branches self.scalar_array = ur.lazy(files=[':'.join((x,self.tree_name)) for x in self.root_files], filter_branch = filter_func, step_size = self.step_size ) # Now remove the target from scalar_branches, so that it is not included among features. self.scalar_branches = [x for x in self.scalar_branches if x != self.target] self.branches = self.scalar_branches + self.matrix_branches self.key_map = key_map # optionally remap data keys (e.g. "EMB1" -> "input") for access -- this is useful if network assumes tensors have certain names that differ from actual branch names if(self.key_map is None): self.key_map = {x:x for x in self.branches} else: for x in self.branches: if(x not in self.key_map.keys()): self.key_map[x] = x self.image_array = ROOTImageArray(root_files = self.root_files, tree_name = self.tree_name, image_branches = self.matrix_branches, flatten = flatten_images ) self.indices = np.arange(len(self.scalar_array)) self.on_epoch_end()