Esempio n. 1
0
def test_lazy_colon():
    uproot.lazy(
        skhep_testdata.data_path("uproot-issue63.root") + ":WtLoop_nominal")
    uproot.lazy([
        skhep_testdata.data_path("uproot-issue63.root") + ":WtLoop_nominal",
        skhep_testdata.data_path("uproot-issue63.root") +
        ":WtLoop_Fake_nominal",
    ])
Esempio n. 2
0
def getData(fnames="", treeName="Events", chunks=False):
    branchlist = []
    for collection, attrs in branches.items():
        branchlist += [collection + "_" + attr for attr in attrs]
    if chunks: ldmx_dict = uproot.iterate(fnames + ":" + treeName, branchlist)
    else: ldmx_dict = uproot.lazy(fnames + ":" + treeName, branchlist)
    return ldmx_dict
Esempio n. 3
0
        def do_the_work(file: Path) -> ak.Array:
            import uproot as uproot

            with uproot.open(file) as f_in:
                tree_name = f_in.keys()[0]

            return uproot.lazy(f'{file}:{tree_name}')
def test_awkward_pluralization():
    awkward = pytest.importorskip("awkward")
    files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace(
        "6.20.04", "*"
    )
    array = uproot.lazy({files: "sample"})
    assert awkward.to_list(array[:5, "i4"]) == [-15, -14, -13, -12, -11]
Esempio n. 5
0
def test():
    array = uproot.lazy(
        skhep_testdata.data_path("uproot-HZZ-objects.root") + ":events")
    assert array.jetp4.fP.fX[:5].tolist() == [
        [],
        [-38.87471389770508],
        [],
        [-71.6952133178711, 36.60636901855469, -28.866418838500977],
        [3.880161762237549, 4.979579925537109],
    ]
def test_awkward():
    awkward = pytest.importorskip("awkward")
    files = skhep_testdata.data_path("uproot-sample-6.20.04-uncompressed.root").replace(
        "6.20.04", "*"
    )
    cache = {}
    array = uproot.lazy({files: "sample"}, array_cache=cache)
    assert len(cache) == 0

    assert awkward.to_list(array[:5, "i4"]) == [-15, -14, -13, -12, -11]
    assert len(cache) == 1

    assert awkward.to_list(array[:5, "ai4"]) == [
        [-14, -13, -12],
        [-13, -12, -11],
        [-12, -11, -10],
        [-11, -10, -9],
        [-10, -9, -8],
    ]
    assert len(cache) == 2

    assert awkward.to_list(array[:5, "Ai4"]) == [
        [],
        [-15],
        [-15, -13],
        [-15, -13, -11],
        [-15, -13, -11, -9],
    ]
    assert len(cache) == 3

    assert awkward.to_list(array[:5, "str"]) == [
        "hey-0",
        "hey-1",
        "hey-2",
        "hey-3",
        "hey-4",
    ]
    assert len(cache) == 4
def main(args):
    logger = setup_logging()

    v0_input_dir = args.v0_input_dir
    vcustom_input_dir = args.vcustom_input_dir
    output_dir = args.output_dir
    channel = args.channel

    tree_name = tree_name_tmpl.format(channel)

    # Needed names for files and trees
    v0_file = v0_input_dir + "/" + file_names_tmpl[channel]
    v_custom_file = vcustom_input_dir + "/" + file_names_tmpl[channel]

    ranges = {
        "pt": {
            "range": (0, 300),
            "label": "$p_T$"
        },
    }

    for var, specs in ranges.items():
        logger.info("Working with {}".format(var))

        # Read two trees lazily
        imp_variables = [var] + ["vtx_z", "gen_vtx_z", "weight"]

        arr_vtx0 = uproot.lazy(["{}:{}".format(v0_file, tree_name)],
                               imp_variables)
        arr_vtxc = uproot.lazy(["{}:{}".format(v_custom_file, tree_name)],
                               imp_variables)

        # Compute quantities
        n_ranges = 35
        var_range = np.linspace(specs["range"][0], specs["range"][1], n_ranges)

        var_ranges = []
        inf = var_range[0]
        for sup in var_range[1:]:
            var_ranges.append((inf, sup))
            inf = sup

        x_vtx0, x_vtxc, y_vtx0, y_vtxc = {}, {}, {}, {}
        xs = [np.mean(rng) for rng in var_ranges]
        x_vtx0["values"] = xs
        x_vtxc["values"] = xs
        x_vtx0["unc"] = [
            np.std(arr_vtx0[(arr_vtx0[var] > rng[0])
                            & (arr_vtx0[var] < rng[1])][var].to_numpy())
            for rng in var_ranges
        ]
        x_vtxc["unc"] = [
            np.std(arr_vtxc[(arr_vtxc[var] > rng[0])
                            & (arr_vtxc[var] < rng[1])][var].to_numpy())
            for rng in var_ranges
        ]

        y_vtx0["values"], y_vtx0["unc"] = count_fraction(
            arr_vtx0, var, var_ranges)
        y_vtxc["values"], y_vtxc["unc"] = count_fraction(
            arr_vtxc, var, var_ranges)

        # Plot
        fig, (ax, rax) = plt.subplots(nrows=2,
                                      ncols=1,
                                      gridspec_kw={"height_ratios": (3, 1)},
                                      sharex=True)
        ax.errorbar(x_vtx0["values"],
                    y_vtx0["values"],
                    xerr=x_vtx0["unc"],
                    yerr=np.array(y_vtx0["unc"]).T,
                    fmt='ro',
                    label="Vertex 0th")
        ax.errorbar(x_vtxc["values"],
                    y_vtxc["values"],
                    xerr=x_vtxc["unc"],
                    yerr=np.array(y_vtxc["unc"]).T,
                    fmt='bs',
                    label="Vertex Reco")

        rdiff = [
            rel_diff_asymm(v0, vc, v0_uncs, vc_uncs) for v0, vc, v0_uncs,
            vc_uncs in zip(y_vtx0["values"], y_vtxc["values"], y_vtx0["unc"],
                           y_vtxc["unc"])
        ]

        rax.errorbar(x_vtx0["values"],
                     y=[rd[0] for rd in rdiff],
                     yerr=np.array([rd[1] for rd in rdiff]).T,
                     fmt='ko')
        ax.legend(fontsize=18, loc="lower right")
        rax.set_xlabel(specs["label"])
        ax.set_ylabel("Fraction of |$Z_{reco}$ - $Z_{true}$| < 10 mm")
        rax.set_ylabel("$rel\ diff$")
        ax.set_ylim(*y_lims[var][channel]["ax"])
        rax.set_ylim(*y_lims[var][channel]["rax"])
        ax.set_xlim(left=0.)
        rax.set_xlim(left=0.)
        ax.grid(which="both")
        rax.grid(which="both")

        output_name = "{}_id_efficiency".format(var)
        hep.cms.label(loc=0,
                      data=True,
                      llabel="Work in Progress",
                      rlabel="",
                      ax=ax,
                      pad=.05)
        fig.savefig("{}/{}.png".format(output_dir, output_name),
                    bbox_inches='tight')
        fig.savefig("{}/{}.pdf".format(output_dir, output_name),
                    bbox_inches='tight')

        logger.info("Dumped plot in {}".format(output_dir))
Esempio n. 8
0
 def load_df():
     import uproot as uproot
     with uproot.open(good_root_file_path) as f_in:
         tree_name = f_in.keys()[0]
     return uproot.lazy(f'{good_root_file_path}:{tree_name}')
Esempio n. 9
0
 def get_gen_events(self):
     # tree_gen = uproot.open(self.fileName)[self.treeName_gen]
     # events = tree_gen.arrays()  # generator events
     tree_path = self.__define_tree_expression(is_gen=True)
     events = uproot.lazy(tree_path)
     return events
Esempio n. 10
0
 def get_events(self):
     # tree_in = uproot.open(self.fileName)[self.treeName]
     # events = tree_in.arrays()  # filtered events
     tree_path = self.__define_tree_expression(is_gen=False)
     events = uproot.lazy(tree_path)
     return events
Esempio n. 11
0
def test_lazy():
    with pytest.raises(ValueError):
        uproot.lazy(skhep_testdata.data_path("uproot-issue63.root"))

    with pytest.raises(ValueError):
        uproot.lazy(
            {skhep_testdata.data_path("uproot-issue63.root"): "blah"},
            allow_missing=True,
        )

    uproot.lazy(
        {skhep_testdata.data_path("uproot-issue63.root"): "WtLoop_nominal"})
    uproot.lazy({
        skhep_testdata.data_path("uproot-issue63.root"):
        "WtLoop_nominal",
        skhep_testdata.data_path("uproot-issue63.root"):
        "WtLoop_Fake_nominal",
    })

    uproot.lazy([{
        skhep_testdata.data_path("uproot-issue63.root"):
        "WtLoop_nominal"
    }])
    uproot.lazy({
        skhep_testdata.data_path("uproot-issue63.root") + "*":
        "WtLoop_nominal"
    })
    uproot.lazy([{
        skhep_testdata.data_path("uproot-issue63.root") + "*":
        "WtLoop_nominal"
    }])
Esempio n. 12
0
#    base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
#    base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
#    #base + 'PrivateSamples.SUEP_2018_mMed-400_mDark-2_temp-2_decay-darkPhoHad_13TeV-pythia8_n-100_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
#}

datasets = {
#    base + 'Autumn18.QCD_HT1000to1500_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
    base + 'Autumn18.QCD_HT1500to2000_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
#    base + 'Autumn18.QCD_HT2000toInf_TuneCP5_13TeV-madgraphMLM-pythia8_0_RA2AnalysisTree.root': 'TreeMaker2/PreSelection',
}

dataset2 = {'qcd_CUETP8M1.root': 'tree'}
#dataset3 = {'qcd_CUETP8M1_up.root': 'tree'}
#dataset4 = {'qcd_CUETP8M1_low.root': 'tree'}

events = uproot.lazy(datasets)
pythia = uproot.lazy(dataset2)
#pythia_up = uproot.lazy(dataset3)
#pythia_low = uproot.lazy(dataset4)
multiplicity_pythia = pythia['nTracks']
#multiplicity_pythia_up = pythia_up['nTracks']
#multiplicity_pythia_low = pythia_low['nTracks']

met = events['MET']
ht = events['HT']
pv_x = events['PrimaryVertices.fCoordinates.fX']
CrossSection = events['CrossSection']
"""tracks_x = events['Tracks.fCoordinates.fX']
tracks_y = events['Tracks.fCoordinates.fY']
tracks_z = events['Tracks.fCoordinates.fZ']
tracks_fromPV0 = events['Tracks_fromPV0']
Esempio n. 13
0
def main():
    logger = setup_logging()

    # Needed names for files and trees
    v0_file = "/work/gallim/root_files/vertex_investigation/VertexInvestigation_vtx0/output_GluGluHToGG_M125_TuneCP5_13TeV-amcatnloFXFX-pythia8_storeWeights_alesauva-UL2018_0-10_6_4-v0-RunIISummer19UL18MiniAOD-106X_upgrade2018_realistic_v11_L1v1-v1-3f96409841a3cc85b911eb441562baae_USER_*.root"
    v_custom_file = "/work/gallim/root_files/vertex_investigation/VertexInvestigation/output_GluGluHToGG_M125_TuneCP5_13TeV-amcatnloFXFX-pythia8_storeWeights_alesauva-UL2018_0-10_6_4-v0-RunIISummer19UL18MiniAOD-106X_upgrade2018_realistic_v11_L1v1-v1-3f96409841a3cc85b911eb441562baae_USER_*.root"

    tree_name = "diphotonDumper/trees/ggH_125_13TeV_All_$SYST"

    output_dir = "/eos/home-g/gallim/www/plots/Hgg/VertexInvestigation/mass_fit"

    # Read two trees lazily
    imp_variables = ["weight", "lead_eta", "sublead_eta", "sigma_m", "mass"]

    arr_vtx0 = uproot.lazy(["{}:{}".format(v0_file, tree_name)], imp_variables)
    arr_vtxc = uproot.lazy(["{}:{}".format(v_custom_file, tree_name)],
                           imp_variables)

    arrays = {"vtx0": arr_vtx0, "vtxc": arr_vtxc}

    # Define categories
    categories = {"EBEB": EBEB_mask, "EBEE": EBEE_mask, "EEEE": EEEE_mask}

    masked_arrays = {"EBEB": {}, "EBEE": {}, "EEEE": {}}

    histos = {}

    fits = {"EBEB": {}, "EBEE": {}, "EEEE": {}}

    # Define zfit objects for fits
    logger.info("Creating zfit objects")

    fit_range = [115, 135]
    obs = zfit.Space("M", limits=fit_range)
    mu1 = zfit.Parameter("mu1", 125, 120, 130)
    sigma1 = zfit.Parameter("sigma1", 1, 0.1, 10)
    mu2 = zfit.Parameter("mu2", 125, 120, 130)
    sigma2 = zfit.Parameter("sigma2", 1, 0.1, 10)
    n = zfit.Parameter("n", 1, 0, 10)
    alpha = zfit.Parameter("alpha", 1, 0, 10)
    frac = zfit.Parameter("frac", 0.5, 0, 1)

    gauss = zfit.pdf.Gauss(obs=obs, mu=mu1, sigma=sigma1)
    cb = zfit.pdf.CrystalBall(obs=obs, mu=mu2, sigma=sigma2, n=n, alpha=alpha)

    model = zfit.pdf.SumPDF(pdfs=[gauss, cb], fracs=frac)

    minimizer = zfit.minimize.Minuit()

    variables = [{
        "name": "mass",
        "bins": 100,
        "range": [115, 135]
    }, {
        "name": "sigma_m",
        "bins": 80,
        "range": [0., 0.035]
    }]

    # Loop over categories
    for cat_name, func in categories.items():
        logger.info("Working with category {}".format(cat_name))
        for vtx_name, arr in arrays.items():
            masked_arrays[cat_name][vtx_name] = arr[func(arr)]

        histos[cat_name] = hist.Hist(
            "Density", hist.Cat("vertex", "Vertex"), *[
                hist.Bin(spec["name"], spec["name"], spec["bins"],
                         *spec["range"]) for spec in variables
            ])
        # fill histos
        for vtx_name, arr in masked_arrays[cat_name].items():
            histos[cat_name].fill(vertex=vtx_name,
                                  mass=arr["mass"],
                                  sigma_m=arr["sigma_m"],
                                  weight=arr["weight"])

        # Plot superimposed vertex values for mass and sigma_m (from flashgg)
        for var in variables:
            logger.info("Creating plot for variable {}".format(var["name"]))

            fig, ax = plt.subplots()
            loc_vars = [sp["name"] for sp in variables]
            loc_vars.remove(var["name"])
            hist.plot1d(histos[cat_name].sum(*loc_vars), density=True)
            output_name = "{}_{}".format(var["name"], cat_name)
            hep.cms.label(loc=0,
                          data=True,
                          llabel="Work in Progress",
                          rlabel="",
                          ax=ax,
                          pad=.05)
            fig.savefig("{}/{}.png".format(output_dir, output_name),
                        bbox_inches='tight')
            fig.savefig("{}/{}.pdf".format(output_dir, output_name),
                        bbox_inches='tight')

        # Fits
        logger.info("Proceed with fits")
        x = np.linspace(115, 135, 1000)  # x values for model in plots
        for vtx_name, arr in masked_arrays[cat_name].items():
            data = zfit.Data.from_numpy(obs=obs, array=arr["mass"].to_numpy())
            nll = zfit.loss.UnbinnedNLL(model=model, data=data)
            fits[cat_name]["result"] = minimizer.minimize(nll)
            fits[cat_name]["param_errors"] = fits[cat_name]["result"].hesse()

            # Compute chi-square and p-value
            logger.info("Computing goodness of fit")
            parameters = [mu1, sigma1, mu2, sigma2, n, alpha, frac]
            observed_values, observed_edges = np.histogram(
                arr["mass"].to_numpy(), variables[0]["bins"],
                variables[0]["range"])
            observed_centers = .5 * (observed_edges[1:] + observed_edges[:-1])
            plot_scale = len(
                arr["mass"]) / variables[0]["bins"] * obs.area().numpy()
            expected_values = model.pdf(observed_centers).numpy() * plot_scale
            res = chisquare(observed_values, f_exp=expected_values)
            textstr = format_fit_info(arr, fits[cat_name]["result"], res,
                                      *parameters)
            logger.info(textstr)

            # Plot superimposed histogram and model
            logger.info(
                "Creating plot for category {}, vertex {} with model".format(
                    cat_name, vtx_name))
            fig, ax = plt.subplots()
            y = model.pdf(x, norm_range=fit_range).numpy()
            plt.plot(x, y, label="Model")
            err_opts = {
                'linestyle': 'none',
                'marker': '.',
                'markersize': 10.,
                'color': 'k',
                'elinewidth': 1,
            }
            hist.plot1d(histos[cat_name].sum("sigma_m")[vtx_name],
                        density=True,
                        error_opts=err_opts)

            # Stats box
            props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
            ax.text(0.05,
                    0.95,
                    textstr,
                    transform=ax.transAxes,
                    fontsize=12,
                    verticalalignment='top',
                    bbox=props)

            output_name = "mass_{}_{}_with_model".format(vtx_name, cat_name)
            hep.cms.label(loc=0,
                          data=True,
                          llabel="Work in Progress",
                          rlabel="",
                          ax=ax,
                          pad=.05)
            fig.savefig("{}/{}.png".format(output_dir, output_name),
                        bbox_inches='tight')
            fig.savefig("{}/{}.pdf".format(output_dir, output_name),
                        bbox_inches='tight')
Esempio n. 14
0
def setupPionData(root_file_dict,branches=[], layers=[], cluster_tree='ClusterTree', 
                  balance_data=True, n_max=-1, 
                  cut_distributions=[], cut_values=[], cut_types=[],
                  match_distribution='', match_binning=(), match_log=False,
                  verbose=False, load=False, save=False, filename='', return_indices=False):

    pdata = {}
    pcells = {}
    keys = list(root_file_dict.keys())
    rng = np.random.default_rng()

    pdata_filename = filename + '_frame.h5'
    pcell_filename = filename + '_images.h5'
    selec_filename = filename + '_selections.h5'

    if(load and pathlib.Path(pdata_filename).exists() and pathlib.Path(pcell_filename).exists()):
        
        if(verbose): print('Loading pandas DataFrame and calo images from {} and {}.'.format(pdata_filename,pcell_filename))
        # Load the DataFrame and images from disk.
        pdata = {
            key: pd.read_hdf(pdata_filename,key=key)
            for key in keys
        }
        
        hf = h5.File(pcell_filename,'r')
        for key in keys:
            pcells[key] = {}
            for layer in layers:
                pcells[key][layer] = hf['{}:{}'.format(key,layer)][:]
        hf.close()
        
        if(return_indices): # TODO: Rework this a little!
            hf = h5.File(selec_filename,'r')
            indices = {key: hf[key][:] for key in keys}
            hf.close()
            
    else:
        
        # root_file_dict entries might be glob-style strings, or lists of files. We should consider both possibilities.
        arrays = {}
        for key,root_files in root_file_dict.items():
            if(type(root_files) == list):
                arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in branches)
            else:
                arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in branches)

        indices = ApplyCuts(arrays, cut_distributions, cut_values, cut_types, verbose)
                                
        # Filter out clusters so that our data series match in their distribution of a user-supplied variable.
        if(match_distribution != ''):
            if(match_distribution in branches and len(match_binning) == 3):
                if(verbose): print('Matching data series on distribution: {}.'.format(match_distribution))
                                                
                binning = np.linspace(match_binning[1],match_binning[2],match_binning[0]+1)
                n_bins = len(binning) - 1
                distributions = {
                    key: np.histogram(arrays[key][match_distribution][indices[key]].to_numpy(), bins=binning)[0] # only keep bin counts
                    for key in keys
                }
                
                # Now determine how many clusters we keep in each bin, for each key.
                n_keep = np.zeros(n_bins,dtype=np.dtype('i8'))
                for i in range(n_bins):
                    n_keep[i] = int(np.min([x[i] for x in distributions.values()]))
                    
                # Now we need to throw out some clusters -- in other words, only keep some.
                # We will randomly choose which ones we keep, for each match_distribution bin,
                # for each data series (key).
                for key in keys:
                    sorted_indices = indices[key][np.argsort(arrays[key][match_distribution][indices[key]])]
                    keep_indices = []
                    bin_idx_edges = np.insert(np.cumsum(distributions[key]),0,0)
                    for i in range(n_bins):
                        index_block = sorted_indices[bin_idx_edges[i]:bin_idx_edges[i+1]] # all indices corresponding to the i'th bin of match_distribution, for this key
                        keep_indices.append(rng.choice(index_block, n_keep[i], replace=False))
                    n_before = len(indices[key])
                    indices[key] = np.hstack(keep_indices)
                    n_after = len(indices[key])
                    #if(verbose): print('\t{}, number of events: {} -> {}'.format(key, n_before, n_after))
                                    
            else: print('Warning: Requested matching of distribution \"{}\" but this variable is not among the branches you selected from the data. Skipping this step.'.format(match_distribution))            
            
        # Balance data so we have equal amounts of each category.
        # Note that if we did the matching above, we can potentially skip this as
        # balancing was implicitly done. However, we might want to take the opportunity
        # to further slim down our dataset.
        if(balance_data):
            n_max_tmp = np.min([len(x) for x in indices.values()])
            if(n_max > 0): n_max = np.minimum(n_max_tmp, n_max)
            else: n_max = n_max_tmp
            
            if(verbose): print('Balancing data: {} events per category.'.format(n_max))
            indices = {key:rng.choice(val, n_max, replace=False) for key,val in indices.items()}

        # Make a boolean mask from the indices. This speeds things up below, as opposed to passing (unsorted) lists of indices.
        for key in indices.keys():
            msk = np.zeros(len(arrays[key]),dtype=bool)
            msk[indices[key]] = True
            indices[key] = msk
    
        # Now, apply our selection indices to the arrays.
        arrays = {
            key:arrays[key][indices[key]]
            for key in keys
        }
        
        # Make the dataframes from the arrays.
        if(verbose): print('Preparing pandas DataFrame.')
        pdata = {
            key: ak.to_pandas(arrays[key][branches])
            for key in keys
        }
    
        # Re-make the arrays with just our layer info (using our selection indices again).
        arrays = {}
        for key,root_files in root_file_dict.items():
            if(type(root_files) == list):
                arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in layers)[indices[key]]
            else:
                arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in layers)[indices[key]]

        
        # Make our calorimeter images.
        nentries = len(keys) * len(layers)
        i = 0
        if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40)

        pcells = {}
        for key in keys:
            pcells[key] = {}
            for layer in layers:
                pcells[key][layer] = setupCells_new(arrays[key],layer)
                i+=1
                if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40)
        
        # Save the dataframes and calorimeter images in HDF5 format for easy access next time.
        if(filename != '' and save):
            if(verbose): print('Saving DataFrames to {}.'.format(pdata_filename))
            for key,frame in pdata.items():
                frame.to_hdf(pdata_filename, key=key, mode='a',complevel=6)   
                
            if(verbose): print('Saving calorimeter images to {}.'.format(pcell_filename))
                
            hf = h5.File(pcell_filename, 'w')
            for key in pcells.keys():
                for layer in layers:
                    dset = hf.create_dataset('{}:{}'.format(key,layer), data=pcells[key][layer], chunks=True, compression='gzip', compression_opts=7)
            hf.close()
            
    # One may optionally also save the selected event indices. This can be useful if referring back to the original data source.
    if(return_indices):
        # Save the indices to a file.
        hf = h5.File(selec_filename, 'w')
        for key in indices.keys():
            dset = hf.create_dataset(key, data=indices[key], chunks=True, compression='gzip', compression_opts=7)
        hf.close()
        return pdata, pcells, indices # return indices
    return pdata, pcells # don't return indices
Esempio n. 15
0
def test_lazy_called_on_nonexistent_file():
    awkward = pytest.importorskip("awkward")
    filename = "nonexistent_file.root"
    with pytest.raises(uproot._util._FileNotFoundError) as excinfo:
        uproot.lazy(filename)
    assert filename in str(excinfo.value)
Esempio n. 16
0
from coffea import processor
from coffea.processor.test_items import NanoEventsProcessor
from coffea.nanoevents import schemas

if __name__ == "__main__":
    config_dict = {
        "skyhook": {
            "ceph_config_path": "/tmp/testskyhookjob/ceph.conf",
            "ceph_data_pool": "cephfs_data",
        }
    }
    with open("/root/.coffea.toml", "w") as f:
        toml.dump(config_dict, f)

    ak.to_parquet(
        uproot.lazy("tests/samples/nano_dy.root:Events"),
        "nano_dy.parquet",
        list_to32=True,
        use_dictionary=False,
        compression="GZIP",
        compression_level=1,
    )

    ak.to_parquet(
        uproot.lazy("tests/samples/nano_dimuon.root:Events"),
        "nano_dimuon.parquet",
        list_to32=True,
        use_dictionary=False,
        compression="GZIP",
        compression_level=1,
    )
Esempio n. 17
0
    def __init__(self,
                 root_files,
                 tree_name,
                 scalar_branches,
                 matrix_branches = list(mu.cell_meta.keys()),
                 target=None,
                 batch_size=200,
                 shuffle=True, # TODO: Turning off shuffle caused some problems in simple tests, when retrieving data. How?
                 step_size=None,
                 flatten_images=False,
                 key_map=None):
        
        # Deal with the case of a dictionary input -- this means that the targets will be the
        # categories specified by the dictionary keys. We will need to keep track of which target
        # value each individual file is associated with, so that we can ultimately determine the
        # target value for every index in our (unshuffled) list of events.
        if(type(root_files) == dict):
            self.external_classification = True
            if(target is not None):
                print('Warning: target is set to {}, but ROOT files have been passed as a dictionary -> target will be ignored, using dictionary keys as classification labels.'.format(target))
            
            self.root_files = []
            keys = list(root_files.keys())
            keys.sort()
            nlabels = len(keys)
            self.external_classification_nclasses = nlabels

            nentries_dict = {}
            classes_dict = {}
            
            for i,key in enumerate(keys):
                rfiles = root_files[key]
                if(type(rfiles) != list): rfiles = glob.glob(rfiles,recursive=True)                    
                for rfile in rfiles:
                    with ur.open(rfile, cache=None, array_cache=None)[tree_name] as tree:
                        nentries_dict[rfile] = tree.num_entries
                        classes_dict[rfile] = i  
                self.root_files += rfiles
            self.root_files.sort()
                
            # At this point, we know how many events we have for every file, and which classification (number)
            # each file corresponds with. Thus we can determine the event index boundaries at which the classification
            # scores change -- and from this, we can determine the classification score of each event without explicitly saving
            # the score per event. In terms of memory usage, this will scale more nicely than explicitly saving all those scores.
            index_score_boundaries = {} # key is upper bound of index range (inclusive!), value is classification value
            nentries = 0
            for rfile in self.root_files:
                nentries += nentries_dict[rfile]
                index_score_boundaries[nentries-1] = classes_dict[rfile]
            self.index_score_boundaries = index_score_boundaries
            
        else:
            self.external_classification = False
            self.external_classification_nclasses = None
            self.index_score_boundaries = None
            if(type(root_files) == list): 
                self.root_files = root_files
            else:
                self.root_files = glob.glob(root_files,recursive=True)
            self.root_files.sort()
        
        if(step_size is None):
            self.step_size = '{} MB'.format(batch_size) # TODO: Is this reasonable?
        else: 
            self.step_size = step_size
        
        self.tree_name = tree_name
        self.scalar_branches = scalar_branches # We will create a lazy array for these, as it performs well.
        self.matrix_branches = matrix_branches # These will only be handled when fetching data! Not using lazy array (too slow).
 
        self.target = target
    
        # Quick hack for the case of external classification, in which case the target is redundant
        if(self.external_classification): self.target = self.scalar_branches[0]
    
        if(self.target is not None): 
            assert(self.target in self.scalar_branches)
            
        self.batch_size = batch_size
        self.shuffle = shuffle
                
        if(self.scalar_branches is None): filter_func = lambda x: x.name not in list(mu.cell_meta.keys())
        else: filter_func = lambda x: x.name in self.scalar_branches
        self.scalar_array = ur.lazy(files=[':'.join((x,self.tree_name)) for x in self.root_files],
                            filter_branch = filter_func,
                            step_size = self.step_size
                           )
        
        # Now remove the target from scalar_branches, so that it is not included among features.
        self.scalar_branches = [x for x in self.scalar_branches if x != self.target]
        
        self.branches = self.scalar_branches + self.matrix_branches
        self.key_map = key_map # optionally remap data keys (e.g. "EMB1" -> "input") for access -- this is useful if network assumes tensors have certain names that differ from actual branch names
        if(self.key_map is None):
            self.key_map = {x:x for x in self.branches}
        else:
            for x in self.branches:
                if(x not in self.key_map.keys()): self.key_map[x] = x
        
        self.image_array = ROOTImageArray(root_files = self.root_files,
                                          tree_name = self.tree_name,
                                          image_branches = self.matrix_branches,
                                          flatten = flatten_images
                                         )
        
        self.indices = np.arange(len(self.scalar_array))
        self.on_epoch_end()