def test_bin_width(self): """ Test getting the bin width of bin and sparselybin histograms """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) # building test histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist4.fill.numpy(df1) assert hist2.bin_width() == 1.0 assert hist3.bin_width() == 1.0 assert hist4.bin_width() == 0.5 assert hist5.bin_width() == 0.5
def test_bin_edges(self): """ Test getting the bin edges for requested ranges """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) df2 = pd.DataFrame({'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]}) # building test histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) import numpy as np np.testing.assert_array_equal(hist2.bin_edges(), [0., 1., 2., 3., 4., 5.]) np.testing.assert_array_equal(hist3.bin_edges(), [2., 3., 4., 5., 6., 7., 8., 9.]) np.testing.assert_array_equal(hist4.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist5.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist2.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.]) np.testing.assert_array_equal(hist3.bin_edges(low=1.1, high=6), [1., 2., 3., 4., 5., 6.]) np.testing.assert_array_equal(hist4.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist5.bin_edges(low=1.1, high=5.4), [1., 2., 3., 4., 5., 6.])
def test_assert_similar_hists(): """Test assert on similarity of list of histograms Check similarity of: type, n-dim, sub-hists, specific type attributes """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Bin(5, 0, 5, unit("A")) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.Categorize(unit("C"), value=hist0) hist4 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) hist5 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist3, ) # fill them for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: assert check_similar_hists([hist, hist]) args01 = [""] args23 = [""] args45 = [""] try: assert_similar_hists([hist0, hist1]) except ValueError as e: args01 = e.args try: assert_similar_hists([hist2, hist3]) except ValueError as e: args23 = e.args try: assert_similar_hists([hist4, hist5]) except ValueError as e: args45 = e.args assert args01[0] == "Input histograms are not all similar." assert args23[0] == "Input histograms are not all similar." assert args45[0] == "Input histograms are not all similar."
def test_bin_entries(self): """ Test getting the number of bins for all assigned bins """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame( {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']}) df2 = pd.DataFrame( {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Categorize(unit('C')) hist1 = hg.Categorize(unit('C')) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist0.fill.numpy(df1) hist1.fill.numpy(df2) hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) labels0 = hist0.bin_labels() labels1 = hist1.bin_labels() centers2 = hist2.bin_centers() centers3 = hist3.bin_centers() centers = hist4.bin_centers() import numpy as np np.testing.assert_array_equal(hist0.bin_entries(), [2., 2., 3., 3.]) np.testing.assert_array_equal(hist1.bin_entries(), [1., 2., 1., 1., 3., 1., 1.]) np.testing.assert_array_equal(hist0.bin_entries(labels=labels1), [2., 3., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist1.bin_entries(labels=labels0), [0., 1., 2., 0.]) np.testing.assert_array_equal(hist2.bin_entries(), [1., 4., 2., 2., 1.]) np.testing.assert_array_equal(hist3.bin_entries(), [1., 1., 2., 2., 1., 2., 1.]) np.testing.assert_array_equal(hist4.bin_entries(), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist5.bin_entries(), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.]) np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers3), [2., 2., 1., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers2), [0., 0., 1., 1., 2.]) np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.]) np.testing.assert_array_equal(hist2.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.]) np.testing.assert_array_equal(hist4.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist5.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
def get_hist_bin(self, hist, features, quant, col, dt): is_number = np.issubdtype(dt, np.number) is_timestamp = np.issubdtype(dt, np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) if "bin_width" in specs: hist = hg.SparselyBin( binWidth=specs["bin_width"], origin=specs.get("bin_offset", 0), quantity=quant, value=hist, ) elif "num" in specs and "low" in specs and "high" in specs: hist = hg.Bin( num=specs["num"], low=specs["low"], high=specs["high"], quantity=quant, value=hist, ) else: raise RuntimeError( "Do not know how to interpret bin specifications.") else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) return hist
def test_prepare_2dgrid(): """Test preparation of grid for extraction of number of entries for 2d hists""" df, hc1, hc2, hc3 = get_test_histograms1() # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) xkeys1, ykeys1 = prepare_2dgrid(hist1) xkeys2, ykeys2 = prepare_2dgrid(hist2) xkeys3, ykeys3 = prepare_2dgrid(hist3) np.testing.assert_array_equal(xkeys1, []) np.testing.assert_array_equal(ykeys1, []) np.testing.assert_array_equal(xkeys2, [0, 1, 2, 3, 4]) np.testing.assert_array_equal(ykeys2, ["foo1", "foo2", "foo3", "foo4", "foo5"]) np.testing.assert_array_equal(xkeys3, [0, 1, 4, 5, 6]) np.testing.assert_array_equal(ykeys3, [0, 1, 2, 3, 4])
def test_profile_hist1d(): num_bins = 1000 num_entries = 10000 hist_name = "histogram" split_len = 10 split = [] np.random.seed(0) for i in range(split_len): h = hg.Bin(num_bins, 0, 1, lambda x: x) h.fill.numpy(np.random.uniform(0, 1, num_entries)) split.append({ "date": pd.Timestamp("2019 - 1 - 1"), hist_name: HistogramContainer(h) }) hp = HistProfiler( read_key="dummy_input", store_key="dummy_output", hist_col=hist_name, index_col="date", ) profiles = hp._profile_hist(split, hist_name="feature") assert len(profiles) == split_len assert "p95" in profiles[0] assert profiles[1]["max"] == np.max( split[1][hist_name].get_bin_centers()[0]) assert len(profiles[0][hist_name].hist.bin_entries()) == num_bins
def construct_empty_hist(self, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the features # in reverse order and passing a single-dim hist as input # to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] # processing function, e.g. only accept boolians during filling f = utils.QUANTITY[dt] if len(features) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[features] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa is_number = np.issubdtype(dt, np.number) is_timestamp = np.issubdtype(dt, np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) if "bin_width" in specs: hist = hg.SparselyBin( binWidth=specs["bin_width"], origin=specs.get("bin_offset", 0), quantity=quant, value=hist, ) elif "num" in specs and "low" in specs and "high" in specs: hist = hg.Bin( num=specs["num"], low=specs["low"], high=specs["high"], quantity=quant, value=hist, ) else: raise RuntimeError( "Do not know how to interpret bin specifications.") else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) return hist
def get_test_histograms2(): """Get set 2 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() # building 1d-, 2d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.Bin(5, 0, 5, unit("A")) hist4 = hg.Categorize(unit("C"), value=hist3) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) hist4.fill.numpy(df) return df, hist1, hist2, hist3, hist4
def test_bin_centers(self): """ Test getting assigned bin-centers for Bin and SparselyBin histograms """ with Pandas() as pd: if pd is None: return with Numpy() as np: # noqa if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame( {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) df2 = pd.DataFrame( {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]}) # histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) import numpy as np np.testing.assert_array_equal(hist2.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5]) np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]) np.testing.assert_array_equal(hist2.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5]) np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]) np.testing.assert_array_equal(hist3.bin_centers(low=2.1, high=5.6), [2.5, 3.5, 4.5, 5.5]) np.testing.assert_array_equal(hist4.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist5.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist4.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist5.bin_centers(low=2.1, high=9.1), [ 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
def test_num_bins(self): """ Test getting the number of bins from lowest to highest bin """ with Pandas() as pd: if pd is None: return with Numpy() as np: # noqa if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 2, 4, 5, 7, 9, 11, 13, 13, 15]}) df2 = pd.DataFrame({'A': [2, 4, 4, 6, 8, 7, 10, 14, 17, 19]}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A')) hist5 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) assert hist2.num_bins() == 16 assert hist3.num_bins() == 18 assert hist4.num_bins() == 20 assert hist5.num_bins() == 20 assert hist2.num_bins(low=10, high=25) == 15 assert hist3.num_bins(low=10, high=25) == 15 assert hist4.num_bins(low=10, high=25) == 10 assert hist5.num_bins(low=10, high=25) == 10 assert hist2.num_bins(low=-10, high=28) == 38 assert hist3.num_bins(low=-10, high=28) == 38 assert hist4.num_bins(low=-10, high=28) == 20 assert hist5.num_bins(low=-10, high=28) == 20
def project_on_x(hist): """Project n-dim histogram onto x-axis :param hist: input histogrammar histogram :return: on x-axis projected histogram (1d) """ # basic check: projecting on itself if hasattr(hist, "n_dim") and hist.n_dim <= 1: return hist # basic checks on contents if hasattr(hist, "bins"): if len(hist.bins) == 0: return hist elif hasattr(hist, "values"): if len(hist.values) == 0: return hist else: return hist # make empty clone # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString() if isinstance(hist, histogrammar.Bin): h_x = histogrammar.Bin( num=hist.num, low=hist.low, high=hist.high, quantity=hist.quantity, ) elif isinstance(hist, histogrammar.SparselyBin): h_x = histogrammar.SparselyBin( binWidth=hist.binWidth, origin=hist.origin, quantity=hist.quantity, ) elif isinstance(hist, histogrammar.Categorize): h_x = histogrammar.Categorize(quantity=hist.quantity) else: raise TypeError("Unknown histogram type. cannot get zero copy.") if hasattr(hist, "bins"): for key, bi in hist.bins.items(): h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi)) elif hasattr(hist, "values"): for i, bi in enumerate(hist.values): h_x.values[i] = histogrammar.Count.ed(sum_entries(bi)) return h_x
def get_test_histograms1(): """Get set 1 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) df["boolT"] = True df["boolF"] = False # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) return df, hist1, hist2, hist3
def get_test_histograms1(): """ Get set 1 of test histograms """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 import pandas as pd import histogrammar as hg df = pd.util.testing.makeMixedDataFrame() df['date'] = df['D'].apply(to_ns) df['boolT'] = True df['boolF'] = False # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit('C')) hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1) hist3 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value, quantity=unit('date'), value=hist2) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) return df, hist1, hist2, hist3
def __init__(self, fin, branch='Events', selected_branches=None, \ exclude_branches=None, identifier=None, label=None, \ chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \ redirector='root://cms-xrd-global.cern.ch', verbose=0): self.type = self.__class__.__name__ self.fin = xfile(fin, redirector) self.verbose = verbose if self.verbose: print("Reading {}".format(self.fin)) self.istream = uproot.open(self.fin) self.branches = {} self.gen = None self.out_branches = [] self.identifier = identifier if identifier else [ 'run', 'event', 'luminosityBlock' ] self.tree = self.istream[branch] self.nrows = self.tree.numentries self.nevts = nevts if nevts != -1 else self.nrows self.label = label self.idx = -1 self.chunk_idx = 0 self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows self.nan = float(nan) self.attrs = [] self.shape = None self.cache = {} self.hdict = {} self.hists = histograms self.idx_label = 0 self.flat_keys_encoded = [] self.jagged_keys_encoded = [] self.keys = [] self.min_list = [] self.max_list = [] self.jdimension = [] self.dimension_list = [] self.time_reading = [] self.time_reading_and_specs = [] if specs: self.load_specs(specs) else: self.jdim = {} self.minv = {} self.maxv = {} self.jkeys = [] self.fkeys = [] self.nans = {} time0 = time.time() if exclude_branches: print(f"Excluded branches: {exclude_branches}") all_branches = self.tree.keys() exclude_branches = [elem.encode() for elem in exclude_branches] self.out_branches = [ elem for elem in all_branches if (elem not in exclude_branches) ] if selected_branches: print(f"Selected branches: {selected_branches}") selected_branches = [elem.encode() for elem in selected_branches] self.out_branches = [elem for elem in selected_branches] # perform initialization self.init() if self.verbose: print("{} init is complete in {} sec".format( self, time.time() - time0)) # declare histograms for original and normilized values if hg and self.hists: for key in self.attrs: low = self.minv[key] high = self.maxv[key] self.hdict['%s_orig' % key] = \ hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count()) self.hdict['%s_norm' % key] = \ hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
zip(output_fields, dimuonCandidate_aux(pt, eta, phi, mass, charge, mediumid))) try: import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import histogrammar as hg import json import logging mpl_logger = logging.getLogger('matplotlib') mpl_logger.setLevel(logging.WARNING) invmass = hg.Bin(80, 70, 110, quantity=lambda x: x, value=hg.Count()) def histogram_fill_from_file(filename): with open(filename, 'r') as f_in: for row in f_in: js = json.loads(row) invmass.fill(js['mass']) def histogram(output='output.png'): #ax = invmass.plot.matplotlib(name="", color="green", edgecolor="white", lw=5) ax = invmass.plot.matplotlib(name="") ax.set_xlabel('Dimuon invariant mass m($\mu\mu$) (GeV)') ax.set_ylabel('Events / 0.5 GeV') plt.savefig(output) print(json.dumps(invmass.toJson())) #plt.show()
#!/usr/bin/env python import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as pyplot #First histogram tutorial in histogrammar package import histogrammar as hg # generate a stream of uniform random numbers import random data = [random.random() for i in xrange(2000)] # aggregation structure and fill rule histogram = hg.Bin(num=20, low=0, high=1, quantity=lambda x: x, value=hg.Count()) # fill the histogram! for d in data: histogram.fill(d) # quick plotting convenience method using matplotlib (if the user has this installed) #looks like this interface soon changes to histogram.plot.matplotlib ax = histogram.matplotlib(name="hello world!") pyplot.savefig('histogrammar.png')
def __init__(self, fin, branch='Events', selected_branches=None, \ exclude_branches=None, identifier=None, \ chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \ redirector='root://cms-xrd-global.cern.ch', verbose=0): self.type = self.__class__.__name__ self.fin = xfile(fin, redirector) self.verbose = verbose if self.verbose: print("Reading {}".format(self.fin)) self.istream = uproot.open(self.fin) self.branches = {} self.gen = None self.out_branches = [] self.identifier = identifier if identifier else ['run', 'event', 'luminosityBlock'] self.tree = self.istream[branch] self.nrows = self.tree.numentries self.nevts = nevts self.idx = -1 self.chunk_idx = 0 self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows self.nan = float(nan) self.attrs = [] self.shape = None self.cache = {} self.hdict = {} self.hists = histograms if specs: self.load_specs(specs) else: self.jdim = {} self.minv = {} self.maxv = {} self.jkeys = [] self.fkeys = [] self.nans = {} # perform initialization time0 = time.time() self.init() if self.verbose: print("{} init is complete in {} sec".format(self, time.time()-time0)) if selected_branches: self.out_branches = [] for attr in self.attrs: for name in selected_branches: if name.find('*') != -1: if attr.startswith(name): self.out_branches.append(attr) else: if attr == name: self.out_branches.append(attr) if self.out_branches: if self.verbose: print("Select branches ...") for name in sorted(self.out_branches): print(name) if exclude_branches: out_branches = set() for attr in self.attrs: count = 0 for name in exclude_branches: if name.find('*') != -1: if attr.startswith(name): count += 1 else: if attr == name: count += 1 if not count: out_branches.add(attr) self.out_branches = list(out_branches) if self.out_branches: if self.verbose: print("Select branches ...") for name in sorted(self.out_branches): print(name) # declare histograms for original and normilized values if hg and self.hists: for key in self.attrs: low = self.minv[key] high = self.maxv[key] self.hdict['%s_orig' % key] = \ hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count()) self.hdict['%s_norm' % key] = \ hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())