def _get_bin_specs(h): """Get histogram bin specifications :param h: input histogrammar histogram :return: list with bin_specs of all dimensions of the histogram :rtype: list """ bin_specs = [] if isinstance(h, histogrammar.Count): return bin_specs if isinstance(h, histogrammar.Categorize): bin_specs.append({}) elif isinstance(h, histogrammar.Bin): bin_specs.append(dict(num=h.num, low=h.low, high=h.high)) elif isinstance(h, histogrammar.SparselyBin): bin_specs.append(dict(bin_width=h.binWidth, bin_offset=h.origin)) # histogram may have a sub-histogram. Extract it and recurse if hasattr(h, "bins"): hist = list(h.bins.values())[0] if h.bins else histogrammar.Count() elif hasattr(h, "values"): hist = h.values[0] if h.values else histogrammar.Count() else: hist = histogrammar.Count() return bin_specs + _get_bin_specs(hist)
def get_n_dim(cls): """Histogram dimension :returns: dimension of the histogram :rtype: int """ if isinstance(cls, hg.Count): return 0 # histogram may have a subhistogram. Extract it and recurse if hasattr(cls, 'values'): hist = cls.values[0] if cls.values else hg.Count() elif hasattr(cls, 'bins'): hist = list(cls.bins.values())[0] if cls.bins else hg.Count() else: hist = hg.Count() return 1 + get_n_dim(hist)
def construct_empty_hist(self, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the features # in reverse order and passing a single-dim hist as input # to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] # processing function, e.g. only accept boolians during filling f = utils.QUANTITY[dt] if len(features) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[features] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa is_number = np.issubdtype(dt, np.number) is_timestamp = np.issubdtype(dt, np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) if "bin_width" in specs: hist = hg.SparselyBin( binWidth=specs["bin_width"], origin=specs.get("bin_offset", 0), quantity=quant, value=hist, ) elif "num" in specs and "low" in specs and "high" in specs: hist = hg.Bin( num=specs["num"], low=specs["low"], high=specs["high"], quantity=quant, value=hist, ) else: raise RuntimeError( "Do not know how to interpret bin specifications.") else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) return hist
def construct_empty_hist(self, df, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through # the features in reverse order and passing a single-dim hist # as input to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] quant = df[col] hist = self.get_hist_bin(hist, features, quant, col, dt) # set data types in histogram dta = [self.var_dtype[col] for col in features] hist.datatype = dta[0] if len(features) == 1 else dta return hist
def construct_empty_hist(self, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the features # in reverse order and passing a single-dim hist as input # to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] # processing function, e.g. only accept boolians during filling f = QUANTITY[dt] if len(features) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[features] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa hist = self.get_hist_bin(hist, features, quant, col, dt) return hist
def construct_empty_hist(self, columns): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column revcols = list(reversed(columns)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) # processing function, e.g. only accept boolians during filling f = self.quantity.get(col, hf.QUANTITY[dt.type]) if len(columns) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[columns] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = hg.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=quant, value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) # decorators; adding them here doesn't seem to work! #hist.n_dim = get_n_dim(hist) #selected_cols = revcols[:idx+1] #dta = [self.var_dtype[col] for col in reversed(selected_cols)] #hist.datatype = dta[0] if hist.n_dim==1 else dta # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) return hist
def construct_empty_hist(self, df, columns): """Create an (empty) histogram of right type Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = histogrammar.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column for col in reversed(columns): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = histogrammar.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=df[col], value=hist) else: # string and boolians are treated as categories hist = histogrammar.Categorize(quantity=df[col], value=hist) # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) @property def n_bins(self): if hasattr(self, 'num'): return self.num elif hasattr(self, 'size'): return self.size else: raise RuntimeError( 'Cannot retrieve number of bins from hgr hist') hist.n_bins = n_bins return hist
def get_datatype(cls): """Get histogrammar histogram datatype(s) of its axes Return data type of the variable represented by the histogram. If not already set, will determine datatype automatically. :returns: list with datatypes of all dimenensions of the histogram :rtype: list """ datatype = [] if isinstance(cls, histogrammar.Count): return datatype if isinstance(cls, histogrammar.Categorize): if len(cls.bins) > 0: dt = type(list(cls.bins.keys())[0]) dt = np.dtype(dt).type if (dt is np.str_) or (dt is np.string_) or (dt is np.object_): dt = str datatype = [dt] elif isinstance(cls, (histogrammar.Bin, histogrammar.SparselyBin)): datatype = [np.number] bin_centers = cls.bin_centers() if len(bin_centers) > 0: dt = type(bin_centers[-1]) dt = np.dtype(dt).type datatype = [dt] # HACK: making an educated guess for timestamp # timestamp is in ns since 1970, so a huge number. is_ts = DATE_LOW < bin_centers[-1] < DATE_HIGH if is_ts: datatype = [np.datetime64] # histogram may have a subhistogram. Extract it and recurse if hasattr(cls, "bins"): hist = list(cls.bins.values())[0] if cls.bins else histogrammar.Count() elif hasattr(cls, "values"): hist = cls.values[0] if cls.values else histogrammar.Count() else: hist = histogrammar.Count() return datatype + get_datatype(hist)
def test_n_dim(self): """ Test dimension assigned to a histogram """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df, hist1, hist2, hist3 = get_test_histograms1() hist0 = hg.Count() assert hist0.n_dim == 0 assert hist1.n_dim == 1 assert hist2.n_dim == 2 assert hist3.n_dim == 3
def construct_empty_hist(self, df, columns): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column revcols = list(reversed(columns)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(columns, columns.index(col)) hist = hg.SparselyBin(binWidth=specs['bin_width'], origin=specs['bin_offset'], quantity=df[col], value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=df[col], value=hist) # decorators; adding them here doesn't seem to work! #selected_cols = revcols[:idx+1] #hist.datatype = [self.var_dtype[col] for col in reversed(selected_cols)] # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) return hist
def construct_empty_hist(self, df, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through # the features in reverse order and passing a single-dim hist # as input to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) hist = hg.SparselyBin( binWidth=specs['bin_width'], origin=specs['bin_offset'], quantity=df[col], value=hist ) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=df[col], value=hist) # set data types in histogram dta = [self.var_dtype[col] for col in features] hist.datatype = dta[0] if len(features) == 1 else dta return hist
#!/usr/bin/env python import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as pyplot #First histogram tutorial in histogrammar package import histogrammar as hg # generate a stream of uniform random numbers import random data = [random.random() for i in xrange(2000)] # aggregation structure and fill rule histogram = hg.Bin(num=20, low=0, high=1, quantity=lambda x: x, value=hg.Count()) # fill the histogram! for d in data: histogram.fill(d) # quick plotting convenience method using matplotlib (if the user has this installed) #looks like this interface soon changes to histogram.plot.matplotlib ax = histogram.matplotlib(name="hello world!") pyplot.savefig('histogrammar.png')
def _construct_empty_hist(self, columns): """Create an (empty) histogram of right type Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column for col in reversed(columns): # histogram type depends on the data type dt = np.dtype(self.datatype[col]) # processing function, e.g. only accept boolians during filling f = self.quantity[col] if col in self.quantity else QUANTITY[ dt.type] if len(columns) == 1: # df[col] is a pd.series q = lambda x, fnc=f: fnc(x) else: # df[columns] is a pd.Dataframe # fix column to col q = lambda x, fnc=f, clm=col: fnc(x[clm]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = hg.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=q, value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=q, value=hist) # FIXME stick data types and number of dimension to histogram dta = [self.datatype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) @property def n_bins(self): if hasattr(self, num): return self.num elif hasattr(size, size): return self.size else: raise Exception( 'Cannot retrieve number of bins from hgr hist.') hist.n_bins = n_bins return hist
def __init__(self, fin, branch='Events', selected_branches=None, \ exclude_branches=None, identifier=None, \ chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \ redirector='root://cms-xrd-global.cern.ch', verbose=0): self.type = self.__class__.__name__ self.fin = xfile(fin, redirector) self.verbose = verbose if self.verbose: print("Reading {}".format(self.fin)) self.istream = uproot.open(self.fin) self.branches = {} self.gen = None self.out_branches = [] self.identifier = identifier if identifier else ['run', 'event', 'luminosityBlock'] self.tree = self.istream[branch] self.nrows = self.tree.numentries self.nevts = nevts self.idx = -1 self.chunk_idx = 0 self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows self.nan = float(nan) self.attrs = [] self.shape = None self.cache = {} self.hdict = {} self.hists = histograms if specs: self.load_specs(specs) else: self.jdim = {} self.minv = {} self.maxv = {} self.jkeys = [] self.fkeys = [] self.nans = {} # perform initialization time0 = time.time() self.init() if self.verbose: print("{} init is complete in {} sec".format(self, time.time()-time0)) if selected_branches: self.out_branches = [] for attr in self.attrs: for name in selected_branches: if name.find('*') != -1: if attr.startswith(name): self.out_branches.append(attr) else: if attr == name: self.out_branches.append(attr) if self.out_branches: if self.verbose: print("Select branches ...") for name in sorted(self.out_branches): print(name) if exclude_branches: out_branches = set() for attr in self.attrs: count = 0 for name in exclude_branches: if name.find('*') != -1: if attr.startswith(name): count += 1 else: if attr == name: count += 1 if not count: out_branches.add(attr) self.out_branches = list(out_branches) if self.out_branches: if self.verbose: print("Select branches ...") for name in sorted(self.out_branches): print(name) # declare histograms for original and normilized values if hg and self.hists: for key in self.attrs: low = self.minv[key] high = self.maxv[key] self.hdict['%s_orig' % key] = \ hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count()) self.hdict['%s_norm' % key] = \ hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
def __init__(self, fin, branch='Events', selected_branches=None, \ exclude_branches=None, identifier=None, label=None, \ chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \ redirector='root://cms-xrd-global.cern.ch', verbose=0): self.type = self.__class__.__name__ self.fin = xfile(fin, redirector) self.verbose = verbose if self.verbose: print("Reading {}".format(self.fin)) self.istream = uproot.open(self.fin) self.branches = {} self.gen = None self.out_branches = [] self.identifier = identifier if identifier else [ 'run', 'event', 'luminosityBlock' ] self.tree = self.istream[branch] self.nrows = self.tree.numentries self.nevts = nevts if nevts != -1 else self.nrows self.label = label self.idx = -1 self.chunk_idx = 0 self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows self.nan = float(nan) self.attrs = [] self.shape = None self.cache = {} self.hdict = {} self.hists = histograms self.idx_label = 0 self.flat_keys_encoded = [] self.jagged_keys_encoded = [] self.keys = [] self.min_list = [] self.max_list = [] self.jdimension = [] self.dimension_list = [] self.time_reading = [] self.time_reading_and_specs = [] if specs: self.load_specs(specs) else: self.jdim = {} self.minv = {} self.maxv = {} self.jkeys = [] self.fkeys = [] self.nans = {} time0 = time.time() if exclude_branches: print(f"Excluded branches: {exclude_branches}") all_branches = self.tree.keys() exclude_branches = [elem.encode() for elem in exclude_branches] self.out_branches = [ elem for elem in all_branches if (elem not in exclude_branches) ] if selected_branches: print(f"Selected branches: {selected_branches}") selected_branches = [elem.encode() for elem in selected_branches] self.out_branches = [elem for elem in selected_branches] # perform initialization self.init() if self.verbose: print("{} init is complete in {} sec".format( self, time.time() - time0)) # declare histograms for original and normilized values if hg and self.hists: for key in self.attrs: low = self.minv[key] high = self.maxv[key] self.hdict['%s_orig' % key] = \ hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count()) self.hdict['%s_norm' % key] = \ hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
zip(output_fields, dimuonCandidate_aux(pt, eta, phi, mass, charge, mediumid))) try: import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import histogrammar as hg import json import logging mpl_logger = logging.getLogger('matplotlib') mpl_logger.setLevel(logging.WARNING) invmass = hg.Bin(80, 70, 110, quantity=lambda x: x, value=hg.Count()) def histogram_fill_from_file(filename): with open(filename, 'r') as f_in: for row in f_in: js = json.loads(row) invmass.fill(js['mass']) def histogram(output='output.png'): #ax = invmass.plot.matplotlib(name="", color="green", edgecolor="white", lw=5) ax = invmass.plot.matplotlib(name="") ax.set_xlabel('Dimuon invariant mass m($\mu\mu$) (GeV)') ax.set_ylabel('Events / 0.5 GeV') plt.savefig(output) print(json.dumps(invmass.toJson())) #plt.show()