Ejemplo n.º 1
0
def _get_bin_specs(h):
    """Get histogram bin specifications

    :param h: input histogrammar histogram
    :return: list with bin_specs of all dimensions of the histogram
    :rtype: list
    """
    bin_specs = []
    if isinstance(h, histogrammar.Count):
        return bin_specs

    if isinstance(h, histogrammar.Categorize):
        bin_specs.append({})
    elif isinstance(h, histogrammar.Bin):
        bin_specs.append(dict(num=h.num, low=h.low, high=h.high))
    elif isinstance(h, histogrammar.SparselyBin):
        bin_specs.append(dict(bin_width=h.binWidth, bin_offset=h.origin))

    # histogram may have a sub-histogram. Extract it and recurse
    if hasattr(h, "bins"):
        hist = list(h.bins.values())[0] if h.bins else histogrammar.Count()
    elif hasattr(h, "values"):
        hist = h.values[0] if h.values else histogrammar.Count()
    else:
        hist = histogrammar.Count()
    return bin_specs + _get_bin_specs(hist)
Ejemplo n.º 2
0
def get_n_dim(cls):
    """Histogram dimension

    :returns: dimension of the histogram
    :rtype: int
    """
    if isinstance(cls, hg.Count):
        return 0
    # histogram may have a subhistogram. Extract it and recurse
    if hasattr(cls, 'values'):
        hist = cls.values[0] if cls.values else hg.Count()
    elif hasattr(cls, 'bins'):
        hist = list(cls.bins.values())[0] if cls.bins else hg.Count()
    else:
        hist = hg.Count()
    return 1 + get_n_dim(hist)
Ejemplo n.º 3
0
    def construct_empty_hist(self, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the features
        # in reverse order and passing a single-dim hist as input
        # to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = self.var_dtype[col]

            # processing function, e.g. only accept boolians during filling
            f = utils.QUANTITY[dt]
            if len(features) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[features] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            is_number = np.issubdtype(dt, np.number)
            is_timestamp = np.issubdtype(dt, np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(features, features.index(col))
                if "bin_width" in specs:
                    hist = hg.SparselyBin(
                        binWidth=specs["bin_width"],
                        origin=specs.get("bin_offset", 0),
                        quantity=quant,
                        value=hist,
                    )
                elif "num" in specs and "low" in specs and "high" in specs:
                    hist = hg.Bin(
                        num=specs["num"],
                        low=specs["low"],
                        high=specs["high"],
                        quantity=quant,
                        value=hist,
                    )
                else:
                    raise RuntimeError(
                        "Do not know how to interpret bin specifications.")
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

        return hist
Ejemplo n.º 4
0
    def construct_empty_hist(self, df, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through
        # the features in reverse order and passing a single-dim hist
        # as input to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = self.var_dtype[col]
            quant = df[col]

            hist = self.get_hist_bin(hist, features, quant, col, dt)

        # set data types in histogram
        dta = [self.var_dtype[col] for col in features]
        hist.datatype = dta[0] if len(features) == 1 else dta
        return hist
Ejemplo n.º 5
0
    def construct_empty_hist(self, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the features
        # in reverse order and passing a single-dim hist as input
        # to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = self.var_dtype[col]

            # processing function, e.g. only accept boolians during filling
            f = QUANTITY[dt]
            if len(features) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[features] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            hist = self.get_hist_bin(hist, features, quant, col, dt)

        return hist
Ejemplo n.º 6
0
    def construct_empty_hist(self, columns):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        revcols = list(reversed(columns))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])

            # processing function, e.g. only accept boolians during filling
            f = self.quantity.get(col, hf.QUANTITY[dt.type])
            if len(columns) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[columns] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = hg.SparselyBin(binWidth=bs['bin_width'],
                                      origin=bs['bin_offset'],
                                      quantity=quant,
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

            # decorators; adding them here doesn't seem to work!
            #hist.n_dim = get_n_dim(hist)
            #selected_cols = revcols[:idx+1]
            #dta = [self.var_dtype[col] for col in reversed(selected_cols)]
            #hist.datatype = dta[0] if hist.n_dim==1 else dta

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        return hist
Ejemplo n.º 7
0
    def construct_empty_hist(self, df, columns):
        """Create an (empty) histogram of right type

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """

        hist = histogrammar.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        for col in reversed(columns):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = histogrammar.SparselyBin(binWidth=bs['bin_width'],
                                                origin=bs['bin_offset'],
                                                quantity=df[col],
                                                value=hist)
            else:
                # string and boolians are treated as categories
                hist = histogrammar.Categorize(quantity=df[col], value=hist)

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        @property
        def n_bins(self):
            if hasattr(self, 'num'):
                return self.num
            elif hasattr(self, 'size'):
                return self.size
            else:
                raise RuntimeError(
                    'Cannot retrieve number of bins from hgr hist')

        hist.n_bins = n_bins

        return hist
Ejemplo n.º 8
0
def get_datatype(cls):
    """Get histogrammar histogram datatype(s) of its axes

    Return data type of the variable represented by the histogram.  If not
    already set, will determine datatype automatically.

    :returns: list with datatypes of all dimenensions of the histogram
    :rtype: list
    """
    datatype = []
    if isinstance(cls, histogrammar.Count):
        return datatype
    if isinstance(cls, histogrammar.Categorize):
        if len(cls.bins) > 0:
            dt = type(list(cls.bins.keys())[0])
            dt = np.dtype(dt).type
            if (dt is np.str_) or (dt is np.string_) or (dt is np.object_):
                dt = str
            datatype = [dt]
    elif isinstance(cls, (histogrammar.Bin, histogrammar.SparselyBin)):
        datatype = [np.number]
        bin_centers = cls.bin_centers()
        if len(bin_centers) > 0:
            dt = type(bin_centers[-1])
            dt = np.dtype(dt).type
            datatype = [dt]
            # HACK: making an educated guess for timestamp
            # timestamp is in ns since 1970, so a huge number.
            is_ts = DATE_LOW < bin_centers[-1] < DATE_HIGH
            if is_ts:
                datatype = [np.datetime64]
    # histogram may have a subhistogram. Extract it and recurse
    if hasattr(cls, "bins"):
        hist = list(cls.bins.values())[0] if cls.bins else histogrammar.Count()
    elif hasattr(cls, "values"):
        hist = cls.values[0] if cls.values else histogrammar.Count()
    else:
        hist = histogrammar.Count()
    return datatype + get_datatype(hist)
Ejemplo n.º 9
0
    def test_n_dim(self):
        """ Test dimension assigned to a histogram
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df, hist1, hist2, hist3 = get_test_histograms1()
                hist0 = hg.Count()

                assert hist0.n_dim == 0
                assert hist1.n_dim == 1
                assert hist2.n_dim == 2
                assert hist3.n_dim == 3
Ejemplo n.º 10
0
    def construct_empty_hist(self, df, columns):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        revcols = list(reversed(columns))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])
            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(columns, columns.index(col))
                hist = hg.SparselyBin(binWidth=specs['bin_width'],
                                      origin=specs['bin_offset'],
                                      quantity=df[col],
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=df[col], value=hist)

            # decorators; adding them here doesn't seem to work!
            #selected_cols = revcols[:idx+1]
            #hist.datatype = [self.var_dtype[col] for col in reversed(selected_cols)]

        # FIXME stick data types and number of dimension to histogram
        dta = [self.var_dtype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        return hist
Ejemplo n.º 11
0
    def construct_empty_hist(self, df, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param df: input dataframe
        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through
        # the features in reverse order and passing a single-dim hist
        # as input to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = np.dtype(self.var_dtype[col])
            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(features, features.index(col))
                hist = hg.SparselyBin(
                    binWidth=specs['bin_width'],
                    origin=specs['bin_offset'],
                    quantity=df[col],
                    value=hist
                )
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=df[col], value=hist)

        # set data types in histogram
        dta = [self.var_dtype[col] for col in features]
        hist.datatype = dta[0] if len(features) == 1 else dta
        return hist
Ejemplo n.º 12
0
#!/usr/bin/env python

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot

#First histogram tutorial in histogrammar package
import histogrammar as hg

# generate a stream of uniform random numbers
import random
data = [random.random() for i in xrange(2000)]

# aggregation structure and fill rule
histogram = hg.Bin(num=20,
                   low=0,
                   high=1,
                   quantity=lambda x: x,
                   value=hg.Count())

# fill the histogram!
for d in data:
    histogram.fill(d)

# quick plotting convenience method using matplotlib (if the user has this installed)
#looks like this interface soon changes to histogram.plot.matplotlib
ax = histogram.matplotlib(name="hello world!")

pyplot.savefig('histogrammar.png')
Ejemplo n.º 13
0
    def _construct_empty_hist(self, columns):
        """Create an (empty) histogram of right type

        Create a multi-dim histogram by iterating through the columns in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param columns: histogram columns
        :returns: created histogram
        :rtype: histogrammar.Count
        """

        hist = hg.Count()

        # create a multi-dim histogram by iterating through the columns in reverse order
        # and passing a single-dim hist as input to the next column
        for col in reversed(columns):
            # histogram type depends on the data type
            dt = np.dtype(self.datatype[col])

            # processing function, e.g. only accept boolians during filling
            f = self.quantity[col] if col in self.quantity else QUANTITY[
                dt.type]
            if len(columns) == 1:
                # df[col] is a pd.series
                q = lambda x, fnc=f: fnc(x)
            else:
                # df[columns] is a pd.Dataframe
                # fix column to col
                q = lambda x, fnc=f, clm=col: fnc(x[clm])

            is_number = isinstance(dt.type(), np.number)
            is_timestamp = isinstance(dt.type(), np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                bs = self.bin_specs.get(
                    col, self._unit_bin_specs
                    if is_number else self._unit_timestamp_specs)
                hist = hg.SparselyBin(binWidth=bs['bin_width'],
                                      origin=bs['bin_offset'],
                                      quantity=q,
                                      value=hist)
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=q, value=hist)

        # FIXME stick data types and number of dimension to histogram
        dta = [self.datatype[col] for col in columns]
        hist.datatype = dta[0] if len(columns) == 1 else dta
        hist.n_dim = len(columns)

        @property
        def n_bins(self):
            if hasattr(self, num):
                return self.num
            elif hasattr(size, size):
                return self.size
            else:
                raise Exception(
                    'Cannot retrieve number of bins from hgr hist.')

        hist.n_bins = n_bins

        return hist
Ejemplo n.º 14
0
    def __init__(self, fin, branch='Events', selected_branches=None, \
            exclude_branches=None, identifier=None, \
            chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \
            redirector='root://cms-xrd-global.cern.ch', verbose=0):
        self.type = self.__class__.__name__
        self.fin = xfile(fin, redirector)
        self.verbose = verbose
        if self.verbose:
            print("Reading {}".format(self.fin))
        self.istream = uproot.open(self.fin)
        self.branches = {}
        self.gen = None
        self.out_branches = []
        self.identifier = identifier if identifier else ['run', 'event', 'luminosityBlock']
        self.tree = self.istream[branch]
        self.nrows = self.tree.numentries
        self.nevts = nevts
        self.idx = -1
        self.chunk_idx = 0
        self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows
        self.nan = float(nan)
        self.attrs = []
        self.shape = None
        self.cache = {}
        self.hdict = {}
        self.hists = histograms
        if specs:
            self.load_specs(specs)
        else:
            self.jdim = {}
            self.minv = {}
            self.maxv = {}
            self.jkeys = []
            self.fkeys = []
            self.nans = {}

        # perform initialization
        time0 = time.time()
        self.init()
        if self.verbose:
            print("{} init is complete in {} sec".format(self, time.time()-time0))

        if selected_branches:
            self.out_branches = []
            for attr in self.attrs:
                for name in selected_branches:
                    if name.find('*') != -1:
                        if attr.startswith(name):
                            self.out_branches.append(attr)
                    else:
                        if attr == name:
                            self.out_branches.append(attr)

            if self.out_branches:
                if self.verbose:
                    print("Select branches ...")
                    for name in sorted(self.out_branches):
                        print(name)
        if exclude_branches:
            out_branches = set()
            for attr in self.attrs:
                count = 0
                for name in exclude_branches:
                    if name.find('*') != -1:
                        if attr.startswith(name):
                            count += 1
                    else:
                        if attr == name:
                            count += 1
                if not count:
                    out_branches.add(attr)
            self.out_branches = list(out_branches)
            if self.out_branches:
                if self.verbose:
                    print("Select branches ...")
                    for name in sorted(self.out_branches):
                        print(name)

        # declare histograms for original and normilized values
        if hg and self.hists:
            for key in self.attrs:
                low = self.minv[key]
                high = self.maxv[key]
                self.hdict['%s_orig' % key] = \
                        hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count())
                self.hdict['%s_norm' % key] = \
                        hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
Ejemplo n.º 15
0
    def __init__(self, fin, branch='Events', selected_branches=None, \
            exclude_branches=None, identifier=None, label=None, \
            chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \
            redirector='root://cms-xrd-global.cern.ch', verbose=0):
        self.type = self.__class__.__name__
        self.fin = xfile(fin, redirector)
        self.verbose = verbose
        if self.verbose:
            print("Reading {}".format(self.fin))
        self.istream = uproot.open(self.fin)
        self.branches = {}
        self.gen = None
        self.out_branches = []
        self.identifier = identifier if identifier else [
            'run', 'event', 'luminosityBlock'
        ]
        self.tree = self.istream[branch]
        self.nrows = self.tree.numentries
        self.nevts = nevts if nevts != -1 else self.nrows
        self.label = label
        self.idx = -1
        self.chunk_idx = 0
        self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows
        self.nan = float(nan)
        self.attrs = []
        self.shape = None
        self.cache = {}
        self.hdict = {}
        self.hists = histograms
        self.idx_label = 0
        self.flat_keys_encoded = []
        self.jagged_keys_encoded = []
        self.keys = []
        self.min_list = []
        self.max_list = []
        self.jdimension = []
        self.dimension_list = []
        self.time_reading = []
        self.time_reading_and_specs = []
        if specs:
            self.load_specs(specs)
        else:
            self.jdim = {}
            self.minv = {}
            self.maxv = {}
            self.jkeys = []
            self.fkeys = []
            self.nans = {}

        time0 = time.time()
        if exclude_branches:
            print(f"Excluded branches: {exclude_branches}")
            all_branches = self.tree.keys()
            exclude_branches = [elem.encode() for elem in exclude_branches]
            self.out_branches = [
                elem for elem in all_branches if (elem not in exclude_branches)
            ]
        if selected_branches:
            print(f"Selected branches: {selected_branches}")
            selected_branches = [elem.encode() for elem in selected_branches]
            self.out_branches = [elem for elem in selected_branches]

        # perform initialization
        self.init()
        if self.verbose:
            print("{} init is complete in {} sec".format(
                self,
                time.time() - time0))

        # declare histograms for original and normilized values
        if hg and self.hists:
            for key in self.attrs:
                low = self.minv[key]
                high = self.maxv[key]
                self.hdict['%s_orig' % key] = \
                        hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count())
                self.hdict['%s_norm' % key] = \
                        hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
Ejemplo n.º 16
0
        zip(output_fields,
            dimuonCandidate_aux(pt, eta, phi, mass, charge, mediumid)))


try:
    import matplotlib
    matplotlib.use('agg')
    import matplotlib.pyplot as plt
    import histogrammar as hg
    import json

    import logging
    mpl_logger = logging.getLogger('matplotlib')
    mpl_logger.setLevel(logging.WARNING)

    invmass = hg.Bin(80, 70, 110, quantity=lambda x: x, value=hg.Count())

    def histogram_fill_from_file(filename):
        with open(filename, 'r') as f_in:
            for row in f_in:
                js = json.loads(row)
                invmass.fill(js['mass'])

    def histogram(output='output.png'):
        #ax = invmass.plot.matplotlib(name="", color="green", edgecolor="white", lw=5)
        ax = invmass.plot.matplotlib(name="")
        ax.set_xlabel('Dimuon invariant mass m($\mu\mu$) (GeV)')
        ax.set_ylabel('Events / 0.5 GeV')
        plt.savefig(output)
        print(json.dumps(invmass.toJson()))
        #plt.show()