def test_bin_width(self):
        """ Test getting the bin width of bin and sparselybin histograms
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})

                # building test histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist4.fill.numpy(df1)

                assert hist2.bin_width() == 1.0
                assert hist3.bin_width() == 1.0
                assert hist4.bin_width() == 0.5
                assert hist5.bin_width() == 0.5
    def test_bin_edges(self):
        """ Test getting the bin edges for requested ranges
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})
                df2 = pd.DataFrame({'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]})

                # building test histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                import numpy as np
                np.testing.assert_array_equal(hist2.bin_edges(), [0., 1., 2., 3., 4., 5.])
                np.testing.assert_array_equal(hist3.bin_edges(), [2., 3., 4., 5., 6., 7., 8., 9.])
                np.testing.assert_array_equal(hist4.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.])
                np.testing.assert_array_equal(hist5.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.])

                np.testing.assert_array_equal(hist2.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.])
                np.testing.assert_array_equal(hist3.bin_edges(low=1.1, high=6), [1., 2., 3., 4., 5., 6.])
                np.testing.assert_array_equal(hist4.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10.])
                np.testing.assert_array_equal(hist5.bin_edges(low=1.1, high=5.4), [1., 2., 3., 4., 5., 6.])
Exemple #3
0
def test_assert_similar_hists():
    """Test assert on similarity of list of histograms

    Check similarity of: type, n-dim, sub-hists, specific type attributes
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist0 = hg.Bin(5, 0, 5, unit("A"))
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.Categorize(unit("C"), value=hist0)

    hist4 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    hist5 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist3,
    )
    # fill them
    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        hist.fill.numpy(df)

    for hist in [hist0, hist1, hist2, hist3, hist4, hist5]:
        assert check_similar_hists([hist, hist])

    args01 = [""]
    args23 = [""]
    args45 = [""]

    try:
        assert_similar_hists([hist0, hist1])
    except ValueError as e:
        args01 = e.args

    try:
        assert_similar_hists([hist2, hist3])
    except ValueError as e:
        args23 = e.args

    try:
        assert_similar_hists([hist4, hist5])
    except ValueError as e:
        args45 = e.args

    assert args01[0] == "Input histograms are not all similar."
    assert args23[0] == "Input histograms are not all similar."
    assert args45[0] == "Input histograms are not all similar."
    def test_bin_entries(self):
        """ Test getting the number of bins for all assigned bins
        """
        with Pandas() as pd:
            if pd is None: return
            with Numpy() as np:
                if numpy is None: return
                sys.stderr.write("\n")

                df1 = pd.DataFrame(
                    {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']})
                df2 = pd.DataFrame(
                    {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']})

                # building 1d-, 2d-, and 3d-histogram (iteratively)
                hist0 = hg.Categorize(unit('C'))
                hist1 = hg.Categorize(unit('C'))
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist0.fill.numpy(df1)
                hist1.fill.numpy(df2)
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                labels0 = hist0.bin_labels()
                labels1 = hist1.bin_labels()
                centers2 = hist2.bin_centers()
                centers3 = hist3.bin_centers()
                centers = hist4.bin_centers()

                import numpy as np
                np.testing.assert_array_equal(hist0.bin_entries(), [2., 2., 3., 3.])
                np.testing.assert_array_equal(hist1.bin_entries(), [1., 2., 1., 1., 3., 1., 1.])
                np.testing.assert_array_equal(hist0.bin_entries(labels=labels1), [2., 3., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist1.bin_entries(labels=labels0), [0., 1., 2., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(), [1., 4., 2., 2., 1.])
                np.testing.assert_array_equal(hist3.bin_entries(), [1., 1., 2., 2., 1., 2., 1.])
                np.testing.assert_array_equal(hist4.bin_entries(), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist5.bin_entries(), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers3), [2., 2., 1., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers2), [0., 0., 1., 1., 2.])
                np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.])

                np.testing.assert_array_equal(hist2.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist3.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
                np.testing.assert_array_equal(hist4.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0.])
                np.testing.assert_array_equal(hist5.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
Exemple #5
0
    def get_hist_bin(self, hist, features, quant, col, dt):
        is_number = np.issubdtype(dt, np.number)
        is_timestamp = np.issubdtype(dt, np.datetime64)

        if is_number or is_timestamp:
            # numbers and timestamps are put in a sparse binned histogram
            specs = self.var_bin_specs(features, features.index(col))
            if "bin_width" in specs:
                hist = hg.SparselyBin(
                    binWidth=specs["bin_width"],
                    origin=specs.get("bin_offset", 0),
                    quantity=quant,
                    value=hist,
                )
            elif "num" in specs and "low" in specs and "high" in specs:
                hist = hg.Bin(
                    num=specs["num"],
                    low=specs["low"],
                    high=specs["high"],
                    quantity=quant,
                    value=hist,
                )
            else:
                raise RuntimeError(
                    "Do not know how to interpret bin specifications.")
        else:
            # string and boolians are treated as categories
            hist = hg.Categorize(quantity=quant, value=hist)

        return hist
Exemple #6
0
def test_prepare_2dgrid():
    """Test preparation of grid for extraction of number of entries for 2d hists"""
    df, hc1, hc2, hc3 = get_test_histograms1()

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    xkeys1, ykeys1 = prepare_2dgrid(hist1)
    xkeys2, ykeys2 = prepare_2dgrid(hist2)
    xkeys3, ykeys3 = prepare_2dgrid(hist3)

    np.testing.assert_array_equal(xkeys1, [])
    np.testing.assert_array_equal(ykeys1, [])
    np.testing.assert_array_equal(xkeys2, [0, 1, 2, 3, 4])
    np.testing.assert_array_equal(ykeys2,
                                  ["foo1", "foo2", "foo3", "foo4", "foo5"])
    np.testing.assert_array_equal(xkeys3, [0, 1, 4, 5, 6])
    np.testing.assert_array_equal(ykeys3, [0, 1, 2, 3, 4])
Exemple #7
0
def test_profile_hist1d():
    num_bins = 1000
    num_entries = 10000
    hist_name = "histogram"
    split_len = 10
    split = []

    np.random.seed(0)
    for i in range(split_len):
        h = hg.Bin(num_bins, 0, 1, lambda x: x)
        h.fill.numpy(np.random.uniform(0, 1, num_entries))
        split.append({
            "date": pd.Timestamp("2019 - 1 - 1"),
            hist_name: HistogramContainer(h)
        })

    hp = HistProfiler(
        read_key="dummy_input",
        store_key="dummy_output",
        hist_col=hist_name,
        index_col="date",
    )

    profiles = hp._profile_hist(split, hist_name="feature")

    assert len(profiles) == split_len
    assert "p95" in profiles[0]
    assert profiles[1]["max"] == np.max(
        split[1][hist_name].get_bin_centers()[0])
    assert len(profiles[0][hist_name].hist.bin_entries()) == num_bins
Exemple #8
0
    def construct_empty_hist(self, features):
        """Create an (empty) histogram of right type.

        Create a multi-dim histogram by iterating through the features in
        reverse order and passing a single-dim hist as input to the next
        column.

        :param list features: histogram features
        :return: created histogram
        :rtype: histogrammar.Count
        """
        hist = hg.Count()

        # create a multi-dim histogram by iterating through the features
        # in reverse order and passing a single-dim hist as input
        # to the next column
        revcols = list(reversed(features))
        for idx, col in enumerate(revcols):
            # histogram type depends on the data type
            dt = self.var_dtype[col]

            # processing function, e.g. only accept boolians during filling
            f = utils.QUANTITY[dt]
            if len(features) == 1:
                # df[col] is a pd.series
                quant = lambda x, fnc=f: fnc(x)  # noqa
            else:
                # df[features] is a pd.Dataframe
                # fix column to col
                quant = lambda x, fnc=f, clm=col: fnc(x[clm])  # noqa

            is_number = np.issubdtype(dt, np.number)
            is_timestamp = np.issubdtype(dt, np.datetime64)

            if is_number or is_timestamp:
                # numbers and timestamps are put in a sparse binned histogram
                specs = self.var_bin_specs(features, features.index(col))
                if "bin_width" in specs:
                    hist = hg.SparselyBin(
                        binWidth=specs["bin_width"],
                        origin=specs.get("bin_offset", 0),
                        quantity=quant,
                        value=hist,
                    )
                elif "num" in specs and "low" in specs and "high" in specs:
                    hist = hg.Bin(
                        num=specs["num"],
                        low=specs["low"],
                        high=specs["high"],
                        quantity=quant,
                        value=hist,
                    )
                else:
                    raise RuntimeError(
                        "Do not know how to interpret bin specifications.")
            else:
                # string and boolians are treated as categories
                hist = hg.Categorize(quantity=quant, value=hist)

        return hist
Exemple #9
0
def get_test_histograms2():
    """Get set 2 of test histograms"""
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()

    # building 1d-, 2d-histogram (iteratively)
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.Bin(5, 0, 5, unit("A"))
    hist4 = hg.Categorize(unit("C"), value=hist3)

    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)
    hist4.fill.numpy(df)

    return df, hist1, hist2, hist3, hist4
Exemple #10
0
    def test_bin_centers(self):
        """ Test getting assigned bin-centers for Bin and SparselyBin histograms
        """
        with Pandas() as pd:
            if pd is None:
                return
            with Numpy() as np:  # noqa
                if numpy is None:
                    return
                sys.stderr.write("\n")

                df1 = pd.DataFrame(
                    {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]})
                df2 = pd.DataFrame(
                    {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]})

                # histograms
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))
                hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                import numpy as np
                np.testing.assert_array_equal(hist2.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5])
                np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5])
                np.testing.assert_array_equal(hist2.bin_centers(low=5, high=15),
                                              [5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5])
                np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5])
                np.testing.assert_array_equal(hist3.bin_centers(low=2.1, high=5.6), [2.5, 3.5, 4.5, 5.5])

                np.testing.assert_array_equal(hist4.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist5.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist4.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5])
                np.testing.assert_array_equal(hist5.bin_centers(low=2.1, high=9.1), [
                                              2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
Exemple #11
0
    def test_num_bins(self):
        """ Test getting the number of bins from lowest to highest bin
        """
        with Pandas() as pd:
            if pd is None:
                return
            with Numpy() as np:  # noqa
                if numpy is None:
                    return
                sys.stderr.write("\n")

                df1 = pd.DataFrame({'A': [0, 2, 4, 5, 7, 9, 11, 13, 13, 15]})
                df2 = pd.DataFrame({'A': [2, 4, 4, 6, 8, 7, 10, 14, 17, 19]})

                # building 1d-, 2d-, and 3d-histogram (iteratively)
                hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A'))
                hist4 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A'))
                hist5 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A'))

                # fill them
                hist2.fill.numpy(df1)
                hist3.fill.numpy(df2)
                hist4.fill.numpy(df1)
                hist5.fill.numpy(df2)

                assert hist2.num_bins() == 16
                assert hist3.num_bins() == 18
                assert hist4.num_bins() == 20
                assert hist5.num_bins() == 20

                assert hist2.num_bins(low=10, high=25) == 15
                assert hist3.num_bins(low=10, high=25) == 15
                assert hist4.num_bins(low=10, high=25) == 10
                assert hist5.num_bins(low=10, high=25) == 10

                assert hist2.num_bins(low=-10, high=28) == 38
                assert hist3.num_bins(low=-10, high=28) == 38
                assert hist4.num_bins(low=-10, high=28) == 20
                assert hist5.num_bins(low=-10, high=28) == 20
Exemple #12
0
def project_on_x(hist):
    """Project n-dim histogram onto x-axis

    :param hist: input histogrammar histogram
    :return: on x-axis projected histogram (1d)
    """
    # basic check: projecting on itself
    if hasattr(hist, "n_dim") and hist.n_dim <= 1:
        return hist
    # basic checks on contents
    if hasattr(hist, "bins"):
        if len(hist.bins) == 0:
            return hist
    elif hasattr(hist, "values"):
        if len(hist.values) == 0:
            return hist
    else:
        return hist

    # make empty clone
    # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString()
    if isinstance(hist, histogrammar.Bin):
        h_x = histogrammar.Bin(
            num=hist.num,
            low=hist.low,
            high=hist.high,
            quantity=hist.quantity,
        )
    elif isinstance(hist, histogrammar.SparselyBin):
        h_x = histogrammar.SparselyBin(
            binWidth=hist.binWidth,
            origin=hist.origin,
            quantity=hist.quantity,
        )
    elif isinstance(hist, histogrammar.Categorize):
        h_x = histogrammar.Categorize(quantity=hist.quantity)
    else:
        raise TypeError("Unknown histogram type. cannot get zero copy.")

    if hasattr(hist, "bins"):
        for key, bi in hist.bins.items():
            h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi))
    elif hasattr(hist, "values"):
        for i, bi in enumerate(hist.values):
            h_x.values[i] = histogrammar.Count.ed(sum_entries(bi))

    return h_x
Exemple #13
0
def get_test_histograms1():
    """Get set 1 of test histograms"""
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    df = pd.util.testing.makeMixedDataFrame()
    df["date"] = df["D"].apply(to_ns)
    df["boolT"] = True
    df["boolF"] = False

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit("C"))
    hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1)
    hist3 = hg.SparselyBin(
        origin=pd.Timestamp("2009-01-01").value,
        binWidth=pd.Timedelta(days=1).value,
        quantity=unit("date"),
        value=hist2,
    )
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    return df, hist1, hist2, hist3
def get_test_histograms1():
    """ Get set 1 of test histograms
    """
    # dummy dataset with mixed types
    # convert timestamp (col D) to nanosec since 1970-1-1
    import pandas as pd
    import histogrammar as hg

    df = pd.util.testing.makeMixedDataFrame()
    df['date'] = df['D'].apply(to_ns)
    df['boolT'] = True
    df['boolF'] = False

    # building 1d-, 2d-, and 3d-histogram (iteratively)
    hist1 = hg.Categorize(unit('C'))
    hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1)
    hist3 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value,
                           quantity=unit('date'), value=hist2)
    # fill them
    hist1.fill.numpy(df)
    hist2.fill.numpy(df)
    hist3.fill.numpy(df)

    return df, hist1, hist2, hist3
Exemple #15
0
    def __init__(self, fin, branch='Events', selected_branches=None, \
            exclude_branches=None, identifier=None, label=None, \
            chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \
            redirector='root://cms-xrd-global.cern.ch', verbose=0):
        self.type = self.__class__.__name__
        self.fin = xfile(fin, redirector)
        self.verbose = verbose
        if self.verbose:
            print("Reading {}".format(self.fin))
        self.istream = uproot.open(self.fin)
        self.branches = {}
        self.gen = None
        self.out_branches = []
        self.identifier = identifier if identifier else [
            'run', 'event', 'luminosityBlock'
        ]
        self.tree = self.istream[branch]
        self.nrows = self.tree.numentries
        self.nevts = nevts if nevts != -1 else self.nrows
        self.label = label
        self.idx = -1
        self.chunk_idx = 0
        self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows
        self.nan = float(nan)
        self.attrs = []
        self.shape = None
        self.cache = {}
        self.hdict = {}
        self.hists = histograms
        self.idx_label = 0
        self.flat_keys_encoded = []
        self.jagged_keys_encoded = []
        self.keys = []
        self.min_list = []
        self.max_list = []
        self.jdimension = []
        self.dimension_list = []
        self.time_reading = []
        self.time_reading_and_specs = []
        if specs:
            self.load_specs(specs)
        else:
            self.jdim = {}
            self.minv = {}
            self.maxv = {}
            self.jkeys = []
            self.fkeys = []
            self.nans = {}

        time0 = time.time()
        if exclude_branches:
            print(f"Excluded branches: {exclude_branches}")
            all_branches = self.tree.keys()
            exclude_branches = [elem.encode() for elem in exclude_branches]
            self.out_branches = [
                elem for elem in all_branches if (elem not in exclude_branches)
            ]
        if selected_branches:
            print(f"Selected branches: {selected_branches}")
            selected_branches = [elem.encode() for elem in selected_branches]
            self.out_branches = [elem for elem in selected_branches]

        # perform initialization
        self.init()
        if self.verbose:
            print("{} init is complete in {} sec".format(
                self,
                time.time() - time0))

        # declare histograms for original and normilized values
        if hg and self.hists:
            for key in self.attrs:
                low = self.minv[key]
                high = self.maxv[key]
                self.hdict['%s_orig' % key] = \
                        hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count())
                self.hdict['%s_norm' % key] = \
                        hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())
Exemple #16
0
        zip(output_fields,
            dimuonCandidate_aux(pt, eta, phi, mass, charge, mediumid)))


try:
    import matplotlib
    matplotlib.use('agg')
    import matplotlib.pyplot as plt
    import histogrammar as hg
    import json

    import logging
    mpl_logger = logging.getLogger('matplotlib')
    mpl_logger.setLevel(logging.WARNING)

    invmass = hg.Bin(80, 70, 110, quantity=lambda x: x, value=hg.Count())

    def histogram_fill_from_file(filename):
        with open(filename, 'r') as f_in:
            for row in f_in:
                js = json.loads(row)
                invmass.fill(js['mass'])

    def histogram(output='output.png'):
        #ax = invmass.plot.matplotlib(name="", color="green", edgecolor="white", lw=5)
        ax = invmass.plot.matplotlib(name="")
        ax.set_xlabel('Dimuon invariant mass m($\mu\mu$) (GeV)')
        ax.set_ylabel('Events / 0.5 GeV')
        plt.savefig(output)
        print(json.dumps(invmass.toJson()))
        #plt.show()
Exemple #17
0
#!/usr/bin/env python

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as pyplot

#First histogram tutorial in histogrammar package
import histogrammar as hg

# generate a stream of uniform random numbers
import random
data = [random.random() for i in xrange(2000)]

# aggregation structure and fill rule
histogram = hg.Bin(num=20,
                   low=0,
                   high=1,
                   quantity=lambda x: x,
                   value=hg.Count())

# fill the histogram!
for d in data:
    histogram.fill(d)

# quick plotting convenience method using matplotlib (if the user has this installed)
#looks like this interface soon changes to histogram.plot.matplotlib
ax = histogram.matplotlib(name="hello world!")

pyplot.savefig('histogrammar.png')
Exemple #18
0
    def __init__(self, fin, branch='Events', selected_branches=None, \
            exclude_branches=None, identifier=None, \
            chunk_size=1000, nevts=-1, specs=None, nan=np.nan, histograms=False, \
            redirector='root://cms-xrd-global.cern.ch', verbose=0):
        self.type = self.__class__.__name__
        self.fin = xfile(fin, redirector)
        self.verbose = verbose
        if self.verbose:
            print("Reading {}".format(self.fin))
        self.istream = uproot.open(self.fin)
        self.branches = {}
        self.gen = None
        self.out_branches = []
        self.identifier = identifier if identifier else ['run', 'event', 'luminosityBlock']
        self.tree = self.istream[branch]
        self.nrows = self.tree.numentries
        self.nevts = nevts
        self.idx = -1
        self.chunk_idx = 0
        self.chunk_size = chunk_size if chunk_size < self.nrows else self.nrows
        self.nan = float(nan)
        self.attrs = []
        self.shape = None
        self.cache = {}
        self.hdict = {}
        self.hists = histograms
        if specs:
            self.load_specs(specs)
        else:
            self.jdim = {}
            self.minv = {}
            self.maxv = {}
            self.jkeys = []
            self.fkeys = []
            self.nans = {}

        # perform initialization
        time0 = time.time()
        self.init()
        if self.verbose:
            print("{} init is complete in {} sec".format(self, time.time()-time0))

        if selected_branches:
            self.out_branches = []
            for attr in self.attrs:
                for name in selected_branches:
                    if name.find('*') != -1:
                        if attr.startswith(name):
                            self.out_branches.append(attr)
                    else:
                        if attr == name:
                            self.out_branches.append(attr)

            if self.out_branches:
                if self.verbose:
                    print("Select branches ...")
                    for name in sorted(self.out_branches):
                        print(name)
        if exclude_branches:
            out_branches = set()
            for attr in self.attrs:
                count = 0
                for name in exclude_branches:
                    if name.find('*') != -1:
                        if attr.startswith(name):
                            count += 1
                    else:
                        if attr == name:
                            count += 1
                if not count:
                    out_branches.add(attr)
            self.out_branches = list(out_branches)
            if self.out_branches:
                if self.verbose:
                    print("Select branches ...")
                    for name in sorted(self.out_branches):
                        print(name)

        # declare histograms for original and normilized values
        if hg and self.hists:
            for key in self.attrs:
                low = self.minv[key]
                high = self.maxv[key]
                self.hdict['%s_orig' % key] = \
                        hg.Bin(num=100, low=low, high=high, quantity=lambda x: x, value=hg.Count())
                self.hdict['%s_norm' % key] = \
                        hg.Bin(num=100, low=0, high=1, quantity=lambda x: x, value=hg.Count())