def test_bin_width(self): """ Test getting the bin width of bin and sparselybin histograms """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) # building test histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=20, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist4.fill.numpy(df1) assert hist2.bin_width() == 1.0 assert hist3.bin_width() == 1.0 assert hist4.bin_width() == 0.5 assert hist5.bin_width() == 0.5
def test_most_probable_value(self): """ Test getting most probable value or label from histogram """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame( {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']}) df2 = pd.DataFrame( {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Categorize(unit('C')) hist1 = hg.Categorize(unit('C')) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) # fill them hist0.fill.numpy(df1) hist1.fill.numpy(df2) hist2.fill.numpy(df1) hist3.fill.numpy(df2) assert hist0.mpv == 'f3' assert hist1.mpv == 'f7' assert hist2.mpv == 1.5 assert hist3.mpv == 4.5
def test_bin_edges(self): """ Test getting the bin edges for requested ranges """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) df2 = pd.DataFrame({'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]}) # building test histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) import numpy as np np.testing.assert_array_equal(hist2.bin_edges(), [0., 1., 2., 3., 4., 5.]) np.testing.assert_array_equal(hist3.bin_edges(), [2., 3., 4., 5., 6., 7., 8., 9.]) np.testing.assert_array_equal(hist4.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist5.bin_edges(), [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist2.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12.]) np.testing.assert_array_equal(hist3.bin_edges(low=1.1, high=6), [1., 2., 3., 4., 5., 6.]) np.testing.assert_array_equal(hist4.bin_edges(low=2.1, high=11.9), [2., 3., 4., 5., 6., 7., 8., 9., 10.]) np.testing.assert_array_equal(hist5.bin_edges(low=1.1, high=5.4), [1., 2., 3., 4., 5., 6.])
def test_assert_similar_hists(): """Test assert on similarity of list of histograms Check similarity of: type, n-dim, sub-hists, specific type attributes """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Bin(5, 0, 5, unit("A")) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.Categorize(unit("C"), value=hist0) hist4 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) hist5 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist3, ) # fill them for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: hist.fill.numpy(df) for hist in [hist0, hist1, hist2, hist3, hist4, hist5]: assert check_similar_hists([hist, hist]) args01 = [""] args23 = [""] args45 = [""] try: assert_similar_hists([hist0, hist1]) except ValueError as e: args01 = e.args try: assert_similar_hists([hist2, hist3]) except ValueError as e: args23 = e.args try: assert_similar_hists([hist4, hist5]) except ValueError as e: args45 = e.args assert args01[0] == "Input histograms are not all similar." assert args23[0] == "Input histograms are not all similar." assert args45[0] == "Input histograms are not all similar."
def test_bin_entries(self): """ Test getting the number of bins for all assigned bins """ with Pandas() as pd: if pd is None: return with Numpy() as np: if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame( {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], 'C': ['f1', 'f3', 'f4', 'f3', 'f4', 'f2', 'f2', 'f1', 'f3', 'f4']}) df2 = pd.DataFrame( {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], 'C': ['f7', 'f3', 'f5', 'f8', 'f9', 'f2', 'f3', 'f6', 'f7', 'f7']}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Categorize(unit('C')) hist1 = hg.Categorize(unit('C')) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist0.fill.numpy(df1) hist1.fill.numpy(df2) hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) labels0 = hist0.bin_labels() labels1 = hist1.bin_labels() centers2 = hist2.bin_centers() centers3 = hist3.bin_centers() centers = hist4.bin_centers() import numpy as np np.testing.assert_array_equal(hist0.bin_entries(), [2., 2., 3., 3.]) np.testing.assert_array_equal(hist1.bin_entries(), [1., 2., 1., 1., 3., 1., 1.]) np.testing.assert_array_equal(hist0.bin_entries(labels=labels1), [2., 3., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist1.bin_entries(labels=labels0), [0., 1., 2., 0.]) np.testing.assert_array_equal(hist2.bin_entries(), [1., 4., 2., 2., 1.]) np.testing.assert_array_equal(hist3.bin_entries(), [1., 1., 2., 2., 1., 2., 1.]) np.testing.assert_array_equal(hist4.bin_entries(), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist5.bin_entries(), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.]) np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers3), [2., 2., 1., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers2), [0., 0., 1., 1., 2.]) np.testing.assert_array_equal(hist2.bin_entries(xvalues=centers), [1., 4., 2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(xvalues=centers), [0., 0., 1., 1., 2., 2., 1., 2., 1., 0.]) np.testing.assert_array_equal(hist2.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist3.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.]) np.testing.assert_array_equal(hist4.bin_entries(low=2.1, high=11.9), [2., 2., 1., 0., 0., 0., 0., 0.]) np.testing.assert_array_equal(hist5.bin_entries(low=1.1, high=5.4), [0., 1., 1., 2., 2.])
def get_hist_bin(self, hist, features, quant, col, dt): is_number = np.issubdtype(dt, np.number) is_timestamp = np.issubdtype(dt, np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) if "bin_width" in specs: hist = hg.SparselyBin( binWidth=specs["bin_width"], origin=specs.get("bin_offset", 0), quantity=quant, value=hist, ) elif "num" in specs and "low" in specs and "high" in specs: hist = hg.Bin( num=specs["num"], low=specs["low"], high=specs["high"], quantity=quant, value=hist, ) else: raise RuntimeError( "Do not know how to interpret bin specifications.") else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) return hist
def test_prepare_2dgrid(): """Test preparation of grid for extraction of number of entries for 2d hists""" df, hc1, hc2, hc3 = get_test_histograms1() # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) xkeys1, ykeys1 = prepare_2dgrid(hist1) xkeys2, ykeys2 = prepare_2dgrid(hist2) xkeys3, ykeys3 = prepare_2dgrid(hist3) np.testing.assert_array_equal(xkeys1, []) np.testing.assert_array_equal(ykeys1, []) np.testing.assert_array_equal(xkeys2, [0, 1, 2, 3, 4]) np.testing.assert_array_equal(ykeys2, ["foo1", "foo2", "foo3", "foo4", "foo5"]) np.testing.assert_array_equal(xkeys3, [0, 1, 4, 5, 6]) np.testing.assert_array_equal(ykeys3, [0, 1, 2, 3, 4])
def construct_empty_hist(self, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the features # in reverse order and passing a single-dim hist as input # to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = self.var_dtype[col] # processing function, e.g. only accept boolians during filling f = utils.QUANTITY[dt] if len(features) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[features] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa is_number = np.issubdtype(dt, np.number) is_timestamp = np.issubdtype(dt, np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) if "bin_width" in specs: hist = hg.SparselyBin( binWidth=specs["bin_width"], origin=specs.get("bin_offset", 0), quantity=quant, value=hist, ) elif "num" in specs and "low" in specs and "high" in specs: hist = hg.Bin( num=specs["num"], low=specs["low"], high=specs["high"], quantity=quant, value=hist, ) else: raise RuntimeError( "Do not know how to interpret bin specifications.") else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) return hist
def test_get_consistent_numpy_entries(): """Test extraction of number of entries When first making bin_edges of input histograms consistent to each other. """ df1 = pd.DataFrame({ "A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], "C": ["f1", "f3", "f4", "f3", "f4", "f2", "f2", "f1", "f3", "f4"], }) df2 = pd.DataFrame({ "A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], "C": ["f7", "f3", "f5", "f8", "f9", "f2", "f3", "f6", "f7", "f7"], }) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Categorize(unit("C")) hist1 = hg.Categorize(unit("C")) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) # fill them for hist, df in zip([hist0, hist1, hist2, hist3], [df1, df2, df1, df2]): hist.fill.numpy(df) e0, e1 = get_consistent_numpy_entries([hist0, hist1], get_bin_labels=False) _, labels01 = get_consistent_numpy_entries([hist0, hist1], get_bin_labels=True) e2, e3 = get_consistent_numpy_entries([hist2, hist3], get_bin_labels=False) _, centers23 = get_consistent_numpy_entries([hist2, hist3], get_bin_labels=True) entries0 = [2.0, 2.0, 3.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0] entries1 = [0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 3.0, 1.0, 1.0] labels = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"] entries2 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0] entries3 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0] centers = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5] np.testing.assert_array_equal(e0, entries0) np.testing.assert_array_equal(e1, entries1) np.testing.assert_array_equal(labels01, labels) np.testing.assert_array_equal(e2, entries2) np.testing.assert_array_equal(e3, entries3) np.testing.assert_array_equal(centers23, centers)
def construct_empty_hist(self, columns): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column revcols = list(reversed(columns)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) # processing function, e.g. only accept boolians during filling f = self.quantity.get(col, hf.QUANTITY[dt.type]) if len(columns) == 1: # df[col] is a pd.series quant = lambda x, fnc=f: fnc(x) # noqa else: # df[columns] is a pd.Dataframe # fix column to col quant = lambda x, fnc=f, clm=col: fnc(x[clm]) # noqa is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = hg.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=quant, value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=quant, value=hist) # decorators; adding them here doesn't seem to work! #hist.n_dim = get_n_dim(hist) #selected_cols = revcols[:idx+1] #dta = [self.var_dtype[col] for col in reversed(selected_cols)] #hist.datatype = dta[0] if hist.n_dim==1 else dta # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) return hist
def construct_empty_hist(self, df, columns): """Create an (empty) histogram of right type Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = histogrammar.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column for col in reversed(columns): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = histogrammar.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=df[col], value=hist) else: # string and boolians are treated as categories hist = histogrammar.Categorize(quantity=df[col], value=hist) # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) @property def n_bins(self): if hasattr(self, 'num'): return self.num elif hasattr(self, 'size'): return self.size else: raise RuntimeError( 'Cannot retrieve number of bins from hgr hist') hist.n_bins = n_bins return hist
def test_bin_centers(self): """ Test getting assigned bin-centers for Bin and SparselyBin histograms """ with Pandas() as pd: if pd is None: return with Numpy() as np: # noqa if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame( {'A': [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) df2 = pd.DataFrame( {'A': [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]}) # histograms hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) hist5 = hg.Bin(num=10, low=0.0, high=10., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) import numpy as np np.testing.assert_array_equal(hist2.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5]) np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]) np.testing.assert_array_equal(hist2.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5]) np.testing.assert_array_equal(hist3.bin_centers(), [2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5]) np.testing.assert_array_equal(hist3.bin_centers(low=2.1, high=5.6), [2.5, 3.5, 4.5, 5.5]) np.testing.assert_array_equal(hist4.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist5.bin_centers(), [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist4.bin_centers(low=5, high=15), [5.5, 6.5, 7.5, 8.5, 9.5]) np.testing.assert_array_equal(hist5.bin_centers(low=2.1, high=9.1), [ 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5])
def test_num_bins(self): """ Test getting the number of bins from lowest to highest bin """ with Pandas() as pd: if pd is None: return with Numpy() as np: # noqa if numpy is None: return sys.stderr.write("\n") df1 = pd.DataFrame({'A': [0, 2, 4, 5, 7, 9, 11, 13, 13, 15]}) df2 = pd.DataFrame({'A': [2, 4, 4, 6, 8, 7, 10, 14, 17, 19]}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist3 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit('A')) hist4 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A')) hist5 = hg.Bin(num=20, low=0.0, high=20., quantity=unit('A')) # fill them hist2.fill.numpy(df1) hist3.fill.numpy(df2) hist4.fill.numpy(df1) hist5.fill.numpy(df2) assert hist2.num_bins() == 16 assert hist3.num_bins() == 18 assert hist4.num_bins() == 20 assert hist5.num_bins() == 20 assert hist2.num_bins(low=10, high=25) == 15 assert hist3.num_bins(low=10, high=25) == 15 assert hist4.num_bins(low=10, high=25) == 10 assert hist5.num_bins(low=10, high=25) == 10 assert hist2.num_bins(low=-10, high=28) == 38 assert hist3.num_bins(low=-10, high=28) == 38 assert hist4.num_bins(low=-10, high=28) == 20 assert hist5.num_bins(low=-10, high=28) == 20
def test_get_consistent_numpy_1dhists(): """Test extraction of number of entries and bin-edges/labels When first making bin_edges/bin-labels of input histograms consistent to each other. """ df1 = pd.DataFrame({"A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1]}) df2 = pd.DataFrame({"A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8]}) # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A")) # fill them hist1.fill.numpy(df1) hist2.fill.numpy(df2) nphist1, nphist2 = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=False) nphist_list, centers = get_consistent_numpy_1dhists([hist1, hist2], get_bin_labels=True) entries1 = [1.0, 4.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0] entries2 = [0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0] bin_edges = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0] bin_centers = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5] np.testing.assert_array_equal(nphist1[0], entries1) np.testing.assert_array_equal(nphist1[1], bin_edges) np.testing.assert_array_equal(nphist2[0], entries2) np.testing.assert_array_equal(nphist2[1], bin_edges) np.testing.assert_array_equal(nphist_list[0][0], entries1) np.testing.assert_array_equal(nphist_list[0][1], bin_edges) np.testing.assert_array_equal(nphist_list[1][0], entries2) np.testing.assert_array_equal(nphist_list[1][1], bin_edges) np.testing.assert_array_equal(centers, bin_centers)
def project_on_x(hist): """Project n-dim histogram onto x-axis :param hist: input histogrammar histogram :return: on x-axis projected histogram (1d) """ # basic check: projecting on itself if hasattr(hist, "n_dim") and hist.n_dim <= 1: return hist # basic checks on contents if hasattr(hist, "bins"): if len(hist.bins) == 0: return hist elif hasattr(hist, "values"): if len(hist.values) == 0: return hist else: return hist # make empty clone # note: cannot do: h_x = hist.zero(), b/c it copies n-dim structure, which screws up hist.toJsonString() if isinstance(hist, histogrammar.Bin): h_x = histogrammar.Bin( num=hist.num, low=hist.low, high=hist.high, quantity=hist.quantity, ) elif isinstance(hist, histogrammar.SparselyBin): h_x = histogrammar.SparselyBin( binWidth=hist.binWidth, origin=hist.origin, quantity=hist.quantity, ) elif isinstance(hist, histogrammar.Categorize): h_x = histogrammar.Categorize(quantity=hist.quantity) else: raise TypeError("Unknown histogram type. cannot get zero copy.") if hasattr(hist, "bins"): for key, bi in hist.bins.items(): h_x.bins[key] = histogrammar.Count.ed(sum_entries(bi)) elif hasattr(hist, "values"): for i, bi in enumerate(hist.values): h_x.values[i] = histogrammar.Count.ed(sum_entries(bi)) return h_x
def construct_empty_hist(self, df, columns): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column revcols = list(reversed(columns)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(columns, columns.index(col)) hist = hg.SparselyBin(binWidth=specs['bin_width'], origin=specs['bin_offset'], quantity=df[col], value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=df[col], value=hist) # decorators; adding them here doesn't seem to work! #selected_cols = revcols[:idx+1] #hist.datatype = [self.var_dtype[col] for col in reversed(selected_cols)] # FIXME stick data types and number of dimension to histogram dta = [self.var_dtype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) return hist
def _create_hist_with_time_axis(self, hist, time_bin_idx): """Create histogram with time-axis and place hist into it at time-value :param hist: input histogram to insert into histogram with time-axis :param str time_bin_idx: time-value at which to insert histogram :return: histogram with time-axis """ # basic checks if time_bin_idx is None or not isinstance(time_bin_idx, (str, int)): raise TypeError( "time_bin_idx not set. should be an (ordered) string or integer." ) ht = (hg.SparselyBin(binWidth=1.0, origin=0.0, quantity=lambda x: x) if isinstance(time_bin_idx, int) else hg.Categorize( quantity=lambda x: x)) # noqa ht.bins[time_bin_idx] = hist ht.entries = hist.entries return ht
def construct_empty_hist(self, df, features): """Create an (empty) histogram of right type. Create a multi-dim histogram by iterating through the features in reverse order and passing a single-dim hist as input to the next column. :param df: input dataframe :param list features: histogram features :return: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through # the features in reverse order and passing a single-dim hist # as input to the next column revcols = list(reversed(features)) for idx, col in enumerate(revcols): # histogram type depends on the data type dt = np.dtype(self.var_dtype[col]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram specs = self.var_bin_specs(features, features.index(col)) hist = hg.SparselyBin( binWidth=specs['bin_width'], origin=specs['bin_offset'], quantity=df[col], value=hist ) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=df[col], value=hist) # set data types in histogram dta = [self.var_dtype[col] for col in features] hist.datatype = dta[0] if len(features) == 1 else dta return hist
def get_test_histograms1(): """ Get set 1 of test histograms """ # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 import pandas as pd import histogrammar as hg df = pd.util.testing.makeMixedDataFrame() df['date'] = df['D'].apply(to_ns) df['boolT'] = True df['boolF'] = False # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit('C')) hist2 = hg.Bin(5, 0, 5, unit('A'), value=hist1) hist3 = hg.SparselyBin(origin=pd.Timestamp('2009-01-01').value, binWidth=pd.Timedelta(days=1).value, quantity=unit('date'), value=hist2) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) return df, hist1, hist2, hist3
def get_test_histograms1(): """Get set 1 of test histograms""" # dummy dataset with mixed types # convert timestamp (col D) to nanosec since 1970-1-1 df = pd.util.testing.makeMixedDataFrame() df["date"] = df["D"].apply(to_ns) df["boolT"] = True df["boolF"] = False # building 1d-, 2d-, and 3d-histogram (iteratively) hist1 = hg.Categorize(unit("C")) hist2 = hg.Bin(5, 0, 5, unit("A"), value=hist1) hist3 = hg.SparselyBin( origin=pd.Timestamp("2009-01-01").value, binWidth=pd.Timedelta(days=1).value, quantity=unit("date"), value=hist2, ) # fill them hist1.fill.numpy(df) hist2.fill.numpy(df) hist3.fill.numpy(df) return df, hist1, hist2, hist3
def _construct_empty_hist(self, columns): """Create an (empty) histogram of right type Create a multi-dim histogram by iterating through the columns in reverse order and passing a single-dim hist as input to the next column. :param columns: histogram columns :returns: created histogram :rtype: histogrammar.Count """ hist = hg.Count() # create a multi-dim histogram by iterating through the columns in reverse order # and passing a single-dim hist as input to the next column for col in reversed(columns): # histogram type depends on the data type dt = np.dtype(self.datatype[col]) # processing function, e.g. only accept boolians during filling f = self.quantity[col] if col in self.quantity else QUANTITY[ dt.type] if len(columns) == 1: # df[col] is a pd.series q = lambda x, fnc=f: fnc(x) else: # df[columns] is a pd.Dataframe # fix column to col q = lambda x, fnc=f, clm=col: fnc(x[clm]) is_number = isinstance(dt.type(), np.number) is_timestamp = isinstance(dt.type(), np.datetime64) if is_number or is_timestamp: # numbers and timestamps are put in a sparse binned histogram bs = self.bin_specs.get( col, self._unit_bin_specs if is_number else self._unit_timestamp_specs) hist = hg.SparselyBin(binWidth=bs['bin_width'], origin=bs['bin_offset'], quantity=q, value=hist) else: # string and boolians are treated as categories hist = hg.Categorize(quantity=q, value=hist) # FIXME stick data types and number of dimension to histogram dta = [self.datatype[col] for col in columns] hist.datatype = dta[0] if len(columns) == 1 else dta hist.n_dim = len(columns) @property def n_bins(self): if hasattr(self, num): return self.num elif hasattr(size, size): return self.size else: raise Exception( 'Cannot retrieve number of bins from hgr hist.') hist.n_bins = n_bins return hist
def test_get_consistent_numpy_2dgrids(): """Test extraction of number of entries for 2d hists When first making bin_edges of input histograms consistent to each other. """ df1 = pd.DataFrame({ "A": [0, 1, 2, 3, 4, 3, 2, 1, 1, 1], "C": ["f1", "f3", "f4", "f3", "f4", "f2", "f2", "f1", "f3", "f4"], }) df2 = pd.DataFrame({ "A": [2, 3, 4, 5, 7, 4, 6, 5, 7, 8], "C": ["f7", "f3", "f5", "f8", "f9", "f2", "f3", "f6", "f7", "f7"], }) # building 1d-, 2d-, and 3d-histogram (iteratively) hist0 = hg.Categorize(unit("C")) hist1 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"), value=hist0) hist2 = hg.SparselyBin(origin=0.0, binWidth=1.0, quantity=unit("A"), value=hist0) # fill them hist0.fill.numpy(df1) hist1.fill.numpy(df1) hist2.fill.numpy(df2) args = [""] try: get_consistent_numpy_2dgrids([hist0, hist0]) except ValueError as e: args = e.args grid2d_list = get_consistent_numpy_2dgrids([hist1, hist2]) g1 = np.asarray([ [1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ]) g2 = np.asarray([ [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0], ]) grid2d_comp = [g1, g2] # MB 20190828: not sure if this is the right way to test for exceptions. assert ( args[0] == "Input histogram only has 1 dimensions (<2). Cannot compute 2d-grid.") for i in range(2): assert (grid2d_list[i] == grid2d_comp[i]).all()