def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_bisect2(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100_000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=100, # update_rows=5, # fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result t = Table(name=None, dshape="{value: string}", data={"value": [0.5]}) min_value = Constant(table=t, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(stirrer, "result") bisect_ = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_.input.limit = min_value.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bisect_.output.result aio.run(s.start()) idx = stirrer.table.eval("_1>0.5", result_object="index") self.assertEqual(bisect_.table.index, bitmap(idx))
def test_idxmax2(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) stirrer = Stirrer(update_column="_1", delete_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result idxmax = IdxMax(scheduler=s) idxmax.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmax.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = max_.output.result aio.run(s.start()) # import pdb;pdb.set_trace() max1 = max_.psdict # print('max1', max1) max = idxmax.max() assert max is not None max2 = notNone(max.last()).to_dict() # print('max2', max2) self.compare(max1, max2)
def t_histogram2d_impl(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(3, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram2d = Histogram2D( 0, 1, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = stirrer.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(s.start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] t = stirrer.table.loc[:, ["_1", "_2"]] assert t is not None v = t.to_array() bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
def _impl_stirred_tst_percentiles_rq(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules(stirrer, "result", min_value=min_value, max_value=max_value) hist_index = range_qry.hist_index assert hist_index t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = range_qry.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() v = range_qry.table["_1"].values p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "TSV=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def _t_stirred_unary(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(10, rows=100_000, scheduler=s) stirrer = Stirrer(update_column="_3", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result module = Unary(np.log, columns=["_3", "_5", "_7"], scheduler=s) module.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = np.log(stirrer.table.to_array()[:, [2, 4, 6]]) res2 = module.table.to_array() self.assertTrue(module.name.startswith("unary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def _impl_stirred_tst_percentiles(self, accuracy: float, **kw: Any) -> None: """ """ s = self.scheduler() with s: random = RandomTable(2, rows=10000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.input[0] = stirrer.output.result t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0}) which_percentiles = Constant(table=t_percentiles, scheduler=s) percentiles = Percentiles(accuracy=accuracy, scheduler=s) percentiles.input[0] = stirrer.output.result percentiles.input.percentiles = which_percentiles.output.result percentiles.input.hist = hist_index.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = percentiles.output.result aio.run(s.start()) pdict = notNone(percentiles.table.last()).to_dict() # v = random.table()['_1'].values # from nose.tools import set_trace; set_trace() v = stirrer.table.to_array(columns=["_1"]).reshape(-1) p25 = np.percentile(v, 25.0) # type: ignore p50 = np.percentile(v, 50.0) # type: ignore p75 = np.percentile(v, 75.0) # type: ignore print( "Table=> accuracy: ", accuracy, " 25:", p25, pdict["_25"], " 50:", p50, pdict["_50"], " 75:", p75, pdict["_75"], ) # from nose.tools import set_trace; set_trace() self.assertAlmostEqual(p25, pdict["_25"], delta=0.01) self.assertAlmostEqual(p50, pdict["_50"], delta=0.01) self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
def test_stirrer(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.table.max() res2 = max_.result self.compare(res1, res2)
def test_filter3(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_1", update_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result filter_ = FilterMod(expr="_1 > 0.5", scheduler=s) filter_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = filter_.output.result aio.run(s.start()) tbl = filter_.get_input_slot("table").data() idx = tbl.eval("_1>0.5", result_object="index") self.assertEqual(filter_.table.index, bitmap(idx)) df = pd.DataFrame(tbl.to_dict(), index=tbl.index.to_array()) dfe = df.eval("_1>0.5") self.assertEqual(filter_.table.index, bitmap(df.index[dfe]))
def _impl_stirred_tst_intersection(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(stirrer, "result") bisect_min = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_min.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_min.input.limit = min_value.output.result bisect_max = Bisect(column="_1", op="<", hist_index=hist_index, scheduler=s) bisect_max.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_max.input.limit = max_value.output.result inter = Intersection(scheduler=s) inter.input[0] = bisect_min.output.result inter.input[0] = bisect_max.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = inter.output.result aio.run(s.start()) assert hist_index.input_module is not None idx = (hist_index.input_module.output["result"].data().eval( "(_1>0.3)&(_1<0.8)", result_object="index")) self.assertEqual(inter.table.index, bitmap(idx))
def t_stirred_cols_binary(self, **kw: Any) -> None: s = self.scheduler() cols = 10 random = RandomTable(cols, rows=10_000, scheduler=s) stirrer = Stirrer(update_column="_3", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result module = ColsBinary(np.add, first=["_3", "_5", "_7"], second=["_4", "_6", "_8"], scheduler=s) module.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) self.assertListEqual(module.table.columns, ["_3", "_5", "_7"]) arr = stirrer.table.to_array() res1 = np.add(arr[:, [2, 4, 6]], arr[:, [3, 5, 7]]) res2 = module.table.to_array() self.assertTrue(module.name.startswith("cols_binary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_switch_if_then(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: True, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result pr_else = Print(proc=self.terse, scheduler=s) pr_else.input[0] = switch.output.result_else pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.result.max() res2 = max_.result self.compare(res1, res2)
def test_idxmin2(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) stirrer = Stirrer(update_column="_1", delete_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result idxmin = IdxMin(scheduler=s) idxmin.input[0] = stirrer.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmin.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = min_.output.result aio.run(s.start()) min1 = min_.psdict # print('min1', min1) min = idxmin.min() assert min is not None min2 = notNone(min.last()).to_dict() # print('min2', min2) self.compare(min1, min2)
def _t_stirred_binary(self, **kw: Any) -> None: s = self.scheduler() random1 = RandomTable(10, rows=100000, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) stirrer1 = Stirrer(update_column="_3", fixed_step_size=1000, scheduler=s, **kw) stirrer1.input[0] = random1.output.result stirrer2 = Stirrer(update_column="_3", fixed_step_size=1000, scheduler=s, **kw) stirrer2.input[0] = random2.output.result module = Binary( np.add, columns={ "first": ["_3", "_5", "_7"], "second": ["_4", "_6", "_8"] }, scheduler=s, ) module.input.first = stirrer1.output.result module.input.second = stirrer2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) idx1 = stirrer1.table.index.to_array() idx2 = stirrer2.table.index.to_array() common = bitmap(idx1) & bitmap(idx2) bt1 = stirrer1.table.loc[common, :] bt2 = stirrer2.table.loc[common, :] assert bt1 is not None and bt2 is not None t1 = bt1.to_array()[:, [2, 4, 6]] t2 = bt2.to_array()[:, [3, 5, 7]] res1 = np.add(t1, t2) res2 = module.table.to_array() self.assertTrue(module.name.startswith("binary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))