def test_pattern(self): s = self.scheduler() n_samples = 1_000 centers = [(0.1, 0.3, 0.5), (0.7, 0.5, 3.3), (-0.4, -0.3, -11.1)] cols = ["A", "B", "C"] with s: data = BlobsTable( columns=cols, centers=centers, cluster_std=0.2, rows=n_samples, scheduler=s, ) # ds = DataShape(scheduler=s) # ds.input.table = data.output.result factory = StatsFactory(input_module=data, scheduler=s) factory.create_dependent_modules(var_name="my_dyn_var") factory.input.table = data.output.result sink = Sink(scheduler=s) # sink.input.inp = ds.output.result sink.input.inp = factory.output.result async def fake_input_1(scheduler: Scheduler, rn: int) -> None: module = scheduler["my_dyn_var"] print("from input my_dyn_var") await module.from_input({"matrix": matrix_hist}) s.on_loop(my_stop, 4) s.on_loop(fake_input_1, 3) aio.run(s.start())
def test_hist_index_min_max(self) -> None: "Test min_out and max_out on HistogramIndex" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result hist_index = range_qry.hist_index assert hist_index is not None min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s) min_.input[0] = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = min_.output.result max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s) max_.input[0] = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = max_.output.result aio.run(s.start()) res1 = cast(float, random.table.min()["_1"]) res2 = cast(float, min_.psdict["_1"]) self.assertAlmostEqual(res1, res2) res1 = cast(float, random.table.max()["_1"]) res2 = cast(float, max_.psdict["_1"]) self.assertAlmostEqual(res1, res2)
def test_binary3(self) -> None: s = self.scheduler() cols = 10 random1 = RandomTable(cols, rows=100_000, scheduler=s) random2 = RandomDict(cols, scheduler=s) module = Binary( np.add, columns={ "first": ["_3", "_5", "_7"], "second": ["_4", "_6", "_8"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = np.add( random1.table.to_array()[:, [2, 4, 6]], np.array(list(random2.psdict.values()))[[3, 5, 7]], ) res2 = module.table.to_array() self.assertTrue(module.name.startswith("binary_")) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def _tst_10_read_multi_csv_file_compress_with_crash( self, file_list: List[str], tag: str) -> None: s = self.scheduler() module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 4) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_load_csv(self) -> None: """ Connecting modules via function calls """ with Scheduler.default: csv = pv.load_csv(get_dataset("bigfile"), index_col=False, header=None) m = pv.min(csv) pv.echo(m, proc=prtm) M = pv.max(csv) pv.echo(M, proc=prtM) trace = M["_trace"] pv.echo(trace, proc=prtT) module = csv.module assert module is not None self.assertEqual(csv.scheduler(), module.scheduler()) aio.run(csv.scheduler().start()) table = csv.table lastm = m.table lastM = M.table self.assertEqual(len(table), 1000000) for col in table.columns: c = table[col] v = c.min() self.assertEqual(v, lastm[col]) v = c.max() self.assertEqual(v, lastM[col])
def test_intersection(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]}) max_value = Constant(table=t_max, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(random, "result") bisect_min = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_min.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_min.input.limit = min_value.output.result bisect_max = Bisect(column="_1", op="<", hist_index=hist_index, scheduler=s) bisect_max.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_max.input.limit = max_value.output.result inter = Intersection(scheduler=s) inter.input[0] = bisect_min.output.result inter.input[0] = bisect_max.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = inter.output.result aio.run(s.start()) assert hist_index.input_module is not None idx = (hist_index.input_module.output["result"].data().eval( "(_1>0.3)&(_1<0.8)", result_object="index")) self.assertEqual(inter.table.index, bitmap(idx))
def test_06_read_http_multi_csv_bz2_with_crash(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url_list = [make_url("bigfile", ext=BZ2)] * 2 module = CSVLoader(url_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) module = CSVLoader( url_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def t_num_expr_impl(self, cls: Type[NumExprABC]) -> Tuple[Any, ...]: s = self.scheduler() random1 = RandomTable(10, rows=100000, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = random1.table.to_array() first_2 = first[:, 1] first_3 = first[:, 2] second = random2.table.to_array() second_2 = second[:, 1] second_3 = second[:, 2] ne_1 = ne.evaluate("first_2+2*second_3") ne_2 = ne.evaluate("first_3-5*second_2") res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True)) return first_2, first_3, second_2, second_3
def test_bisect2(self) -> None: s = self.scheduler() random = RandomTable(2, rows=100_000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=100, # update_rows=5, # fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result t = Table(name=None, dshape="{value: string}", data={"value": [0.5]}) min_value = Constant(table=t, scheduler=s) hist_index = HistogramIndex(column="_1", scheduler=s) hist_index.create_dependent_modules(stirrer, "result") bisect_ = Bisect(column="_1", op=">", hist_index=hist_index, scheduler=s) bisect_.input[0] = hist_index.output.result # bisect_.input[0] = random.output.result bisect_.input.limit = min_value.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bisect_.output.result aio.run(s.start()) idx = stirrer.table.eval("_1>0.5", result_object="index") self.assertEqual(bisect_.table.index, bitmap(idx))
def t_mix_ufunc_impl( self, cls: Type[MixUfuncABC], ufunc1: np.ufunc = np.log, ufunc2: np.ufunc = np.add, ) -> None: s = self.scheduler() random1 = RandomTable(10, rows=100000, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = random1.table.to_array() first_2 = first[:, 1] _ = first[:, 2] second = random2.table.to_array() _ = second[:, 1] second_3 = second[:, 2] ne_1 = ufunc2(first_2, second_3).astype("float64") ne_2 = ufunc1(second_3).astype("float64") res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
def test_merge_simple(self) -> None: s = self.scheduler() cst1 = Constant(Table(name=None, data=pd.DataFrame({ "xmin": [1], "xmax": [2] })), scheduler=s) cst2 = Constant(Table(name=None, data=pd.DataFrame({ "ymin": [3], "ymax": [4] })), scheduler=s) merge = Merge(left_index=True, right_index=True, scheduler=s) merge.input[0] = cst1.output.result merge.input[0] = cst2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = merge.output.result aio.run(s.start()) _ = merge.trace_stats(max_runs=1) # pd.set_option('display.expand_frame_repr', False) # print(res) df = merge.table last = df.loc[df.index[-1]] assert last is not None self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2 and last["ymin"] == 3 and last["ymax"] == 4)
def t_mix_ufunc_table_dict_impl(self, cls: Type[MixUfuncABC]) -> None: s = self.scheduler() random1 = RandomDict(10, scheduler=s) random2 = RandomTable(10, rows=100000, scheduler=s) module = cls( columns={ "first": ["_1", "_2", "_3"], "second": ["_1", "_2", "_3"] }, scheduler=s, ) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) first = list(random1.psdict.values()) first_2 = first[1] _ = first[2] second = random2.table.to_array() _ = second[:, 1] second_3 = second[:, 2] ne_1 = np.add(first_2, second_3) ne_2 = np.log(second_3) res = module.table.to_array() self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True)) self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [ make_url("smallfile", ext=BZ2), make_url("smallfile", ext=BZ2), ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) _close(csv) self.assertEqual(len(csv.table), 60000)
def test_idxmax2(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) stirrer = Stirrer(update_column="_1", delete_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result idxmax = IdxMax(scheduler=s) idxmax.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmax.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = max_.output.result aio.run(s.start()) # import pdb;pdb.set_trace() max1 = max_.psdict # print('max1', max1) max = idxmax.max() assert max is not None max2 = notNone(max.last()).to_dict() # print('max2', max2) self.compare(max1, max2)
def test_mv_blobs_table2(self) -> None: s = self.scheduler() sz = 100000 blob1 = MVBlobsTable(["a", "b"], means=means, covs=covs, rows=sz, scheduler=s) blob1.default_step_size = 1500 blob2 = MVBlobsTable(["a", "b"], means=means, covs=covs, rows=sz, scheduler=s) blob2.default_step_size = 200 add = Add(scheduler=s) add.input.first = blob1.output.result add.input.second = blob2.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = add.output.result aio.run(s.start()) # s.join() self.assertEqual(len(blob1.table), sz) self.assertEqual(len(blob2.table), sz) arr1 = blob1.table.to_array() arr2 = blob2.table.to_array() self.assertTrue(np.allclose(arr1, arr2))
def test_histogram1d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[2] # type: ignore ) v = df.to_numpy().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertListEqual(h1.tolist(), h2.tolist())
def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_01_read_http_csv_with_crash_and_counter(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url = make_url("bigfile") module = CSVLoader(url, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 2) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) csv = CSVLoader( url, recovery=True, index_col=False, recovery_tag=tag, header=None, scheduler=s, ) counter = Counter(scheduler=s) counter.input[0] = csv.output.result self.assertTrue(csv.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = counter.output.result aio.run(s.start()) self.assertEqual(len(csv.table), 1000000) self.assertEqual(counter.table["counter"].loc[0], 1000000)
def test_mb_k_means(self) -> None: s = self.scheduler() n_clusters = 3 try: dataset = (get_dataset("cluster:s3"), ) except TimeoutError: print("Cannot download cluster:s3") return with s: csv = CSVLoader( dataset, sep=" ", skipinitialspace=True, header=None, index_col=False, scheduler=s, ) km = MBKMeans( n_clusters=n_clusters, random_state=42, is_input=False, is_greedy=False, scheduler=s, ) # km.input.table = csv.output.result km.create_dependent_modules(csv) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = km.output.result e = Every(proc=self.terse, scheduler=s) e.input[0] = km.output.labels aio.run(s.start()) labels = km.labels() assert labels is not None self.assertEqual(len(csv.table), len(labels))
def test_09_read_multi_csv_file_with_crash(self) -> None: s = self.scheduler() tag = "t9" file_list = [get_dataset("bigfile"), get_dataset("bigfile")] module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_paste(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) min_1 = Min(name="min_1" + str(hash(random)), scheduler=s, columns=["_1"]) min_1.input[0] = random.output.result d2t_1 = Dict2Table(scheduler=s) d2t_1.input.dict_ = min_1.output.result min_2 = Min(name="min_2" + str(hash(random)), scheduler=s, columns=["_2"]) min_2.input[0] = random.output.result d2t_2 = Dict2Table(scheduler=s) d2t_2.input.dict_ = min_2.output.result bj = Paste(scheduler=s) bj.input.first = d2t_1.output.result bj.input.second = d2t_2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = bj.output.result aio.run(s.start()) res1 = random.table.min() res2 = notNone(bj.table.last()).to_dict() self.assertAlmostEqual(res1["_1"], res2["_1"]) self.assertAlmostEqual(res1["_2"], res2["_2"])
def test_join(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input[0] = csv.output.result stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input[0] = csv.output.result # join=Join(scheduler=s) # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) # reduce_.input[0] = stat1.output.stats # reduce_.input[0] = stat2.output.stats # join = reduce_.expand() join = Reduce.expand( BinJoin, "first", "second", "table", [stat1.output.stats, stat2.output.stats], scheduler=s, ) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = csv.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res)
def test_dataflow_1_dynamic(self) -> None: scheduler = self.scheduler(clean=True) table = RandomTable(name="table", columns=["a"], throttle=1000, scheduler=scheduler) m = Min(name="min", scheduler=scheduler) prt = Print(proc=self.terse, name="print_min", scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result started = False def proc(x: Any) -> None: nonlocal started print("proc max called") started = True async def _add_max(scheduler: Scheduler, run_number: int) -> None: with scheduler: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result scheduler.on_loop(_add_max, 5) # run the function after 5 loops scheduler.on_loop(self._stop, 10) # from nose.tools import set_trace; set_trace() aio.run(scheduler.start()) self.assertTrue(started)
def test_piped_load_csv2(self): """ Connecting modules via the pipe operator (only one pipe) """ with Scheduler.default: ret = ( PipedInput(get_dataset("bigfile")) | pv.load_csv(index_col=False, header=None) | pv.min() | pv.echo(proc=prtm).repipe("csv_loader_1") | pv.max() | pv.echo(proc=prtM).repipe("max_1", out="_trace") | pv.echo(proc=prtT) ) m = ret.fetch("min_1") M = ret.fetch("max_1") csv = ret.fetch("csv_loader_1") self.assertEqual(csv.scheduler(), csv.module.scheduler()) aio.run(csv.scheduler().start()) table = csv.table lastm = m.table lastM = M.table self.assertEqual(len(table), 1000000) for col in table.columns: # print('testing column %s'%col) c = table[col] v = c.min() self.assertEqual(v, lastm[col]) v = c.max() self.assertEqual(v, lastM[col])
def test_dataflow_2_add_remove(self) -> None: scheduler = self.scheduler(clean=True) table = RandomTable(name="table", columns=["a"], throttle=1000, scheduler=scheduler) m = Min(name="min", scheduler=scheduler) prt = Print(proc=self.terse, name="print_min", scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result started = False def proc(x: Any) -> None: nonlocal started print("proc max called") started = True async def _add_max_remove_min(scheduler: Scheduler, run_number: int) -> None: with scheduler as dataflow: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result print("removing min module") dataflow.delete_modules("min", "print_min") # t = _add_max_remove_min(csv, scheduler, proc=proc) scheduler.on_loop(_add_max_remove_min, 5) scheduler.on_loop(self._stop, 10) aio.run(scheduler.start()) self.assertTrue(started)
def test_ldexp(self) -> None: cls, ufunc, mod_name = ColsLdexp, np.ldexp, "cols_ldexp_" print("Testing", mod_name) s = self.scheduler() cols = 10 random = RandomTable( cols, rows=10_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) module = cls( first=["_3", "_5", "_7"], second=["_4", "_6", "_8"], cols_out=["x", "y", "z"], scheduler=s, ) module.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) self.assertListEqual(module.table.columns, ["x", "y", "z"]) arr = random.table.to_array() res1 = ufunc(arr[:, [2, 4, 6]], arr[:, [3, 5, 7]]) res2 = module.table.to_array() self.assertTrue(module.name.startswith(mod_name)) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def test_blobs_table2(self) -> None: s = self.scheduler() sz = 100000 centers = [(0.1, 0.3), (0.7, 0.5), (-0.4, -0.3)] blob1 = BlobsTable(["a", "b"], centers=centers, cluster_std=0.2, rows=sz, scheduler=s) blob1.default_step_size = 1500 blob2 = BlobsTable(["a", "b"], centers=centers, cluster_std=0.2, rows=sz, scheduler=s) blob2.default_step_size = 200 add = Add(scheduler=s) add.input.first = blob1.output.result add.input.second = blob2.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = add.output.result aio.run(s.start()) # s.join() self.assertEqual(len(blob1.table), sz) self.assertEqual(len(blob2.table), sz) arr1 = blob1.table.to_array() arr2 = blob2.table.to_array() self.assertTrue(np.allclose(arr1, arr2))
def _t_impl(self, cls: Type[TableModule], ufunc: np.ufunc, mod_name: str) -> None: print("Testing", mod_name) s = self.scheduler() random1 = RandomTable( 3, rows=100_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) random2 = RandomTable( 3, rows=100_000, scheduler=s, random=lambda x: np.random.randint(10, size=x), # type: ignore dtype="int64", ) module = cls(scheduler=s) module.input.first = random1.output.result module.input.second = random2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = module.output.result aio.run(s.start()) res1 = ufunc(random1.table.to_array(), random2.table.to_array()) res2 = module.table.to_array() self.assertTrue(module.name.startswith(mod_name)) self.assertTrue(np.allclose(res1, res2, equal_nan=True))
def t_histogram2d_impl(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(3, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram2d = Histogram2D( 0, 1, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = stirrer.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(s.start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] t = stirrer.table.loc[:, ["_1", "_2"]] assert t is not None v = t.to_array() bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())