def t_histogram2d_impl(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(3, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram2d = Histogram2D( 0, 1, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = stirrer.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(s.start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] t = stirrer.table.loc[:, ["_1", "_2"]] assert t is not None v = t.to_array() bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
def test_histogram1d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[2] # type: ignore ) v = df.to_numpy().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertListEqual(h1.tolist(), h2.tolist())
def test_mb_k_means(self) -> None: s = self.scheduler() n_clusters = 3 try: dataset = (get_dataset("cluster:s3"), ) except TimeoutError: print("Cannot download cluster:s3") return with s: csv = CSVLoader( dataset, sep=" ", skipinitialspace=True, header=None, index_col=False, scheduler=s, ) km = MBKMeans( n_clusters=n_clusters, random_state=42, is_input=False, is_greedy=False, scheduler=s, ) # km.input.table = csv.output.result km.create_dependent_modules(csv) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = km.output.result e = Every(proc=self.terse, scheduler=s) e.input[0] = km.output.labels aio.run(s.start()) labels = km.labels() assert labels is not None self.assertEqual(len(csv.table), len(labels))
def test_join(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input[0] = csv.output.result stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input[0] = csv.output.result # join=Join(scheduler=s) # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) # reduce_.input[0] = stat1.output.stats # reduce_.input[0] = stat2.output.stats # join = reduce_.expand() join = Reduce.expand( BinJoin, "first", "second", "table", [stat1.output.stats, stat2.output.stats], scheduler=s, ) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = csv.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res)
def test_histogram2d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[1, 2] # type: ignore ) v = df.to_numpy() # .reshape(-1, 2) bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertTrue(np.allclose(h1, h2))
def test_blobs_table2(self) -> None: s = self.scheduler() sz = 100000 centers = [(0.1, 0.3), (0.7, 0.5), (-0.4, -0.3)] blob1 = BlobsTable(["a", "b"], centers=centers, cluster_std=0.2, rows=sz, scheduler=s) blob1.default_step_size = 1500 blob2 = BlobsTable(["a", "b"], centers=centers, cluster_std=0.2, rows=sz, scheduler=s) blob2.default_step_size = 200 add = Add(scheduler=s) add.input.first = blob1.output.result add.input.second = blob2.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = add.output.result aio.run(s.start()) # s.join() self.assertEqual(len(blob1.table), sz) self.assertEqual(len(blob2.table), sz) arr1 = blob1.table.to_array() arr2 = blob2.table.to_array() self.assertTrue(np.allclose(arr1, arr2))
def test_mv_blobs_table2(self) -> None: s = self.scheduler() sz = 100000 blob1 = MVBlobsTable(["a", "b"], means=means, covs=covs, rows=sz, scheduler=s) blob1.default_step_size = 1500 blob2 = MVBlobsTable(["a", "b"], means=means, covs=covs, rows=sz, scheduler=s) blob2.default_step_size = 200 add = Add(scheduler=s) add.input.first = blob1.output.result add.input.second = blob2.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = add.output.result aio.run(s.start()) # s.join() self.assertEqual(len(blob1.table), sz) self.assertEqual(len(blob2.table), sz) arr1 = blob1.table.to_array() arr2 = blob2.table.to_array() self.assertTrue(np.allclose(arr1, arr2))
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_random_table(self) -> None: s = self.scheduler() module = RandomTable(["a", "b"], rows=10000, scheduler=s) self.assertEqual(module.table.columns[0], "a") self.assertEqual(module.table.columns[1], "b") self.assertEqual(len(module.table.columns), 2) # add the UPDATE_COLUMN prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = module.output.result aio.run(s.start()) # s.join() self.assertEqual(len(module.table), 10000)
def NOtest_vec_distances(self) -> None: s = self.scheduler() vec = VECLoader(get_dataset("warlogs"), scheduler=s) # dis=PairwiseDistances(metric='cosine',scheduler=s) # dis.input[0] = vec.output.df # dis.input.array = vec.output.array cnt = Every(proc=self.terse, constant_time=True, scheduler=s) # cnt.input[0] = dis.output.dist cnt.input[0] = vec.output.result global times times = 0 s.start() _ = vec.result
def test_csv_distances(self) -> None: s = self.scheduler() vec = CSVLoader( get_dataset("smallfile"), index_col=False, header=None, scheduler=s ) # dis=PairwiseDistances(metric='euclidean',scheduler=s) # dis.input[0] = vec.output.df cnt = Every(proc=self.terse, constant_time=True, scheduler=s) # cnt.input[0] = dis.output.dist cnt.input[0] = vec.output.result global times times = 0 aio.run(s.start(ten_times)) _ = vec.result
def test_blobs_table(self) -> None: s = self.scheduler() module = BlobsTable(["a", "b"], centers=centers, rows=10000, scheduler=s) self.assertEqual(module.table.columns[0], "a") self.assertEqual(module.table.columns[1], "b") self.assertEqual(len(module.table.columns), 2) prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = module.output.result aio.run(s.start()) # s.join() self.assertEqual(len(module.table), 10000)
def test_last_row(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("smallfile"), index_col=False, header=None, scheduler=s ) lr1 = LastRow(scheduler=s) lr1.input[0] = csv.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = lr1.output.result aio.run(s.start()) df = csv.table res = lr1.table assert res is not None self.assertEqual(res.at[0, "_1"], notNone(df.last())["_1"])
def test_join(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input.table = csv.output.table stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input.table = csv.output.table stat3 = Stats(3, reset_index=True, scheduler=s) stat3.input.table = csv.output.table #join=Join(scheduler=s) #import pdb;pdb.set_trace() reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) reduce_.input.table = stat1.output.stats reduce_.input.table = stat2.output.stats join = reduce_.expand() pr = Print(proc=self.terse, scheduler=s) pr.input.df = join.output.table prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = csv.output.table s.start() res = join.trace_stats(max_runs=1) print(res)
def test_histogram2d(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) min_ = Min(scheduler=s) min_.input.table = csv.output.table max_ = Max(scheduler=s) max_.input.table = csv.output.table histogram2d = Histogram2D(1, 2, xbins=100, ybins=100, scheduler=s) # columns are called 1..30 histogram2d.input.table = csv.output.table histogram2d.input.min = min_.output.table histogram2d.input.max = max_.output.table heatmap = Heatmap(filename='histo_%03d.png', scheduler=s) heatmap.input.array = histogram2d.output.table #pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) #pr.input.df = heatmap.output.heatmap #pr.input.df = histogram2d.output.df pr.input.df = csv.output.table csv.scheduler().start() s.join() #self.scheduler.thread.join() s = histogram2d.trace_stats()
def test_module(self) -> None: # pylint: disable=broad-except s = self.scheduler() with self.assertRaises(TypeError): # abstract base class module = Module(name="a", scheduler=s) # type: ignore with s: module = Every(proc=self.terse, name="a", scheduler=s) self.assertEqual(module.name, "a") self.assertEqual(s.exists("a"), True) self.assertEqual(module.get_progress(), (0, 0)) try: module = SimpleModule(name="a", scheduler=s) self.fail("Exception not triggered with a duplicate name") except ProgressiveError: self.assertTrue(True) else: self.fail("Unexpected exception") mod2 = SimpleModule(name="b", scheduler=s) self.assertEqual(mod2.get_progress(), (0, 0)) self.assertTrue(module.is_valid()) self.assertFalse(module.is_visualization()) self.assertIsNone(module.get_visualization()) self.assertIsNone(module.get_data("error")) self.assertEqual(module.last_time(), 0) module.debug = True self.assertEqual(module.params.debug, True) module.set_current_params({"quantum": 2.0}) self.assertEqual(module.params.quantum, 2.0) params = module.get_data("_params") self.assertIsInstance(params, Table) del s["a"] self.assertEqual(s.exists("a"), False)
def test_scatterplot2(self): s = self.scheduler() random = RandomTable(2, rows=2000000, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(random, 'table', with_sampling=False) cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = random.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table decorate(s, VariablePatch1("variable_1")) decorate(s, VariablePatch2("variable_2")) decorate(s, ScatterPlotPatch("mc_scatter_plot_1")) sp.scheduler().start(idle_proc=idle_proc) s.join() js = sp.to_json() x, y, _ = zip(*js['sample']['data']) min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) self.assertGreaterEqual(min_x, LOWER_X) self.assertGreaterEqual(min_y, LOWER_Y) self.assertLessEqual(max_x, UPPER_X) self.assertLessEqual(max_y, UPPER_Y)
def test_random_table2(self) -> None: s = self.scheduler() # produces more than 4M rows per second on my laptop module = RandomTable(10, rows=1000000, force_valid_ids=True, scheduler=s) self.assertEqual(len(module.table.columns), 10) # add the UPDATE_COLUMN self.assertEqual(module.table.columns[0], "_1") self.assertEqual(module.table.columns[1], "_2") prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = module.output.result aio.run(s.start()) # s.join() self.assertEqual(len(module.table), 1000000)
def test_module(self): # pylint: disable=broad-except s = self.scheduler() with self.assertRaises(TypeError): # abstract base class module = Module(name='a', scheduler=s) module = Every(proc=self.terse, name='a', scheduler=s) self.assertEqual(module.name, 'a') self.assertEqual(s.exists('a'), True) self.assertEqual(module.get_progress(), (0, 0)) with self.assertRaises(ProgressiveError): module = SimpleModule(name='a', scheduler=s) mod2 = SimpleModule(name='b', scheduler=s) self.assertEqual(mod2.get_progress(), (0, 0)) module.debug = True self.assertEqual(module.params.debug, True) module.set_current_params({'quantum': 2.0}) self.assertEqual(module.params.quantum, 2.0) params = module.get_data("_params") self.assertIsInstance(params, Table) module.destroy() self.assertEqual(s.exists('a'), False) module.describe() json = module.to_json(short=True) self.assertEqual(json.get('is_running'), False) self.assertEqual(json.get('is_terminated'), False) json = module.to_json(short=False) self.assertEqual(json.get('start_time', 0), None) # maybe check others self.assertFalse(module.has_any_output())
def test_histogram1d(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats()
def test_percentile(self) -> None: s = self.scheduler() csv_module = CSVLoader(get_dataset("smallfile"), index_col=False, header=None, scheduler=s) module = Percentiles( "_1", name="test_percentile", percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], scheduler=s, ) module.input[0] = csv_module.output.result prt = Every(proc=self.terse, name="print", scheduler=s) prt.input[0] = module.output.result aio.run(s.start())
def test_select(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, force_valid_ids=True, scheduler=s, ) cst = Constant(PsDict({"query": ["_1 < 0.5"]}), scheduler=s) q = Select(scheduler=s) q.input[0] = csv.output.df q.input.query = cst.output.df prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = q.output.df aio.run(s.start()) self.assertTrue(len(q.table) < 1000000)
def test_merge(self) -> None: s = self.scheduler() csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s) stat1 = Stats(1, scheduler=s) stat1.input[0] = csv.output.result stat2 = Stats(2, scheduler=s) stat2.input[0] = csv.output.result merge = Merge(left_index=True, right_index=True, scheduler=s) merge.input[0] = stat1.output.result merge.input[0] = stat2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = merge.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = csv.output.result aio.run(s.start()) _ = merge.trace_stats(max_runs=1)
def test_scatterplot2(self) -> None: s = self.scheduler(clean=True) with s: random = RandomTable(2, rows=2000000, throttle=1000, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[("Scatterplot", "_1", "_2")], approximate=True) sp.create_dependent_modules(random, "result", with_sampling=False) cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input[0] = random.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = sp.output.result async def fake_input_1(scheduler: Scheduler, rn: int) -> None: module = scheduler["dyn_var_1"] print("from input dyn_var_1") await module.from_input({"x": LOWER_X, "y": LOWER_Y}) async def fake_input_2(scheduler: Scheduler, rn: int) -> None: module = scheduler["dyn_var_2"] print("from input dyn_var_2") await module.from_input({"x": UPPER_X, "y": UPPER_Y}) # finp1 = fake_input(s, "dyn_var_1", 6, {"x": LOWER_X, "y": LOWER_Y}) # finp2 = fake_input(s, "dyn_var_2", 6, {"x": UPPER_X, "y": UPPER_Y}) # sts = sleep_then_stop(s, 10) s.on_loop(self._stop, 10) # s.on_loop(prt) s.on_loop(fake_input_1, 3) s.on_loop(fake_input_2, 3) # aio.run_gather(sp.scheduler().start(), sts) aio.run(s.start()) js = sp.to_json() x, y, _ = zip(*js["sample"]["data"]) min_x = min(x) max_x = max(x) min_y = min(y) max_y = max(y) self.assertGreaterEqual(min_x, LOWER_X) self.assertGreaterEqual(min_y, LOWER_Y) self.assertLessEqual(max_x, UPPER_X) self.assertLessEqual(max_y, UPPER_Y)
def test_join(self) -> None: s = self.scheduler() csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input[0] = csv.output.result stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input[0] = csv.output.result join = Join(scheduler=s) join.input[0] = stat1.output.result join.input[0] = stat2.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = csv.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res)
def test_random_table(self): s = self.scheduler() module=RandomTable(['a', 'b'], rows=10000, scheduler=s) self.assertEqual(module.table().columns[0],'a') self.assertEqual(module.table().columns[1],'b') self.assertEqual(len(module.table().columns), 2) # add the UPDATE_COLUMN prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = module.output.table s.start() s.join() self.assertEqual(len(module.table()), 10000)
def test_percentile(self): s = self.scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s) module=Percentiles('_1', name='test_percentile', percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], scheduler=s) module.input.table = csv_module.output.table prt = Every(proc=self.terse, name='print', scheduler=s) prt.input.df = module.output.percentiles s.start() s.join()
def test_random_table2(self): s = self.scheduler() # produces more than 4M rows per second on my laptop module=RandomTable(10, rows=1000000, force_valid_ids=True, scheduler=s) self.assertEqual(len(module.table().columns), 10) # add the UPDATE_COLUMN self.assertEqual(module.table().columns[0],'_1') self.assertEqual(module.table().columns[1],'_2') prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = module.output.table s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def test_histogram2d(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) _ = histogram2d.trace_stats()
def test_csv_distances(self): s = self.scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) # dis=PairwiseDistances(metric='euclidean',scheduler=s) # dis.input.df = vec.output.df cnt = Every(proc=self.terse,constant_time=True,scheduler=s) # cnt.input.df = dis.output.dist cnt.input.df = vec.output.table global times times = 0 s.start(ten_times) s.join() table = vec.table()