def test_histogram1d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[2] # type: ignore ) v = df.to_numpy().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertListEqual(h1.tolist(), h2.tolist())
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def test_idxmax2(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) stirrer = Stirrer(update_column="_1", delete_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result idxmax = IdxMax(scheduler=s) idxmax.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmax.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = max_.output.result aio.run(s.start()) # import pdb;pdb.set_trace() max1 = max_.psdict # print('max1', max1) max = idxmax.max() assert max is not None max2 = notNone(max.last()).to_dict() # print('max2', max2) self.compare(max1, max2)
def test_hist_index_min_max(self): "Test min_out and max_out on HistogramIndex" s = self.scheduler() random = RandomTable(2, rows=100000, scheduler=s) t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]}) min_value = Constant(table=t_min, scheduler=s) t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column='_1', scheduler=s) range_qry.create_dependent_modules(random, 'table', min_value=min_value, max_value=max_value) prt = Print(proc=self.terse, scheduler=s) prt.input.df = range_qry.output.table hist_index = range_qry.hist_index min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s) min_.input.table = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input.df = min_.output.table max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s) max_.input.table = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input.df = max_.output.table s.start() s.join() res1 = random.table().min()['_1'] res2 = min_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2) res1 = random.table().max()['_1'] res2 = max_.table().last().to_dict()['_1'] self.assertAlmostEqual(res1, res2)
def test_hist_index_min_max(self) -> None: "Test min_out and max_out on HistogramIndex" s = self.scheduler() with s: random = RandomTable(2, rows=100000, scheduler=s) t_min = PsDict({"_1": 0.3}) min_value = Constant(table=t_min, scheduler=s) t_max = PsDict({"_1": 0.8}) max_value = Constant(table=t_max, scheduler=s) range_qry = RangeQuery(column="_1", scheduler=s) range_qry.create_dependent_modules( random, "result", min_value=min_value, max_value=max_value ) prt = Print(proc=self.terse, scheduler=s) prt.input[0] = range_qry.output.result hist_index = range_qry.hist_index assert hist_index is not None min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s) min_.input[0] = hist_index.output.min_out prt2 = Print(proc=self.terse, scheduler=s) prt2.input[0] = min_.output.result max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s) max_.input[0] = hist_index.output.max_out pr3 = Print(proc=self.terse, scheduler=s) pr3.input[0] = max_.output.result aio.run(s.start()) res1 = cast(float, random.table.min()["_1"]) res2 = cast(float, min_.psdict["_1"]) self.assertAlmostEqual(res1, res2) res1 = cast(float, random.table.max()["_1"]) res2 = cast(float, max_.psdict["_1"]) self.assertAlmostEqual(res1, res2)
def t_histogram2d_impl(self, **kw: Any) -> None: s = self.scheduler() random = RandomTable(3, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = random.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram2d = Histogram2D( 0, 1, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = stirrer.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(s.start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] t = stirrer.table.loc[:, ["_1", "_2"]] assert t is not None v = t.to_array() bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
def test_histogram2d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[1, 2] # type: ignore ) v = df.to_numpy() # .reshape(-1, 2) bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertTrue(np.allclose(h1, h2))
def test_max(self): s=Scheduler() random = RandomTable(10, rows=10000, scheduler=s) max=Max(scheduler=s) max.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = max.output.df s.start() res1 = random.df()[random.columns.difference([random.UPDATE_COLUMN])].max() res2 = last_row(max.df(), remove_update=True) self.assertTrue(np.allclose(res1, res2))
def test_max(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) # s.join() res1 = random.table.max() res2 = max_.psdict self.compare(res1, res2)
def test_max(self): s = self.scheduler() random = RandomTable(10, rows=10000, scheduler=s) max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join() res1 = random.table().max() res2 = max_.table().last() self.compare(res1, res2)
def test_max(self): s = self.scheduler() random = SimpleCSVLoader( get_dataset("bigfile_multiscale"), nrows=10_000, scheduler=s ) max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) # s.join() res1 = random.result.max() res2 = max_.result self.compare(res1, res2)
def test_idxmax(self): s=Scheduler() random = RandomTable(10, rows=10000,throttle=1000, scheduler=s) idxmax=IdxMax(scheduler=s) idxmax.input.df = random.output.df max=Max(scheduler=s) max.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = idxmax.output.max s.start() max1=last_row(max.df(),remove_update=True) #print max1 max2=last_row(idxmax.max(),remove_update=True) #print max2 self.assertTrue((max1==max2).all())
async def _add_max(scheduler: Scheduler, run_number: int) -> None: with scheduler: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result
def test_histogram2d(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) min_ = Min(scheduler=s) min_.input.table = csv.output.table max_ = Max(scheduler=s) max_.input.table = csv.output.table histogram2d = Histogram2D(1, 2, xbins=100, ybins=100, scheduler=s) # columns are called 1..30 histogram2d.input.table = csv.output.table histogram2d.input.min = min_.output.table histogram2d.input.max = max_.output.table heatmap = Heatmap(filename='histo_%03d.png', scheduler=s) heatmap.input.array = histogram2d.output.table #pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) #pr.input.df = heatmap.output.heatmap #pr.input.df = histogram2d.output.df pr.input.df = csv.output.table csv.scheduler().start() s.join() #self.scheduler.thread.join() s = histogram2d.trace_stats()
def test_idxmax(self): s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) idxmax = IdxMax(scheduler=s) idxmax.input.table = random.output.table max_ = Max(scheduler=s) max_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = idxmax.output.max s.start() s.join() max1 = max_.table().last().to_dict() #print('max1', max1) max2 = idxmax.max().last().to_dict() #print('max2', max2) self.assertAlmostEqual(max1, max2)
def test_histogram1d(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats()
def p10s_random_min_max(n): StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(name='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def p10s_random_min_max(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(mid='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(id='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def test_stirrer(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.table.max() res2 = max_.result self.compare(res1, res2)
def test_idxmax(self) -> None: s = self.scheduler() random = RandomTable(10, rows=10000, throttle=1000, scheduler=s) idxmax = IdxMax(scheduler=s) idxmax.input[0] = random.output.result max_ = Max(scheduler=s) max_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = idxmax.output.result pr2 = Print(proc=self.terse, scheduler=s) pr2.input[0] = max_.output.result aio.run(s.start()) max1 = max_.psdict # print('max1', max1) max = idxmax.max() assert max is not None max2 = notNone(max.last()).to_dict() # print('max2', max2) self.compare(max1, max2)
async def _add_max_remove_min(scheduler: Scheduler, run_number: int) -> None: with scheduler as dataflow: print("adding new modules") m = Max(name="max", scheduler=scheduler) prt = Print(name="print_max", proc=proc, scheduler=scheduler) m.input.table = table.output.result prt.input.df = m.output.result print("removing min module") dataflow.delete_modules("min", "print_min")
def test_histogram2d(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) _ = histogram2d.trace_stats()
def test_switch_if_then(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: True, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result pr_else = Print(proc=self.terse, scheduler=s) pr_else.input[0] = switch.output.result_else pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.result.max() res2 = max_.result self.compare(res1, res2)
def test_dummy(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) dummy_ = DummyMod(update_column='_1', delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s) dummy_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = dummy_.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join()
def main(): "Main function" csvmod = RandomTable(columns=['a', 'b', 'c'], rows=1000000, random=np.random.randn, throttle=1000, scheduler=s) minmod = Min(scheduler=s) minmod.input.table = csvmod.output.table maxmod = Max(scheduler=s) maxmod.input.table = csvmod.output.table histograms = Histograms(scheduler=s) histograms.input.table = csvmod.output.table histograms.input.min = minmod.output.table histograms.input.max = maxmod.output.table prlen = Every(scheduler=s) prlen.input.df = histograms.output.table return csvmod
def test_histogram1d(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) min_ = Min(scheduler=s) min_.input.table = csv.output.table max_ = Max(scheduler=s) max_.input.table = csv.output.table histogram1d = Histogram1D('_2', scheduler=s) # columns are called 1..30 histogram1d.input.table = csv.output.table histogram1d.input.min = min_.output.table histogram1d.input.max = max_.output.table #pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input.df = csv.output.table s.start(tick_proc=lambda s, r: csv.is_terminated() and s.stop()) s.join() s = histogram1d.trace_stats()
from progressivis.io import CSVLoader from progressivis.stats import Histogram2D, Min, Max from progressivis.datasets import get_dataset from progressivis.vis import Heatmap print("Loading test_histogram2d") print("Type of default_scheduler is %s" % type(Scheduler.default)) csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, engine='c') pr = Every() pr.input.df = csv.output.table min_ = Min() min_.input.table = csv.output.table max_ = Max() max_.input.table = csv.output.table histogram2d = Histogram2D('_1', '_2', xbins=128, ybins=128) histogram2d.input.table = csv.output.table histogram2d.input.min = min_.output.table histogram2d.input.max = max_.output.table # heatmap heatmap = Heatmap(filename='histo_%03d.png') heatmap.input.array = histogram2d.output.table pr = Print(name='print') pr.input.df = csv.output.table if __name__ == '__main__': csv.start()
#SUFFIX= '' PREFIX= '../nyc-taxi/' SUFFIX= '.bz2' URLS = [ PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX, ] filenames = pd.DataFrame({'filename': URLS}) cst = Constant(Table('filenames', data=filenames), scheduler=s) csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['dropoff_longitude', 'dropoff_latitude'], filter_=filter_, scheduler=s) csv.input.filenames = cst.output.table min = Min(scheduler=s) min.input.table = csv.output.table max = Max(scheduler=s) max.input.table = csv.output.table histogram2d = Histogram2D('dropoff_longitude', 'dropoff_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s) histogram2d.input.table = csv.output.table histogram2d.input.min = min.output.table histogram2d.input.max = max.output.table heatmap = Heatmap(filename='nyc_dropoff_yellow%d.png', history=5, scheduler=s) heatmap.input.array = histogram2d.output.table if __name__=='__main__': s.start()