def test_binary2(self) -> None:
     s = self.scheduler()
     cols = 10
     _ = RandomTable(cols, rows=100_000, scheduler=s)
     _ = RandomTable(cols, rows=100_000, scheduler=s)
     with self.assertRaises(AssertionError):
         _ = Binary(np.add, columns=["_3", "_5", "_7"], scheduler=s)
Exemple #2
0
    def t_num_expr_impl(self, cls: Type[NumExprABC]) -> Tuple[Any, ...]:
        s = self.scheduler()
        random1 = RandomTable(10, rows=100000, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = random1.table.to_array()
        first_2 = first[:, 1]
        first_3 = first[:, 2]
        second = random2.table.to_array()
        second_2 = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = ne.evaluate("first_2+2*second_3")
        ne_2 = ne.evaluate("first_3-5*second_2")
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
        return first_2, first_3, second_2, second_3
 def _t_impl(self, cls: Type[TableModule], ufunc: np.ufunc,
             mod_name: str) -> None:
     print("Testing", mod_name)
     s = self.scheduler()
     random1 = RandomTable(
         3,
         rows=100_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     random2 = RandomTable(
         3,
         rows=100_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     module = cls(scheduler=s)
     module.input.first = random1.output.result
     module.input.second = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = ufunc(random1.table.to_array(), random2.table.to_array())
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith(mod_name))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
Exemple #4
0
 def test_hist_index_min_max(self):
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column='_1', scheduler=s)
     range_qry.create_dependent_modules(random,
                                        'table',
                                        min_value=min_value,
                                        max_value=max_value)
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = range_qry.output.table
     hist_index = range_qry.hist_index
     min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s)
     min_.input.table = hist_index.output.min_out
     prt2 = Print(proc=self.terse, scheduler=s)
     prt2.input.df = min_.output.table
     max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s)
     max_.input.table = hist_index.output.max_out
     pr3 = Print(proc=self.terse, scheduler=s)
     pr3.input.df = max_.output.table
     s.start()
     s.join()
     res1 = random.table().min()['_1']
     res2 = min_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
     res1 = random.table().max()['_1']
     res2 = max_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
Exemple #5
0
 def _impl_tst_percentiles(self, accuracy):
     """
     """
     s = self.scheduler()
     random = RandomTable(2, rows=10000, scheduler=s)
     hist_index = HistogramIndex(column='_1', scheduler=s)
     hist_index.input.table = random.output.table
     t_percentiles = Table(
         name=None,
         dshape='{_25: float64, _50: float64, _75: float64}',
         data={
             '_25': [25.0],
             '_50': [50.0],
             '_75': [75.0]
         })
     which_percentiles = Constant(table=t_percentiles, scheduler=s)
     percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s)
     percentiles.input.table = random.output.table
     percentiles.input.percentiles = which_percentiles.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = percentiles.output.table
     s.start()
     s.join()
     pdict = percentiles.table().last().to_dict()
     v = random.table()['_1'].values
     p25 = np.percentile(v, 25.0)
     p50 = np.percentile(v, 50.0)
     p75 = np.percentile(v, 75.0)
     print("Table=> accuracy: ", accuracy, " 25:", p25, pdict['_25'],
           " 50:", p50, pdict['_50'], " 75:", p75, pdict['_75'])
     self.assertAlmostEqual(p25, pdict['_25'], delta=0.01)
     self.assertAlmostEqual(p50, pdict['_50'], delta=0.01)
     self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
    def t_mix_ufunc_impl(
        self,
        cls: Type[MixUfuncABC],
        ufunc1: np.ufunc = np.log,
        ufunc2: np.ufunc = np.add,
    ) -> None:
        s = self.scheduler()
        random1 = RandomTable(10, rows=100000, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = random1.table.to_array()
        first_2 = first[:, 1]
        _ = first[:, 2]
        second = random2.table.to_array()
        _ = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = ufunc2(first_2, second_3).astype("float64")
        ne_2 = ufunc1(second_3).astype("float64")
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
 def test_max(self):
     s=Scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     max=Max(scheduler=s)
     max.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = max.output.df
     s.start()
     res1 = random.df()[random.columns.difference([random.UPDATE_COLUMN])].max()
     res2 = last_row(max.df(), remove_update=True)
     self.assertTrue(np.allclose(res1, res2))
 def test_random_table(self):
     s=Scheduler()
     module=RandomTable(['a', 'b'], rows=10000, scheduler=s)
     self.assertEqual(module.df().columns[0],'a')
     self.assertEqual(module.df().columns[1],'b')
     self.assertEqual(len(module.df().columns), 3) # add the UPDATE_COLUMN
     prlen = Every(proc=print_len, constant_time=True, scheduler=s)
     prlen.input.df = module.output.df
     s.start()
     self.assertEqual(len(module.df()), 10000)
     self.assertFalse(module.df()['a'].isnull().any())
     self.assertFalse(module.df()['b'].isnull().any())
 def test_max(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = max_.output.table
     s.start()
     s.join()
     res1 = random.table().max()
     res2 = max_.cxx_module.get_output_table().last()
     self.compare(res1, res2)
Exemple #10
0
 def test_min(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = min_.output.table
     s.start()
     s.join()
     res1 = random.table().min()
     res2 = min_.table().last()
     self.compare(res1, res2)
Exemple #11
0
 def test_var(self):
     s=Scheduler()
     random = RandomTable(1, rows=1000, scheduler=s)
     var=Var(scheduler=s)
     var.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = var.output.df
     s.start()
     res1 = random.df()[1].var()
     res2 = last_row(var.df(), remove_update=True)
     #print 'res1:', res1
     #print 'res2:', res2
     self.assertTrue(np.allclose(res1, res2))
 def test_hadamard(self) -> None:
     s = self.scheduler()
     random1 = RandomTable(3, rows=100000, scheduler=s)
     random2 = RandomTable(3, rows=100000, scheduler=s)
     module = Hadamard(scheduler=s)
     module.input.x1 = random1.output.result
     module.input.x2 = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = np.multiply(random1.table.to_array(), random2.table.to_array())
     res2 = module.table.to_array()
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
 def test_random_table2(self):
     s=Scheduler()
      # produces more than 4M rows per second on my laptop
     module=RandomTable(10, rows=10000000, force_valid_ids=True, scheduler=s)
     self.assertEqual(len(module.df().columns), 11) # add the UPDATE_COLUMN
     self.assertEqual(module.df().columns[0],'_1')
     self.assertEqual(module.df().columns[1],'_2')
     prlen = Every(proc=print_len, constant_time=True, scheduler=s)
     prlen.input.df = module.output.df
     s.start()
     self.assertEqual(len(module.df()), 10000000)
     self.assertFalse(module.df()['_1'].isnull().any())
     self.assertFalse(module.df()['_2'].isnull().any())
 def test_binary(self) -> None:
     s = self.scheduler()
     random1 = RandomTable(3, rows=100_000, scheduler=s)
     random2 = RandomTable(3, rows=100_000, scheduler=s)
     module = Binary(np.add, scheduler=s)
     module.input.first = random1.output.result
     module.input.second = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = np.add(random1.table.to_array(), random2.table.to_array())
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith("binary_"))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
Exemple #15
0
 def test_range_query_min_max3(self):
     "Test min and max on RangeQuery output"
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
     t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [15000.]})
     range_qry = self._query_min_max_impl(random, t_min, t_max, s)
     s.start()
     s.join()
     min_data = range_qry.output.min.data()
     max_data = range_qry.output.max.data()
     max_rand = random.table().max()['_1']
     self.assertAlmostEqual(min_data['_1'].loc[0], 0.3)
     self.assertAlmostEqual(max_data['_1'].loc[0], max_rand)
Exemple #16
0
    def test_intersection(self):
        s = self.scheduler()
        random = RandomTable(2, rows=100000, scheduler=s)
        t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
        min_value = Constant(table=t_min, scheduler=s)
        t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
        max_value = Constant(table=t_max, scheduler=s)
        hist_index = HistogramIndex(column='_1', scheduler=s)
        hist_index.create_dependent_modules(random, 'table')
        bisect_min = Bisect(column='_1',
                            op='>',
                            hist_index=hist_index,
                            scheduler=s)
        bisect_min.input.table = hist_index.output.table
        #bisect_.input.table = random.output.table
        bisect_min.input.limit = min_value.output.table

        bisect_max = Bisect(column='_1',
                            op='<',
                            hist_index=hist_index,
                            scheduler=s)
        bisect_max.input.table = hist_index.output.table
        #bisect_.input.table = random.output.table
        bisect_max.input.limit = max_value.output.table
        inter = Intersection(scheduler=s)
        inter.input.table = bisect_min.output.table
        inter.input.table = bisect_max.output.table
        pr = Print(proc=self.terse, scheduler=s)
        pr.input.df = inter.output.table
        s.start()
        s.join()
        idx = hist_index.input_module.output['table']\
          .data().eval('(_1>0.3)&(_1<0.8)', result_object='index')
        self.assertEqual(inter.table().selection, bitmap(idx))
Exemple #17
0
    def test_dataflow_2_add_remove(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max_remove_min(scheduler: Scheduler,
                                      run_number: int) -> None:
            with scheduler as dataflow:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result
                print("removing min module")
                dataflow.delete_modules("min", "print_min")

        # t = _add_max_remove_min(csv, scheduler, proc=proc)
        scheduler.on_loop(_add_max_remove_min, 5)
        scheduler.on_loop(self._stop, 10)
        aio.run(scheduler.start())
        self.assertTrue(started)
Exemple #18
0
    def test_dataflow_1_dynamic(self) -> None:
        scheduler = self.scheduler(clean=True)

        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=scheduler)
        m = Min(name="min", scheduler=scheduler)
        prt = Print(proc=self.terse, name="print_min", scheduler=scheduler)
        m.input.table = table.output.result
        prt.input.df = m.output.result
        started = False

        def proc(x: Any) -> None:
            nonlocal started
            print("proc max called")
            started = True

        async def _add_max(scheduler: Scheduler, run_number: int) -> None:
            with scheduler:
                print("adding new modules")
                m = Max(name="max", scheduler=scheduler)
                prt = Print(name="print_max", proc=proc, scheduler=scheduler)
                m.input.table = table.output.result
                prt.input.df = m.output.result

        scheduler.on_loop(_add_max, 5)  # run the function after 5 loops
        scheduler.on_loop(self._stop, 10)

        # from nose.tools import set_trace; set_trace()
        aio.run(scheduler.start())
        self.assertTrue(started)
 def test_idxmax2(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, throttle=1000, scheduler=s)
     stirrer = Stirrer(update_column="_1",
                       delete_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     stirrer.input[0] = random.output.result
     idxmax = IdxMax(scheduler=s)
     idxmax.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = idxmax.output.result
     pr2 = Print(proc=self.terse, scheduler=s)
     pr2.input[0] = max_.output.result
     aio.run(s.start())
     # import pdb;pdb.set_trace()
     max1 = max_.psdict
     # print('max1', max1)
     max = idxmax.max()
     assert max is not None
     max2 = notNone(max.last()).to_dict()
     # print('max2', max2)
     self.compare(max1, max2)
 def test_bisect2(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100_000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=100,
         # update_rows=5,
         # fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     t = Table(name=None, dshape="{value: string}", data={"value": [0.5]})
     min_value = Constant(table=t, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(stirrer, "result")
     bisect_ = Bisect(column="_1",
                      op=">",
                      hist_index=hist_index,
                      scheduler=s)
     bisect_.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_.input.limit = min_value.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bisect_.output.result
     aio.run(s.start())
     idx = stirrer.table.eval("_1>0.5", result_object="index")
     self.assertEqual(bisect_.table.index, bitmap(idx))
 def test_scatterplot2(self):
     s = self.scheduler()
     random = RandomTable(2, rows=2000000, scheduler=s)
     sp = MCScatterPlot(scheduler=s,
                        classes=[('Scatterplot', '_1', '_2')],
                        approximate=True)
     sp.create_dependent_modules(random, 'table', with_sampling=False)
     cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
     cnt.input.df = random.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = sp.output.table
     decorate(s, VariablePatch1("variable_1"))
     decorate(s, VariablePatch2("variable_2"))
     decorate(s, ScatterPlotPatch("mc_scatter_plot_1"))
     sp.scheduler().start(idle_proc=idle_proc)
     s.join()
     js = sp.to_json()
     x, y, _ = zip(*js['sample']['data'])
     min_x = min(x)
     max_x = max(x)
     min_y = min(y)
     max_y = max(y)
     self.assertGreaterEqual(min_x, LOWER_X)
     self.assertGreaterEqual(min_y, LOWER_Y)
     self.assertLessEqual(max_x, UPPER_X)
     self.assertLessEqual(max_y, UPPER_Y)
    def t_mix_ufunc_table_dict_impl(self, cls: Type[MixUfuncABC]) -> None:
        s = self.scheduler()
        random1 = RandomDict(10, scheduler=s)
        random2 = RandomTable(10, rows=100000, scheduler=s)
        module = cls(
            columns={
                "first": ["_1", "_2", "_3"],
                "second": ["_1", "_2", "_3"]
            },
            scheduler=s,
        )

        module.input.first = random1.output.result
        module.input.second = random2.output.result
        pr = Print(proc=self.terse, scheduler=s)
        pr.input[0] = module.output.result
        aio.run(s.start())
        first = list(random1.psdict.values())
        first_2 = first[1]
        _ = first[2]
        second = random2.table.to_array()
        _ = second[:, 1]
        second_3 = second[:, 2]
        ne_1 = np.add(first_2, second_3)
        ne_2 = np.log(second_3)
        res = module.table.to_array()
        self.assertTrue(np.allclose(res[:, 0], ne_1, equal_nan=True))
        self.assertTrue(np.allclose(res[:, 1], ne_2, equal_nan=True))
Exemple #23
0
 def test_paste(self) -> None:
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_1 = Min(name="min_1" + str(hash(random)),
                 scheduler=s,
                 columns=["_1"])
     min_1.input[0] = random.output.result
     d2t_1 = Dict2Table(scheduler=s)
     d2t_1.input.dict_ = min_1.output.result
     min_2 = Min(name="min_2" + str(hash(random)),
                 scheduler=s,
                 columns=["_2"])
     min_2.input[0] = random.output.result
     d2t_2 = Dict2Table(scheduler=s)
     d2t_2.input.dict_ = min_2.output.result
     bj = Paste(scheduler=s)
     bj.input.first = d2t_1.output.result
     bj.input.second = d2t_2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bj.output.result
     aio.run(s.start())
     res1 = random.table.min()
     res2 = notNone(bj.table.last()).to_dict()
     self.assertAlmostEqual(res1["_1"], res2["_1"])
     self.assertAlmostEqual(res1["_2"], res2["_2"])
 def test_binary3(self) -> None:
     s = self.scheduler()
     cols = 10
     random1 = RandomTable(cols, rows=100_000, scheduler=s)
     random2 = RandomDict(cols, scheduler=s)
     module = Binary(
         np.add,
         columns={
             "first": ["_3", "_5", "_7"],
             "second": ["_4", "_6", "_8"]
         },
         scheduler=s,
     )
     module.input.first = random1.output.result
     module.input.second = random2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     res1 = np.add(
         random1.table.to_array()[:, [2, 4, 6]],
         np.array(list(random2.psdict.values()))[[3, 5, 7]],
     )
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith("binary_"))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
 def t_histogram2d_impl(self, **kw: Any) -> None:
     s = self.scheduler()
     random = RandomTable(3, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
     stirrer.input[0] = random.output.result
     min_ = Min(scheduler=s)
     min_.input[0] = stirrer.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = stirrer.output.result
     histogram2d = Histogram2D(
         0, 1, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = stirrer.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(s.start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     t = stirrer.table.loc[:, ["_1", "_2"]]
     assert t is not None
     v = t.to_array()
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertEqual(np.sum(h1), np.sum(h2))
     self.assertListEqual(h1.reshape(-1).tolist(), h2.reshape(-1).tolist())
 def test_hist_index_min_max(self) -> None:
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": 0.8})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
         hist_index = range_qry.hist_index
         assert hist_index is not None
         min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s)
         min_.input[0] = hist_index.output.min_out
         prt2 = Print(proc=self.terse, scheduler=s)
         prt2.input[0] = min_.output.result
         max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s)
         max_.input[0] = hist_index.output.max_out
         pr3 = Print(proc=self.terse, scheduler=s)
         pr3.input[0] = max_.output.result
     aio.run(s.start())
     res1 = cast(float, random.table.min()["_1"])
     res2 = cast(float, min_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
     res1 = cast(float, random.table.max()["_1"])
     res2 = cast(float, max_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
 def test_ldexp(self) -> None:
     cls, ufunc, mod_name = ColsLdexp, np.ldexp, "cols_ldexp_"
     print("Testing", mod_name)
     s = self.scheduler()
     cols = 10
     random = RandomTable(
         cols,
         rows=10_000,
         scheduler=s,
         random=lambda x: np.random.randint(10, size=x),  # type: ignore
         dtype="int64",
     )
     module = cls(
         first=["_3", "_5", "_7"],
         second=["_4", "_6", "_8"],
         cols_out=["x", "y", "z"],
         scheduler=s,
     )
     module.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = module.output.result
     aio.run(s.start())
     self.assertListEqual(module.table.columns, ["x", "y", "z"])
     arr = random.table.to_array()
     res1 = ufunc(arr[:, [2, 4, 6]], arr[:, [3, 5, 7]])
     res2 = module.table.to_array()
     self.assertTrue(module.name.startswith(mod_name))
     self.assertTrue(np.allclose(res1, res2, equal_nan=True))
Exemple #28
0
 def test_intersection(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(random, "result")
     bisect_min = Bisect(column="_1",
                         op=">",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_min.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_min.input.limit = min_value.output.result
     bisect_max = Bisect(column="_1",
                         op="<",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_max.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_max.input.limit = max_value.output.result
     inter = Intersection(scheduler=s)
     inter.input[0] = bisect_min.output.result
     inter.input[0] = bisect_max.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = inter.output.result
     aio.run(s.start())
     assert hist_index.input_module is not None
     idx = (hist_index.input_module.output["result"].data().eval(
         "(_1>0.3)&(_1<0.8)", result_object="index"))
     self.assertEqual(inter.table.index, bitmap(idx))
Exemple #29
0
 def test_hub_if_else(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: False, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = switch.output.result_else
     hub = Hub(scheduler=s)
     hub.input.table = min_.output.result
     hub.input.table = max_.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = hub.output.result
     aio.run(s.start())
     res1 = stirrer.result.min()
     res2 = hub.result
     self.compare(res1, res2)
 def setUpStep(self, step):
     self.set_step_info("{} rows".format(step * L))
     s = Scheduler()
     random = RandomTable(10, rows=step * L, scheduler=s)
     s.start()
     #return random
     self.random_table = pd.DataFrame(
         random.output.table.output_module.table().to_dict())
Exemple #31
0
 def test_bin_join(self):
     s = self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     min_1 = Min(name='min_1'+str(hash(random)), scheduler=s, columns=['_1'])
     min_1.input.table = random.output.table
     min_2 = Min(name='min_2'+str(hash(random)), scheduler=s, columns=['_2'])
     min_2.input.table = random.output.table
     bj = BinJoin(scheduler=s)
     bj.input.first = min_1.output.table
     bj.input.second = min_2.output.table
     pr=Print(proc=self.terse, scheduler=s)
     pr.input.df = bj.output.table
     s.start()
     s.join()
     res1 = random.table().min()
     res2 = bj.table().last().to_dict()
     self.assertAlmostEqual(res1['_1'], res2['_1'])
     self.assertAlmostEqual(res1['_2'], res2['_2'])
 def test_var(self):
     s = self.scheduler()
     random = RandomTable(1, rows=1000, scheduler=s)
     var = Var(scheduler=s)
     var.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = var.output.table
     s.start()
     s.join()
     res1 = np.array(
         [float(e) for e in random.table().var(ddof=1).values()])
     res2 = np.array([
         float(e)
         for e in var.table().last().to_dict(ordered=True).values()
     ])
     print('res1:', res1)
     print('res2:', res2)
     self.assertTrue(np.allclose(res1, res2))
 def p10s_random_min_max(n):
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
 def p10s_random_min_max(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(mid='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(id='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
from progressivis import Scheduler, Every
from progressivis.stats import RandomTable, Min, Max
from progressivis.vis import Histograms

#log_level()

try:
    s = scheduler
    print 'No scheduler defined, using the standard one'
except:
    s = Scheduler()

csv = RandomTable(columns=['a', 'b', 'c'],rows=1000000, throttle=1000, scheduler=s)
min = Min(scheduler=s)
min.input.df = csv.output.df
max = Max(scheduler=s)
max.input.df = csv.output.df
histograms = Histograms(scheduler=s)
histograms.input.df = csv.output.df
histograms.input.min = min.output.df
histograms.input.max = max.output.df
prlen = Every(scheduler=s)
prlen.input.df = histograms.output.df

if __name__=='__main__':
    print "Starting"
    csv.start()
from progressivis import Scheduler, Print
from progressivis.cluster import MBKMeans
from progressivis.stats import RandomTable
from progressivis.vis import ScatterPlot

try:
    s = scheduler
except:
    s = Scheduler()

table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s)
mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.df = table.output.df
prn = Print(scheduler=s)
prn.input.df = mbkmeans.output.df
#sp = ScatterPlot('a', 'b')
#sp.create_dependent_modules(mbkmeans,'df')

if __name__ == '__main__':
    table.start()