def test_combine_first_nan(self):
     s = self.scheduler()
     cst1 = Constant(Table(name='tcf_xmin_xmax_nan',
                           data=pd.DataFrame({
                               'xmin': [1],
                               'xmax': [2]
                           }),
                           create=True),
                     scheduler=s)
     cst2 = Constant(Table(name='tcf_ymin_ymax_nan',
                           data=pd.DataFrame({
                               'ymin': [np.nan],
                               'ymax': [np.nan]
                           }),
                           create=True),
                     scheduler=s)
     cst3 = Constant(Table(name='tcf_ymin_ymax2_nan',
                           data=pd.DataFrame({
                               'ymin': [3],
                               'ymax': [4]
                           }),
                           create=True),
                     scheduler=s)
     cf = CombineFirst(scheduler=s)
     cf.input.table = cst1.output.table
     cf.input.table = cst2.output.table
     cf.input.table = cst3.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = cf.output.table
     s.start()
     s.join()
     df = cf.table()
     last = df.last().to_dict()
     self.assertTrue(last['xmin']==1 and last['xmax']==2 and \
                     last['ymin']==3 and last['ymax']==4)
Exemple #2
0
 def test_merge_simple(self):
     s = self.scheduler()
     cst1 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               'xmin': [1],
                               'xmax': [2]
                           })),
                     scheduler=s)
     cst2 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               'ymin': [3],
                               'ymax': [4]
                           })),
                     scheduler=s)
     merge = Merge(left_index=True, right_index=True, scheduler=s)
     merge.input.table = cst1.output.table
     merge.input.table = cst2.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = merge.output.table
     s.start()
     s.join()
     res = merge.trace_stats(max_runs=1)
     #pd.set_option('display.expand_frame_repr', False)
     #print(res)
     df = merge.table()
     last = df.loc[df.index[-1]]
     self.assertTrue(last['xmin']==1 and last['xmax']==2 and \
                     last['ymin']==3 and last['ymax']==4)
Exemple #3
0
 def test_hist_index_min_max(self):
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column='_1', scheduler=s)
     range_qry.create_dependent_modules(random,
                                        'table',
                                        min_value=min_value,
                                        max_value=max_value)
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = range_qry.output.table
     hist_index = range_qry.hist_index
     min_ = Min(name='min_' + str(hash(hist_index)), scheduler=s)
     min_.input.table = hist_index.output.min_out
     prt2 = Print(proc=self.terse, scheduler=s)
     prt2.input.df = min_.output.table
     max_ = Max(name='max_' + str(hash(hist_index)), scheduler=s)
     max_.input.table = hist_index.output.max_out
     pr3 = Print(proc=self.terse, scheduler=s)
     pr3.input.df = max_.output.table
     s.start()
     s.join()
     res1 = random.table().min()['_1']
     res2 = min_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
     res1 = random.table().max()['_1']
     res2 = max_.table().last().to_dict()['_1']
     self.assertAlmostEqual(res1, res2)
 def test_join_simple(self):
     s = self.scheduler()
     cst1 = Constant(Table(name='test_join_simple_cst1',
                           data=pd.DataFrame({
                               'xmin': [1],
                               'xmax': [2]
                           }),
                           create=True),
                     scheduler=s)
     cst2 = Constant(Table(name='test_join_simple_cst2',
                           data=pd.DataFrame({
                               'ymin': [3],
                               'ymax': [4]
                           }),
                           create=True),
                     scheduler=s)
     reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     reduce_.input.table = cst1.output.table
     reduce_.input.table = cst2.output.table
     join = reduce_.expand()
     # join = BinJoin(scheduler=s)
     # join.input.first = cst1.output.table
     # join.input.second = cst2.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = join.output.table
     s.start()
     s.join()
     res = join.trace_stats(max_runs=1)
     print(res)
     df = join.table()
     last = df.loc[df.index[-1]]
     self.assertTrue(last['xmin'] == 1 and last['xmax'] == 2 and \
                     last['ymin'] == 3 and last['ymax'] == 4)
def _2_csv_2_const_scenario(module: Module,
                            s: Scheduler) -> Callable[[Scheduler, int], None]:
    csv_a = CSVLoader(get_dataset("smallfile"),
                      index_col=False,
                      header=None,
                      scheduler=s)
    csv_b = CSVLoader(get_dataset("smallfile"),
                      index_col=False,
                      header=None,
                      scheduler=s)
    table_c = Table("const_c_2_csv_2_const_scenario",
                    dshape="{a: int}",
                    create=True)
    const_c = Constant(table=table_c, scheduler=s)
    table_d = Table("const_d_2_csv_2_const_scenario",
                    dshape="{a: int}",
                    create=True)
    const_d = Constant(table=table_d, scheduler=s)
    module.input.a = csv_a.output.result
    module.input.b = csv_b.output.result
    module.input.c = const_c.output.result
    module.input.d = const_d.output.result

    def _fun(s: Scheduler, r: int) -> None:
        if r > 10:
            s.task_stop()

    return _fun
Exemple #6
0
 def test_intersection(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     t_min = Table(name=None, dshape="{_1: float64}", data={"_1": [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape="{_1: float64}", data={"_1": [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(random, "result")
     bisect_min = Bisect(column="_1",
                         op=">",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_min.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_min.input.limit = min_value.output.result
     bisect_max = Bisect(column="_1",
                         op="<",
                         hist_index=hist_index,
                         scheduler=s)
     bisect_max.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_max.input.limit = max_value.output.result
     inter = Intersection(scheduler=s)
     inter.input[0] = bisect_min.output.result
     inter.input[0] = bisect_max.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = inter.output.result
     aio.run(s.start())
     assert hist_index.input_module is not None
     idx = (hist_index.input_module.output["result"].data().eval(
         "(_1>0.3)&(_1<0.8)", result_object="index"))
     self.assertEqual(inter.table.index, bitmap(idx))
Exemple #7
0
    def test_intersection(self):
        s = self.scheduler()
        random = RandomTable(2, rows=100000, scheduler=s)
        t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
        min_value = Constant(table=t_min, scheduler=s)
        t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
        max_value = Constant(table=t_max, scheduler=s)
        hist_index = HistogramIndex(column='_1', scheduler=s)
        hist_index.create_dependent_modules(random, 'table')
        bisect_min = Bisect(column='_1',
                            op='>',
                            hist_index=hist_index,
                            scheduler=s)
        bisect_min.input.table = hist_index.output.table
        #bisect_.input.table = random.output.table
        bisect_min.input.limit = min_value.output.table

        bisect_max = Bisect(column='_1',
                            op='<',
                            hist_index=hist_index,
                            scheduler=s)
        bisect_max.input.table = hist_index.output.table
        #bisect_.input.table = random.output.table
        bisect_max.input.limit = max_value.output.table
        inter = Intersection(scheduler=s)
        inter.input.table = bisect_min.output.table
        inter.input.table = bisect_max.output.table
        pr = Print(proc=self.terse, scheduler=s)
        pr.input.df = inter.output.table
        s.start()
        s.join()
        idx = hist_index.input_module.output['table']\
          .data().eval('(_1>0.3)&(_1<0.8)', result_object='index')
        self.assertEqual(inter.table().selection, bitmap(idx))
Exemple #8
0
 def test_last_row_simple(self):
     s = self.scheduler()
     t1 = Table(name=get_random_name("cst1"),
                data={
                    'xmin': [1],
                    'xmax': [2]
                })
     t2 = Table(name=get_random_name("cst2"),
                data={
                    'ymin': [3],
                    'ymax': [4]
                })
     cst1 = Constant(t1, scheduler=s)
     cst2 = Constant(t2, scheduler=s)
     join = Join(scheduler=s)
     join.input.table = cst1.output.table
     join.input.table = cst2.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = join.output.table
     s.start()
     s.join()
     #res = join.trace_stats(max_runs=1)
     #pd.set_option('display.expand_frame_repr', False)
     #print(res)
     df = join.table()
     last = df.last()
     self.assertTrue(last['xmin']==1 and last['xmax']==2 and \
                     last['ymin']==3 and last['ymax']==4)
 def test_hist_index_min_max(self) -> None:
     "Test min_out and max_out on HistogramIndex"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=100000, scheduler=s)
         t_min = PsDict({"_1": 0.3})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": 0.8})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
         hist_index = range_qry.hist_index
         assert hist_index is not None
         min_ = Min(name="min_" + str(hash(hist_index)), scheduler=s)
         min_.input[0] = hist_index.output.min_out
         prt2 = Print(proc=self.terse, scheduler=s)
         prt2.input[0] = min_.output.result
         max_ = Max(name="max_" + str(hash(hist_index)), scheduler=s)
         max_.input[0] = hist_index.output.max_out
         pr3 = Print(proc=self.terse, scheduler=s)
         pr3.input[0] = max_.output.result
     aio.run(s.start())
     res1 = cast(float, random.table.min()["_1"])
     res2 = cast(float, min_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
     res1 = cast(float, random.table.max()["_1"])
     res2 = cast(float, max_.psdict["_1"])
     self.assertAlmostEqual(res1, res2)
Exemple #10
0
 def test_merge_simple(self) -> None:
     s = self.scheduler()
     cst1 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               "xmin": [1],
                               "xmax": [2]
                           })),
                     scheduler=s)
     cst2 = Constant(Table(name=None,
                           data=pd.DataFrame({
                               "ymin": [3],
                               "ymax": [4]
                           })),
                     scheduler=s)
     merge = Merge(left_index=True, right_index=True, scheduler=s)
     merge.input[0] = cst1.output.result
     merge.input[0] = cst2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = merge.output.result
     aio.run(s.start())
     _ = merge.trace_stats(max_runs=1)
     # pd.set_option('display.expand_frame_repr', False)
     # print(res)
     df = merge.table
     last = df.loc[df.index[-1]]
     assert last is not None
     self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2
                     and last["ymin"] == 3 and last["ymax"] == 4)
 def test_join_simple(self) -> None:
     s = self.scheduler()
     cst1 = Constant(
         Table(
             name="test_join_simple_cst1",
             data=pd.DataFrame({"xmin": [1], "xmax": [2]}),
             create=True,
         ),
         scheduler=s,
     )
     cst2 = Constant(
         Table(
             name="test_join_simple_cst2",
             data=pd.DataFrame({"ymin": [3], "ymax": [4]}),
             create=True,
         ),
         scheduler=s,
     )
     cst3 = Constant(
         Table(
             name="test_join_simple_cst3",
             data=pd.DataFrame({"zmin": [5], "zmax": [6]}),
             create=True,
         ),
         scheduler=s,
     )
     # join=Join(scheduler=s)
     # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     # reduce_.input[0] = cst1.output.result
     # reduce_.input[0] = cst2.output.result
     # reduce_.input[0] = cst3.output.result
     # join = reduce_.expand()
     join = Reduce.expand(
         BinJoin,
         "first",
         "second",
         "result",
         [cst1.output.result, cst2.output.result, cst3.output.result],
         scheduler=s,
     )
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = join.output.result
     aio.run(s.start())
     res = join.trace_stats(max_runs=1)
     print(res)
     df = join.table
     last = df.loc[df.index[-1]]
     assert last is not None
     self.assertTrue(
         last["xmin"] == 1
         and last["xmax"] == 2
         and last["ymin"] == 3
         and last["ymax"] == 4
         and last["zmin"] == 5
         and last["zmax"] == 6
     )
    def _impl_stirred_tst_percentiles_rq(self, accuracy: float,
                                         **kw: Any) -> None:
        """ """
        s = self.scheduler()
        with s:
            random = RandomTable(2, rows=10000, scheduler=s)
            stirrer = Stirrer(update_column="_2",
                              fixed_step_size=1000,
                              scheduler=s,
                              **kw)
            stirrer.input[0] = random.output.result
            t_min = PsDict({"_1": 0.3})
            min_value = Constant(table=t_min, scheduler=s)
            t_max = PsDict({"_1": 0.8})
            max_value = Constant(table=t_max, scheduler=s)
            range_qry = RangeQuery(column="_1", scheduler=s)
            range_qry.create_dependent_modules(stirrer,
                                               "result",
                                               min_value=min_value,
                                               max_value=max_value)

            hist_index = range_qry.hist_index
            assert hist_index
            t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0})
            which_percentiles = Constant(table=t_percentiles, scheduler=s)
            percentiles = Percentiles(accuracy=accuracy, scheduler=s)
            percentiles.input[0] = range_qry.output.result
            percentiles.input.percentiles = which_percentiles.output.result
            percentiles.input.hist = hist_index.output.result
            prt = Print(proc=self.terse, scheduler=s)
            prt.input[0] = percentiles.output.result
        aio.run(s.start())
        pdict = notNone(percentiles.table.last()).to_dict()
        v = range_qry.table["_1"].values
        p25 = np.percentile(v, 25.0)  # type: ignore
        p50 = np.percentile(v, 50.0)  # type: ignore
        p75 = np.percentile(v, 75.0)  # type: ignore
        print(
            "TSV=> accuracy: ",
            accuracy,
            " 25:",
            p25,
            pdict["_25"],
            " 50:",
            p50,
            pdict["_50"],
            " 75:",
            p75,
            pdict["_75"],
        )
        self.assertAlmostEqual(p25, pdict["_25"], delta=0.01)
        self.assertAlmostEqual(p50, pdict["_50"], delta=0.01)
        self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
Exemple #13
0
 def _query_min_max_impl(self, random, t_min, t_max, s):
     min_value = Constant(table=t_min, scheduler=s)
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column='_1', scheduler=s)
     range_qry.create_dependent_modules(random,
                                        'table',
                                        min_value=min_value,
                                        max_value=max_value)
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = range_qry.output.table
     prt2 = Print(proc=self.terse, scheduler=s)
     prt2.input.df = range_qry.output.min
     pr3 = Print(proc=self.terse, scheduler=s)
     pr3.input.df = range_qry.output.max
     return range_qry
 def test_bisect2(self) -> None:
     s = self.scheduler()
     random = RandomTable(2, rows=100_000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=100,
         # update_rows=5,
         # fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     t = Table(name=None, dshape="{value: string}", data={"value": [0.5]})
     min_value = Constant(table=t, scheduler=s)
     hist_index = HistogramIndex(column="_1", scheduler=s)
     hist_index.create_dependent_modules(stirrer, "result")
     bisect_ = Bisect(column="_1",
                      op=">",
                      hist_index=hist_index,
                      scheduler=s)
     bisect_.input[0] = hist_index.output.result
     # bisect_.input[0] = random.output.result
     bisect_.input.limit = min_value.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = bisect_.output.result
     aio.run(s.start())
     idx = stirrer.table.eval("_1>0.5", result_object="index")
     self.assertEqual(bisect_.table.index, bitmap(idx))
Exemple #15
0
 def _impl_tst_percentiles(self, accuracy):
     """
     """
     s = self.scheduler()
     random = RandomTable(2, rows=10000, scheduler=s)
     hist_index = HistogramIndex(column='_1', scheduler=s)
     hist_index.input.table = random.output.table
     t_percentiles = Table(
         name=None,
         dshape='{_25: float64, _50: float64, _75: float64}',
         data={
             '_25': [25.0],
             '_50': [50.0],
             '_75': [75.0]
         })
     which_percentiles = Constant(table=t_percentiles, scheduler=s)
     percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s)
     percentiles.input.table = random.output.table
     percentiles.input.percentiles = which_percentiles.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = percentiles.output.table
     s.start()
     s.join()
     pdict = percentiles.table().last().to_dict()
     v = random.table()['_1'].values
     p25 = np.percentile(v, 25.0)
     p50 = np.percentile(v, 50.0)
     p75 = np.percentile(v, 75.0)
     print("Table=> accuracy: ", accuracy, " 25:", p25, pdict['_25'],
           " 50:", p50, pdict['_50'], " 75:", p75, pdict['_75'])
     self.assertAlmostEqual(p25, pdict['_25'], delta=0.01)
     self.assertAlmostEqual(p50, pdict['_50'], delta=0.01)
     self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
 def _query_min_max_impl(
     self, random: RandomTable, t_min: PsDict, t_max: PsDict, s: Scheduler
 ) -> RangeQuery:
     min_value = Constant(table=t_min, scheduler=s)
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column="_1", scheduler=s)
     range_qry.create_dependent_modules(
         random, "result", min_value=min_value, max_value=max_value
     )
     prt = Print(proc=self.terse, scheduler=s)
     prt.input[0] = range_qry.output.result
     prt2 = Print(proc=self.terse, scheduler=s)
     prt2.input[0] = range_qry.output.min
     pr3 = Print(proc=self.terse, scheduler=s)
     pr3.input[0] = range_qry.output.max
     return range_qry
Exemple #17
0
 def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename": [
                 make_url("smallfile", ext=BZ2),
                 make_url("smallfile", ext=BZ2),
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False,
                     header=None,
                     scheduler=s,
                     timeout=0.01)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     _close(csv)
     self.assertEqual(len(csv.table), 60000)
def _4_const_scenario(module: Module,
                      s: Scheduler) -> Callable[[Scheduler, int], None]:
    table_ = Table("const_4_scenario", dshape="{a: int}", create=True)
    const_a = Constant(table=table_, scheduler=s)
    const_b = Constant(table=table_, scheduler=s)
    const_c = Constant(table=table_, scheduler=s)
    const_d = Constant(table=table_, scheduler=s)
    module.input.a = const_a.output.result
    module.input.b = const_b.output.result
    module.input.c = const_c.output.result
    module.input.d = const_d.output.result

    def _fun(s: Scheduler, r: int) -> None:
        if r > 10:
            s.task_stop()

    return _fun
Exemple #19
0
    def _impl_tst_percentiles_rq(self, accuracy):
        """
        """
        s = self.scheduler()
        random = RandomTable(2, rows=10000, scheduler=s)
        t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
        min_value = Constant(table=t_min, scheduler=s)
        t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
        max_value = Constant(table=t_max, scheduler=s)
        range_qry = RangeQuery(column='_1', scheduler=s)
        range_qry.create_dependent_modules(random,
                                           'table',
                                           min_value=min_value,
                                           max_value=max_value)

        hist_index = range_qry.hist_index
        t_percentiles = Table(
            name=None,
            dshape='{_25: float64, _50: float64, _75: float64}',
            data={
                '_25': [25.0],
                '_50': [50.0],
                '_75': [75.0]
            })
        which_percentiles = Constant(table=t_percentiles, scheduler=s)
        percentiles = Percentiles(hist_index, accuracy=accuracy, scheduler=s)
        percentiles.input.table = range_qry.output.table
        percentiles.input.percentiles = which_percentiles.output.table
        prt = Print(proc=self.terse, scheduler=s)
        prt.input.df = percentiles.output.table
        s.start()
        s.join()
        pdict = percentiles.table().last().to_dict()
        v = range_qry.table()['_1'].values
        p25 = np.percentile(v, 25.0)
        p50 = np.percentile(v, 50.0)
        p75 = np.percentile(v, 75.0)
        print("TSV=> accuracy: ", accuracy, " 25:", p25, pdict['_25'], " 50:",
              p50, pdict['_50'], " 75:", p75, pdict['_75'])
        self.assertAlmostEqual(p25, pdict['_25'], delta=0.01)
        self.assertAlmostEqual(p50, pdict['_50'], delta=0.01)
        self.assertAlmostEqual(p75, pdict['_75'], delta=0.01)
Exemple #20
0
 def test_range_query(self):
     "Run tests of the RangeQuery module"
     s = self.scheduler()
     random = RandomTable(2, rows=1000, scheduler=s)
     t_min = Table(name=None, dshape='{_1: float64}', data={'_1': [0.3]})
     min_value = Constant(table=t_min, scheduler=s)
     t_max = Table(name=None, dshape='{_1: float64}', data={'_1': [0.8]})
     max_value = Constant(table=t_max, scheduler=s)
     range_qry = RangeQuery(column='_1', scheduler=s)
     range_qry.create_dependent_modules(random,
                                        'table',
                                        min_value=min_value,
                                        max_value=max_value)
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = range_qry.output.table
     s.start()
     s.join()
     idx = range_qry.input_module.output['table']\
       .data().eval('(_1>0.3)&(_1<0.8)', result_object='index')
     self.assertEqual(range_qry.table().selection, bitmap(idx))
 def test_read_multiple_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     self.assertEqual(len(csv.table()), 60000)
 def test_combine_first_dup(self) -> None:
     s = self.scheduler(True)
     cst1 = Constant(
         Table(
             name="tcf_xmin_xmax",
             data=pd.DataFrame({"xmin": [1], "xmax": [2]}),
             create=True,
         ),
         scheduler=s,
     )
     cst2 = Constant(
         Table(
             name="tcf_ymin_ymax",
             data=pd.DataFrame({"ymin": [5], "ymax": [6]}),
             create=True,
         ),
         scheduler=s,
     )
     cst3 = Constant(
         Table(
             name="tcf_ymin_ymax2",
             data=pd.DataFrame({"ymin": [3], "ymax": [4]}),
             create=True,
         ),
         scheduler=s,
     )
     cf = CombineFirst(scheduler=s)
     cf.input[0] = cst1.output.result
     cf.input[0] = cst2.output.result
     cf.input[0] = cst3.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = cf.output.result
     aio.run(s.start())
     # res = cf.trace_stats(max_runs=1)
     row = cf.table.last()
     assert row is not None
     last = row.to_dict()
     self.assertEqual(last["xmin"], 1)
     self.assertEqual(last["xmax"], 2)
     self.assertEqual(last["ymin"], 5)
     self.assertEqual(last["ymax"], 6)
 def test_cmp_query(self):
     s=self.scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     cmp_ = CmpQueryLast(scheduler=s)
     cst = Table("cmp_table", data={'_1': [0.5]})
     value = Constant(cst, scheduler=s)
     cmp_.input.cmp = value.output.table
     cmp_.input.table = random.output.table
     pr=Print(proc=self.terse, scheduler=s)
     pr.input.df = cmp_.output.select
     s.start()
     s.join()
 def test_combine_first_nan(self) -> None:
     s = self.scheduler(True)
     cst1 = Constant(
         Table(
             name="tcf_xmin_xmax_nan",
             data=pd.DataFrame({"xmin": [1], "xmax": [2]}),
             create=True,
         ),
         scheduler=s,
     )
     cst2 = Constant(
         Table(
             name="tcf_ymin_ymax_nan",
             data=pd.DataFrame({"ymin": [np.nan], "ymax": [np.nan]}),
             create=True,
         ),
         scheduler=s,
     )
     cst3 = Constant(
         Table(
             name="tcf_ymin_ymax2_nan",
             data=pd.DataFrame({"ymin": [3], "ymax": [4]}),
             create=True,
         ),
         scheduler=s,
     )
     cf = CombineFirst(scheduler=s)
     cf.input[0] = cst1.output.result
     cf.input[0] = cst2.output.result
     cf.input[0] = cst3.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = cf.output.result
     aio.run(s.start())
     last = notNone(cf.table.last()).to_dict()
     self.assertTrue(
         last["xmin"] == 1
         and last["xmax"] == 2
         and last["ymin"] == 3
         and last["ymax"] == 4
     )
Exemple #25
0
 def test_last_row_simple(self) -> None:
     s = self.scheduler()
     t1 = Table(name=get_random_name("cst1"), data={"xmin": [1], "xmax": [2]})
     t2 = Table(name=get_random_name("cst2"), data={"ymin": [3], "ymax": [4]})
     cst1 = Constant(t1, scheduler=s)
     cst2 = Constant(t2, scheduler=s)
     join = Join(scheduler=s)
     join.input[0] = cst1.output.result
     join.input[0] = cst2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = join.output.result
     aio.run(s.start())
     # res = join.trace_stats(max_runs=1)
     # pd.set_option('display.expand_frame_repr', False)
     # print(res)
     last = notNone(join.table.last())
     self.assertTrue(
         last["xmin"] == 1
         and last["xmax"] == 2
         and last["ymin"] == 3
         and last["ymax"] == 4
     )
 def test_read_multiple_fake_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [
                           'buffer://fake1?cols=10&rows=30000',
                           'buffer://fake2?cols=10&rows=30000']})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()        
     self.assertEqual(len(csv.table()), 60000)
 def _range_query_impl(self, lo, up) -> None:
     "Run tests of the RangeQuery module"
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=1000, scheduler=s)
         t_min = PsDict({"_1": lo})
         min_value = Constant(table=t_min, scheduler=s)
         t_max = PsDict({"_1": up})
         max_value = Constant(table=t_max, scheduler=s)
         range_qry = RangeQuery(column="_1", scheduler=s)
         range_qry.create_dependent_modules(
             random, "result", min_value=min_value, max_value=max_value
         )
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = range_qry.output.result
     aio.run(s.start())
     assert range_qry.input_module is not None
     idx = (
         range_qry.input_module.output["result"]
         .data()
         .eval(f"(_1>{lo})&(_1<{up})", result_object="index")
     )
     self.assertEqual(range_qry.table.index, bitmap(idx))
Exemple #28
0
 def te_st_join_simple(self) -> None:
     s = self.scheduler()
     cst1 = Constant(
         Table(
             name="test_join_simple_cst1",
             data=pd.DataFrame({
                 "xmin": [1],
                 "xmax": [2]
             }),
             create=True,
         ),
         scheduler=s,
     )
     cst2 = Constant(
         Table(
             name="test_join_simple_cst2",
             data=pd.DataFrame({
                 "ymin": [3],
                 "ymax": [4]
             }),
             create=True,
         ),
         scheduler=s,
     )
     join = Join(scheduler=s)
     join.input[0] = cst1.output.result
     join.input[0] = cst2.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = join.output.result
     aio.run(s.start())
     res = join.trace_stats(max_runs=1)
     print(res)
     df = join.table
     last = df.loc[df.index[-1]]
     assert last is not None
     self.assertTrue(last["xmin"] == 1 and last["xmax"] == 2
                     and last["ymin"] == 3 and last["ymax"] == 4)
 def _impl_stirred_tst_percentiles(self, accuracy: float,
                                   **kw: Any) -> None:
     """ """
     s = self.scheduler()
     with s:
         random = RandomTable(2, rows=10000, scheduler=s)
         stirrer = Stirrer(update_column="_2",
                           fixed_step_size=1000,
                           scheduler=s,
                           **kw)
         stirrer.input[0] = random.output.result
         hist_index = HistogramIndex(column="_1", scheduler=s)
         hist_index.input[0] = stirrer.output.result
         t_percentiles = PsDict({"_25": 25.0, "_50": 50.0, "_75": 75.0})
         which_percentiles = Constant(table=t_percentiles, scheduler=s)
         percentiles = Percentiles(accuracy=accuracy, scheduler=s)
         percentiles.input[0] = stirrer.output.result
         percentiles.input.percentiles = which_percentiles.output.result
         percentiles.input.hist = hist_index.output.result
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = percentiles.output.result
     aio.run(s.start())
     pdict = notNone(percentiles.table.last()).to_dict()
     # v = random.table()['_1'].values
     # from nose.tools import set_trace; set_trace()
     v = stirrer.table.to_array(columns=["_1"]).reshape(-1)
     p25 = np.percentile(v, 25.0)  # type: ignore
     p50 = np.percentile(v, 50.0)  # type: ignore
     p75 = np.percentile(v, 75.0)  # type: ignore
     print(
         "Table=> accuracy: ",
         accuracy,
         " 25:",
         p25,
         pdict["_25"],
         " 50:",
         p50,
         pdict["_50"],
         " 75:",
         p75,
         pdict["_75"],
     )
     # from nose.tools import set_trace; set_trace()
     self.assertAlmostEqual(p25, pdict["_25"], delta=0.01)
     self.assertAlmostEqual(p50, pdict["_50"], delta=0.01)
     self.assertAlmostEqual(p75, pdict["_75"], delta=0.01)
 def test_03_read_multiple_csv_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP) 
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [make_url('smallfile'), make_url('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     _close(csv)        
     self.assertEqual(len(csv.table()), 60000)