Example #1
0
 def test_09_read_multi_csv_file_with_crash(self) -> None:
     s = self.scheduler()
     tag = "t9"
     file_list = [get_dataset("bigfile"), get_dataset("bigfile")]
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Example #2
0
 def test_read_csv(self):
     s=self.scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 1000000)
Example #3
0
 def test_01_read_http_csv_with_crash_and_counter(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url = make_url("bigfile")
     module = CSVLoader(url,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 2)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     csv = CSVLoader(
         url,
         recovery=True,
         index_col=False,
         recovery_tag=tag,
         header=None,
         scheduler=s,
     )
     counter = Counter(scheduler=s)
     counter.input[0] = csv.output.result
     self.assertTrue(csv.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = counter.output.result
     aio.run(s.start())
     self.assertEqual(len(csv.table), 1000000)
     self.assertEqual(counter.table["counter"].loc[0], 1000000)
Example #4
0
 def test_read_fake_csv(self):
     s=self.scheduler()
     module=CSVLoader(RandomBytesIO(cols=30, rows=1000000), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 1000000)
 def test_histogram2d(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     min_ = Min(scheduler=s)
     min_.input.table = csv.output.table
     max_ = Max(scheduler=s)
     max_.input.table = csv.output.table
     histogram2d = Histogram2D(1, 2, xbins=100, ybins=100,
                               scheduler=s)  # columns are called 1..30
     histogram2d.input.table = csv.output.table
     histogram2d.input.min = min_.output.table
     histogram2d.input.max = max_.output.table
     heatmap = Heatmap(filename='histo_%03d.png', scheduler=s)
     heatmap.input.array = histogram2d.output.table
     #pr = Print(scheduler=s)
     pr = Every(proc=self.terse, scheduler=s)
     #pr.input.df = heatmap.output.heatmap
     #pr.input.df = histogram2d.output.df
     pr.input.df = csv.output.table
     csv.scheduler().start()
     s.join()
     #self.scheduler.thread.join()
     s = histogram2d.trace_stats()
Example #6
0
    def test_scheduler(self):
        s = MTScheduler()
        csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s)

        smp = Sample(n=10,scheduler=s)
        smp.input.df = csv.output.df

        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        smp2 = Sample(n=15, scheduler=s)
        smp2.input.df = csv.output.df

        def add_min():
            m = Min(scheduler=s)
            # Of course, sleeping here is a bad idea. this is to illustrate
            # that add_min will be executed atomically by the scheduler. 
            # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
            # state.
            #sleep(1)
            m.input.df = smp2.output.df
            prt = Print(scheduler=s)
            prt.input.df = m.output.df

        s.add_oneshot_tick_proc(add_min)

        sleep(1)
        self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id))
        self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id))
        s.stop()
        s.join()
Example #7
0
    def test_join(self):
        s=Scheduler()
        csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s)
        stat1=Stats(1, scheduler=s)
        stat1.input.df = csv.output.df
        stat2=Stats(2, scheduler=s)
        stat2.input.df = csv.output.df
        lr1 = LastRow(scheduler=s)
        lr1.input.df = stat1.output.stats
        lr2 = LastRow(scheduler=s)
        lr2.input.df = stat2.output.stats
        join=Join(scheduler=s)
        join.input.df = lr1.output.df
        join.input.df = lr2.output.df
        pr=Print(scheduler=s)
        pr.input.df = join.output.df
        prlen = Every(proc=print_len, constant_time=True, scheduler=s)
        prlen.input.df = csv.output.df
        s.start()
        res = join.trace_stats(max_runs=1)
        pd.set_option('display.expand_frame_repr', False)
        last = join.df()
        df = csv.df()
        self.assertTrue(last.at[0,'1.min']==df[1].min() and last.at[0,'1.max']==df[1].max() and \
                        last.at[0,'2.min']==df[2].min() and last.at[0,'2.max']==df[2].max())

        print res
 def test_histogram2d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[1, 2]  # type: ignore
     )
     v = df.to_numpy()  # .reshape(-1, 2)
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertTrue(np.allclose(h1, h2))
Example #9
0
 def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename": [
                 make_url("smallfile", ext=BZ2),
                 make_url("smallfile", ext=BZ2),
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False,
                     header=None,
                     scheduler=s,
                     timeout=0.01)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     _close(csv)
     self.assertEqual(len(csv.table), 60000)
Example #10
0
 def test_06_read_http_multi_csv_bz2_with_crash(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url_list = [make_url("bigfile", ext=BZ2)] * 2
     module = CSVLoader(url_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     module = CSVLoader(
         url_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Example #11
0
 def test_sample(self):
     s = Scheduler()
     csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s)
     smp = Sample(n=10,scheduler=s)
     smp.input.df = csv.output.df
     prt = Print(scheduler=s)
     prt.input.df = smp.output.df
     csv.scheduler().start()
Example #12
0
 def test_read_multiple_csv(self):
     s=Scheduler()
     filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(df=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.df
     csv.start()
     self.assertEqual(len(csv.df()), 60000)
Example #13
0
 def p10s_read_csv(self):
     s = Scheduler()
     module = CSVLoader(RandomBytesIO(cols=30,
                                      size=self.current_step * GIGA),
                        index_col=False,
                        header=None,
                        scheduler=s)
     module.start()
Example #14
0
 def _tst_08_read_multi_csv_file_compress_no_crash(self, files):
     s = self.scheduler()
     module = CSVLoader(files, index_col=False, header=None,
                        scheduler=s)  #, save_context=False)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
Example #15
0
 def test_read_multiple_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     self.assertEqual(len(csv.table()), 60000)
Example #16
0
 def test_scatterplot(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s)
     sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s)
     sp.create_dependent_modules(csv,'df')
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = csv.output.df
     prt = Print(scheduler=s)
     prt.input.df = sp.histogram2d.output.df
     csv.scheduler().start(None,idle_proc)
     self.assertEquals(len(csv.df()), 1000000)
Example #17
0
 def test_mb_k_means(self):
     #log_level()
     s=Scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
     km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s)
     km.input.df = csv.output.df
     pr = Print(scheduler=s)
     pr.input.df = km.output.df
     e = Every(scheduler=s)
     e.input.df = km.output.labels
     s.start()
     self.assertEquals(len(csv.df()), len(km.labels()))
Example #18
0
 def test_sample(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(get_dataset("bigfile"),
                     index_col=False,
                     header=None,
                     scheduler=s)
     smp = Sample(samples=10, scheduler=s)
     smp.input[0] = csv.output.result
     prt = Print(proc=self.terse, scheduler=s)
     prt.input[0] = smp.output.result
     aio.run(csv.scheduler().start())
     # print(repr(smp.result))
     self.assertEqual(len(smp.table), 10)
    def test_csv_distances(self):
        s = self.scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
#        dis=PairwiseDistances(metric='euclidean',scheduler=s)
#        dis.input.df = vec.output.df
        cnt = Every(proc=self.terse,constant_time=True,scheduler=s)
#        cnt.input.df = dis.output.dist
        cnt.input.df = vec.output.table
        global times
        times = 0
        s.start(ten_times)
        s.join()
        table = vec.table()
Example #20
0
 def test_04_read_http_multi_csv_bz2_no_crash(self):
     #if TRAVIS: return
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     module = CSVLoader([make_url('smallfile', ext=BZ2)] * 2,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
 def test_04_read_http_csv_bz2_no_crash(self):
     #if TRAVIS: return
     p = Process(target=run_simple_server, args=())
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s=self.scheduler()
     module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     _close(module)
     self.assertEqual(len(module.table()), 1000000)
Example #22
0
 def test_07_read_multi_csv_file_no_crash(self):
     s = self.scheduler()
     module = CSVLoader(
         [get_dataset('smallfile'),
          get_dataset('smallfile')],
         index_col=False,
         header=None,
         scheduler=s)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
Example #23
0
 def test_read_multiple_fake_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [
                           'buffer://fake1?cols=10&rows=30000',
                           'buffer://fake2?cols=10&rows=30000']})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()        
     self.assertEqual(len(csv.table()), 60000)
Example #24
0
 def test_sample(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     smp = Sample(samples=10, scheduler=s)
     smp.input.table = csv.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = smp.output.table
     csv.scheduler().start()
     s.join()
     #print(repr(smp.table()))
     self.assertEqual(len(smp.table()), 10)
 def test_05_read_http_csv_bz2_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**7))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s=self.scheduler()
     module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s, timeout=0.01)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     _close(module)
     #self.assertGreater(module.parser._recovery_cnt, 0)
     self.assertEqual(len(module.table()), 1000000)
 def test_histogram1d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
     histogram1d.input[0] = csv.output.result
     histogram1d.input.min = min_.output.result
     histogram1d.input.max = max_.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = histogram1d.output.result
     aio.run(s.start())
     _ = histogram1d.trace_stats()
     last = notNone(histogram1d.table.last()).to_dict()
     h1 = last["array"]
     bounds = (last["min"], last["max"])
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[2]  # type: ignore
     )
     v = df.to_numpy().reshape(-1)
     h2, _ = np.histogram(  # type: ignore
         v, bins=histogram1d.params.bins, density=False, range=bounds
     )
     self.assertListEqual(h1.tolist(), h2.tolist())
Example #27
0
 def test_last_row(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('smallfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     lr1 = LastRow(scheduler=s)
     lr1.input.table = csv.output.table
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input.df = lr1.output.table
     s.start()
     s.join()
     df = csv.table()
     last = df.last()
     res = lr1.table()
     self.assertEqual(res.at[0, '_1'], last['_1'])
Example #28
0
    def test_mb_k_means(self) -> None:
        s = self.scheduler()
        n_clusters = 3
        try:
            dataset = (get_dataset("cluster:s3"), )
        except TimeoutError:
            print("Cannot download cluster:s3")
            return

        with s:
            csv = CSVLoader(
                dataset,
                sep=" ",
                skipinitialspace=True,
                header=None,
                index_col=False,
                scheduler=s,
            )
            km = MBKMeans(
                n_clusters=n_clusters,
                random_state=42,
                is_input=False,
                is_greedy=False,
                scheduler=s,
            )
            # km.input.table = csv.output.result
            km.create_dependent_modules(csv)
            pr = Print(proc=self.terse, scheduler=s)
            pr.input[0] = km.output.result
            e = Every(proc=self.terse, scheduler=s)
            e.input[0] = km.output.labels
        aio.run(s.start())
        labels = km.labels()
        assert labels is not None
        self.assertEqual(len(csv.table), len(labels))
Example #29
0
 def test_percentile(self):
     s=Scheduler()
     csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None, scheduler=s)
     module=Percentiles(1,id='test_percentile',
                        percentiles=[0.1, 0.25, 0.5, 0.75, 0.9],
                        scheduler=s)
     module.describe()
     csv_module.describe()
     connect(csv_module, 'df', module, 'df')
     connect(module, 'percentiles',
             Print(id='print', scheduler=s), 'df')
     s.start()
     ret = module.trace_stats(max_runs=1)
     #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df()))
     pd.set_option('display.expand_frame_repr', False)
     print ret
    def t_histogram1d_impl(self, **kw: Any) -> None:
        s = self.scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"), index_col=False, header=None, scheduler=s
        )
        stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
        stirrer.input[0] = csv.output.result
        min_ = Min(scheduler=s)
        min_.input[0] = stirrer.output.result
        max_ = Max(scheduler=s)
        max_.input[0] = stirrer.output.result
        histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
        histogram1d.input[0] = stirrer.output.result
        histogram1d.input.min = min_.output.result
        histogram1d.input.max = max_.output.result

        # pr = Print(scheduler=s)
        pr = Every(proc=self.terse, scheduler=s)
        pr.input[0] = histogram1d.output.result
        aio.run(s.start())
        _ = histogram1d.trace_stats()
        last = notNone(histogram1d.table.last()).to_dict()
        h1 = last["array"]
        bounds = (last["min"], last["max"])
        tab = stirrer.table.loc[:, ["_2"]]
        assert tab is not None
        v = tab.to_array().reshape(-1)
        h2, _ = np.histogram(  # type: ignore
            v, bins=histogram1d.params.bins, density=False, range=bounds
        )
        self.assertEqual(np.sum(h1), np.sum(h2))
        self.assertListEqual(h1.tolist(), h2.tolist())
Example #31
0
 def test_join(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     stat1 = Stats(1, reset_index=True, scheduler=s)
     stat1.input.table = csv.output.table
     stat2 = Stats(2, reset_index=True, scheduler=s)
     stat2.input.table = csv.output.table
     stat3 = Stats(3, reset_index=True, scheduler=s)
     stat3.input.table = csv.output.table
     #join=Join(scheduler=s)
     #import pdb;pdb.set_trace()
     reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     reduce_.input.table = stat1.output.stats
     reduce_.input.table = stat2.output.stats
     join = reduce_.expand()
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = join.output.table
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input.df = csv.output.table
     s.start()
     res = join.trace_stats(max_runs=1)
     print(res)
Example #32
0
 def test_join(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     stat1 = Stats(1, reset_index=True, scheduler=s)
     stat1.input[0] = csv.output.result
     stat2 = Stats(2, reset_index=True, scheduler=s)
     stat2.input[0] = csv.output.result
     # join=Join(scheduler=s)
     # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     # reduce_.input[0] = stat1.output.stats
     # reduce_.input[0] = stat2.output.stats
     # join = reduce_.expand()
     join = Reduce.expand(
         BinJoin,
         "first",
         "second",
         "table",
         [stat1.output.stats, stat2.output.stats],
         scheduler=s,
     )
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = join.output.result
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input[0] = csv.output.result
     aio.run(s.start())
     res = join.trace_stats(max_runs=1)
     print(res)
 def test_03_read_multiple_csv_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP) 
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [make_url('smallfile'), make_url('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     _close(csv)        
     self.assertEqual(len(csv.table()), 60000)
Example #34
0
 def test_01_read_http_csv_with_crash_and_counter(self):
     #if TRAVIS: return
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     url = make_url('bigfile')
     module = CSVLoader(url, index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     Patch1.max_steps = 200000
     decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self._http_srv.restart()
     s = self.scheduler()
     csv = CSVLoader(url,
                     recovery=True,
                     index_col=False,
                     header=None,
                     scheduler=s)
     counter = Counter(scheduler=s)
     counter.input.table = csv.output.table
     self.assertTrue(csv.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(csv.table()), 1000000)
     self.assertEqual(counter.table()['counter'].loc[0], 1000000)
Example #35
0
 def test_read_multiple_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename":
             [get_dataset("smallfile"),
              get_dataset("smallfile")]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
 def test_scatterplot(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('smallfile'),
                     index_col=False,
                     header=None,
                     force_valid_ids=True,
                     scheduler=s)
     sp = MCScatterPlot(scheduler=s,
                        classes=[('Scatterplot', '_1', '_2')],
                        approximate=True)
     sp.create_dependent_modules(csv, 'table')
     cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
     cnt.input.df = csv.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = sp.output.table
     csv.scheduler().start(idle_proc=idle_proc)
     s.join()
     self.assertEqual(len(csv.table()), 30000)
Example #37
0
 def test_read_multiple_fake_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names2",
         dshape="{filename: string}",
         data={
             "filename": [
                 "buffer://fake1?cols=10&rows=30000",
                 "buffer://fake2?cols=10&rows=30000",
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
Example #38
0
    def test_csv_distances(self):
        s=Scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
        dis=PairwiseDistances(metric='euclidean',scheduler=s)
        dis.input.df = vec.output.df
        cnt = Every(proc=print_len,constant_time=True,scheduler=s)
        cnt.input.df = dis.output.dist
        global times
        times = 0
        s.start(ten_times)
        df = vec.df()
        computed = dis.dist()
        #self.assertEquals(computed.shape[0], len(df))

        del df[CSVLoader.UPDATE_COLUMN]
        offset=0
        size=offset+5000
        truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric)
        dist = computed[offset:size,offset:size]
        self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
cst = Constant(df=filenames, scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], filter=filter, scheduler=s)
#csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], scheduler=s)
csv.input.filenames = cst.output.df
pr = Every(scheduler=s)
pr.input.df = csv.output.df
scatterplot = ScatterPlot('pickup_longitude', 'pickup_latitude', scheduler=s)
scatterplot.create_dependent_modules(csv,'df')

if __name__=='__main__':
    s.start()
    while True:
        time.sleep(2)
        scheluder.to_json()
        scatterplot.to_json() # simulate a web query
        scatterplot.get_image()
    s.join()
Example #40
0
from progressivis import *
from progressivis.vis import ScatterPlot
from progressivis.io import CSVLoader
from progressivis.datasets import get_dataset

def filter(df):
    l = df['pickup_longitude']
    return df[(l < -70) & (l > -80) ]

def print_len(x):
    if x is not None:
        print len(x)

#log_level()

try:
    s = scheduler
except:
    s = Scheduler()

csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s)
pr = Every(scheduler=s)
pr.input.df = csv.output.df
scatterplot = ScatterPlot('_1', '_2', scheduler=s)
scatterplot.create_dependent_modules(csv,'df')

if __name__=='__main__':
    csv.start()
    s.join()
    print len(csv.df())
Example #41
0
 def test_read_csv(self):
     s=Scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.df() is None)
     s.start()
     self.assertEqual(len(module.df()), 1000000)
Example #42
0
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.stats import Min, Max, Histogram2D
from progressivis.vis import Heatmap, ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s3'),sep='    ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
histogram2d.input.df = data.output.df
min_mod = Min([0,1], scheduler=s)
Example #43
0
from progressivis import Scheduler, Every, Print
from progressivis.io import CSVLoader
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset

print "Loading test_histogram2d"
print "Type of default_scheduler is %s" % type(Scheduler.default)

csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c")
pr = Every()
pr.input.df = csv.output.df
min = Min()
min.input.df = csv.output.df
max = Max()
max.input.df = csv.output.df
histogram2d = Histogram2D(1, 2, xbins=128, ybins=128)
histogram2d.input.df = csv.output.df
histogram2d.input.min = min.output.df
histogram2d.input.max = max.output.df
pr = Print(id="print")
pr.input.df = histogram2d.output.df

if __name__ == "__main__":
    csv.start()