def test_histogram1d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
     histogram1d.input[0] = csv.output.result
     histogram1d.input.min = min_.output.result
     histogram1d.input.max = max_.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = histogram1d.output.result
     aio.run(s.start())
     _ = histogram1d.trace_stats()
     last = notNone(histogram1d.table.last()).to_dict()
     h1 = last["array"]
     bounds = (last["min"], last["max"])
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[2]  # type: ignore
     )
     v = df.to_numpy().reshape(-1)
     h2, _ = np.histogram(  # type: ignore
         v, bins=histogram1d.params.bins, density=False, range=bounds
     )
     self.assertListEqual(h1.tolist(), h2.tolist())
 def test_histogram2d1(self) -> None:
     s = self.scheduler()
     csv = CSVLoader(
         get_dataset("bigfile"), index_col=False, header=None, scheduler=s
     )
     min_ = Min(scheduler=s)
     min_.input[0] = csv.output.result
     max_ = Max(scheduler=s)
     max_.input[0] = csv.output.result
     histogram2d = Histogram2D(
         1, 2, xbins=100, ybins=100, scheduler=s
     )  # columns are called 1..30
     histogram2d.input[0] = csv.output.result
     histogram2d.input.min = min_.output.result
     histogram2d.input.max = max_.output.result
     heatmap = Heatmap(filename="histo_%03d.png", scheduler=s)
     heatmap.input.array = histogram2d.output.result
     pr = Every(proc=self.terse, scheduler=s)
     pr.input[0] = heatmap.output.result
     aio.run(csv.scheduler().start())
     last = notNone(histogram2d.table.last()).to_dict()
     h1 = last["array"]
     bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]]
     df = pd.read_csv(
         get_dataset("bigfile"), header=None, usecols=[1, 2]  # type: ignore
     )
     v = df.to_numpy()  # .reshape(-1, 2)
     bins = [histogram2d.params.ybins, histogram2d.params.xbins]
     h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds)
     h2 = np.flip(h2, axis=0)  # type: ignore
     self.assertTrue(np.allclose(h1, h2))
Exemple #3
0
 def test_09_read_multi_csv_file_with_crash(self) -> None:
     s = self.scheduler()
     tag = "t9"
     file_list = [get_dataset("bigfile"), get_dataset("bigfile")]
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Exemple #4
0
def run_throttled_server(port: int = 8000, threshold: int = 10**6) -> None:
    _ = get_dataset("smallfile")
    _ = get_dataset("bigfile")
    _ = get_dataset_bz2("smallfile")
    _ = get_dataset_bz2("bigfile")
    os.chdir(DATA_DIR)
    ThrottledReqHandler.threshold = threshold
    http_srv.test(HandlerClass=ThrottledReqHandler, port=port)  # type: ignore
Exemple #5
0
 def test_read_multiple_csv(self):
     s=Scheduler()
     filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(df=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.df
     csv.start()
     self.assertEqual(len(csv.df()), 60000)
Exemple #6
0
def run_simple_server() -> None:
    _ = get_dataset("smallfile")
    _ = get_dataset("bigfile")
    _ = get_dataset_bz2("smallfile")
    _ = get_dataset_bz2("bigfile")
    os.chdir(DATA_DIR)
    import RangeHTTPServer.__main__  # type: ignore

    RangeHTTPServer.__main__
 def test_read_multiple_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     self.assertEqual(len(csv.table()), 60000)
Exemple #8
0
 def test_07_read_multi_csv_file_no_crash(self):
     s = self.scheduler()
     module = CSVLoader(
         [get_dataset('smallfile'),
          get_dataset('smallfile')],
         index_col=False,
         header=None,
         scheduler=s)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
def run_throttled_server(port=8000, threshold=10**6):
    _ = get_dataset('smallfile')
    _ = get_dataset('bigfile')
    _ = get_dataset_bz2('smallfile')
    _ = get_dataset_bz2('bigfile')
    os.chdir(DATA_DIR)
    ThrottledReqHandler.threshold = threshold
    if six.PY2:
        import sys
        sys.argv[1] = 8000
        http_srv.test(HandlerClass=ThrottledReqHandler)
    else:
        http_srv.test(HandlerClass=ThrottledReqHandler, port=port)
Exemple #10
0
def run_simple_server() -> None:
    _ = get_dataset("smallfile")
    _ = get_dataset("bigfile")
    _ = get_dataset_bz2("smallfile")
    _ = get_dataset_bz2("bigfile")
    _ = get_dataset_gz("smallfile")
    _ = get_dataset_gz("bigfile")
    # if six.PY3:
    #    _ = get_dataset_lzma('smallfile')
    #    _ = get_dataset_lzma('bigfile')
    os.chdir(DATA_DIR)
    import RangeHTTPServer.__main__  # type: ignore

    assert RangeHTTPServer.__main__
Exemple #11
0
 def test_07_read_multi_csv_file_no_crash(self) -> None:
     s = self.scheduler()
     module = CSVLoader(
         [get_dataset("smallfile"),
          get_dataset("smallfile")],
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 60000)
def run_simple_server():
    _ = get_dataset('smallfile')
    _ = get_dataset('bigfile')
    _ = get_dataset_bz2('smallfile')
    _ = get_dataset_bz2('bigfile')
    os.chdir(DATA_DIR)
    if six.PY2:
        import SimpleHTTPServer
        import RangeHTTPServer
        from RangeHTTPServer import RangeRequestHandler
        import sys
        sys.argv[1] = 8000
        SimpleHTTPServer.test(HandlerClass=RangeRequestHandler)
    else:
        import RangeHTTPServer.__main__
 def test_histogram2d(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     min_ = Min(scheduler=s)
     min_.input.table = csv.output.table
     max_ = Max(scheduler=s)
     max_.input.table = csv.output.table
     histogram2d = Histogram2D(1, 2, xbins=100, ybins=100,
                               scheduler=s)  # columns are called 1..30
     histogram2d.input.table = csv.output.table
     histogram2d.input.min = min_.output.table
     histogram2d.input.max = max_.output.table
     heatmap = Heatmap(filename='histo_%03d.png', scheduler=s)
     heatmap.input.array = histogram2d.output.table
     #pr = Print(scheduler=s)
     pr = Every(proc=self.terse, scheduler=s)
     #pr.input.df = heatmap.output.heatmap
     #pr.input.df = histogram2d.output.df
     pr.input.df = csv.output.table
     csv.scheduler().start()
     s.join()
     #self.scheduler.thread.join()
     s = histogram2d.trace_stats()
Exemple #14
0
 def test_join(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('bigfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     stat1 = Stats(1, reset_index=True, scheduler=s)
     stat1.input.table = csv.output.table
     stat2 = Stats(2, reset_index=True, scheduler=s)
     stat2.input.table = csv.output.table
     stat3 = Stats(3, reset_index=True, scheduler=s)
     stat3.input.table = csv.output.table
     #join=Join(scheduler=s)
     #import pdb;pdb.set_trace()
     reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s)
     reduce_.input.table = stat1.output.stats
     reduce_.input.table = stat2.output.stats
     join = reduce_.expand()
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = join.output.table
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input.df = csv.output.table
     s.start()
     res = join.trace_stats(max_runs=1)
     print(res)
Exemple #15
0
 def test_read_vec(self):
     module=VECLoader(get_dataset('warlogs'),
                      id='test_read_vec')
     self.assertTrue(module.df() is None)
     module.run(0)
     s = module.trace_stats(max_runs=1)
     df = module.df()
     self.assertFalse(df is None)
     l = len(df)
     self.assertEqual(l, len(df[df[module.UPDATE_COLUMN]==module.last_update()]))
     cnt = 1
     
     while not module.is_zombie():
         module.run(cnt)
         cnt += 1
         s = module.trace_stats(max_runs=1)
         df = module.df()
         ln = len(df)
         print "Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), ln)
         self.assertEqual(ln-l, len(df[df[module.UPDATE_COLUMN]==module.last_update()]))
         l =  ln
     s = module.trace_stats(max_runs=1)
     print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df()))
     df2 = module.df().groupby([Module.UPDATE_COLUMN])
     self.assertEqual(cnt, len(df2))
    def test_join(self):
        s=Scheduler()
        csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s)
        stat1=Stats(1, scheduler=s)
        stat1.input.df = csv.output.df
        stat2=Stats(2, scheduler=s)
        stat2.input.df = csv.output.df
        lr1 = LastRow(scheduler=s)
        lr1.input.df = stat1.output.stats
        lr2 = LastRow(scheduler=s)
        lr2.input.df = stat2.output.stats
        join=Join(scheduler=s)
        join.input.df = lr1.output.df
        join.input.df = lr2.output.df
        pr=Print(scheduler=s)
        pr.input.df = join.output.df
        prlen = Every(proc=print_len, constant_time=True, scheduler=s)
        prlen.input.df = csv.output.df
        s.start()
        res = join.trace_stats(max_runs=1)
        pd.set_option('display.expand_frame_repr', False)
        last = join.df()
        df = csv.df()
        self.assertTrue(last.at[0,'1.min']==df[1].min() and last.at[0,'1.max']==df[1].max() and \
                        last.at[0,'2.min']==df[2].min() and last.at[0,'2.max']==df[2].max())

        print res
Exemple #17
0
    def test_mb_k_means(self) -> None:
        s = self.scheduler()
        n_clusters = 3
        try:
            dataset = (get_dataset("cluster:s3"), )
        except TimeoutError:
            print("Cannot download cluster:s3")
            return

        with s:
            csv = CSVLoader(
                dataset,
                sep=" ",
                skipinitialspace=True,
                header=None,
                index_col=False,
                scheduler=s,
            )
            km = MBKMeans(
                n_clusters=n_clusters,
                random_state=42,
                is_input=False,
                is_greedy=False,
                scheduler=s,
            )
            # km.input.table = csv.output.result
            km.create_dependent_modules(csv)
            pr = Print(proc=self.terse, scheduler=s)
            pr.input[0] = km.output.result
            e = Every(proc=self.terse, scheduler=s)
            e.input[0] = km.output.labels
        aio.run(s.start())
        labels = km.labels()
        assert labels is not None
        self.assertEqual(len(csv.table), len(labels))
 def test_load_csv(self):
     """
     Connecting modules via function calls
     """
     csv = pv.load_csv(get_dataset('bigfile'), index_col=False, header=None)
     m = pv.min(csv)
     pv.echo(m, proc=prtm)
     M = pv.max(csv)
     pv.echo(M, proc=prtM)
     trace = M["_trace"]
     pv.echo(trace, proc=prtT)
     self.assertEqual(csv.scheduler(), csv.module.scheduler())
     csv.scheduler().start()
     csv.scheduler().join()
     table = csv.table
     lastm = m.table.last()
     lastM = M.table.last()
     self.assertEqual(len(table), 1000000)
     for col in table.columns:
         #print('testing column %s'%col)
         c = table[col]
         v = c.min()
         self.assertEqual(v, lastm[col])
         v = c.max()
         self.assertEqual(v, lastM[col])
 def test_piped_load_csv2(self):
     """
     Connecting modules via the pipe operator (only one pipe)
     """
     ret = (PipedInput(get_dataset('bigfile'))
            | pv.load_csv(index_col=False, header=None) | pv.min()
            | pv.echo(proc=prtm).repipe('csv_loader_1') | pv.max()
            | pv.echo(proc=prtM).repipe('max_1', out='_trace')
            | pv.echo(proc=prtT))
     m = ret.fetch('min_1')
     M = ret.fetch('max_1')
     csv = ret.fetch('csv_loader_1')
     self.assertEqual(csv.scheduler(), csv.module.scheduler())
     csv.scheduler().start()
     csv.scheduler().join()
     table = csv.table
     lastm = m.table.last()
     lastM = M.table.last()
     self.assertEqual(len(table), 1000000)
     for col in table.columns:
         #print('testing column %s'%col)
         c = table[col]
         v = c.min()
         self.assertEqual(v, lastm[col])
         v = c.max()
         self.assertEqual(v, lastM[col])
 def test_read_csv(self):
     s=self.scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 1000000)
    def test_scheduler(self):
        s = MTScheduler()
        csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s)

        smp = Sample(n=10,scheduler=s)
        smp.input.df = csv.output.df

        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        smp2 = Sample(n=15, scheduler=s)
        smp2.input.df = csv.output.df

        def add_min():
            m = Min(scheduler=s)
            # Of course, sleeping here is a bad idea. this is to illustrate
            # that add_min will be executed atomically by the scheduler. 
            # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
            # state.
            #sleep(1)
            m.input.df = smp2.output.df
            prt = Print(scheduler=s)
            prt.input.df = m.output.df

        s.add_oneshot_tick_proc(add_min)

        sleep(1)
        self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id))
        self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id))
        s.stop()
        s.join()
    def t_histogram1d_impl(self, **kw: Any) -> None:
        s = self.scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"), index_col=False, header=None, scheduler=s
        )
        stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw)
        stirrer.input[0] = csv.output.result
        min_ = Min(scheduler=s)
        min_.input[0] = stirrer.output.result
        max_ = Max(scheduler=s)
        max_.input[0] = stirrer.output.result
        histogram1d = Histogram1D("_2", scheduler=s)  # columns are called 1..30
        histogram1d.input[0] = stirrer.output.result
        histogram1d.input.min = min_.output.result
        histogram1d.input.max = max_.output.result

        # pr = Print(scheduler=s)
        pr = Every(proc=self.terse, scheduler=s)
        pr.input[0] = histogram1d.output.result
        aio.run(s.start())
        _ = histogram1d.trace_stats()
        last = notNone(histogram1d.table.last()).to_dict()
        h1 = last["array"]
        bounds = (last["min"], last["max"])
        tab = stirrer.table.loc[:, ["_2"]]
        assert tab is not None
        v = tab.to_array().reshape(-1)
        h2, _ = np.histogram(  # type: ignore
            v, bins=histogram1d.params.bins, density=False, range=bounds
        )
        self.assertEqual(np.sum(h1), np.sum(h2))
        self.assertListEqual(h1.tolist(), h2.tolist())
Exemple #23
0
 def test_sample(self):
     s = Scheduler()
     csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s)
     smp = Sample(n=10,scheduler=s)
     smp.input.df = csv.output.df
     prt = Print(scheduler=s)
     prt.input.df = smp.output.df
     csv.scheduler().start()
Exemple #24
0
 def test_query_simple(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,force_valid_ids=True,scheduler=s)
     q=Select(scheduler=s)
     q.input.df = csv.output.df
     prlen = Every(constant_time=True, scheduler=s)
     prlen.input.df = q.output.df
     s.start()
     self.assertEqual(len(q.df()), 1000000)
Exemple #25
0
 def load_csv(self):
     module=CSVLoader(filepath_or_buffer=get_dataset('smallfile'),
                      force_valid_ids=True,
                      index_col=False,
                      header=None,
                      scheduler=self.scheduler)
     self.assertTrue(module.table() is None)
     self.scheduler.start()
     self.scheduler.join()
     t = module.table()
     self.assertFalse(t is None)
     self.assertEqual(len(t), 30000)
     df = pd.read_csv(filepath_or_buffer=get_dataset('smallfile'),
                      index_col=False,
                      header=None)
     for col in range(t.ncol):
         coldf = df[col]
         colt = t[col]
         self.assertTrue(np.all(coldf==colt.values))
Exemple #26
0
 def test_MDS_csv(self):
     s=Scheduler()
     vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
     dis=PairwiseDistances(metric='euclidean',scheduler=s)
     dis.input.df = vec.output.df
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = dis.output.dist
     global times
     times = 0
     s.start(ten_times)
 def test_scatterplot(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s)
     sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s)
     sp.create_dependent_modules(csv,'df')
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = csv.output.df
     prt = Print(scheduler=s)
     prt.input.df = sp.histogram2d.output.df
     csv.scheduler().start(None,idle_proc)
     self.assertEquals(len(csv.df()), 1000000)
Exemple #28
0
 def test_query(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,force_valid_ids=True,scheduler=s)
     cst=Constant(pd.DataFrame({'query': ['_1 < 0.5']}),scheduler=s)
     q=Select(scheduler=s)
     q.input.df = csv.output.df
     q.input.query = cst.output.df
     prlen = Every(constant_time=True, scheduler=s)
     prlen.input.df = q.output.df
     s.start()
     self.assertTrue(len(q.df()) < 1000000)
 def test_percentile(self):
     s = self.scheduler()
     csv_module = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s)
     module=Percentiles('_1', name='test_percentile',
                        percentiles=[0.1, 0.25, 0.5, 0.75, 0.9],
                        scheduler=s)
     module.input.table = csv_module.output.table
     prt = Every(proc=self.terse, name='print', scheduler=s)
     prt.input.df = module.output.percentiles
             
     s.start()
     s.join()
    def test_csv_distances(self):
        s = self.scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
#        dis=PairwiseDistances(metric='euclidean',scheduler=s)
#        dis.input.df = vec.output.df
        cnt = Every(proc=self.terse,constant_time=True,scheduler=s)
#        cnt.input.df = dis.output.dist
        cnt.input.df = vec.output.table
        global times
        times = 0
        s.start(ten_times)
        s.join()
        table = vec.table()
 def test_mb_k_means(self):
     #log_level()
     s=Scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
     km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s)
     km.input.df = csv.output.df
     pr = Print(scheduler=s)
     pr.input.df = km.output.df
     e = Every(scheduler=s)
     e.input.df = km.output.labels
     s.start()
     self.assertEquals(len(csv.df()), len(km.labels()))
 def NOtest_vec_distances(self):
     s=Scheduler()
     vec=VECLoader(get_dataset('warlogs'),scheduler=s)
     dis=PairwiseDistances(metric='cosine',scheduler=s)
     dis.input.df = vec.output.df
     dis.input.array = vec.output.array
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = dis.output.dist
     global times
     times = 0
     s.start()
     df = vec.df()
     computed = dis.dist()
     self.assertEquals(computed.shape[0], len(df))
     truth = pairwise_distances(vec.toarray(), metric=dis._metric)
     self.assertTrue(np.allclose(truth, computed))
 def test_percentile(self):
     s=Scheduler()
     csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None, scheduler=s)
     module=Percentiles(1,id='test_percentile',
                        percentiles=[0.1, 0.25, 0.5, 0.75, 0.9],
                        scheduler=s)
     module.describe()
     csv_module.describe()
     connect(csv_module, 'df', module, 'df')
     connect(module, 'percentiles',
             Print(id='print', scheduler=s), 'df')
     s.start()
     ret = module.trace_stats(max_runs=1)
     #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df()))
     pd.set_option('display.expand_frame_repr', False)
     print ret
Exemple #34
0
 def test_stats(self):
     s=Scheduler()
     csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None,
                            scheduler=s)
     stats=Stats(1,id='test_stats', scheduler=s)
     wait=Wait(id='wait', delay=3, scheduler=s)
     wait.input.df = csv_module.output.df
     #connect(csv_module, 'df', wait, 'df')
     stats.input._params = wait.output.df
     #connect(wait, 'df', stats, '_params')
     #connect(csv_module, 'df', stats, 'df')
     stats.input.df = csv_module.output.df
     pr = Print(id='print', scheduler=s)
     #connect(stats, 'stats', pr, 'inp')
     pr.input.df = stats.output.stats
     s.start()
     s = stats.trace_stats(max_runs=1)
     pd.set_option('display.expand_frame_repr', False)
     print s
    def test_histogram1d(self):
        s = Scheduler()
        csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s)
        min = Min(scheduler=s)
        min.input.df = csv.output.df
        max = Max(scheduler=s)
        max.input.df = csv.output.df
        histogram1d = Histogram1D(2, scheduler=s)  # columns are called 1..30
        histogram1d.input.df = csv.output.df
        histogram1d.input.min = min.output.df
        histogram1d.input.max = max.output.df

        # pr = Print(scheduler=s)
        pr = Every(scheduler=s)
        pr.input.df = csv.output.df
        s.start(tick_proc=lambda s, r: csv.is_terminated() and s.stop())
        s = histogram1d.trace_stats()
        # print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df()))
        pd.set_option("display.expand_frame_repr", False)
        print s
    def test_csv_distances(self):
        s=Scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
        dis=PairwiseDistances(metric='euclidean',scheduler=s)
        dis.input.df = vec.output.df
        cnt = Every(proc=print_len,constant_time=True,scheduler=s)
        cnt.input.df = dis.output.dist
        global times
        times = 0
        s.start(ten_times)
        df = vec.df()
        computed = dis.dist()
        #self.assertEquals(computed.shape[0], len(df))

        del df[CSVLoader.UPDATE_COLUMN]
        offset=0
        size=offset+5000
        truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric)
        dist = computed[offset:size,offset:size]
        self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
 def test_histogram2d(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s)
     min = Min(scheduler=s)
     min.input.df = csv.output.df
     max = Max(scheduler=s)
     max.input.df = csv.output.df
     histogram2d=Histogram2D(1, 2, xbins=100, ybins=100,scheduler=s) # columns are called 1..30
     histogram2d.input.df = csv.output.df
     histogram2d.input.min = min.output.df
     histogram2d.input.max = max.output.df
     heatmap=Heatmap(filename='histo_%03d.png',scheduler=s)
     heatmap.input.array = histogram2d.output.df
     #pr = Print(scheduler=s)
     pr = Every(scheduler=s)
     #pr.input.df = heatmap.output.heatmap
     #pr.input.df = histogram2d.output.df
     pr.input.df = csv.output.df
     csv.scheduler().start()
     #self.scheduler.thread.join()
     s = histogram2d.trace_stats()
     #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df()))
     pd.set_option('display.expand_frame_repr', False)
     print s
from progressivis import *
from progressivis.vis import ScatterPlot
from progressivis.io import CSVLoader
from progressivis.datasets import get_dataset

def filter(df):
    l = df['pickup_longitude']
    return df[(l < -70) & (l > -80) ]

def print_len(x):
    if x is not None:
        print len(x)

#log_level()

try:
    s = scheduler
except:
    s = Scheduler()

csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s)
pr = Every(scheduler=s)
pr.input.df = csv.output.df
scatterplot = ScatterPlot('_1', '_2', scheduler=s)
scatterplot.create_dependent_modules(csv,'df')

if __name__=='__main__':
    csv.start()
    s.join()
    print len(csv.df())
from progressivis import Scheduler, Every, Print
from progressivis.io import CSVLoader
from progressivis.stats import Histogram2D, Min, Max
from progressivis.datasets import get_dataset

print "Loading test_histogram2d"
print "Type of default_scheduler is %s" % type(Scheduler.default)

csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c")
pr = Every()
pr.input.df = csv.output.df
min = Min()
min.input.df = csv.output.df
max = Max()
max.input.df = csv.output.df
histogram2d = Histogram2D(1, 2, xbins=128, ybins=128)
histogram2d.input.df = csv.output.df
histogram2d.input.min = min.output.df
histogram2d.input.max = max.output.df
pr = Print(id="print")
pr.input.df = histogram2d.output.df

if __name__ == "__main__":
    csv.start()
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.stats import Min, Max, Histogram2D
from progressivis.vis import Heatmap, ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s3'),sep='    ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
histogram2d.input.df = data.output.df
min_mod = Min([0,1], scheduler=s)
Exemple #41
0
 def test_read_csv(self):
     s=Scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.df() is None)
     s.start()
     self.assertEqual(len(module.df()), 1000000)