def test_09_read_multi_csv_file_with_crash(self) -> None: s = self.scheduler() tag = "t9" file_list = [get_dataset("bigfile"), get_dataset("bigfile")] module = CSVLoader(file_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) _close(module) s = self.scheduler(clean=True) module = CSVLoader( file_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_read_csv(self): s=self.scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def test_01_read_http_csv_with_crash_and_counter(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url = make_url("bigfile") module = CSVLoader(url, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 2) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) csv = CSVLoader( url, recovery=True, index_col=False, recovery_tag=tag, header=None, scheduler=s, ) counter = Counter(scheduler=s) counter.input[0] = csv.output.result self.assertTrue(csv.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = counter.output.result aio.run(s.start()) self.assertEqual(len(csv.table), 1000000) self.assertEqual(counter.table["counter"].loc[0], 1000000)
def test_read_fake_csv(self): s=self.scheduler() module=CSVLoader(RandomBytesIO(cols=30, rows=1000000), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def test_histogram2d(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) min_ = Min(scheduler=s) min_.input.table = csv.output.table max_ = Max(scheduler=s) max_.input.table = csv.output.table histogram2d = Histogram2D(1, 2, xbins=100, ybins=100, scheduler=s) # columns are called 1..30 histogram2d.input.table = csv.output.table histogram2d.input.min = min_.output.table histogram2d.input.max = max_.output.table heatmap = Heatmap(filename='histo_%03d.png', scheduler=s) heatmap.input.array = histogram2d.output.table #pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) #pr.input.df = heatmap.output.heatmap #pr.input.df = histogram2d.output.df pr.input.df = csv.output.table csv.scheduler().start() s.join() #self.scheduler.thread.join() s = histogram2d.trace_stats()
def test_scheduler(self): s = MTScheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s) smp = Sample(n=10,scheduler=s) smp.input.df = csv.output.df csv.scheduler().start() sleep(1) self.assertTrue(csv.scheduler().is_running()) smp2 = Sample(n=15, scheduler=s) smp2.input.df = csv.output.df def add_min(): m = Min(scheduler=s) # Of course, sleeping here is a bad idea. this is to illustrate # that add_min will be executed atomically by the scheduler. # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent # state. #sleep(1) m.input.df = smp2.output.df prt = Print(scheduler=s) prt.input.df = m.output.df s.add_oneshot_tick_proc(add_min) sleep(1) self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id)) self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id)) #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id)) s.stop() s.join()
def test_join(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False,header=None,scheduler=s) stat1=Stats(1, scheduler=s) stat1.input.df = csv.output.df stat2=Stats(2, scheduler=s) stat2.input.df = csv.output.df lr1 = LastRow(scheduler=s) lr1.input.df = stat1.output.stats lr2 = LastRow(scheduler=s) lr2.input.df = stat2.output.stats join=Join(scheduler=s) join.input.df = lr1.output.df join.input.df = lr2.output.df pr=Print(scheduler=s) pr.input.df = join.output.df prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = csv.output.df s.start() res = join.trace_stats(max_runs=1) pd.set_option('display.expand_frame_repr', False) last = join.df() df = csv.df() self.assertTrue(last.at[0,'1.min']==df[1].min() and last.at[0,'1.max']==df[1].max() and \ last.at[0,'2.min']==df[2].min() and last.at[0,'2.max']==df[2].max()) print res
def test_histogram2d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram2d = Histogram2D( 1, 2, xbins=100, ybins=100, scheduler=s ) # columns are called 1..30 histogram2d.input[0] = csv.output.result histogram2d.input.min = min_.output.result histogram2d.input.max = max_.output.result heatmap = Heatmap(filename="histo_%03d.png", scheduler=s) heatmap.input.array = histogram2d.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = heatmap.output.result aio.run(csv.scheduler().start()) last = notNone(histogram2d.table.last()).to_dict() h1 = last["array"] bounds = [[last["ymin"], last["ymax"]], [last["xmin"], last["xmax"]]] df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[1, 2] # type: ignore ) v = df.to_numpy() # .reshape(-1, 2) bins = [histogram2d.params.ybins, histogram2d.params.xbins] h2 = fh.histogram2d(v[:, 1], v[:, 0], bins=bins, range=bounds) h2 = np.flip(h2, axis=0) # type: ignore self.assertTrue(np.allclose(h1, h2))
def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None: p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [ make_url("smallfile", ext=BZ2), make_url("smallfile", ext=BZ2), ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) _close(csv) self.assertEqual(len(csv.table), 60000)
def test_06_read_http_multi_csv_bz2_with_crash(self) -> None: self._http_srv = _HttpSrv() tag = self.get_tag() s = self.scheduler() url_list = [make_url("bigfile", ext=BZ2)] * 2 module = CSVLoader(url_list, index_col=False, recovery_tag=tag, header=None, scheduler=s) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result sts = sleep_then_stop(s, 3) aio.run_gather(s.start(), sts) self._http_srv.restart() s = self.scheduler(clean=True) module = CSVLoader( url_list, recovery=True, recovery_tag=tag, index_col=False, header=None, scheduler=s, ) self.assertTrue(module.result is None) sink = Sink(name="sink", scheduler=s) sink.input.inp = module.output.result aio.run(s.start()) self.assertEqual(len(module.table), 2000000)
def test_sample(self): s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,scheduler=s) smp = Sample(n=10,scheduler=s) smp.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = smp.output.df csv.scheduler().start()
def test_read_multiple_csv(self): s=Scheduler() filenames = pd.DataFrame({'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.df csv.start() self.assertEqual(len(csv.df()), 60000)
def p10s_read_csv(self): s = Scheduler() module = CSVLoader(RandomBytesIO(cols=30, size=self.current_step * GIGA), index_col=False, header=None, scheduler=s) module.start()
def _tst_08_read_multi_csv_file_compress_no_crash(self, files): s = self.scheduler() module = CSVLoader(files, index_col=False, header=None, scheduler=s) #, save_context=False) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_read_multiple_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_scatterplot(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s) sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) sp.create_dependent_modules(csv,'df') cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = sp.histogram2d.output.df csv.scheduler().start(None,idle_proc) self.assertEquals(len(csv.df()), 1000000)
def test_mb_k_means(self): #log_level() s=Scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.df = csv.output.df pr = Print(scheduler=s) pr.input.df = km.output.df e = Every(scheduler=s) e.input.df = km.output.labels s.start() self.assertEquals(len(csv.df()), len(km.labels()))
def test_sample(self) -> None: s = self.scheduler() csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, scheduler=s) smp = Sample(samples=10, scheduler=s) smp.input[0] = csv.output.result prt = Print(proc=self.terse, scheduler=s) prt.input[0] = smp.output.result aio.run(csv.scheduler().start()) # print(repr(smp.result)) self.assertEqual(len(smp.table), 10)
def test_csv_distances(self): s = self.scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) # dis=PairwiseDistances(metric='euclidean',scheduler=s) # dis.input.df = vec.output.df cnt = Every(proc=self.terse,constant_time=True,scheduler=s) # cnt.input.df = dis.output.dist cnt.input.df = vec.output.table global times times = 0 s.start(ten_times) s.join() table = vec.table()
def test_04_read_http_multi_csv_bz2_no_crash(self): #if TRAVIS: return self._http_srv = _HttpSrv() s = self.scheduler() module = CSVLoader([make_url('smallfile', ext=BZ2)] * 2, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_04_read_http_csv_bz2_no_crash(self): #if TRAVIS: return p = Process(target=run_simple_server, args=()) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() _close(module) self.assertEqual(len(module.table()), 1000000)
def test_07_read_multi_csv_file_no_crash(self): s = self.scheduler() module = CSVLoader( [get_dataset('smallfile'), get_dataset('smallfile')], index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_read_multiple_fake_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [ 'buffer://fake1?cols=10&rows=30000', 'buffer://fake2?cols=10&rows=30000']}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_sample(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) smp = Sample(samples=10, scheduler=s) smp.input.table = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = smp.output.table csv.scheduler().start() s.join() #print(repr(smp.table())) self.assertEqual(len(smp.table()), 10)
def test_05_read_http_csv_bz2_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**7)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s, timeout=0.01) self.assertTrue(module.table() is None) s.start() s.join() _close(module) #self.assertGreater(module.parser._recovery_cnt, 0) self.assertEqual(len(module.table()), 1000000)
def test_histogram1d1(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) min_ = Min(scheduler=s) min_.input[0] = csv.output.result max_ = Max(scheduler=s) max_.input[0] = csv.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = csv.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) df = pd.read_csv( get_dataset("bigfile"), header=None, usecols=[2] # type: ignore ) v = df.to_numpy().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertListEqual(h1.tolist(), h2.tolist())
def test_last_row(self): s = self.scheduler() csv = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s) lr1 = LastRow(scheduler=s) lr1.input.table = csv.output.table prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = lr1.output.table s.start() s.join() df = csv.table() last = df.last() res = lr1.table() self.assertEqual(res.at[0, '_1'], last['_1'])
def test_mb_k_means(self) -> None: s = self.scheduler() n_clusters = 3 try: dataset = (get_dataset("cluster:s3"), ) except TimeoutError: print("Cannot download cluster:s3") return with s: csv = CSVLoader( dataset, sep=" ", skipinitialspace=True, header=None, index_col=False, scheduler=s, ) km = MBKMeans( n_clusters=n_clusters, random_state=42, is_input=False, is_greedy=False, scheduler=s, ) # km.input.table = csv.output.result km.create_dependent_modules(csv) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = km.output.result e = Every(proc=self.terse, scheduler=s) e.input[0] = km.output.labels aio.run(s.start()) labels = km.labels() assert labels is not None self.assertEqual(len(csv.table), len(labels))
def test_percentile(self): s=Scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False,header=None, scheduler=s) module=Percentiles(1,id='test_percentile', percentiles=[0.1, 0.25, 0.5, 0.75, 0.9], scheduler=s) module.describe() csv_module.describe() connect(csv_module, 'df', module, 'df') connect(module, 'percentiles', Print(id='print', scheduler=s), 'df') s.start() ret = module.trace_stats(max_runs=1) #print "Done. Run time: %gs, loaded %d rows" % (s['duration'].irow(-1), len(module.df())) pd.set_option('display.expand_frame_repr', False) print ret
def t_histogram1d_impl(self, **kw: Any) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stirrer = Stirrer(update_column="_2", fixed_step_size=1000, scheduler=s, **kw) stirrer.input[0] = csv.output.result min_ = Min(scheduler=s) min_.input[0] = stirrer.output.result max_ = Max(scheduler=s) max_.input[0] = stirrer.output.result histogram1d = Histogram1D("_2", scheduler=s) # columns are called 1..30 histogram1d.input[0] = stirrer.output.result histogram1d.input.min = min_.output.result histogram1d.input.max = max_.output.result # pr = Print(scheduler=s) pr = Every(proc=self.terse, scheduler=s) pr.input[0] = histogram1d.output.result aio.run(s.start()) _ = histogram1d.trace_stats() last = notNone(histogram1d.table.last()).to_dict() h1 = last["array"] bounds = (last["min"], last["max"]) tab = stirrer.table.loc[:, ["_2"]] assert tab is not None v = tab.to_array().reshape(-1) h2, _ = np.histogram( # type: ignore v, bins=histogram1d.params.bins, density=False, range=bounds ) self.assertEqual(np.sum(h1), np.sum(h2)) self.assertListEqual(h1.tolist(), h2.tolist())
def test_join(self): s = self.scheduler() csv = CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input.table = csv.output.table stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input.table = csv.output.table stat3 = Stats(3, reset_index=True, scheduler=s) stat3.input.table = csv.output.table #join=Join(scheduler=s) #import pdb;pdb.set_trace() reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) reduce_.input.table = stat1.output.stats reduce_.input.table = stat2.output.stats join = reduce_.expand() pr = Print(proc=self.terse, scheduler=s) pr.input.df = join.output.table prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = csv.output.table s.start() res = join.trace_stats(max_runs=1) print(res)
def test_join(self) -> None: s = self.scheduler() csv = CSVLoader( get_dataset("bigfile"), index_col=False, header=None, scheduler=s ) stat1 = Stats(1, reset_index=True, scheduler=s) stat1.input[0] = csv.output.result stat2 = Stats(2, reset_index=True, scheduler=s) stat2.input[0] = csv.output.result # join=Join(scheduler=s) # reduce_ = Reduce(BinJoin, "first", "second", "table", scheduler=s) # reduce_.input[0] = stat1.output.stats # reduce_.input[0] = stat2.output.stats # join = reduce_.expand() join = Reduce.expand( BinJoin, "first", "second", "table", [stat1.output.stats, stat2.output.stats], scheduler=s, ) pr = Print(proc=self.terse, scheduler=s) pr.input[0] = join.output.result prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input[0] = csv.output.result aio.run(s.start()) res = join.trace_stats(max_runs=1) print(res)
def test_03_read_multiple_csv_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [make_url('smallfile'), make_url('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.table csv.start() s.join() _close(csv) self.assertEqual(len(csv.table()), 60000)
def test_01_read_http_csv_with_crash_and_counter(self): #if TRAVIS: return self._http_srv = _HttpSrv() s = self.scheduler() url = make_url('bigfile') module = CSVLoader(url, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) Patch1.max_steps = 200000 decorate(s, Patch1("csv_loader_1")) s.start() s.join() self._http_srv.restart() s = self.scheduler() csv = CSVLoader(url, recovery=True, index_col=False, header=None, scheduler=s) counter = Counter(scheduler=s) counter.input.table = csv.output.table self.assertTrue(csv.table() is None) s.start() s.join() self.assertEqual(len(csv.table()), 1000000) self.assertEqual(counter.table()['counter'].loc[0], 1000000)
def test_read_multiple_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names", dshape="{filename: string}", data={ "filename": [get_dataset("smallfile"), get_dataset("smallfile")] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
def test_scatterplot(self): s = self.scheduler() csv = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, force_valid_ids=True, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(csv, 'table') cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table csv.scheduler().start(idle_proc=idle_proc) s.join() self.assertEqual(len(csv.table()), 30000)
def test_read_multiple_fake_csv(self) -> None: s = self.scheduler() filenames = Table( name="file_names2", dshape="{filename: string}", data={ "filename": [ "buffer://fake1?cols=10&rows=30000", "buffer://fake2?cols=10&rows=30000", ] }, ) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.result sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result aio.run(csv.start()) self.assertEqual(len(csv.table), 60000)
def test_csv_distances(self): s=Scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) dis=PairwiseDistances(metric='euclidean',scheduler=s) dis.input.df = vec.output.df cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start(ten_times) df = vec.df() computed = dis.dist() #self.assertEquals(computed.shape[0], len(df)) del df[CSVLoader.UPDATE_COLUMN] offset=0 size=offset+5000 truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric) dist = computed[offset:size,offset:size] self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
#SUFFIX= '' PREFIX= '../nyc-taxi/' SUFFIX= '.bz2' URLS = [ PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX, ] filenames = pd.DataFrame({'filename': URLS}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], filter=filter, scheduler=s) #csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], scheduler=s) csv.input.filenames = cst.output.df pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('pickup_longitude', 'pickup_latitude', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': s.start() while True: time.sleep(2) scheluder.to_json() scatterplot.to_json() # simulate a web query scatterplot.get_image() s.join()
from progressivis import * from progressivis.vis import ScatterPlot from progressivis.io import CSVLoader from progressivis.datasets import get_dataset def filter(df): l = df['pickup_longitude'] return df[(l < -70) & (l > -80) ] def print_len(x): if x is not None: print len(x) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('_1', '_2', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': csv.start() s.join() print len(csv.df())
def test_read_csv(self): s=Scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.df() is None) s.start() self.assertEqual(len(module.df()), 1000000)
https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.stats import Min, Max, Histogram2D from progressivis.vis import Heatmap, ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s) mbkmeans.input.df = data.output.df prn = Every(scheduler=s) prn.input.df = mbkmeans.output.df sp = ScatterPlot(0,1, scheduler=s) sp.move_point = mbkmeans # for input management #sp.create_dependent_modules(mbkmeans,'centroids') # Create modules by hand rather than with the utility. # We show the cluster centroids on the scatterplot and the # data as a heatmap # histogram2d histogram2d = Histogram2D(0, 1, scheduler=s) histogram2d.input.df = data.output.df min_mod = Min([0,1], scheduler=s)
from progressivis import Scheduler, Every, Print from progressivis.io import CSVLoader from progressivis.stats import Histogram2D, Min, Max from progressivis.datasets import get_dataset print "Loading test_histogram2d" print "Type of default_scheduler is %s" % type(Scheduler.default) csv = CSVLoader(get_dataset("bigfile"), index_col=False, header=None, engine="c") pr = Every() pr.input.df = csv.output.df min = Min() min.input.df = csv.output.df max = Max() max.input.df = csv.output.df histogram2d = Histogram2D(1, 2, xbins=128, ybins=128) histogram2d.input.df = csv.output.df histogram2d.input.min = min.output.df histogram2d.input.max = max.output.df pr = Print(id="print") pr.input.df = histogram2d.output.df if __name__ == "__main__": csv.start()