def test_01_read_http_csv_with_crash_and_counter(self): #if TRAVIS: return self._http_srv = _HttpSrv() s = self.scheduler() url = make_url('bigfile') module = CSVLoader(url, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) Patch1.max_steps = 200000 decorate(s, Patch1("csv_loader_1")) s.start() s.join() self._http_srv.restart() s = self.scheduler() csv = CSVLoader(url, recovery=True, index_col=False, header=None, scheduler=s) counter = Counter(scheduler=s) counter.input.table = csv.output.table self.assertTrue(csv.table() is None) s.start() s.join() self.assertEqual(len(csv.table()), 1000000) self.assertEqual(counter.table()['counter'].loc[0], 1000000)
def test_read_fake_csv(self): s=self.scheduler() module=CSVLoader(RandomBytesIO(cols=30, rows=1000000), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def test_read_csv(self): s=self.scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 1000000)
def _tst_08_read_multi_csv_file_compress_no_crash(self, files): s = self.scheduler() module = CSVLoader(files, index_col=False, header=None, scheduler=s) #, save_context=False) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_07_read_multi_csv_file_no_crash(self): s = self.scheduler() module = CSVLoader( [get_dataset('smallfile'), get_dataset('smallfile')], index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_04_read_http_multi_csv_bz2_no_crash(self): #if TRAVIS: return self._http_srv = _HttpSrv() s = self.scheduler() module = CSVLoader([make_url('smallfile', ext=BZ2)] * 2, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) #decorate(s, Patch1("csv_loader_1")) s.start() s.join() self.assertEqual(len(module.table()), 60000)
def test_04_read_http_csv_bz2_no_crash(self): #if TRAVIS: return p = Process(target=run_simple_server, args=()) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() _close(module) self.assertEqual(len(module.table()), 1000000)
def test_05_read_http_csv_bz2_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**7)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s, timeout=0.01) self.assertTrue(module.table() is None) s.start() s.join() _close(module) #self.assertGreater(module.parser._recovery_cnt, 0) self.assertEqual(len(module.table()), 1000000)
def test_read_multiple_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def test_06_read_http_multi_csv_bz2_with_crash(self): #if TRAVIS: return self._http_srv = _HttpSrv() s = self.scheduler() url_list = [make_url('bigfile', ext=BZ2)] * 2 module = CSVLoader(url_list, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) Patch1.max_steps = 1200000 decorate(s, Patch1("csv_loader_1")) s.start() s.join() self._http_srv.restart() s = self.scheduler() module = CSVLoader(url_list, recovery=True, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 2000000)
def test_read_multiple_fake_csv(self): s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [ 'buffer://fake1?cols=10&rows=30000', 'buffer://fake2?cols=10&rows=30000']}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s) csv.input.filenames = cst.output.table csv.start() s.join() self.assertEqual(len(csv.table()), 60000)
def _tst_10_read_multi_csv_file_compress_with_crash(self, file_list): s = self.scheduler() module = CSVLoader(file_list, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) Patch1.max_steps = 1200000 decorate(s, Patch1("csv_loader_1")) s.start() s.join() _close(module) s = self.scheduler() module = CSVLoader(file_list, recovery=True, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 2000000)
def test_csv_distances(self): s = self.scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) # dis=PairwiseDistances(metric='euclidean',scheduler=s) # dis.input.df = vec.output.df cnt = Every(proc=self.terse,constant_time=True,scheduler=s) # cnt.input.df = dis.output.dist cnt.input.df = vec.output.table global times times = 0 s.start(ten_times) s.join() table = vec.table()
def test_09_read_multi_csv_file_with_crash(self): s = self.scheduler() file_list = [get_dataset('bigfile'), get_dataset('bigfile')] module = CSVLoader(file_list, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) Patch1.max_steps = 1200000 decorate(s, Patch1("csv_loader_1")) s.start() s.join() _close(module) s = self.scheduler() module = CSVLoader(file_list, recovery=True, index_col=False, header=None, scheduler=s) self.assertTrue(module.table() is None) s.start() s.join() self.assertEqual(len(module.table()), 2000000)
def test_last_row(self): s = self.scheduler() csv = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s) lr1 = LastRow(scheduler=s) lr1.input.table = csv.output.table prlen = Every(proc=self.terse, constant_time=True, scheduler=s) prlen.input.df = lr1.output.table s.start() s.join() df = csv.table() last = df.last() res = lr1.table() self.assertEqual(res.at[0, '_1'], last['_1'])
def test_03_read_multiple_csv_crash_recovery(self): #if TRAVIS: return p = Process(target=run_throttled_server, args=(8000, 10**6)) p.start() self._http_proc = p time.sleep(SLEEP) s=self.scheduler() filenames = Table(name='file_names', dshape='{filename: string}', data={'filename': [make_url('smallfile'), make_url('smallfile')]}) cst = Constant(table=filenames, scheduler=s) csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01) csv.input.filenames = cst.output.table csv.start() s.join() _close(csv) self.assertEqual(len(csv.table()), 60000)
def test_scatterplot(self): s = self.scheduler() csv = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, force_valid_ids=True, scheduler=s) sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_1', '_2')], approximate=True) sp.create_dependent_modules(csv, 'table') cnt = Every(proc=self.terse, constant_time=True, scheduler=s) cnt.input.df = csv.output.table prt = Print(proc=self.terse, scheduler=s) prt.input.df = sp.output.table csv.scheduler().start(idle_proc=idle_proc) s.join() self.assertEqual(len(csv.table()), 30000)
def test_stats(self): s = self.scheduler() csv_module = CSVLoader(get_dataset('smallfile'), index_col=False, header=None, scheduler=s) stats = Stats('_1', name='test_stats', scheduler=s) wait = Wait(name='wait', delay=3, scheduler=s) wait.input.inp = csv_module.output.table stats.input._params = wait.output.out stats.input.table = csv_module.output.table pr = Print(proc=self.terse, name='print', scheduler=s) pr.input.df = stats.output.stats s.start() s.join() table = csv_module.table() stable = stats.table() last = stable.last() tmin = table['_1'].min() self.assertTrue(np.isclose(tmin, last['__1_min'])) tmax = table['_1'].max() self.assertTrue(np.isclose(tmax, last['__1_max']))
def test_mb_k_means(self): #log_level() s = self.scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'), sep=' ', skipinitialspace=True, header=None, index_col=False, scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.table = csv.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = km.output.table e = Every(proc=self.terse, scheduler=s) e.input.df = km.output.labels s.start() s.join() self.assertEqual(len(csv.table()), len(km.labels()))
def _print_len(x): if x is not None: print(len(x)) #log_level() #package='progressivis.stats.histogram2d') try: s = scheduler except NameError: s = Scheduler() CSV = CSVLoader(get_dataset('bigfile_mvn'), index_col=False, header=None, scheduler=s) #PR = Every(scheduler=s) #PR.input.df = CSV.output.table KNNKDE = KernelDensity(scheduler=s, samples=samples, bins=sampleN) KNNKDE.input.table = CSV.output.table if __name__ == '__main__': s.start() while True: time.sleep(2) s.to_json() KNNKDE.to_json() # simulate a web query #SCATTERPLOT.get_image() s.join() print(len(CSV.table()))