Beispiel #1
0
 def test_01_read_http_csv_with_crash_and_counter(self):
     #if TRAVIS: return
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     url = make_url('bigfile')
     module = CSVLoader(url, index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     Patch1.max_steps = 200000
     decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self._http_srv.restart()
     s = self.scheduler()
     csv = CSVLoader(url,
                     recovery=True,
                     index_col=False,
                     header=None,
                     scheduler=s)
     counter = Counter(scheduler=s)
     counter.input.table = csv.output.table
     self.assertTrue(csv.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(csv.table()), 1000000)
     self.assertEqual(counter.table()['counter'].loc[0], 1000000)
 def test_read_fake_csv(self):
     s=self.scheduler()
     module=CSVLoader(RandomBytesIO(cols=30, rows=1000000), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 1000000)
 def test_read_csv(self):
     s=self.scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 1000000)
Beispiel #4
0
 def _tst_08_read_multi_csv_file_compress_no_crash(self, files):
     s = self.scheduler()
     module = CSVLoader(files, index_col=False, header=None,
                        scheduler=s)  #, save_context=False)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
Beispiel #5
0
 def test_07_read_multi_csv_file_no_crash(self):
     s = self.scheduler()
     module = CSVLoader(
         [get_dataset('smallfile'),
          get_dataset('smallfile')],
         index_col=False,
         header=None,
         scheduler=s)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
Beispiel #6
0
 def test_04_read_http_multi_csv_bz2_no_crash(self):
     #if TRAVIS: return
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     module = CSVLoader([make_url('smallfile', ext=BZ2)] * 2,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     #decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 60000)
 def test_04_read_http_csv_bz2_no_crash(self):
     #if TRAVIS: return
     p = Process(target=run_simple_server, args=())
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s=self.scheduler()
     module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     _close(module)
     self.assertEqual(len(module.table()), 1000000)
 def test_05_read_http_csv_bz2_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**7))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s=self.scheduler()
     module=CSVLoader(make_url('bigfile', ext=BZ2), index_col=False, header=None, scheduler=s, timeout=0.01)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     _close(module)
     #self.assertGreater(module.parser._recovery_cnt, 0)
     self.assertEqual(len(module.table()), 1000000)
 def test_read_multiple_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [get_dataset('smallfile'), get_dataset('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     self.assertEqual(len(csv.table()), 60000)
Beispiel #10
0
 def test_06_read_http_multi_csv_bz2_with_crash(self):
     #if TRAVIS: return
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     url_list = [make_url('bigfile', ext=BZ2)] * 2
     module = CSVLoader(url_list, index_col=False, header=None, scheduler=s)
     self.assertTrue(module.table() is None)
     Patch1.max_steps = 1200000
     decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     self._http_srv.restart()
     s = self.scheduler()
     module = CSVLoader(url_list,
                        recovery=True,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 2000000)
Beispiel #11
0
 def test_read_multiple_fake_csv(self):
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [
                           'buffer://fake1?cols=10&rows=30000',
                           'buffer://fake2?cols=10&rows=30000']})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()        
     self.assertEqual(len(csv.table()), 60000)
Beispiel #12
0
 def _tst_10_read_multi_csv_file_compress_with_crash(self, file_list):
     s = self.scheduler()
     module = CSVLoader(file_list,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     Patch1.max_steps = 1200000
     decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     _close(module)
     s = self.scheduler()
     module = CSVLoader(file_list,
                        recovery=True,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 2000000)
    def test_csv_distances(self):
        s = self.scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
#        dis=PairwiseDistances(metric='euclidean',scheduler=s)
#        dis.input.df = vec.output.df
        cnt = Every(proc=self.terse,constant_time=True,scheduler=s)
#        cnt.input.df = dis.output.dist
        cnt.input.df = vec.output.table
        global times
        times = 0
        s.start(ten_times)
        s.join()
        table = vec.table()
Beispiel #14
0
 def test_09_read_multi_csv_file_with_crash(self):
     s = self.scheduler()
     file_list = [get_dataset('bigfile'), get_dataset('bigfile')]
     module = CSVLoader(file_list,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     Patch1.max_steps = 1200000
     decorate(s, Patch1("csv_loader_1"))
     s.start()
     s.join()
     _close(module)
     s = self.scheduler()
     module = CSVLoader(file_list,
                        recovery=True,
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.table() is None)
     s.start()
     s.join()
     self.assertEqual(len(module.table()), 2000000)
Beispiel #15
0
 def test_last_row(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('smallfile'),
                     index_col=False,
                     header=None,
                     scheduler=s)
     lr1 = LastRow(scheduler=s)
     lr1.input.table = csv.output.table
     prlen = Every(proc=self.terse, constant_time=True, scheduler=s)
     prlen.input.df = lr1.output.table
     s.start()
     s.join()
     df = csv.table()
     last = df.last()
     res = lr1.table()
     self.assertEqual(res.at[0, '_1'], last['_1'])
 def test_03_read_multiple_csv_crash_recovery(self):
     #if TRAVIS: return        
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP) 
     s=self.scheduler()
     filenames = Table(name='file_names',
                       dshape='{filename: string}',
                       data={'filename': [make_url('smallfile'), make_url('smallfile')]})
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s, timeout=0.01)
     csv.input.filenames = cst.output.table
     csv.start()
     s.join()
     _close(csv)        
     self.assertEqual(len(csv.table()), 60000)
 def test_scatterplot(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('smallfile'),
                     index_col=False,
                     header=None,
                     force_valid_ids=True,
                     scheduler=s)
     sp = MCScatterPlot(scheduler=s,
                        classes=[('Scatterplot', '_1', '_2')],
                        approximate=True)
     sp.create_dependent_modules(csv, 'table')
     cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
     cnt.input.df = csv.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = sp.output.table
     csv.scheduler().start(idle_proc=idle_proc)
     s.join()
     self.assertEqual(len(csv.table()), 30000)
Beispiel #18
0
 def test_stats(self):
     s = self.scheduler()
     csv_module = CSVLoader(get_dataset('smallfile'),
                            index_col=False,
                            header=None,
                            scheduler=s)
     stats = Stats('_1', name='test_stats', scheduler=s)
     wait = Wait(name='wait', delay=3, scheduler=s)
     wait.input.inp = csv_module.output.table
     stats.input._params = wait.output.out
     stats.input.table = csv_module.output.table
     pr = Print(proc=self.terse, name='print', scheduler=s)
     pr.input.df = stats.output.stats
     s.start()
     s.join()
     table = csv_module.table()
     stable = stats.table()
     last = stable.last()
     tmin = table['_1'].min()
     self.assertTrue(np.isclose(tmin, last['__1_min']))
     tmax = table['_1'].max()
     self.assertTrue(np.isclose(tmax, last['__1_max']))
 def test_mb_k_means(self):
     #log_level()
     s = self.scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),
                     sep=' ',
                     skipinitialspace=True,
                     header=None,
                     index_col=False,
                     scheduler=s)
     km = MBKMeans(n_clusters=n_clusters,
                   random_state=42,
                   is_input=False,
                   scheduler=s)
     km.input.table = csv.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = km.output.table
     e = Every(proc=self.terse, scheduler=s)
     e.input.df = km.output.labels
     s.start()
     s.join()
     self.assertEqual(len(csv.table()), len(km.labels()))
Beispiel #20
0
def _print_len(x):
    if x is not None:
        print(len(x))


#log_level() #package='progressivis.stats.histogram2d')

try:
    s = scheduler
except NameError:
    s = Scheduler()

CSV = CSVLoader(get_dataset('bigfile_mvn'),
                index_col=False,
                header=None,
                scheduler=s)

#PR = Every(scheduler=s)
#PR.input.df = CSV.output.table
KNNKDE = KernelDensity(scheduler=s, samples=samples, bins=sampleN)
KNNKDE.input.table = CSV.output.table
if __name__ == '__main__':
    s.start()
    while True:
        time.sleep(2)
        s.to_json()
        KNNKDE.to_json()  # simulate a web query
        #SCATTERPLOT.get_image()
    s.join()
    print(len(CSV.table()))