def setUpStep(self, step): self.set_step_info("{} rows".format(step * L)) s = Scheduler() random = RandomTable(10, rows=step * L, scheduler=s) s.start() #return random self.random_table = pd.DataFrame( random.output.table.output_module.table().to_dict())
def p10s_random_min_max(n): StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(name='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def test_input(self): s=Scheduler() inp = Input(scheduler=s) pr=Print(scheduler=s) pr.input.df = inp.output.df t=threading.Thread(target=do_line,args=(inp,s)) t.start() s.start() self.assertEqual(len(inp.df()), 10)
def p10s_random_min_max(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) min_ = Min(mid='min_' + str(hash(random)), scheduler=s) min_.input.table = random.output.table max_ = Max(id='max_' + str(hash(random)), scheduler=s) max_.input.table = random.output.table s.start()
def test_max(self): s=Scheduler() random = RandomTable(10, rows=10000, scheduler=s) max=Max(scheduler=s) max.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = max.output.df s.start() res1 = random.df()[random.columns.difference([random.UPDATE_COLUMN])].max() res2 = last_row(max.df(), remove_update=True) self.assertTrue(np.allclose(res1, res2))
def test_random_table(self): s=Scheduler() module=RandomTable(['a', 'b'], rows=10000, scheduler=s) self.assertEqual(module.df().columns[0],'a') self.assertEqual(module.df().columns[1],'b') self.assertEqual(len(module.df().columns), 3) # add the UPDATE_COLUMN prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = module.output.df s.start() self.assertEqual(len(module.df()), 10000) self.assertFalse(module.df()['a'].isnull().any()) self.assertFalse(module.df()['b'].isnull().any())
def test_filter(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) filter_ = FilterMod(expr='_1 > 0.5', scheduler=s) filter_.input.table = random.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = filter_.output.table s.start() s.join() idx = filter_.get_input_slot('table').data().eval( '_1>0.5', result_object='index') self.assertEqual(filter_._table.selection, bitmap(idx))
def test_mb_k_means(self): #log_level() s=Scheduler() n_clusters = 3 csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s) km.input.df = csv.output.df pr = Print(scheduler=s) pr.input.df = km.output.df e = Every(scheduler=s) e.input.df = km.output.labels s.start() self.assertEquals(len(csv.df()), len(km.labels()))
def test_random_table2(self): s=Scheduler() # produces more than 4M rows per second on my laptop module=RandomTable(10, rows=10000000, force_valid_ids=True, scheduler=s) self.assertEqual(len(module.df().columns), 11) # add the UPDATE_COLUMN self.assertEqual(module.df().columns[0],'_1') self.assertEqual(module.df().columns[1],'_2') prlen = Every(proc=print_len, constant_time=True, scheduler=s) prlen.input.df = module.output.df s.start() self.assertEqual(len(module.df()), 10000000) self.assertFalse(module.df()['_1'].isnull().any()) self.assertFalse(module.df()['_2'].isnull().any())
def test_var(self): s=Scheduler() random = RandomTable(1, rows=1000, scheduler=s) var=Var(scheduler=s) var.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = var.output.df s.start() res1 = random.df()[1].var() res2 = last_row(var.df(), remove_update=True) #print 'res1:', res1 #print 'res2:', res2 self.assertTrue(np.allclose(res1, res2))
def test_idxmin(self): s=Scheduler() random = RandomTable(10, rows=10000,throttle=1000, scheduler=s) idxmin=IdxMin(scheduler=s) idxmin.input.df = random.output.df min=Min(scheduler=s) min.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = idxmin.output.min s.start() min1=last_row(min.df(),remove_update=True) #print min1 min2=last_row(idxmin.min(),remove_update=True) #print min2 self.assertTrue((min1==min2).all())
def test_idxmax(self): s=Scheduler() random = RandomTable(10, rows=10000,throttle=1000, scheduler=s) idxmax=IdxMax(scheduler=s) idxmax.input.df = random.output.df max=Max(scheduler=s) max.input.df = random.output.df pr=Print(scheduler=s) pr.input.df = idxmax.output.max s.start() max1=last_row(max.df(),remove_update=True) #print max1 max2=last_row(idxmax.max(),remove_update=True) #print max2 self.assertTrue((max1==max2).all())
def test_dummy(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) dummy_ = DummyMod(update_column='_1', delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s) dummy_.input.table = random.output.table max_ = Max(name='max_' + str(hash(random)), scheduler=s) max_.input.table = dummy_.output.table pr = Print(proc=self.terse, scheduler=s) pr.input.df = max_.output.table s.start() s.join()
def NOtest_vec_distances(self): s=Scheduler() vec=VECLoader(get_dataset('warlogs'),scheduler=s) dis=PairwiseDistances(metric='cosine',scheduler=s) dis.input.df = vec.output.df dis.input.array = vec.output.array cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start() df = vec.df() computed = dis.dist() self.assertEquals(computed.shape[0], len(df)) truth = pairwise_distances(vec.toarray(), metric=dis._metric) self.assertTrue(np.allclose(truth, computed))
def test_hub_if_else(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: False, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result min_ = Min(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = switch.output.result_else hub = Hub(scheduler=s) hub.input.table = min_.output.result hub.input.table = max_.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = hub.output.result aio.run(s.start()) res1 = stirrer.result.min() res2 = hub.result self.compare(res1, res2)
def test_filter(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) filter_ = FilterMod(expr="_1 > 0.5", scheduler=s) filter_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = filter_.output.result aio.run(s.start()) idx = (filter_.get_input_slot("table").data().eval( "_1>0.5", result_object="index")) self.assertEqual(filter_.table.index, bitmap(idx))
def test_csv_distances(self): s=Scheduler() vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s) dis=PairwiseDistances(metric='euclidean',scheduler=s) dis.input.df = vec.output.df cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = dis.output.dist global times times = 0 s.start(ten_times) df = vec.df() computed = dis.dist() #self.assertEquals(computed.shape[0], len(df)) del df[CSVLoader.UPDATE_COLUMN] offset=0 size=offset+5000 truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric) dist = computed[offset:size,offset:size] self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
def test_select_delta(self): #log_level() delta = np.array([0, 0.05]) points = [[0.0, 0.0], [1.0, 1.0], [1.0, 0.0]] s=Scheduler() df=pd.DataFrame(points) add_to_row=AddToRow(df, scheduler=s) def tick_proc(s, run_number): if run_number > 100: s.stop() #print add_to_row.df() try: add_to_row.from_input({1: delta}) except Exception as e: print 'Error: %s'%e q=SelectDelta(delta=0.5,scheduler=s) q.input.df = add_to_row.output.df prlen = Every(scheduler=s) prlen.input.df = q.output.df s.start(tick_proc=tick_proc) self.assertEqual(len(q.df()), 3)
def test_repair_min(self) -> None: """ test_repair_min() min without deletes/updates """ s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) min_ = ScalarMin(name="min_" + str(hash(random)), scheduler=s) min_.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = min_.output.result aio.run(s.start()) res1 = random.table.min() res2 = min_.psdict self.compare(res1, res2)
def test_repair_min2(self) -> None: """ test_repair_min2() runs with sensitive ids deletion """ s = Scheduler() ScalarMin._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) min_ = ScalarMin(name="min_repair_test2", scheduler=s) stirrer = MyStirrer(watched="min_repair_test2", scheduler=s) stirrer.input[0] = random.output.result min_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = min_.output.result aio.run(s.start()) self.assertEqual(ScalarMin._reset_calls_counter, 1) # type: ignore res1 = stirrer.table.min() res2 = min_.psdict self.compare(res1, res2)
def test_stirrer(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.table.max() res2 = max_.result self.compare(res1, res2)
def test_filter3(self) -> None: s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer(update_column="_1", update_rows=5, fixed_step_size=100, scheduler=s) stirrer.input[0] = random.output.result filter_ = FilterMod(expr="_1 > 0.5", scheduler=s) filter_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = filter_.output.result aio.run(s.start()) tbl = filter_.get_input_slot("table").data() idx = tbl.eval("_1>0.5", result_object="index") self.assertEqual(filter_.table.index, bitmap(idx)) df = pd.DataFrame(tbl.to_dict(), index=tbl.index.to_array()) dfe = df.eval("_1>0.5") self.assertEqual(filter_.table.index, bitmap(df.index[dfe]))
def test_repair_max3(self) -> None: """ test_repair_max3() runs with NON-sensitive ids deletion """ s = Scheduler() ScalarMax._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) max_ = ScalarMax(name="max_repair_test3", scheduler=s) stirrer = MyStirrer(watched="max_repair_test3", proc_sensitive=False, scheduler=s) stirrer.input[0] = random.output.result max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) self.assertEqual(ScalarMax._reset_calls_counter, 0) # type: ignore res1 = stirrer.table.max() res2 = max_.psdict self.compare(res1, res2)
def test_scheduler(self) -> None: with self.assertRaises(ProgressiveError): s = Scheduler(0) s = Scheduler() csv = CSVLoader( get_dataset("bigfile"), name="csv", index_col=False, header=None, scheduler=s, ) self.assertIs(s["csv"], csv) sink = Sink(name="sink", scheduler=s) sink.input.inp = csv.output.result # allow csv to start check_running = False async def _is_running() -> None: nonlocal check_running check_running = csv.scheduler().is_running() aio.run_gather(s.start(), _is_running()) self.assertTrue(check_running) def add_min(s: Scheduler, r: int) -> None: with s: m = Min(scheduler=s) m.input.table = csv.output.result prt = Print(proc=self.terse, scheduler=s) prt.input.df = m.output.result s.on_loop(add_min, 10) s.on_loop(self._stop, 20) self.assertIs(s["csv"], csv) json = s.to_json(short=False) self.assertFalse(json["is_running"]) self.assertTrue(json["is_terminated"]) html = s._repr_html_() self.assertTrue(len(html) != 0)
def test_repair_max5(self) -> None: """ test_repair_max5() runs with sensitive ids update (critical) """ s = Scheduler() ScalarMax._reset_calls_counter = 0 # type: ignore random = RandomTable(2, rows=100000, scheduler=s) max_ = ScalarMax(name="max_repair_test4", scheduler=s) stirrer = MyStirrer(watched="max_repair_test4", mode="update", value=-9999.0, scheduler=s) stirrer.input[0] = random.output.result max_.input[0] = stirrer.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) self.assertEqual(ScalarMax._reset_calls_counter, 1) # type: ignore res1 = stirrer.table.max() res2 = max_.psdict self.compare(res1, res2)
def test_switch_if_then(self): s = Scheduler() random = RandomTable(2, rows=100000, scheduler=s) stirrer = Stirrer( update_column="_1", delete_rows=5, update_rows=5, fixed_step_size=100, scheduler=s, ) stirrer.input[0] = random.output.result switch = Switch(condition=lambda x: True, scheduler=s) switch.input[0] = stirrer.output.result max_ = Max(name="max_" + str(hash(random)), scheduler=s) max_.input[0] = switch.output.result pr_else = Print(proc=self.terse, scheduler=s) pr_else.input[0] = switch.output.result_else pr = Print(proc=self.terse, scheduler=s) pr.input[0] = max_.output.result aio.run(s.start()) res1 = stirrer.result.max() res2 = max_.result self.compare(res1, res2)
'sample': mbkmeans if i == 0 else None, 'input_module': filt, 'input_slot': 'table' }) sp = MCScatterPlot(scheduler=s, classes=classes) sp.create_dependent_modules() for i in range(n_clusters): cname = f"k{i}" sp[cname].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf}) sp[cname].max_value._table = PsDict({'_0': np.inf, '_1': np.inf}) mbkmeans.input.table = data.output.table mbkmeans.create_dependent_modules() sp.move_point = mbkmeans.moved_center # for input management def myprint(d): if d['convergence'] != 'unknown': print(d) else: print('.', end='') prn = Every(scheduler=s, proc=print) prn.input.df = mbkmeans.output.conv if __name__ == '__main__': #data.start() #s.join() aio.run(s.start())
def _common( self, rtol: float, threshold: Optional[int] = None, resetter: Optional[MyResetter] = None, resetter_func: Optional[Callable[[Slot], Any]] = None, scheduler: Optional[Scheduler] = None, ) -> float: global KNN, LABELS, INDICES if scheduler is None: s = Scheduler() else: s = scheduler try: dataset = get_dataset("mnist_784") except TimeoutError: print("Cannot download mnist") return 0 data = CSVLoader( dataset, index_col=False, as_array="array", usecols=lambda x: x != "class", scheduler=s, ) ppca = PPCA(scheduler=s) ppca.input[0] = data.output.result ppca.params.n_components = N_COMPONENTS if resetter: assert callable(resetter_func) resetter.input[0] = ppca.output.result ppca.create_dependent_modules( rtol=rtol, trace=TRACE, threshold=threshold, resetter=resetter, resetter_func=resetter_func, ) prn = Every(scheduler=s, proc=_print) prn.input[0] = ppca.reduced.output.result aio.run(s.start()) pca_ = ppca._transformer["inc_pca"] recovered = pca_.inverse_transform(_array(ppca.reduced.table)) if KNN is None: print("Init KNN") KNN = KNeighborsClassifier(NNEIGHBOURS) arr = _array(data.table) df: pd.DataFrame = pd.read_csv( dataset, usecols=["class"] # type: ignore ) LABELS = df.values.reshape((-1,)) indices_t = sample_without_replacement( n_population=len(data.table), n_samples=TRAIN_SAMPLE_SIZE, random_state=RANDOM_STATE, ) KNN.fit(arr[indices_t], LABELS[indices_t]) indices_p = sample_without_replacement( n_population=len(data.table), n_samples=PREDICT_SAMPLE_SIZE, random_state=RANDOM_STATE * 2 + 1, ) return KNN.score(recovered[indices_p], LABELS[indices_p]) # type: ignore
def p10s_read_csv(f): s=Scheduler() module=CSVLoader(f, index_col=False, header=None, scheduler=s) s.start()
#SUFFIX= '' PREFIX= '../nyc-taxi/' SUFFIX= '.bz2' URLS = [ PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX, ] filenames = pd.DataFrame({'filename': URLS}) cst = Constant(Table('filenames', data=filenames), scheduler=s) csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['dropoff_longitude', 'dropoff_latitude'], filter_=filter_, scheduler=s) csv.input.filenames = cst.output.table min = Min(scheduler=s) min.input.table = csv.output.table max = Max(scheduler=s) max.input.table = csv.output.table histogram2d = Histogram2D('dropoff_longitude', 'dropoff_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s) histogram2d.input.table = csv.output.table histogram2d.input.min = min.output.table histogram2d.input.max = max.output.table heatmap = Heatmap(filename='nyc_dropoff_yellow%d.png', history=5, scheduler=s) heatmap.input.array = histogram2d.output.table if __name__=='__main__': s.start()
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import MCScatterPlot import asyncio as aio try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = table.output.table prn = Print(scheduler=s) prn.input.df = mbkmeans.output.table sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', 'a', 'b')], approximate=True) sp.create_dependent_modules(mbkmeans, 'table') sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1 sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1 if __name__ == '__main__': #table.start() aio.run(s.start(coros=[aio.sleep(3600)]))
def make_df(n, L): s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start() #return random return pd.DataFrame(random.output.table.output_module.table().to_dict())
def p10s_zarr_random(n): StorageEngine.default = "zarr" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start()
def p10s_random(self): n = self.current_step StorageEngine.default = "hdf5" s = Scheduler() random = RandomTable(10, rows=n * L, scheduler=s) s.start()
def test_read_csv(self): s=Scheduler() module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s) self.assertTrue(module.df() is None) s.start() self.assertEqual(len(module.df()), 1000000)
SUFFIX= '.bz2' URLS = [ PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX, ] filenames = pd.DataFrame({'filename': URLS}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], filter=filter, scheduler=s) csv.input.filenames = cst.output.df #min = Min(scheduler=s) #min.input.df = csv.output.df #max = Max(scheduler=s) #max.input.df = csv.output.df min = Constant(df=pd.DataFrame([bounds_min]), scheduler=s) max = Constant(df=pd.DataFrame([bounds_max]), scheduler=s) histogram2d = Histogram2D('pickup_longitude', 'pickup_latitude', xbins=RESOLUTION, ybins=RESOLUTION, scheduler=s) histogram2d.input.df = csv.output.df histogram2d.input.min = min.output.df histogram2d.input.max = max.output.df heatmap = Heatmap(filename='nyc_pickup_yellow%d.png', history=5, scheduler=s) heatmap.input.array = histogram2d.output.df if __name__=='__main__': s.start()