def test_scatterplot(self): s=Scheduler() csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s) sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) sp.create_dependent_modules(csv,'df') cnt = Every(proc=print_len,constant_time=True,scheduler=s) cnt.input.df = csv.output.df prt = Print(scheduler=s) prt.input.df = sp.histogram2d.output.df csv.scheduler().start(None,idle_proc) self.assertEquals(len(csv.df()), 1000000)
""" Clustering datasets may be found at https://cs.joensuu.fi/sipu/datasets/ """ from progressivis import Scheduler, Every#, log_level from progressivis.cluster import MBKMeans from progressivis.io import CSVLoader from progressivis.vis import ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = data.output.table prn = Every(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('_0','_1', scheduler=s) sp.move_point = mbkmeans # for input management sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans) if __name__ == '__main__': data.start() s.join()
l = df['pickup_longitude'] return df[(l < -70) & (l > -80)] def print_len(x): if x is not None: print(len(x)) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'), header=None, index_col=False, force_valid_ids=True, scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.table scatterplot = ScatterPlot(x_column='_1', y_column='_2', scheduler=s) scatterplot.create_dependent_modules(csv, 'table') if __name__ == '__main__': csv.start() s.join() print(len(csv.df()))
from progressivis.stats import Min, Max, Histogram2D from progressivis.vis import Heatmap, ScatterPlot from progressivis.datasets import get_dataset try: s = scheduler except NameError: s = Scheduler() #log_level(package="progressivis.cluster") data = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s) mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s) mbkmeans.input.df = data.output.df prn = Every(scheduler=s) prn.input.df = mbkmeans.output.df sp = ScatterPlot(0,1, scheduler=s) sp.move_point = mbkmeans # for input management #sp.create_dependent_modules(mbkmeans,'centroids') # Create modules by hand rather than with the utility. # We show the cluster centroids on the scatterplot and the # data as a heatmap # histogram2d histogram2d = Histogram2D(0, 1, scheduler=s) histogram2d.input.df = data.output.df min_mod = Min([0,1], scheduler=s) max_mod = Max([0,1], scheduler=s) min_mod.input.df = data.output.df max_mod.input.df = data.output.df histogram2d.input.min = min_mod.output.df histogram2d.input.max = max_mod.output.df
from progressivis import * from progressivis.vis import ScatterPlot from progressivis.io import CSVLoader from progressivis.datasets import get_dataset def filter(df): l = df['pickup_longitude'] return df[(l < -70) & (l > -80) ] def print_len(x): if x is not None: print len(x) #log_level() try: s = scheduler except: s = Scheduler() csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s) pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('_1', '_2', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': csv.start() s.join() print len(csv.df())
SUFFIX= '.bz2' URLS = [ PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX, PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX, ] filenames = pd.DataFrame({'filename': URLS}) cst = Constant(df=filenames, scheduler=s) csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], filter=filter, scheduler=s) #csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], scheduler=s) csv.input.filenames = cst.output.df pr = Every(scheduler=s) pr.input.df = csv.output.df scatterplot = ScatterPlot('pickup_longitude', 'pickup_latitude', scheduler=s) scatterplot.create_dependent_modules(csv,'df') if __name__=='__main__': s.start() while True: time.sleep(2) scheluder.to_json() scatterplot.to_json() # simulate a web query scatterplot.get_image() s.join() print len(csv.df())
from progressivis import Scheduler, Print from progressivis.cluster import MBKMeans from progressivis.stats import RandomTable from progressivis.vis import ScatterPlot try: s = scheduler except: s = Scheduler() table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s) mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s) mbkmeans.input.table = table.output.table prn = Print(scheduler=s) prn.input.df = mbkmeans.output.table sp = ScatterPlot('a', 'b', scheduler=s) sp.create_dependent_modules(mbkmeans,'table') if __name__ == '__main__': table.start()