def test_scatterplot(self):
     s=Scheduler()
     csv = CSVLoader(get_dataset('bigfile'),index_col=False,header=None,force_valid_ids=True,scheduler=s)
     sp = ScatterPlot(x_column='_1', y_column='_2', scheduler=s)
     sp.create_dependent_modules(csv,'df')
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = csv.output.df
     prt = Print(scheduler=s)
     prt.input.df = sp.histogram2d.output.df
     csv.scheduler().start(None,idle_proc)
     self.assertEquals(len(csv.df()), 1000000)
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = data.output.table
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('_0','_1', scheduler=s)

sp.move_point = mbkmeans # for input management
sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans)

if __name__ == '__main__':
    data.start()
    s.join()
    l = df['pickup_longitude']
    return df[(l < -70) & (l > -80)]


def print_len(x):
    if x is not None:
        print(len(x))


#log_level()

try:
    s = scheduler
except:
    s = Scheduler()

csv = CSVLoader(get_dataset('bigfile'),
                header=None,
                index_col=False,
                force_valid_ids=True,
                scheduler=s)
pr = Every(scheduler=s)
pr.input.df = csv.output.table
scatterplot = ScatterPlot(x_column='_1', y_column='_2', scheduler=s)
scatterplot.create_dependent_modules(csv, 'table')

if __name__ == '__main__':
    csv.start()
    s.join()
    print(len(csv.df()))
Example #4
0
from progressivis.stats import Min, Max, Histogram2D
from progressivis.vis import Heatmap, ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s3'),sep='    ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
histogram2d.input.df = data.output.df
min_mod = Min([0,1], scheduler=s)
max_mod = Max([0,1], scheduler=s)
min_mod.input.df = data.output.df
max_mod.input.df = data.output.df
histogram2d.input.min = min_mod.output.df
histogram2d.input.max = max_mod.output.df
from progressivis import *
from progressivis.vis import ScatterPlot
from progressivis.io import CSVLoader
from progressivis.datasets import get_dataset

def filter(df):
    l = df['pickup_longitude']
    return df[(l < -70) & (l > -80) ]

def print_len(x):
    if x is not None:
        print len(x)

#log_level()

try:
    s = scheduler
except:
    s = Scheduler()

csv = CSVLoader(get_dataset('bigfile'),header=None,index_col=False,force_valid_ids=True,scheduler=s)
pr = Every(scheduler=s)
pr.input.df = csv.output.df
scatterplot = ScatterPlot('_1', '_2', scheduler=s)
scatterplot.create_dependent_modules(csv,'df')

if __name__=='__main__':
    csv.start()
    s.join()
    print len(csv.df())
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]

filenames = pd.DataFrame({'filename': URLS})
cst = Constant(df=filenames, scheduler=s)
csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], filter=filter, scheduler=s)
#csv = CSVLoader(index_col=False,skipinitialspace=True,usecols=['pickup_longitude', 'pickup_latitude'], scheduler=s)
csv.input.filenames = cst.output.df
pr = Every(scheduler=s)
pr.input.df = csv.output.df
scatterplot = ScatterPlot('pickup_longitude', 'pickup_latitude', scheduler=s)
scatterplot.create_dependent_modules(csv,'df')

if __name__=='__main__':
    s.start()
    while True:
        time.sleep(2)
        scheluder.to_json()
        scatterplot.to_json() # simulate a web query
        scatterplot.get_image()
    s.join()
    print len(csv.df())
from progressivis import Scheduler, Print
from progressivis.cluster import MBKMeans
from progressivis.stats import RandomTable
from progressivis.vis import ScatterPlot

try:
    s = scheduler
except:
    s = Scheduler()

table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s)
mbkmeans = MBKMeans(columns=['a', 'b'], n_clusters=8, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = table.output.table
prn = Print(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('a', 'b', scheduler=s)
sp.create_dependent_modules(mbkmeans,'table')

if __name__ == '__main__':
    table.start()