Esempio n. 1
0
 async def modify_3(scheduler: Scheduler, run_number: int) -> None:
     print("Adding scatterplot_2")
     self.assertFalse("scatterplot_1" in scheduler)
     with scheduler:
         sp = MCScatterPlot(
             name="scatterplot_2",
             classes=[("Scatterplot", "a", "c")],
             approximate=True,
             scheduler=scheduler,
         )
         sp.create_dependent_modules(table, "result")
     scheduler.on_loop(modify_4, 10)  # Schedule the next activity
Esempio n. 2
0
 async def modify_1(scheduler: Scheduler, run_number: int) -> None:
     print("Adding scatterplot_1")
     # from nose.tools import set_trace; set_trace()
     with scheduler as dataflow:
         sp = MCScatterPlot(
             name="scatterplot_1",
             classes=[("Scatterplot", "a", "b")],
             approximate=True,
             scheduler=scheduler,
         )
         sp.create_dependent_modules(table, "result")
         print(f"Created scatterplot_1, groups: {dataflow.groups()}")
     scheduler.on_loop(modify_2, 10)  # Schedule the next activity
 def test_scatterplot2(self):
     s = self.scheduler()
     random = RandomTable(2, rows=2000000, scheduler=s)
     sp = MCScatterPlot(scheduler=s,
                        classes=[('Scatterplot', '_1', '_2')],
                        approximate=True)
     sp.create_dependent_modules(random, 'table', with_sampling=False)
     cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
     cnt.input.df = random.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = sp.output.table
     decorate(s, VariablePatch1("variable_1"))
     decorate(s, VariablePatch2("variable_2"))
     decorate(s, ScatterPlotPatch("mc_scatter_plot_1"))
     sp.scheduler().start(idle_proc=idle_proc)
     s.join()
     js = sp.to_json()
     x, y, _ = zip(*js['sample']['data'])
     min_x = min(x)
     max_x = max(x)
     min_y = min(y)
     max_y = max(y)
     self.assertGreaterEqual(min_x, LOWER_X)
     self.assertGreaterEqual(min_y, LOWER_Y)
     self.assertLessEqual(max_x, UPPER_X)
     self.assertLessEqual(max_y, UPPER_Y)
 def test_scatterplot(self):
     s = self.scheduler()
     csv = CSVLoader(get_dataset('smallfile'),
                     index_col=False,
                     header=None,
                     force_valid_ids=True,
                     scheduler=s)
     sp = MCScatterPlot(scheduler=s,
                        classes=[('Scatterplot', '_1', '_2')],
                        approximate=True)
     sp.create_dependent_modules(csv, 'table')
     cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
     cnt.input.df = csv.output.table
     prt = Print(proc=self.terse, scheduler=s)
     prt.input.df = sp.output.table
     csv.scheduler().start(idle_proc=idle_proc)
     s.join()
     self.assertEqual(len(csv.table()), 30000)
    def test_scatterplot2(self) -> None:
        s = self.scheduler(clean=True)
        with s:
            random = RandomTable(2, rows=2000000, throttle=1000, scheduler=s)
            sp = MCScatterPlot(scheduler=s,
                               classes=[("Scatterplot", "_1", "_2")],
                               approximate=True)
            sp.create_dependent_modules(random, "result", with_sampling=False)
            cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
            cnt.input[0] = random.output.result
            prt = Print(proc=self.terse, scheduler=s)
            prt.input[0] = sp.output.result

        async def fake_input_1(scheduler: Scheduler, rn: int) -> None:
            module = scheduler["dyn_var_1"]
            print("from input dyn_var_1")
            await module.from_input({"x": LOWER_X, "y": LOWER_Y})

        async def fake_input_2(scheduler: Scheduler, rn: int) -> None:
            module = scheduler["dyn_var_2"]
            print("from input dyn_var_2")
            await module.from_input({"x": UPPER_X, "y": UPPER_Y})

        # finp1 = fake_input(s, "dyn_var_1", 6, {"x": LOWER_X, "y": LOWER_Y})
        # finp2 = fake_input(s, "dyn_var_2", 6, {"x": UPPER_X, "y": UPPER_Y})
        # sts = sleep_then_stop(s, 10)
        s.on_loop(self._stop, 10)
        # s.on_loop(prt)
        s.on_loop(fake_input_1, 3)
        s.on_loop(fake_input_2, 3)
        # aio.run_gather(sp.scheduler().start(), sts)
        aio.run(s.start())
        js = sp.to_json()
        x, y, _ = zip(*js["sample"]["data"])
        min_x = min(x)
        max_x = max(x)
        min_y = min(y)
        max_y = max(y)
        self.assertGreaterEqual(min_x, LOWER_X)
        self.assertGreaterEqual(min_y, LOWER_Y)
        self.assertLessEqual(max_x, UPPER_X)
        self.assertLessEqual(max_y, UPPER_Y)
Esempio n. 6
0
        async def modify_1(scheduler: Scheduler, run_number: int) -> None:
            print("Adding scatterplot_1")
            with scheduler as dataflow:
                dataflow1 = dataflow
                sp = MCScatterPlot(
                    name="scatterplot_1",
                    classes=[("Scatterplot", "a", "b")],
                    approximate=True,
                    scheduler=scheduler,
                )
                sp.create_dependent_modules(table, "result")
                print(f"Created scatterplot_1, groups: {dataflow.groups()}")

            with self.assertRaises(ProgressiveError):
                with scheduler as dataflow:
                    self.assertIs(dataflow, dataflow1)
                    prt = Print(name="print",
                                proc=self.terse,
                                scheduler=scheduler)
                    # prt.input.df = table.output.result
                    _ = prt
            scheduler.on_loop(modify_2, 3)  # Schedule the next activity
 def test_scatterplot(self) -> None:
     s = self.scheduler(clean=True)
     with s:
         csv = CSVLoader(
             get_dataset("smallfile"),
             index_col=False,
             header=None,
             force_valid_ids=True,
             scheduler=s,
         )
         sp = MCScatterPlot(scheduler=s,
                            classes=[("Scatterplot", "_1", "_2")],
                            approximate=True)
         sp.create_dependent_modules(csv, "result")
         cnt = Every(proc=self.terse, constant_time=True, scheduler=s)
         cnt.input[0] = csv.output.result
         prt = Print(proc=self.terse, scheduler=s)
         prt.input[0] = sp.output.result
         # sts = sleep_then_stop(s, 5)
     s.on_loop(self._stop, 5)
     aio.run(csv.scheduler().start())
     self.assertEqual(len(csv.table), 30000)
Esempio n. 8
0
                    scheduler=s)
classes = []
for i in range(n_clusters):
    cname = f"k{i}"
    filt = MBKMeansFilter(i)
    filt.create_dependent_modules(mbkmeans, data, 'table')
    classes.append({
        'name': cname,
        'x_column': '_0',
        'y_column': '_1',
        'sample': mbkmeans if i == 0 else None,
        'input_module': filt,
        'input_slot': 'table'
    })

sp = MCScatterPlot(scheduler=s, classes=classes)
sp.create_dependent_modules()
for i in range(n_clusters):
    cname = f"k{i}"
    sp[cname].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf})
    sp[cname].max_value._table = PsDict({'_0': np.inf, '_1': np.inf})
mbkmeans.input.table = data.output.table
mbkmeans.create_dependent_modules()
sp.move_point = mbkmeans.moved_center  # for input management


def myprint(d):
    if d['convergence'] != 'unknown':
        print(d)
    else:
        print('.', end='')
Esempio n. 9
0
from progressivis import Scheduler, Print
from progressivis.cluster import MBKMeans
from progressivis.stats import RandomTable
from progressivis.vis import MCScatterPlot
import asyncio as aio

try:
    s = scheduler
except:
    s = Scheduler()

table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s)
mbkmeans = MBKMeans(columns=['a', 'b'],
                    n_clusters=8,
                    batch_size=100,
                    is_input=False,
                    scheduler=s)
mbkmeans.input.table = table.output.table
prn = Print(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = MCScatterPlot(scheduler=s,
                   classes=[('Scatterplot', 'a', 'b')],
                   approximate=True)
sp.create_dependent_modules(mbkmeans, 'table')
sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1
sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1

if __name__ == '__main__':
    #table.start()
    aio.run(s.start(coros=[aio.sleep(3600)]))
    #log_level(package="progressivis.cluster")

file_name = "/tmp/foobar.csv"
gen_csv(file_name, rows=999999, reset=True)  #, header='_0,_1', reset=False)
data = CSVLoader(file_name,
                 skipinitialspace=True,
                 header=None,
                 index_col=False,
                 scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'],
                    n_clusters=3,
                    batch_size=100,
                    tol=0.01,
                    is_input=False,
                    scheduler=s)
sp = MCScatterPlot(scheduler=s,
                   classes=[('Scatterplot', '_0', '_1', mbkmeans)])
sp.create_dependent_modules(data, 'table')
sp['Scatterplot'].min_value._table = PsDict({'_0': -np.inf, '_1': -np.inf})
sp['Scatterplot'].max_value._table = PsDict({'_0': np.inf, '_1': np.inf})
mbkmeans.input.table = sp['Scatterplot'].range_query_2d.output.table
#mbkmeans.input.table = data.output.table
mbkmeans.create_dependent_modules()
sp.move_point = mbkmeans.moved_center  # for input management


def myprint(d):
    if d['convergence'] != 'unknown':
        print(d)
    else:
        print('.', end='')
    PREFIX + 'yellow_tripdata_2015-06.csv' + SUFFIX,
]

FILENAMES = pd.DataFrame({'filename': URLS})
CST = Constant(Table('filenames', data=FILENAMES), scheduler=s)
CSV = CSVLoader(index_col=False,
                skipinitialspace=True,
                usecols=['pickup_longitude', 'pickup_latitude'],
                filter_=_filter,
                scheduler=s)

CSV.input.filenames = CST.output.table
PR = Every(scheduler=s)
PR.input.df = CSV.output.table
SCATTERPLOT = MCScatterPlot(scheduler=s,
                            classes=[('Scatterplot', 'pickup_longitude',
                                      'pickup_latitude')],
                            approximate=True)
SCATTERPLOT.create_dependent_modules(CSV, 'table')
s.set_interaction_opts(starving_mods=SCATTERPLOT.get_starving_mods(),
                       max_iter=3,
                       max_time=1.5)
if __name__ == '__main__':
    s.start()
    while True:
        time.sleep(2)
        s.to_json()
        SCATTERPLOT.to_json()  # simulate a web query
        #SCATTERPLOT.get_image()
    s.join()
    print(len(CSV.table()))
Esempio n. 12
0
CST = Constant(Table('filenames', data=FILENAMES), scheduler=s)
CSV = CSVLoader(index_col=False,
                skipinitialspace=True,
                usecols=[
                    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                    'dropoff_latitude'
                ],
                filter_=_filter,
                scheduler=s)  # TODO: reimplement filter in read_csv.py

CSV.input.filenames = CST.output.result
PR = Every(scheduler=s)
PR.input.df = CSV.output.result
MULTICLASS = MCScatterPlot(scheduler=s,
                           classes=[('pickup', 'pickup_longitude',
                                     'pickup_latitude'),
                                    ('dropoff', 'dropoff_longitude',
                                     'dropoff_latitude')],
                           approximate=True)
MULTICLASS.create_dependent_modules(CSV, 'result')


async def coro(s):
    await aio.sleep(2)
    print("awake after 2 sec.")
    s.to_json()


if __name__ == '__main__':
    aio.run(s.start(coros=[coro(s), aio.sleep(3600)]))
    print(len(CSV.table()))
CST = Constant(Table('filenames', data=FILENAMES), scheduler=s)
CSV = CSVLoader(index_col=False,
                skipinitialspace=True,
                usecols=[
                    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                    'dropoff_latitude'
                ],
                filter_=_filter,
                scheduler=s)  # TODO: reimplement filter in read_csv.py

CSV.input.filenames = CST.output.table
PR = Every(scheduler=s)
PR.input.df = CSV.output.table
MULTICLASS = MCScatterPlot(scheduler=s,
                           classes=[('pickup', 'pickup_longitude',
                                     'pickup_latitude'),
                                    ('dropoff', 'dropoff_longitude',
                                     'dropoff_latitude')],
                           approximate=True)
MULTICLASS.create_dependent_modules(CSV, 'table')
s.set_interaction_opts(starving_mods=MULTICLASS.get_starving_mods(),
                       max_iter=3,
                       max_time=1.5)
if __name__ == '__main__':
    s.start()
    while True:
        time.sleep(2)
        s.to_json()
        MULTICLASS.to_json()  # simulate a web query
    s.join()
    print(len(CSV.table()))
Esempio n. 14
0
Clustering datasets may be found at
http://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import MCScatterPlot
from progressivis.datasets import get_dataset
from progressivis.stats import RandomTable

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
sp = MCScatterPlot(scheduler=s, classes=[('Scatterplot', '_0', '_1', mbkmeans)])
sp.create_dependent_modules(data,'table')

mbkmeans.input.table = data.output.table

prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table

if __name__ == '__main__':
    #data.start()
    #s.join()
    aio.run(s.start())