Beispiel #1
0
 def test_hub_if_else(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: False, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     min_ = Min(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = switch.output.result_else
     hub = Hub(scheduler=s)
     hub.input.table = min_.output.result
     hub.input.table = max_.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = hub.output.result
     aio.run(s.start())
     res1 = stirrer.result.min()
     res2 = hub.result
     self.compare(res1, res2)
 def setUpStep(self, step):
     self.set_step_info("{} rows".format(step * L))
     s = Scheduler()
     random = RandomTable(10, rows=step * L, scheduler=s)
     s.start()
     #return random
     self.random_table = pd.DataFrame(
         random.output.table.output_module.table().to_dict())
Beispiel #3
0
 def test_input(self):
     s=Scheduler()
     inp = Input(scheduler=s)
     pr=Print(scheduler=s)
     pr.input.df = inp.output.df
     t=threading.Thread(target=do_line,args=(inp,s))
     t.start()
     s.start()
     self.assertEqual(len(inp.df()), 10)
 def p10s_random_min_max(n):
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(name='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
 def p10s_random_min_max(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     min_ = Min(mid='min_' + str(hash(random)), scheduler=s)
     min_.input.table = random.output.table
     max_ = Max(id='max_' + str(hash(random)), scheduler=s)
     max_.input.table = random.output.table
     s.start()
Beispiel #6
0
 def test_filter(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     filter_ = FilterMod(expr="_1 > 0.5", scheduler=s)
     filter_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = filter_.output.result
     aio.run(s.start())
     idx = (filter_.get_input_slot("table").data().eval(
         "_1>0.5", result_object="index"))
     self.assertEqual(filter_.table.index, bitmap(idx))
Beispiel #7
0
 def test_max(self):
     s=Scheduler()
     random = RandomTable(10, rows=10000, scheduler=s)
     max=Max(scheduler=s)
     max.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = max.output.df
     s.start()
     res1 = random.df()[random.columns.difference([random.UPDATE_COLUMN])].max()
     res2 = last_row(max.df(), remove_update=True)
     self.assertTrue(np.allclose(res1, res2))
 def test_random_table(self):
     s=Scheduler()
     module=RandomTable(['a', 'b'], rows=10000, scheduler=s)
     self.assertEqual(module.df().columns[0],'a')
     self.assertEqual(module.df().columns[1],'b')
     self.assertEqual(len(module.df().columns), 3) # add the UPDATE_COLUMN
     prlen = Every(proc=print_len, constant_time=True, scheduler=s)
     prlen.input.df = module.output.df
     s.start()
     self.assertEqual(len(module.df()), 10000)
     self.assertFalse(module.df()['a'].isnull().any())
     self.assertFalse(module.df()['b'].isnull().any())
Beispiel #9
0
 def test_var(self):
     s=Scheduler()
     random = RandomTable(1, rows=1000, scheduler=s)
     var=Var(scheduler=s)
     var.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = var.output.df
     s.start()
     res1 = random.df()[1].var()
     res2 = last_row(var.df(), remove_update=True)
     #print 'res1:', res1
     #print 'res2:', res2
     self.assertTrue(np.allclose(res1, res2))
 def test_random_table2(self):
     s=Scheduler()
      # produces more than 4M rows per second on my laptop
     module=RandomTable(10, rows=10000000, force_valid_ids=True, scheduler=s)
     self.assertEqual(len(module.df().columns), 11) # add the UPDATE_COLUMN
     self.assertEqual(module.df().columns[0],'_1')
     self.assertEqual(module.df().columns[1],'_2')
     prlen = Every(proc=print_len, constant_time=True, scheduler=s)
     prlen.input.df = module.output.df
     s.start()
     self.assertEqual(len(module.df()), 10000000)
     self.assertFalse(module.df()['_1'].isnull().any())
     self.assertFalse(module.df()['_2'].isnull().any())
 def test_mb_k_means(self):
     #log_level()
     s=Scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
     km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s)
     km.input.df = csv.output.df
     pr = Print(scheduler=s)
     pr.input.df = km.output.df
     e = Every(scheduler=s)
     e.input.df = km.output.labels
     s.start()
     self.assertEquals(len(csv.df()), len(km.labels()))
Beispiel #12
0
 def test_idxmax(self):
     s=Scheduler()
     random = RandomTable(10, rows=10000,throttle=1000, scheduler=s)
     idxmax=IdxMax(scheduler=s)
     idxmax.input.df = random.output.df
     max=Max(scheduler=s)
     max.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = idxmax.output.max
     s.start()
     max1=last_row(max.df(),remove_update=True)
     #print max1
     max2=last_row(idxmax.max(),remove_update=True)
     #print max2
     self.assertTrue((max1==max2).all())
Beispiel #13
0
 def test_idxmin(self):
     s=Scheduler()
     random = RandomTable(10, rows=10000,throttle=1000, scheduler=s)
     idxmin=IdxMin(scheduler=s)
     idxmin.input.df = random.output.df
     min=Min(scheduler=s)
     min.input.df = random.output.df
     pr=Print(scheduler=s)
     pr.input.df = idxmin.output.min
     s.start()
     min1=last_row(min.df(),remove_update=True)
     #print min1
     min2=last_row(idxmin.min(),remove_update=True)
     #print min2
     self.assertTrue((min1==min2).all())
Beispiel #14
0
 def test_repair_min(self) -> None:
     """
     test_repair_min()
     min without deletes/updates
     """
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     min_ = ScalarMin(name="min_" + str(hash(random)), scheduler=s)
     min_.input[0] = random.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     res1 = random.table.min()
     res2 = min_.psdict
     self.compare(res1, res2)
Beispiel #15
0
 def NOtest_vec_distances(self):
     s=Scheduler()
     vec=VECLoader(get_dataset('warlogs'),scheduler=s)
     dis=PairwiseDistances(metric='cosine',scheduler=s)
     dis.input.df = vec.output.df
     dis.input.array = vec.output.array
     cnt = Every(proc=print_len,constant_time=True,scheduler=s)
     cnt.input.df = dis.output.dist
     global times
     times = 0
     s.start()
     df = vec.df()
     computed = dis.dist()
     self.assertEquals(computed.shape[0], len(df))
     truth = pairwise_distances(vec.toarray(), metric=dis._metric)
     self.assertTrue(np.allclose(truth, computed))
Beispiel #16
0
 def p10s_read_csv(self):
     s = Scheduler()
     module = CSVLoader(RandomBytesIO(cols=30,
                                      size=self.current_step * GIGA),
                        index_col=False,
                        header=None,
                        scheduler=s)
     module.start()
Beispiel #17
0
 def test_repair_min2(self) -> None:
     """
     test_repair_min2()
     runs with sensitive ids deletion
     """
     s = Scheduler()
     ScalarMin._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     min_ = ScalarMin(name="min_repair_test2", scheduler=s)
     stirrer = MyStirrer(watched="min_repair_test2", scheduler=s)
     stirrer.input[0] = random.output.result
     min_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = min_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMin._reset_calls_counter, 1)  # type: ignore
     res1 = stirrer.table.min()
     res2 = min_.psdict
     self.compare(res1, res2)
Beispiel #18
0
 def test_filter3(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(update_column="_1",
                       update_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     stirrer.input[0] = random.output.result
     filter_ = FilterMod(expr="_1 > 0.5", scheduler=s)
     filter_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = filter_.output.result
     aio.run(s.start())
     tbl = filter_.get_input_slot("table").data()
     idx = tbl.eval("_1>0.5", result_object="index")
     self.assertEqual(filter_.table.index, bitmap(idx))
     df = pd.DataFrame(tbl.to_dict(), index=tbl.index.to_array())
     dfe = df.eval("_1>0.5")
     self.assertEqual(filter_.table.index, bitmap(df.index[dfe]))
 def test_stirrer(self) -> None:
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     res1 = stirrer.table.max()
     res2 = max_.result
     self.compare(res1, res2)
Beispiel #20
0
    def test_csv_distances(self):
        s=Scheduler()
        vec=CSVLoader(get_dataset('smallfile'),index_col=False,header=None,scheduler=s)
        dis=PairwiseDistances(metric='euclidean',scheduler=s)
        dis.input.df = vec.output.df
        cnt = Every(proc=print_len,constant_time=True,scheduler=s)
        cnt.input.df = dis.output.dist
        global times
        times = 0
        s.start(ten_times)
        df = vec.df()
        computed = dis.dist()
        #self.assertEquals(computed.shape[0], len(df))

        del df[CSVLoader.UPDATE_COLUMN]
        offset=0
        size=offset+5000
        truth = pairwise_distances(df.iloc[offset:size], metric=dis._metric)
        dist = computed[offset:size,offset:size]
        self.assertTrue(np.allclose(truth, dist,atol=1e-7)) # reduce tolerance
Beispiel #21
0
 def test_repair_max3(self) -> None:
     """
     test_repair_max3()
     runs with NON-sensitive ids deletion
     """
     s = Scheduler()
     ScalarMax._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     max_ = ScalarMax(name="max_repair_test3", scheduler=s)
     stirrer = MyStirrer(watched="max_repair_test3",
                         proc_sensitive=False,
                         scheduler=s)
     stirrer.input[0] = random.output.result
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMax._reset_calls_counter, 0)  # type: ignore
     res1 = stirrer.table.max()
     res2 = max_.psdict
     self.compare(res1, res2)
 def test_select_delta(self):
     #log_level()
     delta = np.array([0, 0.05])
     points = [[0.0, 0.0], [1.0, 1.0], [1.0, 0.0]]
     s=Scheduler()
     df=pd.DataFrame(points)
     add_to_row=AddToRow(df, scheduler=s)
     def tick_proc(s, run_number):
         if run_number > 100:
             s.stop()
         #print add_to_row.df()
         try:
             add_to_row.from_input({1: delta})
         except Exception as e:
             print 'Error: %s'%e
     q=SelectDelta(delta=0.5,scheduler=s)
     q.input.df = add_to_row.output.df
     prlen = Every(scheduler=s)
     prlen.input.df = q.output.df
     s.start(tick_proc=tick_proc)
     self.assertEqual(len(q.df()), 3)
Beispiel #23
0
 def test_repair_max5(self) -> None:
     """
     test_repair_max5()
     runs with sensitive ids update (critical)
     """
     s = Scheduler()
     ScalarMax._reset_calls_counter = 0  # type: ignore
     random = RandomTable(2, rows=100000, scheduler=s)
     max_ = ScalarMax(name="max_repair_test4", scheduler=s)
     stirrer = MyStirrer(watched="max_repair_test4",
                         mode="update",
                         value=-9999.0,
                         scheduler=s)
     stirrer.input[0] = random.output.result
     max_.input[0] = stirrer.output.result
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     self.assertEqual(ScalarMax._reset_calls_counter, 1)  # type: ignore
     res1 = stirrer.table.max()
     res2 = max_.psdict
     self.compare(res1, res2)
Beispiel #24
0
 def scheduler(self):
     sched = None
     if getenv("NOTHREAD"):
         if not self._output:
             print('[Using non-threaded scheduler]',
                   end=' ',
                   file=sys.stderr)
             self._output = True
         sched = BaseScheduler()
     else:
         sched = Scheduler()
     self._schedulers.append(sched)
     return sched
Beispiel #25
0
    def test_resetter(self) -> None:
        """
        test_resetter()
        """
        s = Scheduler()
        resetter = MyResetter(threshold=30000, scheduler=s)

        def _func(slot: Slot) -> bool:
            return slot.data().get("reset") is True

        score = self._common(0.1, resetter=resetter, resetter_func=_func, scheduler=s)
        print("resetter 30K=>score", score)
        self.assertGreater(score, 0.77)
Beispiel #26
0
 def test_switch_if_then(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     stirrer = Stirrer(
         update_column="_1",
         delete_rows=5,
         update_rows=5,
         fixed_step_size=100,
         scheduler=s,
     )
     stirrer.input[0] = random.output.result
     switch = Switch(condition=lambda x: True, scheduler=s)
     switch.input[0] = stirrer.output.result
     max_ = Max(name="max_" + str(hash(random)), scheduler=s)
     max_.input[0] = switch.output.result
     pr_else = Print(proc=self.terse, scheduler=s)
     pr_else.input[0] = switch.output.result_else
     pr = Print(proc=self.terse, scheduler=s)
     pr.input[0] = max_.output.result
     aio.run(s.start())
     res1 = stirrer.result.max()
     res2 = max_.result
     self.compare(res1, res2)
 def test_filter(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     filter_ = FilterMod(expr='_1 > 0.5', scheduler=s)
     filter_.input.table = random.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = filter_.output.table
     s.start()
     s.join()
     idx = filter_.get_input_slot('table').data().eval(
         '_1>0.5', result_object='index')
     self.assertEqual(filter_._table.selection, bitmap(idx))
    def test_scheduler(self):
        s = Scheduler()
        csv = CSVLoader(get_dataset('bigfile'),
                        name="csv",
                        index_col=False,
                        header=None,
                        scheduler=s)

        #smp = Sample(n=10,scheduler=s)
        #smp.input.df = csv.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        #smp2 = Sample(n=15, scheduler=s)
        #smp2.input.df = csv.output.df

        def add_min():
            m = Min(scheduler=s)
            # Of course, sleeping here is a bad idea. this is to illustrate
            # that add_min will be executed atomically by the scheduler.
            # using a sleep outside of add_oneshot_tick_proc would lead to an inconsistent
            # state.
            #sleep(1)
            m.input.table = csv.output.table
            prt = Print(proc=self.terse, scheduler=s)
            prt.input.df = m.output.table

        s.on_tick_once(add_min)

        sleep(1)
        #self.assertTrue(s._runorder.index(smp.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(smp2.id) > s._runorder.index(csv.id))
        #self.assertTrue(s._runorder.index(m.id) > s._runorder.index(smp2.id))
        s.stop()
        s.join()
 def test_dummy(self):
     s = Scheduler()
     random = RandomTable(2, rows=100000, scheduler=s)
     dummy_ = DummyMod(update_column='_1',
                       delete_rows=5,
                       update_rows=5,
                       fixed_step_size=100,
                       scheduler=s)
     dummy_.input.table = random.output.table
     max_ = Max(name='max_' + str(hash(random)), scheduler=s)
     max_.input.table = dummy_.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = max_.output.table
     s.start()
     s.join()
    def test_dataflow(self):
        s = Scheduler()
        with Dataflow(s):
            csv = CSVLoader(get_dataset('bigfile'),
                            name="csv",
                            index_col=False,
                            header=None)
            m = Min()
            m.input.table = csv.output.table
            prt = Print(proc=self.terse)
            prt.input.df = m.output.table

        self.assertIs(s["csv"], csv)
        csv.scheduler().start()

        sleep(1)
        self.assertTrue(csv.scheduler().is_running())

        s.stop()
        s.join()
 def p10s_random(self):
     n = self.current_step
     StorageEngine.default = "hdf5"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     s.start()
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.stats import Min, Max, Histogram2D
from progressivis.vis import Heatmap, ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s3'),sep='    ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=[0, 1], n_clusters=15, batch_size=100, scheduler=s)
mbkmeans.input.df = data.output.df
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.df
sp = ScatterPlot(0,1, scheduler=s)
sp.move_point = mbkmeans # for input management
#sp.create_dependent_modules(mbkmeans,'centroids')
# Create modules by hand rather than with the utility.
# We show the cluster centroids on the scatterplot and the
# data as a heatmap

# histogram2d
histogram2d = Histogram2D(0, 1, scheduler=s)
Beispiel #33
0
def filter_(df):
    lon = df['dropoff_longitude']
    lat = df['dropoff_latitude']
    return df[(lon>-74.10)&(lon<-73.7)&(lat>40.60)&(lat<41)]

def print_len(x):
    if x is not None:
        print(len(x))

#log_level() #package='progressivis.stats.histogram2d')

try:
    s = scheduler
except:
    s = Scheduler()

#PREFIX= 'https://storage.googleapis.com/tlc-trip-data/2015/'
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]
from progressivis import Scheduler, Print
from progressivis.cluster import MBKMeans
from progressivis.stats import RandomTable
from progressivis.vis import MCScatterPlot
import asyncio as aio

try:
    s = scheduler
except:
    s = Scheduler()

table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s)
mbkmeans = MBKMeans(columns=['a', 'b'],
                    n_clusters=8,
                    batch_size=100,
                    is_input=False,
                    scheduler=s)
mbkmeans.input.table = table.output.table
prn = Print(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = MCScatterPlot(scheduler=s,
                   classes=[('Scatterplot', 'a', 'b')],
                   approximate=True)
sp.create_dependent_modules(mbkmeans, 'table')
sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1
sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1

if __name__ == '__main__':
    #table.start()
    aio.run(s.start(coros=[aio.sleep(3600)]))
"Test the Histograms visualization module"
from progressivis import Scheduler, Every
from progressivis.stats import RandomTable, Min, Max
from progressivis.vis import Histograms

import numpy as np

#log_level()

try:
    s = scheduler
    print('No scheduler defined, using the standard one')
except NameError:
    s = Scheduler()


def main():
    "Main function"
    csvmod = RandomTable(columns=['a', 'b', 'c'],
                         rows=1000000,
                         random=np.random.randn,
                         throttle=1000,
                         scheduler=s)
    minmod = Min(scheduler=s)
    minmod.input.table = csvmod.output.table
    maxmod = Max(scheduler=s)
    maxmod.input.table = csvmod.output.table
    histograms = Histograms(scheduler=s)
    histograms.input.table = csvmod.output.table
    histograms.input.min = minmod.output.table
    histograms.input.max = maxmod.output.table
bounds_max = {'pickup_latitude': 41.00, 'pickup_longitude': -73.70}
def filter(df):
    lon = df['pickup_longitude']
    lat = df['pickup_latitude']
    return df[(lon>-74.10)&(lon<-73.7)&(lat>40.60)&(lat<41)]

def print_len(x):
    if x is not None:
        print len(x)

#log_level() #package='progressivis.stats.histogram2d')

try:
    s = scheduler
except:
    s = Scheduler()

#PREFIX= 'https://storage.googleapis.com/tlc-trip-data/2015/'
#SUFFIX= ''
PREFIX= '../nyc-taxi/'
SUFFIX= '.bz2'

URLS = [
    PREFIX+'yellow_tripdata_2015-01.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-02.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-03.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-04.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-05.csv'+SUFFIX,
    PREFIX+'yellow_tripdata_2015-06.csv'+SUFFIX,
]
Beispiel #37
0
 def scheduler(self, clean: bool = False) -> Scheduler:
     if self._scheduler is None or clean:
         self._scheduler = Scheduler()
     return self._scheduler
 def _fun(s: Scheduler, r: int) -> None:
     if r > 10:
         s.task_stop()
Beispiel #39
0
 def test_read_csv(self):
     s=Scheduler()
     module=CSVLoader(get_dataset('bigfile'), index_col=False, header=None, scheduler=s)
     self.assertTrue(module.df() is None)
     s.start()
     self.assertEqual(len(module.df()), 1000000)
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = data.output.table
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('_0','_1', scheduler=s)

sp.move_point = mbkmeans # for input management
sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans)

if __name__ == '__main__':
    data.start()
    s.join()
Beispiel #41
0
from progressivis import Scheduler, Every  #, log_level
from progressivis.cluster import MBKMeans, MBKMeansFilter
from progressivis.io import CSVLoader
from progressivis.vis import MCScatterPlot
from progressivis.datasets import get_dataset
from progressivis.stats import RandomTable
from progressivis.utils.psdict import PsDict
import pandas as pd
import numpy as np
import os.path
import tempfile
from progressivis.datasets.random import generate_random_multivariate_normal_csv as gen_csv
try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

#dir_name = tempfile.mkdtemp(prefix='progressivis_tmp_')
dir_name = os.path.join(tempfile.gettempdir(), 'progressivis_tmp_')
os.makedirs(dir_name, exist_ok=True)
file_name = os.path.join(dir_name, "foobar.csv")
gen_csv(file_name, rows=99999, reset=True)  #, header='_0,_1', reset=False)
data = CSVLoader(file_name,
                 skipinitialspace=True,
                 header=None,
                 index_col=False,
                 scheduler=s)
n_clusters = 3
mbkmeans = MBKMeans(columns=['_0', '_1'],
                    n_clusters=n_clusters,
 def p10s_zarr_random(n):
     StorageEngine.default = "zarr"
     s = Scheduler()
     random = RandomTable(10, rows=n * L, scheduler=s)
     s.start()
Beispiel #43
0
    def test_scheduler(self) -> None:
        with self.assertRaises(ProgressiveError):
            s = Scheduler(0)
        s = Scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"),
            name="csv",
            index_col=False,
            header=None,
            scheduler=s,
        )
        self.assertIs(s["csv"], csv)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = csv.output.result  # allow csv to start
        check_running = False

        async def _is_running() -> None:
            nonlocal check_running
            check_running = csv.scheduler().is_running()

        aio.run_gather(s.start(), _is_running())

        self.assertTrue(check_running)

        def add_min(s: Scheduler, r: int) -> None:
            with s:
                m = Min(scheduler=s)
                m.input.table = csv.output.result
                prt = Print(proc=self.terse, scheduler=s)
                prt.input.df = m.output.result

        s.on_loop(add_min, 10)
        s.on_loop(self._stop, 20)

        self.assertIs(s["csv"], csv)
        json = s.to_json(short=False)
        self.assertFalse(json["is_running"])
        self.assertTrue(json["is_terminated"])
        html = s._repr_html_()
        self.assertTrue(len(html) != 0)
def make_df(n, L):
    s = Scheduler()
    random = RandomTable(10, rows=n * L, scheduler=s)
    s.start()
    #return random
    return pd.DataFrame(random.output.table.output_module.table().to_dict())