Ejemplo n.º 1
0
    def test_mb_k_means(self) -> None:
        s = self.scheduler()
        n_clusters = 3
        try:
            dataset = (get_dataset("cluster:s3"), )
        except TimeoutError:
            print("Cannot download cluster:s3")
            return

        with s:
            csv = CSVLoader(
                dataset,
                sep=" ",
                skipinitialspace=True,
                header=None,
                index_col=False,
                scheduler=s,
            )
            km = MBKMeans(
                n_clusters=n_clusters,
                random_state=42,
                is_input=False,
                is_greedy=False,
                scheduler=s,
            )
            # km.input.table = csv.output.result
            km.create_dependent_modules(csv)
            pr = Print(proc=self.terse, scheduler=s)
            pr.input[0] = km.output.result
            e = Every(proc=self.terse, scheduler=s)
            e.input[0] = km.output.labels
        aio.run(s.start())
        labels = km.labels()
        assert labels is not None
        self.assertEqual(len(csv.table), len(labels))
Ejemplo n.º 2
0
 def test_mb_k_means(self):
     #log_level()
     s=Scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),sep=' ',skipinitialspace=True,header=None,index_col=False,scheduler=s)
     km = MBKMeans(n_clusters=n_clusters, random_state=42, is_input=False, scheduler=s)
     km.input.df = csv.output.df
     pr = Print(scheduler=s)
     pr.input.df = km.output.df
     e = Every(scheduler=s)
     e.input.df = km.output.labels
     s.start()
     self.assertEquals(len(csv.df()), len(km.labels()))
Ejemplo n.º 3
0
 def test_mb_k_means(self):
     #log_level()
     s = self.scheduler()
     n_clusters = 3
     csv = CSVLoader(get_dataset('cluster:s3'),
                     sep=' ',
                     skipinitialspace=True,
                     header=None,
                     index_col=False,
                     scheduler=s)
     km = MBKMeans(n_clusters=n_clusters,
                   random_state=42,
                   is_input=False,
                   scheduler=s)
     km.input.table = csv.output.table
     pr = Print(proc=self.terse, scheduler=s)
     pr.input.df = km.output.table
     e = Every(proc=self.terse, scheduler=s)
     e.input.df = km.output.labels
     s.start()
     s.join()
     self.assertEqual(len(csv.table()), len(km.labels()))
Ejemplo n.º 4
0
    #log_level(package="progressivis.cluster")

#dir_name = tempfile.mkdtemp(prefix='progressivis_tmp_')
dir_name = os.path.join(tempfile.gettempdir(), 'progressivis_tmp_')
os.makedirs(dir_name, exist_ok=True)
file_name = os.path.join(dir_name, "foobar.csv")
gen_csv(file_name, rows=99999, reset=True)  #, header='_0,_1', reset=False)
data = CSVLoader(file_name,
                 skipinitialspace=True,
                 header=None,
                 index_col=False,
                 scheduler=s)
n_clusters = 3
mbkmeans = MBKMeans(columns=['_0', '_1'],
                    n_clusters=n_clusters,
                    batch_size=100,
                    tol=0.01,
                    is_input=False,
                    scheduler=s)
classes = []
for i in range(n_clusters):
    cname = f"k{i}"
    filt = MBKMeansFilter(i)
    filt.create_dependent_modules(mbkmeans, data, 'table')
    classes.append({
        'name': cname,
        'x_column': '_0',
        'y_column': '_1',
        'sample': mbkmeans if i == 0 else None,
        'input_module': filt,
        'input_slot': 'table'
    })
Ejemplo n.º 5
0
from progressivis import Scheduler, Print
from progressivis.cluster import MBKMeans
from progressivis.stats import RandomTable
from progressivis.vis import MCScatterPlot
import asyncio as aio

try:
    s = scheduler
except:
    s = Scheduler()

table = RandomTable(columns=['a', 'b'], rows=50000, throttle=500, scheduler=s)
mbkmeans = MBKMeans(columns=['a', 'b'],
                    n_clusters=8,
                    batch_size=100,
                    is_input=False,
                    scheduler=s)
mbkmeans.input.table = table.output.table
prn = Print(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = MCScatterPlot(scheduler=s,
                   classes=[('Scatterplot', 'a', 'b')],
                   approximate=True)
sp.create_dependent_modules(mbkmeans, 'table')
sp['Scatterplot'].range_query_2d.hist_index_x.params.init_threshold = 1
sp['Scatterplot'].range_query_2d.hist_index_y.params.init_threshold = 1

if __name__ == '__main__':
    #table.start()
    aio.run(s.start(coros=[aio.sleep(3600)]))
Ejemplo n.º 6
0
"""
Clustering datasets may be found at
https://cs.joensuu.fi/sipu/datasets/
"""
from progressivis import Scheduler, Every#, log_level
from progressivis.cluster import MBKMeans
from progressivis.io import CSVLoader
from progressivis.vis import ScatterPlot
from progressivis.datasets import get_dataset

try:
    s = scheduler
except NameError:
    s = Scheduler()
    #log_level(package="progressivis.cluster")

data = CSVLoader(get_dataset('cluster:s1'),sep='\\s+',skipinitialspace=True,header=None,index_col=False,scheduler=s)
mbkmeans = MBKMeans(columns=['_0', '_1'], n_clusters=15, batch_size=100, is_input=False, scheduler=s)
mbkmeans.input.table = data.output.table
prn = Every(scheduler=s)
prn.input.df = mbkmeans.output.table
sp = ScatterPlot('_0','_1', scheduler=s)

sp.move_point = mbkmeans # for input management
sp.create_dependent_modules(data,'table', sample=None, select=mbkmeans)

if __name__ == '__main__':
    data.start()
    s.join()