Ejemplo n.º 1
0
 def _tst_10_read_multi_csv_file_compress_with_crash(
         self, file_list: List[str], tag: str) -> None:
     s = self.scheduler()
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 4)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Ejemplo n.º 2
0
 def test_06_read_http_multi_csv_bz2_with_crash(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url_list = [make_url("bigfile", ext=BZ2)] * 2
     module = CSVLoader(url_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     module = CSVLoader(
         url_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Ejemplo n.º 3
0
 def test_01_read_http_csv_with_crash_and_counter(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url = make_url("bigfile")
     module = CSVLoader(url,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 2)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     csv = CSVLoader(
         url,
         recovery=True,
         index_col=False,
         recovery_tag=tag,
         header=None,
         scheduler=s,
     )
     counter = Counter(scheduler=s)
     counter.input[0] = csv.output.result
     self.assertTrue(csv.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = counter.output.result
     aio.run(s.start())
     self.assertEqual(len(csv.table), 1000000)
     self.assertEqual(counter.table["counter"].loc[0], 1000000)
Ejemplo n.º 4
0
 def test_09_read_multi_csv_file_with_crash(self) -> None:
     s = self.scheduler()
     tag = "t9"
     file_list = [get_dataset("bigfile"), get_dataset("bigfile")]
     module = CSVLoader(file_list,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 3)
     aio.run_gather(s.start(), sts)
     _close(module)
     s = self.scheduler(clean=True)
     module = CSVLoader(
         file_list,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 2000000)
Ejemplo n.º 5
0
    def test_pattern(self):
        s = self.scheduler()
        n_samples = 1_000
        centers = [(0.1, 0.3, 0.5), (0.7, 0.5, 3.3), (-0.4, -0.3, -11.1)]
        cols = ["A", "B", "C"]
        with s:
            data = BlobsTable(
                columns=cols,
                centers=centers,
                cluster_std=0.2,
                rows=n_samples,
                scheduler=s,
            )
            # ds = DataShape(scheduler=s)
            # ds.input.table = data.output.result
            factory = StatsFactory(input_module=data, scheduler=s)
            factory.create_dependent_modules(var_name="my_dyn_var")
            factory.input.table = data.output.result
            sink = Sink(scheduler=s)
            # sink.input.inp = ds.output.result
            sink.input.inp = factory.output.result

        async def fake_input_1(scheduler: Scheduler, rn: int) -> None:
            module = scheduler["my_dyn_var"]
            print("from input my_dyn_var")
            await module.from_input({"matrix": matrix_hist})

        s.on_loop(my_stop, 4)
        s.on_loop(fake_input_1, 3)
        aio.run(s.start())
Ejemplo n.º 6
0
 def test_06_read_multiple_csv_bz2_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**6))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename": [
                 make_url("smallfile", ext=BZ2),
                 make_url("smallfile", ext=BZ2),
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False,
                     header=None,
                     scheduler=s,
                     timeout=0.01)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     _close(csv)
     self.assertEqual(len(csv.table), 60000)
Ejemplo n.º 7
0
 def _tst_08_read_multi_csv_file_compress_no_crash(
         self, files: List[str]) -> None:
     s = self.scheduler()
     module = CSVLoader(files, index_col=False, header=None,
                        scheduler=s)  # , save_context=False)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 60000)
Ejemplo n.º 8
0
 def test_read_csv(self) -> None:
     s = self.scheduler()
     module = CSVLoader(get_dataset("bigfile"),
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 1000000)
Ejemplo n.º 9
0
    def test_dataflow_6_dynamic(self) -> None:
        s = self.scheduler()
        table = RandomTable(name="table",
                            columns=["a"],
                            throttle=1000,
                            scheduler=s)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = table.output.result
        prt = Print(name="prt", proc=self.terse, scheduler=s)
        prt.input.df = table.output.result
        prt2 = Print(name="prt2", proc=self.terse, scheduler=s)
        prt2.input.df = table.output.result
        # from nose.tools import set_trace; set_trace()
        s.commit()

        async def modify_1(scheduler: Scheduler, run_number: int) -> None:
            with s as dataflow:
                print("Checking module deletion")
                self.assertTrue(isinstance(dataflow, Dataflow))
                deps = dataflow.collateral_damage("prt2")
                self.assertEqual(deps, set(["prt2"]))
                deps = dataflow.collateral_damage("prt")
                self.assertEqual(deps, set(["prt"]))
                deps = dataflow.collateral_damage("prt", "prt2")
                self.assertEqual(deps, set(["prt", "prt2"]))
                dataflow.delete_modules("prt2")
            s.on_loop(modify_2, 5)

        async def modify_2(scheduler: Scheduler, run_number: Any) -> None:
            self.assertFalse("prt2" in scheduler)
            with s as dataflow:
                print("Checking more module deletion")
                deps = dataflow.collateral_damage("prt")
                self.assertEqual(deps, {"prt"})
                deps = dataflow.collateral_damage("prt", "sink")
                self.assertEqual(deps, {"prt", "sink", "table"})
                dataflow.delete_modules("prt")
            s.on_loop(modify_3, 5)

        async def modify_3(scheduler: Scheduler, run_number: int) -> None:
            self.assertFalse("prt" in scheduler)
            with s as dataflow:
                print("Checking even more module deletion")
                deps = dataflow.collateral_damage("sink")
                self.assertEqual(deps, {"sink", "table"})
                dataflow.delete_modules("sink", "table")

        async def stop_error(scheduler: Scheduler, run_number: int) -> None:
            self.assertFalse("Scheduler should have stopped")
            await scheduler.stop()

        s.on_loop(modify_1, 5)
        s.on_loop(stop_error, 100)
        aio.run(s.start())
Ejemplo n.º 10
0
    def test_dataflow_9_errors(self) -> None:
        s = self.scheduler()
        table = RandomTable(name="table",
                            columns=["a", "b", "c"],
                            throttle=1000,
                            scheduler=s)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = table.output.result
        s.commit()

        # Start loading a dataset, then visualize it, then change the visualizations
        async def modify_1(scheduler: Scheduler, run_number: int) -> None:
            print("Adding scatterplot_1")
            with scheduler as dataflow:
                dataflow1 = dataflow
                sp = MCScatterPlot(
                    name="scatterplot_1",
                    classes=[("Scatterplot", "a", "b")],
                    approximate=True,
                    scheduler=scheduler,
                )
                sp.create_dependent_modules(table, "result")
                print(f"Created scatterplot_1, groups: {dataflow.groups()}")

            with self.assertRaises(ProgressiveError):
                with scheduler as dataflow:
                    self.assertIs(dataflow, dataflow1)
                    prt = Print(name="print",
                                proc=self.terse,
                                scheduler=scheduler)
                    # prt.input.df = table.output.result
                    _ = prt
            scheduler.on_loop(modify_2, 3)  # Schedule the next activity

        async def modify_2(scheduler: Scheduler, run_number: int) -> None:
            print("Removing table")
            self.assertFalse("scatterplot_1" in scheduler)
            with scheduler as dataflow:
                print("Checking sink+table modules deletion")
                deps = dataflow.collateral_damage("sink", "print")
                print(f"collateral_damage('sink') = '{sorted(deps)}'")
                dataflow.delete_modules(*deps)

        async def stop_error(scheduler: Scheduler, run_number: int) -> None:
            self.assertFalse("Scheduler should have stopped")
            await scheduler.stop()

        s.on_loop(modify_1, 3)
        s.on_loop(stop_error, 10)
        aio.run(s.start())
        self.assertFalse("scatterplot_1" in s)
        self.assertFalse("print" in s)
Ejemplo n.º 11
0
 def test_read_fake_csv(self) -> None:
     s = self.scheduler()
     module = CSVLoader(
         RandomBytesIO(cols=30, rows=1000000),
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 1000000)
Ejemplo n.º 12
0
 def test_04_read_http_multi_csv_bz2_no_crash(self) -> None:
     self._http_srv = _HttpSrv()
     s = self.scheduler()
     module = CSVLoader(
         [make_url("smallfile", ext=BZ2)] * 2,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 60000)
Ejemplo n.º 13
0
 def test_07_read_multi_csv_file_no_crash(self) -> None:
     s = self.scheduler()
     module = CSVLoader(
         [get_dataset("smallfile"),
          get_dataset("smallfile")],
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 60000)
Ejemplo n.º 14
0
 def test_01_read_http_csv_with_crash(self) -> None:
     self._http_srv = _HttpSrv()
     tag = self.get_tag()
     s = self.scheduler()
     url = make_url("bigfile")
     module = CSVLoader(url,
                        index_col=False,
                        recovery_tag=tag,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     sts = sleep_then_stop(s, 2)
     aio.run_gather(s.start(), sts)
     self._http_srv.restart()
     s = self.scheduler(clean=True)
     module = CSVLoader(
         url,
         recovery=True,
         recovery_tag=tag,
         index_col=False,
         header=None,
         scheduler=s,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     self.assertEqual(len(module.table), 1000000)
     col = module.table.loc[:, 0]
     assert col is not None
     arr1 = col.to_array().reshape(-1)
     arr2 = BIGFILE_DF.loc[:, 0].values
     # import pdb;pdb.set_trace()
     self.assertTrue(np.allclose(arr1, arr2))
Ejemplo n.º 15
0
 def test_01_read_http_csv_no_crash(self) -> None:
     p = Process(target=run_simple_server, args=())
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     module = CSVLoader(make_url("bigfile"),
                        index_col=False,
                        header=None,
                        scheduler=s)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     _close(module)
     self.assertEqual(len(module.table), 1000000)
Ejemplo n.º 16
0
 def test_as_array(self) -> None:
     s = self.scheduler()
     module = CSVLoader(
         get_dataset("bigfile"),
         index_col=False,
         as_array="array",
         header=None,
         scheduler=s,
     )
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     self.assertTrue(module.result is None)
     aio.run(s.start())
     table = module.table
     self.assertEqual(len(table), 1000000)
     self.assertEqual(table.columns, ["array"])
     self.assertEqual(table["array"].shape, (1000000, 30))
Ejemplo n.º 17
0
 def test_read_multiple_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names",
         dshape="{filename: string}",
         data={
             "filename":
             [get_dataset("smallfile"),
              get_dataset("smallfile")]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
Ejemplo n.º 18
0
 def test_02_read_http_csv_crash_recovery(self) -> None:
     p = Process(target=run_throttled_server, args=(8000, 10**7))
     p.start()
     self._http_proc = p
     time.sleep(SLEEP)
     s = self.scheduler()
     module = CSVLoader(make_url("bigfile"),
                        index_col=False,
                        header=None,
                        scheduler=s,
                        timeout=0.01)
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     aio.run(s.start())
     _close(module)
     # self.assertGreater(module.parser._recovery_cnt, 0)
     self.assertEqual(len(module.table), 1000000)
Ejemplo n.º 19
0
 def test_read_multiple_fake_csv(self) -> None:
     s = self.scheduler()
     filenames = Table(
         name="file_names2",
         dshape="{filename: string}",
         data={
             "filename": [
                 "buffer://fake1?cols=10&rows=30000",
                 "buffer://fake2?cols=10&rows=30000",
             ]
         },
     )
     cst = Constant(table=filenames, scheduler=s)
     csv = CSVLoader(index_col=False, header=None, scheduler=s)
     csv.input.filenames = cst.output.result
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = csv.output.result
     aio.run(csv.start())
     self.assertEqual(len(csv.table), 60000)
Ejemplo n.º 20
0
    def test_sf(self):
        np.random.seed(42)
        s = self.scheduler()
        random = RandomTable(3, rows=10_000, scheduler=s)
        sf = StatsFactory(input_module=random, scheduler=s)
        sf.create_dependent_modules(var_name="my_dyn_var")
        sf.input.table = random.output.result
        sink = Sink(scheduler=s)
        # sink.input.inp = random.output.result
        sink.input.inp = sf.output.result

        async def fake_input_1(scheduler: Scheduler, rn: int) -> None:
            module = scheduler["my_dyn_var"]
            print("from input my_dyn_var", "test_sf")
            await module.from_input({"matrix": matrix_max})

        s.on_loop(my_stop, 4)
        s.on_loop(fake_input_1, 3)
        aio.run(s.start())
        print(s.modules())
Ejemplo n.º 21
0
 def test_as_array2(self) -> None:
     s = self.scheduler()
     module = CSVLoader(
         get_dataset("bigfile"),
         index_col=False,
         as_array={
             "firsthalf": ["_" + str(r) for r in range(13)],
             "secondhalf": ["_" + str(r) for r in range(13, 30)],
         },
         header=None,
         scheduler=s,
     )
     sink = Sink(name="sink", scheduler=s)
     sink.input.inp = module.output.result
     self.assertTrue(module.result is None)
     aio.run(s.start())
     table = module.table
     self.assertEqual(len(table), 1000000)
     self.assertEqual(table.columns, ["firsthalf", "secondhalf"])
     self.assertEqual(table["firsthalf"].shape, (1000000, 13))
     self.assertEqual(table["secondhalf"].shape, (1000000, 17))
Ejemplo n.º 22
0
    def test_scheduler(self) -> None:
        with self.assertRaises(ProgressiveError):
            s = Scheduler(0)
        s = Scheduler()
        csv = CSVLoader(
            get_dataset("bigfile"),
            name="csv",
            index_col=False,
            header=None,
            scheduler=s,
        )
        self.assertIs(s["csv"], csv)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = csv.output.result  # allow csv to start
        check_running = False

        async def _is_running() -> None:
            nonlocal check_running
            check_running = csv.scheduler().is_running()

        aio.run_gather(s.start(), _is_running())

        self.assertTrue(check_running)

        def add_min(s: Scheduler, r: int) -> None:
            with s:
                m = Min(scheduler=s)
                m.input.table = csv.output.result
                prt = Print(proc=self.terse, scheduler=s)
                prt.input.df = m.output.result

        s.on_loop(add_min, 10)
        s.on_loop(self._stop, 20)

        self.assertIs(s["csv"], csv)
        json = s.to_json(short=False)
        self.assertFalse(json["is_running"])
        self.assertTrue(json["is_terminated"])
        html = s._repr_html_()
        self.assertTrue(len(html) != 0)
Ejemplo n.º 23
0
 def test_as_array3(self) -> None:
     s = self.scheduler()
     try:
         module = CSVLoader(
             get_dataset("mnist_784"),
             index_col=False,
             as_array=lambda cols:
             {"array": [c for c in cols if c != "class"]},
             scheduler=s,
         )
         sink = Sink(name="sink", scheduler=s)
         sink.input.inp = module.output.result
         self.assertTrue(module.result is None)
         aio.run(s.start())
         table = module.table
         self.assertEqual(len(table), 70000)
         self.assertEqual(table.columns, ["array", "class"])
         self.assertEqual(table["array"].shape, (70000, 784))
         self.assertEqual(table["class"].shape, (70000, ))
     except TimeoutError:
         print("Cannot download mnist")
         pass
Ejemplo n.º 24
0
 def load_csv(self) -> None:
     module = CSVLoader(
         filepath_or_buffer=get_dataset("smallfile"),
         force_valid_ids=True,
         index_col=False,
         header=None,
         scheduler=self.scheduler_,
     )
     self.assertTrue(module.result is None)
     sink = Sink(name="sink", scheduler=self.scheduler_)
     sink.input.inp = module.output.result
     aio.run(self.scheduler_.start(persist=True))
     t = module.table
     self.assertFalse(t is None)
     self.assertEqual(len(t), 30000)
     df = pd.read_csv(filepath_or_buffer=get_dataset("smallfile"),
                      index_col=False,
                      header=None)
     for col in range(t.ncol):
         coldf = df[col]
         colt = t[col]
         self.assertTrue(np.all(coldf == colt.values))
Ejemplo n.º 25
0
    def test_dataflow_0(self) -> None:
        scheduler = self.scheduler()
        saved_inputs = None
        saved_outputs = None
        with scheduler as dataflow:
            csv = CSVLoader(
                get_dataset("smallfile"),
                name="csv",
                index_col=False,
                header=None,
                scheduler=scheduler,
            )
            self.assertIs(scheduler["csv"], csv)
            self.assertEqual(
                dataflow.validate_module(csv),
                ['Output slot "result" missing in module "csv"'],
            )

            m = Min(name="min", scheduler=scheduler)
            self.assertIs(dataflow[m.name], m)
            self.assertEqual(
                dataflow.validate_module(m),
                [
                    'Input slot "table" missing in module "min"',
                    'Output slot "result" missing in module "min"',
                ],
            )

            prt = Print(proc=self.terse, name="print", scheduler=scheduler)
            self.assertIs(dataflow[prt.name], prt)
            self.assertEqual(
                dataflow.validate_module(prt),
                ['Input slot "df" missing in module "print"'],
            )

            m.input.table = csv.output.result
            prt.input.df = m.output.result

            self.assertEqual(len(dataflow), 3)
            self.assertEqual(dataflow.dir(), ["csv", "min", "print"])
            errors = dataflow.validate()
            self.assertEqual(errors, [])
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, prt.name])
            saved_inputs = dataflow.inputs
            saved_outputs = dataflow.outputs
            # dataflow.__exit__() is called here
        # print('Old modules:', end=' ')
        # pprint(scheduler._modules)
        # scheduler._update_modules()  # force modules in the main loop
        # print('New modules:', end=' ')
        # pprint(scheduler.modules())

        with scheduler as dataflow:
            # nothing should change when nothing is modified in dataflow
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, prt.name])
            self.assertEqual(dataflow.inputs, saved_inputs)
            self.assertEqual(dataflow.outputs, saved_outputs)
        # scheduler._update_modules()  # force modules in the main loop

        with scheduler as dataflow:
            sink = Sink(name="sink", scheduler=scheduler)
            sink.input.inp = m.output.result
            dataflow.delete_modules(prt)
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, "sink"])
            # pprint(dataflow.inputs)
            # pprint(dataflow.outputs)
        # print('Old modules:')
        # pprint(scheduler._new_modules)
        # scheduler._update_modules()  # force modules in the main loop
        # print('New modules:')
        # pprint(scheduler.modules())
        with scheduler as dataflow:
            self.assertEqual(len(dataflow), 3)
            deps = dataflow.order_modules()
            self.assertEqual(deps, ["csv", m.name, "sink"])
            prt = Print(proc=self.terse, name="print", scheduler=scheduler)
            self.assertIs(dataflow[prt.name], prt)
            self.assertEqual(
                dataflow.validate_module(prt),
                ['Input slot "df" missing in module "print"'],
            )

            prt.input.df = m.output.result
Ejemplo n.º 26
0
    def test_dataflow_7_dynamic(self) -> None:
        s = self.scheduler()
        table = RandomTable(name="table",
                            columns=["a", "b", "c"],
                            throttle=1000,
                            scheduler=s)
        sink = Sink(name="sink", scheduler=s)
        sink.input.inp = table.output.result
        s.commit()

        # Start loading a dataset, then visualize it, then change the visualizations
        async def modify_1(scheduler: Scheduler, run_number: int) -> None:
            print("Adding scatterplot_1")
            # from nose.tools import set_trace; set_trace()
            with scheduler as dataflow:
                sp = MCScatterPlot(
                    name="scatterplot_1",
                    classes=[("Scatterplot", "a", "b")],
                    approximate=True,
                    scheduler=scheduler,
                )
                sp.create_dependent_modules(table, "result")
                print(f"Created scatterplot_1, groups: {dataflow.groups()}")
            scheduler.on_loop(modify_2, 10)  # Schedule the next activity

        async def modify_2(scheduler: Scheduler, run_number: int) -> None:
            print("Removing scatterplot_1")
            self.assertTrue("scatterplot_1" in scheduler)
            with scheduler as dataflow:
                print("Checking scatterplot_1 module deletion")
                deps = dataflow.collateral_damage("scatterplot_1")
                print(f"collateral_damage('scatterplot_1') = '{sorted(deps)}'")
                dataflow.delete_modules(*deps)
            scheduler.on_loop(modify_3, 10)

        async def modify_3(scheduler: Scheduler, run_number: int) -> None:
            print("Adding scatterplot_2")
            self.assertFalse("scatterplot_1" in scheduler)
            with scheduler:
                sp = MCScatterPlot(
                    name="scatterplot_2",
                    classes=[("Scatterplot", "a", "c")],
                    approximate=True,
                    scheduler=scheduler,
                )
                sp.create_dependent_modules(table, "result")
            scheduler.on_loop(modify_4, 10)  # Schedule the next activity

        async def modify_4(scheduler: Scheduler, run_number: int) -> None:
            print("Removing scatterplot_2")
            self.assertFalse("scatterplot_1" in scheduler)
            self.assertTrue("scatterplot_2" in scheduler)
            with scheduler as dataflow:
                print("Checking scatterplot module deletion")
                print("Checking scatterplot_2 module addition")
                deps = dataflow.collateral_damage("scatterplot_2")
                print(f"collateral_damage('scatterplot_2') = '{sorted(deps)}'")
                dataflow.delete_modules(*deps)
            s.on_loop(modify_5, 5)

        async def modify_5(scheduler: Scheduler, run_number: int) -> None:
            print("Removing table")
            self.assertFalse("scatterplot_1" in scheduler)
            self.assertFalse("scatterplot_2" in scheduler)
            with scheduler as dataflow:
                print("Checking sink+table modules deletion")
                deps = dataflow.collateral_damage("sink")
                print(f"collateral_damage('sink') = '{sorted(deps)}'")
                dataflow.delete_modules(*deps)

        async def stop_error(scheduler: Scheduler, run_number: int) -> None:
            self.assertFalse("Scheduler should have stopped")
            await scheduler.stop()

        s.on_loop(modify_1, 10)
        s.on_loop(stop_error, 100)
        aio.run(s.start())
        self.assertFalse("scatterplot_1" in s)
        self.assertFalse("scatterplot_2" in s)