Example #1
0
    def test_deferred_chain(self):
        """
        one command computes a string that gets passed to another one

        this is analogous to

            run1=$(ssh localhost echo from-first-run)
            final=$(ssh localhost echo ${run1})

        the 'final' variable is only needed 
        for checking everything went well
        """

        s = Scheduler()
        env = Variables()

        n = SshNode(localhostname(), username=localuser())
        Sequence(SshJob(n,
                        commands=Run("echo from-first-run",
                                     capture=Capture('run1', env))),
                 SshJob(n,
                        commands=Run(Deferred("echo {{run1}}", env),
                                     capture=Capture('final', env))),
                 scheduler=s)

        s.run()

        #print(f"env={env}")
        obtained = env.final
        expected = "from-first-run"
        self.assertEqual(obtained, expected)
Example #2
0
    def test_deferred_service(self):
        """
        a service can be defined from a deferred instance
        rather than a plain string
        """
        s = Scheduler()
        env = Variables()
        echo_service = Service(Deferred("echo {{run1}}", env),
                               service_id='echo',
                               verbose=True)

        n = SshNode(localhostname(), username=localuser())
        Sequence(SshJob(n,
                        commands=Run("echo from-first-run",
                                     capture=Capture('run1', env))),
                 SshJob(n, commands=Run(echo_service.start_command())),
                 SshJob(n,
                        commands=Run(echo_service.journal_command(
                            since="10 second ago"),
                                     capture=Capture('journal', env))),
                 scheduler=s)
        print('STARTING', 20 * '-', echo_service.start_command())
        s.run()
        print('DONE', 20 * '-', echo_service.start_command())

        #print(f"env={env}")
        obtained = env.journal
        expected = "from-first-run"
        found = expected in obtained
        self.assertTrue(found)
Example #3
0
 def check_expansion(self, *deferred_expected_s):
     s = Scheduler()
     formatters = {}
     for deferred, _ in deferred_expected_s:
         formatters[deferred] = f = CaptureFormatter()
         f.start_capture()
         n = SshNode(localhostname(), username=localuser(), formatter=f)
         s.add(SshJob(node=n, commands=Run(deferred)))
     s.run()
     for deferred, expected in deferred_expected_s:
         captured = formatters[deferred].get_capture()
         self.assertEqual(captured, expected)
Example #4
0
    def hop1(self, hostname='localhost', username=None,
             *, c1, commands, s_command='echo hop1-{}-{}',
             nested_sched=(0, 1)):
        """
        create
          * <c1> connections to one node 1 hop away
          * and on each <commands> commands

        check current number of connections
        """
        if username is None:
            username = localuser()

        verbose(f"creating {c1} hop1-connections - "
                f"{commands} commands per conn - "
                f" to {username}@{hostname}")
        scheduler = Scheduler()
        nodes = []
        jobs = []
        for n in range(c1):
            node1 = SshNode(hostname, username=username,
                            formatter=ColonFormatter(verbose=False))
            nodes.append(node1)
            for c in range(commands):
                jobs.append(SshJob(node=node1,
                                   command=s_command.format(n, c),
                                   ))
        scheduler = self.populate_sched(scheduler, jobs,
                                        nested=nested_sched[0],
                                        pack_job=nested_sched[1])
        expected = c1
        # record base status
        in0, out0 = in_out_connections()
        verbose(f"INITIAL count in={in0} out={out0}")
        scheduler.export_as_pngfile("debug")
        topology_as_pngfile(scheduler, "topology")
        scheduler.run()

        in1, out1 = in_out_connections()
        verbose(f"AFTER RUN in={in1} out={out1}")
        self.assertEqual(in1-in0, expected)
        self.assertEqual(out1-out0, expected)
        arg = nodes

        # cleanup
        close_ssh_in_scheduler(scheduler)

        in1, out1 = in_out_connections()
        verbose(f"AFTER CLEANUP in={in1} out={out1}")
        self.assertEqual(in1-in0, 0)
        self.assertEqual(out1-out0, 0)
Example #5
0
    def test_hop_depth(self, hostname='localhost', username=None,
                       depth=4, commands=1):
        # Do not use the close_nodes manually on this test, it does keep the
        # Order of the declared nodes.

        if username is None:
            username = localuser()

        verbose(f"creating hop{depth}-connections - "
                f"{commands} commands per conn "
                f" to {username}@{hostname}")
        scheduler = Scheduler(timeout=7)
        nodes = []
        jobs = []
        gateway = None
        for n in range(depth):
            node = SshNode(hostname, gateway=gateway, username=username,
                           formatter=ColonFormatter(verbose=False))
            nodes.append(node)
            gateway = node
            for c in range(commands):
                jobs.append(SshJob(node=node,
                                   command=f"echo hop{n}-{c}",
                                   scheduler=scheduler))

        expected = depth

        # record base status
        in0, out0 = in_out_connections()

        verbose(f"INITIAL count in={in0} out={out0}")
    #    try:
        scheduler.run()
        #except Exception:
    #        pass
        in1, out1 = in_out_connections()
        verbose(f"AFTER RUN in={in1} out={out1}")
        self.assertEqual(in1-in0, expected)
        self.assertEqual(out1-out0, expected)

        # cleanup
        close_ssh_in_scheduler(scheduler)

        #Lets wait a little bit to count
        time.sleep(1)
        in1, out1 = in_out_connections()

        verbose(f"AFTER CLEANUP in={in1} out={out1}")
        self.assertEqual(in1-in0, 0)
        self.assertEqual(out1-out0, 0)
Example #6
0
    def hop1(self, hostname='localhost', username=None, *, c1, commands):
        """
        create
          * <c1> connections to one node 1 hop away
          * and on each <commands> commands

        check current number of connections
        """
        if username is None:
            username = localuser()

        print(f"creating {c1} hop1-connections - "
              f"{commands} commands per conn - "
              f" to {username}@{hostname}")
        scheduler = Scheduler()
        nodes = []
        jobs = []
        for n in range(c1):
            node1 = SshNode(hostname,
                            username=username,
                            formatter=ColonFormatter(verbose=False))
            nodes.append(node1)
            for c in range(commands):
                jobs.append(
                    SshJob(node=node1,
                           command=f'echo hop1-{n}-{c}',
                           scheduler=scheduler))

        expected = c1

        # record base status
        in0, out0 = in_out_connections()
        print(f"INITIAL count in={in0} out={out0}")

        scheduler.run()

        in1, out1 = in_out_connections()
        print(f"AFTER RUN in={in1} out={out1}")
        self.assertEqual(in1 - in0, expected)
        self.assertEqual(out1 - out0, expected)

        # cleanup
        gathered = asyncio.get_event_loop().run_until_complete(
            asyncio.gather(*(node.close() for node in nodes)))
        in1, out1 = in_out_connections()
        print(f"AFTER CLEANUP in={in1} out={out1}")
        self.assertEqual(in1 - in0, 0)
        self.assertEqual(out1 - out0, 0)
Example #7
0
    def run(self, message_bus, timeout):
        """
        send verb to all nodes, waits for max timeout
        returns True if all nodes behaved as expected
        and False otherwise - including in case of KeyboardInterrupt
        """

        nodes = [
            Node(cmc_name, message_bus)
            for cmc_name in self.selector.cmc_names()
        ]
        jobs = [
            Job(self.get_and_show_verb(node, self.verb), critical=True)
            for node in nodes
        ]
        display = Display(nodes, message_bus)
        scheduler = Scheduler(Job(display.run(), forever=True, critical=True),
                              *jobs,
                              timeout=timeout,
                              critical=False)
        try:
            if scheduler.run():
                return True
            else:
                scheduler.debrief()
                print(f"rhubarbe-{self.verb} failed: {scheduler.why()}")
                return False
        except KeyboardInterrupt:
            print(f"rhubarbe-{self.verb} : keyboard interrupt - exiting")
            return False
Example #8
0
    def test_capture(self):

        s = Scheduler()
        f = CaptureFormatter()
        n = SshNode(localhostname(), username=localuser(), formatter=f)
        s.add(SshJob(node=n, commands=[
            Run("echo LINE1"),
            Run("echo LINE2"),
        ]))

        f.start_capture()
        s.run()
        captured = f.get_capture()

        expected = "LINE1\nLINE2\n"
        self.assertEqual(captured, expected)
Example #9
0
    def test_environment(self):

        needle_foo = 'xxx-foo-xxx'
        needle_bar = 'xxx-bar-xxx'

        scheduler = Scheduler()
        node = SshNode("localhost")

        env = Variables()
        service = Service("env",
                          service_id='echo-environ',
                          environ={
                              'FOO': needle_foo,
                              'BAR': needle_bar,
                          })
        SshJob(scheduler=scheduler,
               node=node,
               commands=[
                   Run(service.start_command()),
                   Run(service.journal_command(since='5s ago'),
                       capture=Capture('journal', env))
               ])

        self.assertEqual(scheduler.run(), True)
        self.assertTrue(f"FOO={needle_foo}" in env.journal)
        self.assertTrue(f"BAR={needle_bar}" in env.journal)
Example #10
0
    def test_nesting_sequence(self):

        expected_duration = 1.

        watch = Watch('test_nesting_sequence')

        subjob = Scheduler(
            Sequence(
                Job(co_print_sleep(watch, .2, "one")),
                Job(co_print_sleep(watch, .2, "two")),
                Job(co_print_sleep(watch, .2, "three")),
            ),
            watch=watch,
            label="sub-scheduler\non several lines",
            critical=True,
            forever=True,
        )

        main = Scheduler(Sequence(
            Job(co_print_sleep(watch, .2, "BEGIN"), label="job-label"),
            subjob,
            Job(co_print_sleep(watch, .2, "END")),
        ),
                         watch=watch)

        print("===== test_nesting_sequence", "LIST with details")
        main.list(details=True)

        self.assertTrue(main.run())
        self.assertAlmostEqual(watch.seconds(), expected_duration, delta=.05)

        produce_png(main, "test_nesting_sequence")
Example #11
0
    def _allowed_retcod(self, allowed_exits,
                       host="localhost", username=None):

        print(f"Testing allowed retcod allowed_exits={allowed_exits}")

        # global timeout
        total = 4
        # scheduled duration
        long = 1
        # we always exit code 100
        retcod = 1000

        if username is None:
            username = util.localuser()
        node = SshNode(host, username=username)

        scheduler = Scheduler(timeout = total, critical=False)
        SshJob(node=node, scheduler=scheduler,
               command=Run(f"sleep {long}; exit {retcod}",
                           allowed_exits=allowed_exits))

        expected = retcod in allowed_exits

        run = scheduler.run()
        scheduler.list()
        self.assertEqual(run, expected)
Example #12
0
    def _allowed_signal(self, allowed_exits,
                       host="localhost", username=None):

        print(f"Testing allowed signal allowed_exits={allowed_exits}")

        # global timeout
        total = 4
        # scheduled duration
        long = 2
        # send signal after that amount
        short = 1
        # we always kill with TERM
        signal = "TERM"

        if username is None:
            username = util.localuser()
        node = SshNode(host, username=username)

        scheduler = Scheduler(timeout = total, critical=False)
        SshJob(node=node, scheduler=scheduler,
               command=Run(f"sleep {long}",
                           allowed_exits=allowed_exits))
        SshJob(node=node, scheduler=scheduler,
               command=f"sleep {short}; pkill -{signal} sleep")

        expected = signal in allowed_exits

        run = scheduler.run()
        scheduler.list()
        self.assertEqual(run, expected)
Example #13
0
 def run_one_job(self, job, *, details=False, expected=True):
     print(job)
     scheduler = Scheduler(job, verbose=True)
     orchestration = scheduler.run()
     scheduler.list(details=details)
     if not orchestration:
         scheduler.debrief()
     self.assertTrue(orchestration)
     if expected:
         self.assertEqual(job.result(), 0)
     else:
         self.assertNotEqual(job.result(), 0)
Example #14
0
    def main(self, reset, timeout):
        mainjob = Job(self.run(reset), critical=True)
        displayjob = Job(self.display.run(), forever=True, critical=True)

        scheduler = Scheduler(mainjob,
                              displayjob,
                              timeout=timeout,
                              critical=False)

        try:
            is_ok = scheduler.run()
            if not is_ok:
                scheduler.debrief()
                self.display.set_goodbye(
                    f"rhubarbe-save failed: {scheduler.why()}")
                return 1
            return 0 if mainjob.result() else 1
        except KeyboardInterrupt:
            self.display.set_goodbye("rhubarbe-save : keyboard interrupt, bye")
            return 1
        finally:
            self.cleanup()
Example #15
0
 def test_commands_verbose(self):
     dummy_path = "tests/dummy-10"
     dummy_file = Path(dummy_path).name
     scheduler = Scheduler()
     Sequence(SshJob(
         node=self.gateway(),
         verbose=True,
         commands=[
             Run("hostname"),
             RunScript("tests/script-with-args.sh", "arg1", "arg2"),
             RunString("for i in $(seq 3); do echo verbose$i; done"),
             Push(localpaths=dummy_path, remotepath="."),
             Pull(remotepaths=dummy_file, localpath=dummy_path + ".loop"),
         ]),
              SshJob(node=LocalNode(),
                     critical=True,
                     commands=Run("diff {x} {x}.loop".format(x=dummy_path),
                                  verbose=True)),
              scheduler=scheduler)
     ok = scheduler.run()
     ok or scheduler.debrief()
     self.assertTrue(ok)
Example #16
0
    def global_check_image(self, _image, check_strings):
        # on the remaining nodes: check image marker
        self.print(f"checking {len(self.nodes)} nodes"
                   f" against {check_strings} in /etc/rhubarbe-image")

        grep_pattern = "|".join(check_strings)
        check_command = (
            f"tail -1 /etc/rhubarbe-image | egrep -q '{grep_pattern}'")
        jobs = [
            SshJob(node=silent_sshnode(node, verbose=self.verbose),
                   command=check_command,
                   critical=False) for node in self.nodes
        ]

        scheduler = Scheduler(Job(self.display.run(), forever=True),
                              *jobs,
                              critical=False,
                              timeout=self.wait_timeout)
        if not scheduler.run():
            self.verbose and scheduler.debrief()  # pylint: disable=w0106
        # exclude nodes that have not behaved
        for node, job in zip(self.nodes, jobs):
            if not job.is_done() or job.raised_exception():
                self.verbose_msg(
                    f"checking {grep_pattern}: something went badly wrong with {node}"
                )
                message = None
                if exc := job.raised_exception():
                    message = f"OOPS {type(exc)} {exc}"
                self.mark_and_exclude(node, Reason.CANT_CHECK_IMAGE, message)
                continue
            if not job.result() == 0:
                explanation = f"wrong image found on {node} - looking for {grep_pattern}"
                self.verbose_msg(explanation)
                self.mark_and_exclude(node, Reason.DID_NOT_LOAD, explanation)
                continue
            self.print(f"node {node} checked out OK")
Example #17
0
    def global_wait_ssh(self):
        # wait for nodes to be ssh-reachable
        self.print(f"waiting for {len(self.nodes)} nodes"
                   f" (timeout={self.wait_timeout})")
        sshs = [SshWaiter(node, verbose=self.verbose) for node in self.nodes]
        jobs = [
            Job(ssh.wait_for(self.backoff), critical=False) for ssh in sshs
        ]

        scheduler = Scheduler(Job(self.display.run(), forever=True),
                              *jobs,
                              critical=False,
                              timeout=self.wait_timeout)
        if not scheduler.run():
            self.verbose and scheduler.debrief()  # pylint: disable=w0106
        # exclude nodes that have not behaved
        for node, job in zip(self.nodes, jobs):
            self.verbose_msg(
                f"node {node.id} wait_ssh_job -> done={job.is_done()}",
                f"exc={job.raised_exception()}")

            if exc := job.raised_exception():
                message = f"OOPS {type(exc)} {exc}"
                self.mark_and_exclude(node, Reason.WONT_SSH, message)
Example #18
0
    def test_graphics1(self):

        scheduler = Scheduler(critical=False)

        gateway = SshNode(hostname=localhostname(), username=localuser())

        Sequence(
            SshJob(
                node=gateway,
                command='hostname',
            ),
            SshJob(node=gateway,
                   command=[
                       Run('ls /etc/passwd'),
                       Run('wc -l /etc/passwd'),
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo with RunString on $(hostname) at $(date)"),
                   ]),
            SshJob(node=gateway, commands=[
                RunScript("tests/testbasic.sh"),
            ]),
            SshJob(node=gateway,
                   commands=[
                       Run('wc -l /etc/passwd'),
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo with RunsString on $(hostname) at $(date)",
                           remote_name="show-host-date"),
                       RunScript("tests/testbasic.sh"),
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo first arg is $1\n", 10)
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo first arg is $1\n",
                           10,
                           remote_name='short-show-args')
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo first arg is $1\n"
                           "echo second arg is $2\n"
                           "echo third arg is $3\n"
                           "echo fourth arg is $4\n", 100, 200, 300, 400)
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo first arg is $1\n"
                           "echo second arg is $2\n"
                           "echo third arg is $3\n"
                           "echo fourth arg is $4\n",
                           1000,
                           2000,
                           3000,
                           4000,
                           remote_name='long-show-args')
                   ]),
            SshJob(node=gateway,
                   commands=[
                       RunString(
                           "#!/usr/bin/env bash\n"
                           "echo first arg is $1\n"
                           "echo second arg is $2\n"
                           "echo third arg is $3\n"
                           "echo fourth arg is $4\n",
                           1000,
                           2000,
                           3000,
                           4000,
                           remote_name='long-show-args',
                           label='snip')
                   ]),
            SshJob(node=gateway,
                   commands=[
                       Run("hostname", label="Run()"),
                       RunScript("foobar", label="RunScript()"),
                       RunString("foobar", label="RunString()"),
                       Push("foobar", remotepath="remote", label="Push()"),
                       Pull("remote", localpath="foobar", label="Pull()"),
                       Run("hostname", label=None),
                       RunScript("foobar", label=[]),
                       RunString("foobar", label=0),
                       Push("foobar", remotepath="remote", label={}),
                       Pull("remote", localpath="foobar", label=""),
                   ]),
            scheduler=scheduler,
        )

        print("NO DETAILS")
        scheduler.list()
        print("WITH DETAILS")
        scheduler.list(details=True)
        produce_png(scheduler, "test_graphics1")

        ok = scheduler.run()

        self.assertFalse(ok)
Example #19
0
    def _simple(self, forever):

        storage = f"/root/TCPDUMP-{forever}.pcap"
        status = f"/root/TCPDUMP-{forever}.status"

        tcpdump = Service(f"tcpdump -i lo -w {storage}",
                          service_id='tcpdump',
                          verbose=True)
        monitor = ProcessMonitor()

        scheduler = Scheduler()
        node = SshNode("localhost")

        SshJob(node,
               scheduler=scheduler,
               command=tcpdump.start_command(),
               forever=forever)

        Sequence(
            SshJob(node, command="sleep 1"),
            SshJob(node, command=tcpdump.status_command(output=status)),
            SshJob(node, command="sleep 1"),
            SshJob(node, command=tcpdump.stop_command()),
            # could use a pull to retrive both files but that's not required
            # since we run on localhost, so keep tests simple
            scheduler=scheduler,
        )

        # cleanup before we run
        paths = (Path(x) for x in (storage, status))
        for path in paths:
            if path.exists():
                path.unlink()
            self.assertFalse(path.exists())
        produce_png(scheduler, f"service-{forever}")

        self.assertTrue(scheduler.run())
        scheduler.list()
        for path in paths:
            self.assertTrue(path.exists())

        with Path(status).open() as feed:
            contents = feed.read()
            for needle in ('Loaded: loaded', 'Active: active'):
                self.assertTrue(contents.find(needle) >= 0)

        close_ssh_in_scheduler(scheduler)

        # let it settle for a short while, and check the process space
        import time
        time.sleep(0.5)
        monitor.difference()

        news = monitor.news
        if news:
            print(f"we have {len(news)} new processes, {news}")
            ps_command = "ps " + "".join(str(pid) for pid in news)
            import os
            os.system(ps_command)

        self.assertEqual(len(news), 0)
Example #20
0
    def hop2(self,
             hostname='localhost',
             username=None,
             *,
             c1=1,
             c2=1,
             commands=1):
        """
        create
          * <c1> connections to one node 1 hop away
          * on each one, <c2> connections one hop behind
          * and on each <commands> commands

        check current number of connections
        """
        if username is None:
            username = localuser()

        print(f"creating {c1}x{c2} hop2-connections - "
              f"{commands} commands per conn "
              f" to {username}@{hostname}")
        scheduler = Scheduler()
        nodes1 = []
        nodes2 = []
        jobs = []
        for n in range(c1):
            node1 = SshNode(hostname,
                            username=username,
                            formatter=ColonFormatter(verbose=False))
            nodes1.append(node1)
            for m in range(c2):
                node2 = SshNode(hostname,
                                username=username,
                                gateway=node1,
                                formatter=ColonFormatter(verbose=False))
                nodes2.append(node2)
                for c in range(commands):
                    jobs.append(
                        SshJob(node=node2,
                               command=f'echo hop1-{n}-{m}-{c}',
                               scheduler=scheduler))

        # for each hop1 conn, there are 1 hop1 + c2 hop2 connections alive
        expected = c1 * (c2 + 1)

        # record base status
        in0, out0 = in_out_connections()
        print(f"INITIAL count in={in0} out={out0}")

        scheduler.run()

        in1, out1 = in_out_connections()
        print(f"AFTER RUN in={in1} out={out1}")
        self.assertEqual(in1 - in0, expected)
        self.assertEqual(out1 - out0, expected)

        # cleanup
        # would be nice to find a way to check that the result
        # holds no matter in what order the cleanup is done
        for nodeset in nodes1, nodes2:
            gathered = asyncio.get_event_loop().run_until_complete(
                asyncio.gather(*(node.close() for node in nodeset)))
        in1, out1 = in_out_connections()
        print(f"AFTER CLEANUP in={in1} out={out1}")
        self.assertEqual(in1 - in0, 0)
        self.assertEqual(out1 - out0, 0)
Example #21
0
def main(argv):
    if len(argv) == 3:
        print("!! Unfinished routines !!")
    else:
        print("++ Using default settings ++")
        ###########################
        ## Local Variables
        # platform='multiGPU'
        platform='distributed'
        gateway_user='******'
		gateway_host='gw_host'
        node_username='******'

        #########################################################
        ## Distributed Requirements
        num_ps = 1
        num_workers = 2

        #########################################################
        gateway = SshNode(
                        gateway_host,
                        username=gateway_user
                        )

		##########################################################
        elif platform == 'distributed':
            
            ## Jetson-TX2 Cluster
            hosts = [cluster_ip_host]

            #########################################################
            ## Use the Server node for processing the first satge Data-mining
            server = ResourceManager._set_Node(master_host, master_user, gateway,)

            ############################
            # Push the launch file (run_splitpoint)
            # With the Parameters Connfiguration on the server
            # To execute the First Satege in this host
            job_launch_S1 = SshJob(
                        node = server,
                        commands = [
                                ## Run the script locate in the laptop
                                RunScript("run_dataspworkers_mlp.sh", platform, num_ps, num_workers),
                                Run("echo Split Data DONE"),
                                ],
                        )

            #############################
            ## A collection of the PS node
            ps = []
            [ps.append(ResourceManager._set_Node(hosts[i],
                                                node_username, gateway,))
                                                for i in range(num_ps)]

            #############################
            ## A collection of the workers node
            workers = []
            [workers.append(ResourceManager._set_Node(hosts[num_ps+i],
                                                    node_username, gateway,))
                                                    for i in range(num_workers)]

            #########################################################
            ## Setting Parameters for the First Stage
            FEATURES_NAME = "FULL-W1_x1_x2_x3_x4_x5_x7_x8_Y1"
            SANDBOX=str("/data_B/datasets/drg-PACA/healthData/sandbox-"+FEATURES_NAME)
            YEAR=str(2008)

            ## Stage 1
            # localdir = "/1_Mining-Stage/"
            # SP_Dir_X = str(SANDBOX+localdir+"BPPR-"+FEATURES_NAME+"-"YEAR)

            #############################
            ## Setting parameters for the Second Stage
            S_PLOINT = str(3072)    #1536)
            #SP_ARGV = str(S_PLOINT+"-"+platform)
            SP_ARGV = platform+"-"+str(num_workers)
            SP2=str(SANDBOX+"/2_Split-Point-"+SP_ARGV+"/")

            #############################
            ## BPPR Directories
            dir_train = "/data_training/"
            dir_valid = "/data_valid/"
            dir_test = "/data_test/"

            ############################
            ## Worker data management
            worker_healthData = "/opt/diagnosenet/healthData/"
            worker_sandbox = str(worker_healthData+"/sandbox-"+FEATURES_NAME)
            worker_splitpoint = str(worker_sandbox+"/2_Split-Point-"+SP_ARGV+"/")
            worker_train = str(worker_splitpoint+dir_train)
            worker_valid = str(worker_splitpoint+dir_valid)
            worker_test = str(worker_splitpoint+dir_test)

            ############################
            ## Worker commands
            mkd_worker_sandbox = str("mkdir"+" "+worker_sandbox)
            mkd_worker_splitpoint = str("mkdir"+" "+worker_splitpoint)
            mkd_worker_train = str("mkdir"+" "+worker_train)
            mkd_worker_valid = str("mkdir"+" "+worker_valid)
            mkd_worker_test = str("mkdir"+" "+worker_test)

            #############################
            ## Create a JOB to build the sandbox for each Worker
            job_build_sandbox = []

            [ job_build_sandbox.append(SshJob(
                            node = workers[i],
                            commands = [
                                RunString(mkd_worker_sandbox),
                                RunString(mkd_worker_splitpoint),
                                RunString(mkd_worker_train),
                                RunString(mkd_worker_valid),
                                RunString(mkd_worker_test),
                                Run("echo SANDBOX ON WORKER DONE"), ],
                                )) for i in range(len(workers)) ]


            #############################
            ## Create a command for transfer data
            scp = "scp"
            cmd_X_train_transfer = []
            cmd_y_train_transfer = []
            cmd_X_valid_transfer = []
            cmd_y_valid_transfer = []
            cmd_X_test_transfer = []
            cmd_y_test_transfer = []

            for i in range(num_workers):
                worker_host = str(node_user+"@"+ hosts[num_ps+i] +":")
                num_file = str(i+1)
                ## Commands to transfer Training dataset
                X_train_splitted = str(SP2+dir_train+"X_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_X_train_transfer.append(str(scp+" "+X_train_splitted+" "+worker_host+worker_train))
                y_train_splitted = str(SP2+dir_train+"y_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_y_train_transfer.append(str(scp+" "+y_train_splitted+" "+worker_host+worker_train))

                ## Commands to transfer Validation dataset
                X_valid_splitted = str(SP2+dir_valid+"X_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_X_valid_transfer.append(str(scp+" "+X_valid_splitted+" "+worker_host+worker_valid))
                y_valid_splitted = str(SP2+dir_valid+"y_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_y_valid_transfer.append(str(scp+" "+y_valid_splitted+" "+worker_host+worker_valid))

                ## Commands to transfer Test dataset
                X_test_splitted = str(SP2+dir_test+"X_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_X_test_transfer.append(str(scp+" "+X_test_splitted+" "+worker_host+worker_test))
                y_test_splitted = str(SP2+dir_test+"y_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt")
                cmd_y_test_transfer.append(str(scp+" "+y_test_splitted+" "+worker_host+worker_test))


            ############################
            ## Build a JOB for transfering data to each worker sandbox
            job_data_transfer = []
            [job_data_transfer.append(SshJob(
                        node = server,
                        commands = [
                                    RunString(cmd_X_train_transfer[i]),
                                    RunString(cmd_y_train_transfer[i]),
                                    Run("echo SENDER TRAINING DATA DONE"),
                                    RunString(cmd_X_valid_transfer[i]),
                                    RunString(cmd_y_valid_transfer[i]),
                                    Run("echo SENDER VALID DATA DONE"),
                                    RunString(cmd_X_test_transfer[i]),
                                    RunString(cmd_y_test_transfer[i]),
                                    Run("echo SENDER TEST DATA DONE"),
                                    ],)
                                    ) for i in range(len(workers))]

            #########################################################
            ## Create a sequence orchestration scheduler instance upfront
            worker_seq = []

            ## Add the Stage-1 JOB into Scheduler
            worker_seq.append(Scheduler(Sequence(
                                job_launch_S1)))

            ## Add the worker JOBs into Scheduler
            [worker_seq.append(Scheduler(Sequence(
                                job_build_sandbox[i],
                                job_data_transfer[i], ))
                                ) for i in range(len(workers))]

            #############################
            ## Old method
            ## Add the JOB PS Replicas into Scheduler
            # worker_seq.append(Scheduler(Sequence(
            #                     job_PS_replicas)))
            #
            # ## Add the JOB WORKER Replicas into Scheduler
            # worker_seq.append(Scheduler(Sequence(
            #                     job_WORKER_replicas)))


            #############################
            ## Run the Sequence JOBS
            # [seq.orchestrate() for seq in worker_seq]


            #########################################################
            #########################################################
            ## Push the launch file (run_secondstage_distributed)
            ## With the Distributed Parameters for each worker replicas
            ## To distributed training of Unsupervised Embedding

            #############################
            ## Build a collection of TensorFlow Hosts for PS
            tf_ps = []
            [tf_ps.append(str(hosts[i]+":2222")) for i in range(num_ps)]
            # print("+++ tf_ps: {}".format(tf_ps))
            tf_ps=','.join(tf_ps)

            #############################
            ## Build a collection of TensorFlow Hosts for workers
            tf_workers = []
            [tf_workers.append(str(hosts[num_ps+i]+":2222")) for i in range(num_workers)]
            # print("+++ tf_workers: {}".format(tf_workers))
            tf_workers=','.join(tf_workers)

            job_PS_replicas = []
            [job_PS_replicas.append(SshJob(
                        node = ps[i],
                        commands = [
                                ## Launches local script to execute on cluster
                                # RunScript("run_secondstage_distributed.sh",
                                #             platform, tf_ps, tf_workers,
                                #             num_ps, num_workers, "ps", i),
                                RunScript("run_thirdstage_distributed_mlp.sh",
                                            platform, tf_ps, tf_workers,
                                            num_ps, num_workers, "ps", i),
                                Run("echo PS REPLICA DONE"),
                                ],)
                                ) for i in range(len(ps))]


            job_WORKER_replicas = []
            [job_WORKER_replicas.append(SshJob(
                        node = workers[i],
                        commands = [
                                ## Launches local script to execute on cluster
                                # RunScript("run_secondstage_distributed.sh",
                                #             platform, tf_ps, tf_workers,
                                #             num_ps, num_workers, "worker", i),
                                RunScript("run_thirdstage_distributed_mlp.sh",
                                            platform, tf_ps, tf_workers,
                                            num_ps, num_workers, "worker", i),
                                Run("echo WORKER REPLICA DONE"),
                                ], )
                                ) for i in range(len(workers))]

            #############################
            ### Simultaneous jobs
            s_distraining = Scheduler()
            [s_distraining.add(job_PS_replicas[i]) for i in range(len(ps))]
            [s_distraining.add(job_WORKER_replicas[i]) for i in range(len(workers))]

            s_distraining.run(jobs_window = int(num_ps+num_workers+1))
Example #22
0
    def hop2(self, hostname='localhost', username=None,
             *, c1=1, c2=1, commands=1, s_command='echo hop2-{}-{}-{}',
             nested_sched=(0, 1)):
        """
        create
          * <c1> connections to one node 1 hop away
          * on each one, <c2> connections one hop behind
          * and on each <commands> commands

        check current number of connections
        """
        if username is None:
            username = localuser()

        verbose(f"creating {c1}x{c2} hop2-connections - "
                f"{commands} commands per conn "
                f" to {username}@{hostname}")
        scheduler = Scheduler(timeout=7)
        nodes = []
        #nodes2 = []
        jobs = []
        for n in range(c1):
            node1 = SshNode(hostname, username=username,
                            formatter=ColonFormatter(verbose=False))
            nodes.append(node1)
            for m in range(c2):
                node2 = SshNode(hostname, username=username,
                                gateway=node1,
                                formatter=ColonFormatter(verbose=False))
                nodes.append(node2)
                for c in range(commands):
                    jobs.append(SshJob(node=node2,
                                       command=s_command.format(n, m, c),
                                       ))

        scheduler = self.populate_sched(scheduler, jobs,
                                        nested=nested_sched[0],
                                        pack_job=nested_sched[1])
        # for each hop1 conn, there are 1 hop1 + c2 hop2 connections alive
        expected = c1 * (c2+1)
        scheduler.export_as_pngfile("debug")
        topology_as_pngfile(scheduler, "topology")

        # record base status
        in0, out0 = in_out_connections()

        verbose(f"INITIAL count in={in0} out={out0}")
    #    try:
        scheduler.run()
        #except Exception:
    #        pass
        in1, out1 = in_out_connections()
        verbose(f"AFTER RUN in={in1} out={out1}")
        self.assertEqual(in1-in0, expected)
        self.assertEqual(out1-out0, expected)

        # cleanup
        close_ssh_in_scheduler(scheduler)

        #Lets wait a little bit to count
        time.sleep(1)
        in1, out1 = in_out_connections()

        verbose(f"AFTER CLEANUP in={in1} out={out1}")
        self.assertEqual(in1-in0, 0)
        self.assertEqual(out1-out0, 0)
Example #23
0
def one_run(*, protocol, interference,
            run_name=default_run_name, slicename=default_slicename,
            tx_power, phy_rate, antenna_mask, channel,
            load_images=False,
            node_ids=DEFAULT_NODE_IDS,
            src_ids=DEFAULT_SRC_IDS, dest_ids=DEFAULT_DEST_IDS,
            scrambler_id=DEFAULT_SCRAMBLER_ID,
            tshark=False, map=False, warmup=False,
            route_sampling=False, iperf=False,
            verbose_ssh=False, verbose_jobs=False, dry_run=False,
            run_number=None):
    """
    Performs data acquisition on all nodes with the following settings

    Arguments:
        tx_power: in dBm, a string like 5, 10 or 14.
          Corresponds to the transmission power.
        phy_rate: a string among 1, 54. Correspond to the wifi rate.
        antenna_mask: a string among 1, 3, 7.
        channel: a string like e.g. 1 or 40. Correspond to the channel.
        protocol: a string among batman , olsr. Correspond to the protocol
        interference : in amplitude percentage, a string like 15 or 20.
          Correspond to the power of the noise generated in the spectrum.
          Can be either None or "None" to mean no interference.
        run_name: the name for a subdirectory where all data will be kept
          successive runs should use the same name for further visualization
        slicename: the Unix login name (slice name) to enter the gateway
        load_images: a boolean specifying whether nodes should be re-imaged first
        node_ids: a list of node ids to run the scenario against;
          strings or ints are OK;
        tshark: a boolean specifying wether we should format/parse the .pcap.
        map: a boolean specifying wether we should fetch/parse
          the route tables of the nodes.
        warmup: a boolean specifying whether we should run a ping before
          the experiment to be certain of the stabilisation on the network.
        src_ids: a list of nodes from which we will launch the ping from.
          strings or ints are OK.
        ping_messages : the number of ping packets that will be generated

    """
    # set default for the nodes parameter
    node_ids = ([int(id) for id in node_ids]
                if node_ids is not None else DEFAULT_NODE_IDS)
    src_ids = ([int(id) for id in src_ids]
               if src_ids is not None else DEFAULT_SRC_IDS)
    dest_ids = ([int(id) for id in dest_ids]
                if dest_ids is not None else DEFAULT_NODE_IDS)

    # all nodes - i.e. including sources and destinations -
    # need to run the protocol
    node_ids = list(set(node_ids).union(set(src_ids).union(set(dest_ids))))

    if interference == "None":
        interference = None

    # open result dir no matter what
    run_root = naming_scheme(
        run_name=run_name, protocol=protocol,
        interference=interference, autocreate=True)

# fix me    trace = run_root / f"trace-{%m-%d-%H-%M}"
    ref_time = apssh_time()
    trace = run_root / f"trace-{ref_time}"

    try:
        with trace.open('w') as feed:
            def log_line(line):
                time_line(line, file=feed)
            load_msg = f"{'WITH' if load_images else 'NO'} image loading"
            interference_msg = (f"interference={interference} "
                                f"from scrambler={scrambler_id}")
            nodes = " ".join(str(n) for n in node_ids)
            srcs = " ".join(str(n) for n in src_ids)
            dests = " ".join(str(n) for n in dest_ids)
            ping_labels = [
                f"PING {s} ➡︎ {d}"
                for s in src_ids
                # and on the destination
                for d in dest_ids
                if d != s
            ]

            log_line(f"output in {run_root}")
            log_line(f"trace in {trace}")
            log_line(f"protocol={protocol}")
            log_line(f"{load_msg}")
            log_line(f"{interference_msg}")
            log_line("----")
            log_line(f"Selected nodes : {nodes}")
            log_line(f"Sources : {srcs}")
            log_line(f"Destinations : {dests}")
            for label in ping_labels:
                log_line(f"{label}")
            log_line("----")
            for feature in ('warmup', 'tshark', 'map',
                            'route_sampling', 'iperf'):
                log_line(f"Feature {feature}: {locals()[feature]}")

    except Exception as exc:
        print(f"Cannot write into {trace} - aborting this run")
        print(f"Found exception {type(exc)} - {exc}")
        return False
    #
    # dry-run mode
    # just display a one-liner with parameters
    #
    prelude = "" if not dry_run else "dry_run:"
    with trace.open() as feed:
        print(f"**************** {ref_time} one_run #{run_number}:")
        for line in feed:
            print(prelude, line, sep='', end='')
    if dry_run:
        return True

    # the nodes involved
    faraday = SshNode(hostname=default_gateway, username=slicename,
                      formatter=TimeColonFormatter(), verbose=verbose_ssh)

    # this is a python dictionary that allows to retrieve a node object
    # from an id
    node_index = {
        id: SshNode(gateway=faraday, hostname=fitname(id), username="******",
                    formatter=TimeColonFormatter(), verbose=verbose_ssh)
        for id in node_ids
    }
    # extracts for sources and destinations
    src_index = {id:node for (id, node) in node_index.items()
                 if id in src_ids}
    dest_index = {id:node for (id, node) in node_index.items()
                  if id in dest_ids}

    if interference:
        node_scrambler = SshNode(
            gateway=faraday, hostname=fitname(scrambler_id), username="******",
            formatter=TimeColonFormatter(), verbose=verbose_ssh)
    # the global scheduler
    scheduler = Scheduler(verbose=verbose_jobs)

    ##########
    check_lease = SshJob(
        scheduler=scheduler,
        node=faraday,
        verbose=verbose_jobs,
        label="rhubarbe check lease",
        command=Run("rhubarbe leases --check", label="rlease"),
    )

    # load images if requested

    green_light = check_lease

    # at some point we did not load the scrambler if interference was None
    # and that was a way to run faster loads with no interference
    # but now we always load the scrambler node with gnuradio
    # this is because when we do runs.py -i None 15 30 ...
    # then the first call to one_run is with interference being None
    # but it is still important to load the scrambler
    if load_images:
        # copy node_ids
        load_ids = node_ids[:]
        load_ids.append(scrambler_id)
        # the nodes that we **do not** use should be turned off
        # so if we have selected e.g. nodes 10 12 and 15, we will do
        # rhubarbe off -a ~10 ~12 ~15, meaning all nodes except 10, 12 and 15
        negated_node_ids = [f"~{id}" for id in load_ids]

        # we can do these three things in parallel
        ready_jobs = [
            SshJob(node=faraday, required=green_light,
                   scheduler=scheduler, verbose=verbose_jobs,
                   command=Run("rhubarbe", "off", "-a", *negated_node_ids,
                               label="turn off unused nodes")),
            SshJob(node=faraday, required=green_light,
                   scheduler=scheduler, verbose=verbose_jobs,
                   label="load batman image",
                   command=Run("rhubarbe", "load", "-i",
                               "batman-olsr",
                               *node_ids,
                               label=f"load ubuntu on {node_ids}")),
            SshJob(
                node=faraday, required=green_light,
                scheduler=scheduler, verbose=verbose_jobs,
                label="load gnuradio image",
                command=Run("rhubarbe", "load", "-i",
                            "batman-olsr-gnuradio",
                            scrambler_id,
                            label=f"load gnuradio on {scrambler_id}")),
        ]

        # replace green_light in this case
        green_light = SshJob(
            node=faraday, required=ready_jobs,
            scheduler=scheduler, verbose=verbose_jobs,
            label="wait for nodes to come up",
            command=Run("rhubarbe", "wait", *load_ids))

    ##########
    # setting up the wireless interface on all nodes
    #
    # provide node-utilities with the ranges/units it expects
    frequency = channel_frequency[int(channel)]
    # tx_power_in_mBm not in dBm
    tx_power_driver = tx_power * 100

    #just in case somme services failed in the previous experiment
    reset_failed_services_job = [
        SshJob(
            node=node,
            verbose=verbose_jobs,
            label="reset failed services",
            command=Run("systemctl reset-failed",
                        label="reset-failed services"))
        for id, node in node_index.items()
    ]
    reset_failed_services = Scheduler(
        *reset_failed_services_job,
        scheduler=scheduler,
        required=green_light,
        verbose=verbose_jobs,
        label="Reset failed services")
    init_wireless_sshjobs = [
        SshJob(
            node=node,
            verbose=verbose_jobs,
            label=f"init {id}",
            command=RunScript(
                "node-utilities.sh",
                f"init-ad-hoc-network-{WIRELESS_DRIVER}",
                WIRELESS_DRIVER, "foobar", frequency, phy_rate,
                antenna_mask, tx_power_driver,
                label="init add-hoc network"),
        )
        for id, node in node_index.items()]
    init_wireless_jobs = Scheduler(
        *init_wireless_sshjobs,
        scheduler=scheduler,
        required=green_light,
        verbose=verbose_jobs,
        label="Initialisation of wireless chips")

    if interference:
        # Run uhd_siggen with the chosen power
        init_scrambler_job = SshJob(
            scheduler=scheduler,
            required=green_light,
            forever=True,
            node=node_scrambler,
            verbose=verbose_jobs,
            #TODO : If exit-signal patch is done add exit-signal=["TERM"]
            #       to this run object and call uhd_siggen directly
            commands=[RunScript("node-utilities.sh",
                                "init-scrambler",
                                label="init scrambler"),
                      Run(f"systemd-run --unit=uhd_siggen -t ",
                          f"uhd_siggen -a usrp -f {frequency}M",
                          f"--sine --amplitude 0.{interference}",
                          label="systemctl start uhd_siggen")
                      ]
        )

    green_light = [init_wireless_jobs, reset_failed_services]
    # then install and run batman on fit nodes
    run_protocol_job = [
        SshJob(
            # scheduler=scheduler,
            node=node,
            label=f"init and run {protocol} on fit node {id}",
            verbose=verbose_jobs,
            # CAREFUL : These ones use sytemd-run
            #            with the ----service-type=forking option!
            command=RunScript("node-utilities.sh",
                              f"run-{protocol}",
                              label=f"run {protocol}"),
        )
        for id, node in node_index.items()]

    run_protocol = Scheduler(
        *run_protocol_job,
        scheduler=scheduler,
        required=green_light,
        verbose=verbose_jobs,
        label="init and run routing protocols")

    green_light = run_protocol

    # after that, run tcpdump on fit nodes, this job never ends...
    if tshark:

        run_tcpdump_job = [
            SshJob(
                # scheduler=scheduler_monitoring,
                node=node,
                forever=True,
                label=f"run tcpdump on fit node {id}",
                verbose=verbose_jobs,
                command=[
                    Run("systemd-run -t  --unit=tcpdump",
                        f"tcpdump -U -i moni-{WIRELESS_DRIVER}",
                        f"-y ieee802_11_radio -w /tmp/fit{id}.pcap",
                        label=f"tcpdump {id}")
                    ]
            )
            for id, node in node_index.items()
        ]

        run_tcpdump = Scheduler(
            *run_tcpdump_job,
            scheduler=scheduler,
            required=green_light,
            forever=True,
            verbose=verbose_jobs,
            label="Monitoring - tcpdumps")

    # let the wireless network settle
    settle_scheduler = Scheduler(
        scheduler=scheduler,
        required=green_light,
    )

    if warmup:
        # warmup pings don't need to be sequential, so let's
        # do all the nodes at the same time
        # on a given node though, we'll ping the other ends sequentially
        # see the graph for more
        warmup_jobs = [
            SshJob(
                node=node_s,
                verbose=verbose_jobs,
                commands=[
                    RunScript("node-utilities.sh",
                              "my-ping", f"10.0.0.{d}",
                              warmup_ping_timeout,
                              warmup_ping_interval,
                              warmup_ping_size,
                              warmup_ping_messages,
                              f"warmup {s} ➡︎ {d}",
                              label=f"warmup {s} ➡︎ {d}")
                    for d in dest_index.keys()
                    if s != d
                ]
            )
            # for each selected experiment nodes
            for s, node_s in src_index.items()
        ]
        warmup_scheduler = Scheduler(
            *warmup_jobs,
            scheduler=settle_scheduler,
            verbose=verbose_jobs,
            label="Warmup pings")
        settle_wireless_job2 = PrintJob(
            "Let the wireless network settle after warmup",
            sleep=settle_delay_shorter,
            scheduler=settle_scheduler,
            required=warmup_scheduler,
            label=f"settling-warmup for {settle_delay_shorter} sec")

    # this is a little cheating; could have gone before the bloc above
    # but produces a nicer graphical output
    # we might want to help asynciojobs if it offered a means
    # to specify entry and exit jobs in a scheduler
    settle_wireless_job = PrintJob(
        "Let the wireless network settle",
        sleep=settle_delay_long,
        scheduler=settle_scheduler,
        label=f"settling for {settle_delay_long} sec")

    green_light = settle_scheduler

    if iperf:
        iperf_service_jobs = [
            SshJob(
                node=node_d,
                verbose=verbose_jobs,
                forever=True,
                commands=[
                    Run("systemd-run -t --unit=iperf",
                        "iperf -s -p 1234 -u",
                        label=f"iperf serv on {d}"),
                ],
            )
            for d, node_d in dest_index.items()
        ]
        iperf_serv_sched = Scheduler(
            *iperf_service_jobs,
            verbose=verbose_jobs,
            label="Iperf Servers",
            # for a nicer graphical output
            # otherwise the exit arrow
            # from scheduler 'iperf mode'
            # to job 'settling for 60s'
            # gets to start from this box
            forever=True,
            )

        iperf_cli = [
            SshJob(
                node=node_s,
                verbose=verbose_jobs,
                commands=[
                    Run("sleep 7", label=""),
                    Run(f"iperf",
                        f"-c 10.0.0.{d} -p 1234",
                        f"-u -b {phy_rate}M -t 60",
                        f"-l 1024 > IPERF-{s:02d}-{d:02d}",
                        label=f"run iperf {s} ➡︎ {d}")
                ]
            )

            for s, node_s in src_index.items()
            for d, node_d in dest_index.items()
            if s != d
        ]
        iperf_cli_sched = Scheduler(
            Sequence(*iperf_cli),
            verbose=verbose_jobs,
            label="Iperf Clients")

        iperf_stop = [
            SshJob(node=node_d,
                   verbose=verbose_jobs,
                   label=f"Stop iperf on {d}",
                   command=Run("systemctl stop iperf"))
            for d, node_d in dest_index.items()
        ]
        iperf_stop_sched = Scheduler(
            *iperf_stop,
            required=iperf_cli_sched,
            verbose=verbose_jobs,
            label="Iperf server stop")
        iperf_fetch = [
            SshJob(node=node_s,
                   verbose=verbose_jobs,
                   command=Pull(
                       remotepaths=[f"IPERF-{s:02d}-{d:02d}"],
                       localpath=str(run_root),
                       label="fetch iperf {s} ➡︎ {d}")
                   )
            for s, node_s in src_index.items()
            for d, node_d in dest_index.items()
            if s != d
        ]
        iperf_fetch_sched = Scheduler(
            *iperf_fetch,
            required=iperf_stop_sched,
            verbose=verbose_jobs,
            label="Iperf fetch report")
        iperf_jobs = [iperf_serv_sched, iperf_cli_sched,
                      iperf_stop_sched, iperf_fetch_sched]
        iperf_sched = Scheduler(
            *iperf_jobs,
            scheduler=scheduler,
            required=green_light,
            verbose=verbose_jobs,
            label="Iperf Module")
        settle_wireless_job_iperf = PrintJob(
            "Let the wireless network settle",
            sleep=settle_delay_shorter,
            scheduler=scheduler,
            required=iperf_sched,
            label=f"settling-iperf for {settle_delay_shorter} sec")

        green_light = settle_wireless_job_iperf


    # create all the tracepath jobs from the first node in the list
    if map:
        map_jobs = [
            SshJob(
                node=node,
                label=f"Generating ROUTE file for proto {protocol} on node {id}",
                verbose=verbose_jobs,
                commands=[
                    RunScript(f"node-utilities.sh",
                              f"route-{protocol}",
                              f"> ROUTE-TABLE-{id:02d}",
                              label="get route table"),
                    Pull(remotepaths=[f"ROUTE-TABLE-{id:02d}"],
                         localpath=str(run_root),
                         label="")
                ],
            )
            for id, node in node_index.items()
        ]
        map_scheduler = Scheduler(
            *map_jobs,
            scheduler=scheduler,
            required=green_light,
            verbose=verbose_jobs,
            label="Snapshoting route files")
        green_light = map_scheduler

    if route_sampling:
        route_sampling_jobs = [
            SshJob(
                node=node,
                label=f"Route sampling service for proto {protocol} on node {id}",
                verbose=False,
                forever=True,
                commands=[
                    Push(localpaths=["route-sample-service.sh"],
                         remotepath=".", label=""),
                    Run("chmod +x route-sample-service.sh", label=""),
                    Run("systemd-run -t --unit=route-sample",
                        "/root/route-sample-service.sh",
                        "route-sample",
                        f"ROUTE-TABLE-{id:02d}-SAMPLED",
                        protocol,
                        label="start route-sampling"),
                ],
            )
            for id, node in node_index.items()
        ]
        route_sampling_scheduler = Scheduler(
            *route_sampling_jobs,
            scheduler=scheduler,
            verbose=False,
            forever=True,
            label="Route Sampling services launch",
            required=green_light)

    ##########
    # create all the ping jobs, i.e. max*(max-1)/2
    # this again is a python list comprehension
    # see the 2 for instructions at the bottom
    #
    # notice that these SshJob instances are not yet added
    # to the scheduler, we will add them later on
    # depending on the sequential/parallel strategy

    pings_job = [
        SshJob(
            node=node_s,
            verbose=verbose_jobs,
            commands=[
                Run(f"echo actual ping {s} ➡︎ {d} using {protocol}",
                    label=f"ping {s} ➡︎ {d}"),
                RunScript("node-utilities.sh", "my-ping",
                          f"10.0.0.{d}",
                          ping_timeout, ping_interval,
                          ping_size, ping_messages,
                          f"actual {s} ➡︎ {d}",
                          ">", f"PING-{s:02d}-{d:02d}",
                          label=""),
                Pull(remotepaths=[f"PING-{s:02d}-{d:02d}"],
                     localpath=str(run_root),
                     label=""),
            ],
        )
        # for each selected experiment nodes
        for s, node_s in src_index.items()
        for d, node_d in dest_index.items()
        if s != d
    ]
    pings = Scheduler(
        scheduler=scheduler,
        label="PINGS",
        verbose=verbose_jobs,
        required=green_light)

    # retrieve all pcap files from fit nodes
    stop_protocol_job = [
        SshJob(
            # scheduler=scheduler,
            node=node,
            # required=pings,
            label=f"kill routing protocol on {id}",
            verbose=verbose_jobs,
            command=RunScript(f"node-utilities.sh",
                              f"kill-{protocol}",
                              label=f"kill-{protocol}"),
        )
        for id, node in node_index.items()
    ]
    stop_protocol = Scheduler(
        *stop_protocol_job,
        scheduler=scheduler,
        required=pings,
        label="Stop routing protocols",
    )

    if tshark:
        retrieve_tcpdump_job = [
            SshJob(
                # scheduler=scheduler,
                node=nodei,
                # required=pings,
                label=f"retrieve pcap trace from fit{i:02d}",
                verbose=verbose_jobs,
                commands=[
                    Run("systemctl stop tcpdump",
                        label="stop tcpdump"),
                    #Run("systemctl reset-failed tcpdump"),
                    #RunScript("node-utilities.sh", "kill-tcpdump",
                    #          label="kill-tcpdump"),
                    Run(
                        f"echo retrieving pcap trace and result-{i}.txt from fit{i:02d}",
                        label=""),
                    Pull(remotepaths=[f"/tmp/fit{i}.pcap"],
                         localpath=str(run_root), label=""),
                ],
            )
            for i, nodei in node_index.items()
        ]
        retrieve_tcpdump = Scheduler(
            *retrieve_tcpdump_job,
            scheduler=scheduler,
            required=pings,
            label="Retrieve tcpdump",
        )
    if route_sampling:
        retrieve_sampling_job = [
            SshJob(
                # scheduler=scheduler,
                node=nodei,
                # required=pings,
                label=f"retrieve sampling trace from fit{i:02d}",
                verbose=verbose_jobs,
                commands=[
                    # RunScript("node-utilities.sh", "kill-route-sample", protocol,
                    #          label = "kill route sample"),
                    #RunScript("route-sample-service.sh", "kill-route-sample",
                    #          label="kill route sample"),
                    Run("systemctl stop route-sample",
                        label="stop route-sample"),
                    Run(
                        f"echo retrieving sampling trace from fit{i:02d}",
                        label=""),
                    Pull(remotepaths=[f"ROUTE-TABLE-{i:02d}-SAMPLED"],
                         localpath=str(run_root), label=""),
                ],
            )
            for i, nodei in node_index.items()
        ]
        retrieve_sampling = Scheduler(
            *retrieve_sampling_job,
            scheduler=scheduler,
            required=pings,
            verbose=verbose_jobs,
            label="Stop & retrieve route sampling",
            )
    if tshark:
        parse_pcaps_job = [
            SshJob(
                # scheduler=scheduler,
                node=LocalNode(),
                # required=retrieve_tcpdump,
                label=f"parse pcap trace {run_root}/fit{i}.pcap",
                verbose=verbose_jobs,
                #commands = [RunScript("parsepcap.sh", run_root, i)]
                command=Run("tshark", "-2", "-r",
                            f"{run_root}/fit{i}.pcap",
                            "-R",
                            f"'(ip.dst==10.0.0.{i} && icmp) && radiotap.dbm_antsignal'",
                            "-Tfields",
                            "-e", "'ip.src'",
                            "-e" "'ip.dst'",
                            "-e", "'radiotap.dbm_antsignal'",
                            ">", f"{run_root}/result-{i}.txt",
                            label=f"parsing pcap from {i}"),
            )
            for i in node_ids
        ]
        parse_pcaps = Scheduler(
            *parse_pcaps_job,
            scheduler=scheduler,
            required=retrieve_tcpdump,
            label="Parse pcap",
        )

    if interference:
        kill_uhd_siggen = SshJob(
            scheduler=scheduler,
            node=node_scrambler,
            required=pings,
            label=f"killing uhd_siggen on the scrambler node {scrambler_id}",
            verbose=verbose_jobs,
            commands=[Run("systemctl", "stop", "uhd_siggen"),
                      #Run("systemctl reset-failed tcpdump"),
                      ],
        )
        kill_2_uhd_siggen = SshJob(
            scheduler=scheduler,
            node=faraday,
            required=kill_uhd_siggen,
            label=f"turning off usrp on the scrambler node {scrambler_id}",
            verbose=verbose_jobs,
            command=Run("rhubarbe", "usrpoff", scrambler_id),
        )

    pings.add(Sequence(*pings_job))
    # for running sequentially we impose no limit on the scheduler
    # that will be limitied anyways by the very structure
    # of the required graph

    # safety check

    scheduler.export_as_pngfile(run_root / "experiment-graph")
    if dry_run:
        scheduler.list()
        return True

    # if not in dry-run mode, let's proceed to the actual experiment
    ok = scheduler.run()  # jobs_window=jobs_window)

    # close all ssh connections
    close_ssh_in_scheduler(scheduler)


    # give details if it failed
    if not ok:
        scheduler.debrief()
        scheduler.export_as_pngfile("debug")
    if ok and map:
        time_line("Creation of MAP files")
        post_processor = ProcessRoutes(run_root, src_ids, node_ids)
        post_processor.run()
    if ok and route_sampling:
        time_line("Creation of ROUTE SAMPLING files")
        post_processor = ProcessRoutes(run_root, src_ids, node_ids)
        post_processor.run_sampled()
    # data acquisition is done, let's aggregate results
    # i.e. compute averages
    #if ok and tshark:
        #post_processor = Aggregator(run_root, node_ids, antenna_mask)
        #post_processor.run()

    time_line("one_run done")
    return ok
Example #24
0
 def test_format(self):
     s = Scheduler()
     f = TerminalFormatter("%Y:%H:%S - @host@:@line@", verbose=True)
     n = SshNode(localhostname(), username=localuser(), formatter=f)
     s.add(SshJob(node=n, commands=[Run("echo LINE1"), Run("echo LINE2")]))
     s.run()
Example #25
0
def wait(*argv):  # pylint: disable=r0914
    usage = """
    Wait for selected nodes to be reachable by ssh
    Returns 0 if all nodes indeed are reachable
    """
    # suppress info log messages from asyncssh
    asyncssh_set_log_level(logging.WARNING)

    config = Config()
    parser = ArgumentParser(usage=usage,
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument("-c",
                        "--curses",
                        action='store_true',
                        default=False,
                        help="Use curses to provide term-based animation")
    parser.add_argument("-t",
                        "--timeout",
                        action='store',
                        default=config.value('nodes', 'wait_default_timeout'),
                        type=float,
                        help="Specify global timeout for the whole process")
    parser.add_argument("-b",
                        "--backoff",
                        action='store',
                        default=config.value('networking', 'ssh_backoff'),
                        type=float,
                        help="Specify backoff average between "
                        "attempts to ssh connect")
    parser.add_argument("-u",
                        "--user",
                        default="root",
                        help="select other username")
    # really dont' write anything
    parser.add_argument("-s", "--silent", action='store_true', default=False)
    parser.add_argument("-v", "--verbose", action='store_true', default=False)

    add_selector_arguments(parser)
    args = parser.parse_args(argv)

    # --curses implies --verbose otherwise nothing shows up
    if args.curses:
        args.verbose = True

    selector = selected_selector(args)
    message_bus = asyncio.Queue()

    if args.verbose:
        message_bus.put_nowait({'selected_nodes': selector})
    from rhubarbe.logger import logger
    logger.info(f"wait: backoff is {args.backoff} "
                f"and global timeout is {args.timeout}")

    nodes = [
        Node(cmc_name, message_bus)  # pylint: disable=w0621
        for cmc_name in selector.cmc_names()
    ]
    sshs = [
        SshProxy(node, username=args.user, verbose=args.verbose)
        for node in nodes
    ]
    jobs = [Job(ssh.wait_for(args.backoff), critical=True) for ssh in sshs]

    display_class = Display if not args.curses else DisplayCurses
    display = display_class(nodes, message_bus)

    # have the display class run forever until the other ones are done
    scheduler = Scheduler(Job(display.run(), forever=True, critical=True),
                          *jobs,
                          timeout=args.timeout,
                          critical=False)
    try:
        orchestration = scheduler.run()
        if orchestration:
            return 0
        else:
            if args.verbose:
                scheduler.debrief()
            return 1
    except KeyboardInterrupt:
        print("rhubarbe-wait : keyboard interrupt - exiting")
        # xxx
        return 1
    finally:
        display.epilogue()
        if not args.silent:
            for ssh in sshs:
                print(f"{ssh.node}:ssh {'OK' if ssh.status else 'KO'}")
Example #26
0
def collect(run_name, slicename, cn, ran, oai_ues, verbose, dry_run):
    """
    retrieves all relevant logs under a common name
    otherwise, same signature as run() for convenience

    retrieved stuff will be made of
    * one pcap file for the CN
    * compressed tgz files, one per node, gathering logs and configs and datas
    * for convenience the tgz files are unwrapped in run_name/id0
    """

    # the local dir to store incoming raw files. mostly tar files
    local_path = Path(f"{run_name}")
    if not local_path.exists():
        print(f"Creating directory {local_path}")
        local_path.mkdir()

    gwuser, gwhost = r2lab_parse_slice(slicename)
    gwnode = SshNode(hostname=gwhost, username=gwuser,
                     formatter=TimeColonFormatter(verbose=verbose),
                     debug=verbose)

    functions = ["cn", "ran"]
    hostnames = [r2lab_hostname(x) for x in (cn, ran)]
    node_cn, node_ran = nodes = [
        SshNode(gateway=gwnode, hostname=hostname, username='******',
                formatter=TimeColonFormatter(verbose=verbose), debug=verbose)
        for hostname in hostnames
    ]
    if oai_ues:
        hostnames_ue = [r2lab_hostname(x) for x in oai_ues]
        nodes_ue = [
            SshNode(gateway=gwnode, hostname=hostname, username='******',
                    formatter=TimeColonFormatter(verbose=verbose), debug=verbose)
            for hostname in hostnames_ue]


    # all nodes involved are  managed in the same way
    # node: a SshNode instance
    # id: the fit number
    # function, a string like 'cn' or 'ran' or 'oai-ue'

    local_nodedirs_tars = []

    scheduler = Scheduler(verbose=verbose)
    for (node, id, function) in zip(
            chain(nodes, nodes_ue),
            chain( [cn, ran], oai_ues),
            chain(functions, cycle(["oai-ue"]))):
        # nodes on 2 digits
        id0 = f"{id:02d}"
        # node-dep collect dir
        node_dir = local_path / id0
        node_dir.exists() or node_dir.mkdir()
        local_tar = f"{local_path}/{function}-{id0}.tgz"
        SshJob(
            node=node,
            commands=[
                # first run a 'capture-all' function remotely
                # to gather all the relevant files and commands remotely
                RunScript(
                    find_local_embedded_script(f"mosaic-{function}.sh"),
                    f"capture-all", f"{run_name}-{function}",
                    includes=INCLUDES),
                # and retrieve it locally
                Pull(
                    remotepaths=f"{run_name}-{function}.tgz",
                    localpath=local_tar),
                ],
            scheduler=scheduler)
        local_nodedirs_tars.append((node_dir, local_tar))


    # retrieve tcpdump on CN
    SshJob(
        node=node_cn,
        commands=[
            tcpdump_cn_service.stop_command(),
            Pull(remotepaths=[tcpdump_cn_pcap],
                 localpath=local_path),
            ],
        scheduler=scheduler
        )

    print(10*'*', 'See collect scheduler in',
          scheduler.export_as_pngfile("cefore-collect"))

    if verbose:
        scheduler.list()

    if dry_run:
        return

    if not scheduler.run():
        print("KO")
        scheduler.debrief()
        return

    # unwrap
    for node_dir, tar in local_nodedirs_tars:
        print(f"Untaring {tar} in {node_dir}/")
        os.system(f"tar -C {node_dir} -xzf {tar}")
Example #27
0
    def main(self, *test_argv):  # pylint: disable=r0915,r0912,r0914,c0111
        self.parser = parser = argparse.ArgumentParser()
        # scope - on what hosts
        parser.add_argument(
            "-s",
            "--script",
            action='store_true',
            default=False,
            help=f"""If this flag is present, the first element of the remote
            command is assumed to be either the name of a local script, or,
            if this is not found, the body of a local script, that will be
            copied over before being executed remotely.
            In this case it should be executable.

            On the remote boxes it will be installed
            and run in the {default_remote_workdir} directory.
            """)
        parser.add_argument(
            "-i",
            "--includes",
            dest='includes',
            default=[],
            action='append',
            help="""for script mode only : a list of local files that are
            pushed remotely together with the local script,
            and in the same location; useful when you want to
            to run remotely a shell script that sources other files;
            remember that on the remote end all files (scripts and includes)
            end up in the same location""")
        parser.add_argument("-t",
                            "--target",
                            dest='targets',
                            action='append',
                            default=[],
                            help="""
            specify targets (additive); at least one is required;
            each target can be either
            * a space-separated list of hostnames
            * the name of a file containing hostnames
            * the name of a directory containing files named after hostnames;
            see e.g. the --mark option
            """)
        parser.add_argument("-x",
                            "--exclude",
                            dest='excludes',
                            action='append',
                            default=[],
                            help="""
            like --target, but for specifying exclusions;
            for now there no wildcard mechanism is supported here;
            also the order in which --target and --exclude options
            are mentioned does not matter;
            use --dry-run to only check for the list of applicable hosts
            """)
        # global settings
        parser.add_argument("-w",
                            "--window",
                            type=int,
                            default=0,
                            help="""
            specify how many connections can run simultaneously;
            default is no limit
            """)
        parser.add_argument(
            "-c",
            "--connect-timeout",
            dest='timeout',
            type=float,
            default=default_timeout,
            help=f"specify connection timeout, default is {default_timeout}s")
        # ssh settings
        parser.add_argument(
            "-l",
            "--login",
            default=default_username,
            help=f"remote user name - default is {default_username}")
        parser.add_argument("-k",
                            "--key",
                            dest='keys',
                            default=None,
                            action='append',
                            type=str,
                            help="""
            The default is for apssh to locate an ssh-agent
            through the SSH_AUTH_SOCK environment variable.
            If this cannot be found, or has an empty set of keys,
            then the user should specify private key file(s) - additive
            """)
        parser.add_argument("-K",
                            "--ok-if-no-key",
                            default=False,
                            action='store_true',
                            help="""
            When no key can be found, apssh won't even bother
            to try and connect. With this option it proceeds
            even with no key available.
            """)
        parser.add_argument("-g",
                            "--gateway",
                            default=None,
                            help="""
            specify a gateway for 2-hops ssh
            - either hostname or username@hostname
            """)
        # how to store results
        # terminal
        parser.add_argument("-r",
                            "--raw-format",
                            default=False,
                            action='store_true',
                            help="""
            produce raw result, incoming lines are shown as-is without hostname
            """)
        parser.add_argument(
            "-tc",
            "--time-colon-format",
            default=False,
            action='store_true',
            help="equivalent to --format '@time@:@host@:@line@")
        parser.add_argument("-f",
                            "--format",
                            default=None,
                            action='store',
                            help="""specify output format, which may include
* `strftime` formats like e.g. %%H-%%M, and one of the following:
* @user@ for the remote username,
* @host@ for the target hostname,
* @line@ for the actual line output (which contains the actual newline)
* @time@ is a shorthand for %%H-%%M-%%S""")

        # filesystem
        parser.add_argument("-o",
                            "--out-dir",
                            default=None,
                            help="specify directory where to store results")
        parser.add_argument("-d",
                            "--date-time",
                            default=None,
                            action='store_true',
                            help="use date-based directory to store results")
        parser.add_argument("-m",
                            "--mark",
                            default=False,
                            action='store_true',
                            help="""
            available with the -d and -o options only.

            When specified, then for all nodes there will be a file created
            in the output subdir, named either
            0ok/<hostname> for successful nodes,
            or 1failed/<hostname> for the other ones.

            This mark file will contain a single line with the returned code,
            or 'None' if the node was not reachable at all
            """)

        # usual stuff
        parser.add_argument("-n",
                            "--dry-run",
                            default=False,
                            action='store_true',
                            help="Only show details on selected hostnames")
        parser.add_argument("-v",
                            "--verbose",
                            action='store_true',
                            default=False)
        parser.add_argument("-D",
                            "--debug",
                            action='store_true',
                            default=False)
        parser.add_argument("-V",
                            "--version",
                            action='store_true',
                            default=False)

        # the commands to run
        parser.add_argument("commands",
                            nargs=argparse.REMAINDER,
                            type=str,
                            help="""
            command to run remotely.

            If the -s or --script option is provided, the first argument
            here should denote a (typically script) file **that must exist**
            on the local filesystem. This script is then copied over
            to the remote system and serves as the command for remote execution
            """)

        if test_argv:
            args = self.parsed_args = parser.parse_args(test_argv)
        else:
            args = self.parsed_args = parser.parse_args()

        # helpers
        if args.version:
            print(f"apssh version {apssh_version}")
            exit(0)

        # manual check for REMAINDER
        if not args.commands:
            print("You must provide a command to be run remotely")
            parser.print_help()
            exit(1)

        # load keys
        self.loaded_private_keys = load_private_keys(
            self.parsed_args.keys, args.verbose or args.debug)
        if not self.loaded_private_keys and not args.ok_if_no_key:
            print("Could not find any usable key - exiting")
            exit(1)

        # initialize a gateway proxy if --gateway is specified
        gateway = None
        if args.gateway:
            gwuser, gwhost = self.user_host(args.gateway)
            gateway = SshProxy(hostname=gwhost,
                               username=gwuser,
                               keys=self.loaded_private_keys,
                               formatter=self.get_formatter(),
                               timeout=self.parsed_args.timeout,
                               debug=self.parsed_args.debug)

        proxies = self.create_proxies(gateway)
        if args.verbose:
            print_stderr(f"apssh is working on {len(proxies)} nodes")

        window = self.parsed_args.window

        # populate scheduler
        scheduler = Scheduler(verbose=args.verbose)
        if not args.script:
            command_class = Run
            extra_kwds_args = {}
        else:
            # try RunScript
            command_class = RunScript
            extra_kwds_args = {'includes': args.includes}
            # but if the filename is not found then use RunString
            script = args.commands[0]
            if not Path(script).exists():
                if args.verbose:
                    print("Warning: file not found '{}'\n"
                          "=> Using RunString instead".format(script))
                command_class = RunString

        for proxy in proxies:
            scheduler.add(
                SshJob(node=proxy,
                       critical=False,
                       command=command_class(*args.commands,
                                             **extra_kwds_args)))

        # pylint: disable=w0106
        scheduler.jobs_window = window
        if not scheduler.run():
            scheduler.debrief()
        results = [job.result() for job in scheduler.jobs]

        ##########
        # print on stdout the name of the output directory
        # useful mostly with -d :
        subdir = self.get_formatter().run_name \
            if isinstance(self.get_formatter(), SubdirFormatter) \
            else None
        if subdir:
            print(subdir)

        # details on the individual retcods - a bit hacky
        if self.parsed_args.debug:
            for proxy, result in zip(proxies, results):
                print(f"PROXY {proxy.hostname} -> {result}")
        # marks
        names = {0: '0ok', None: '1failed'}
        if subdir and self.parsed_args.mark:
            # do we need to create the subdirs
            need_ok = [s for s in results if s == 0]
            if need_ok:
                os.makedirs(f"{subdir}/{names[0]}", exist_ok=True)
            need_fail = [s for s in results if s != 0]
            if need_fail:
                os.makedirs(f"{subdir}/{names[None]}", exist_ok=True)

            for proxy, result in zip(proxies, results):
                prefix = names[0] if result == 0 else names[None]
                mark_path = Path(subdir) / prefix / proxy.hostname
                with mark_path.open("w") as mark:
                    mark.write(f"{result}\n")

        # xxx - when in gateway mode, the gateway proxy never gets disconnected
        # which probably is just fine

        # return 0 only if all hosts have returned 0
        # otherwise, return 1
        failures = [r for r in results if r != 0]
        overall = 0 if not failures else 1
        return overall
Example #28
0
 def test_logic2(self):
     todo = SshJob(node=self.gateway(),
                   commands=[Run("true"), Run("false")],
                   label="should fail")
     sched = Scheduler(todo, critical=False, verbose=True)
     self.assertFalse(sched.run())