def test_deferred_chain(self): """ one command computes a string that gets passed to another one this is analogous to run1=$(ssh localhost echo from-first-run) final=$(ssh localhost echo ${run1}) the 'final' variable is only needed for checking everything went well """ s = Scheduler() env = Variables() n = SshNode(localhostname(), username=localuser()) Sequence(SshJob(n, commands=Run("echo from-first-run", capture=Capture('run1', env))), SshJob(n, commands=Run(Deferred("echo {{run1}}", env), capture=Capture('final', env))), scheduler=s) s.run() #print(f"env={env}") obtained = env.final expected = "from-first-run" self.assertEqual(obtained, expected)
def test_deferred_service(self): """ a service can be defined from a deferred instance rather than a plain string """ s = Scheduler() env = Variables() echo_service = Service(Deferred("echo {{run1}}", env), service_id='echo', verbose=True) n = SshNode(localhostname(), username=localuser()) Sequence(SshJob(n, commands=Run("echo from-first-run", capture=Capture('run1', env))), SshJob(n, commands=Run(echo_service.start_command())), SshJob(n, commands=Run(echo_service.journal_command( since="10 second ago"), capture=Capture('journal', env))), scheduler=s) print('STARTING', 20 * '-', echo_service.start_command()) s.run() print('DONE', 20 * '-', echo_service.start_command()) #print(f"env={env}") obtained = env.journal expected = "from-first-run" found = expected in obtained self.assertTrue(found)
def check_expansion(self, *deferred_expected_s): s = Scheduler() formatters = {} for deferred, _ in deferred_expected_s: formatters[deferred] = f = CaptureFormatter() f.start_capture() n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=Run(deferred))) s.run() for deferred, expected in deferred_expected_s: captured = formatters[deferred].get_capture() self.assertEqual(captured, expected)
def hop1(self, hostname='localhost', username=None, *, c1, commands, s_command='echo hop1-{}-{}', nested_sched=(0, 1)): """ create * <c1> connections to one node 1 hop away * and on each <commands> commands check current number of connections """ if username is None: username = localuser() verbose(f"creating {c1} hop1-connections - " f"{commands} commands per conn - " f" to {username}@{hostname}") scheduler = Scheduler() nodes = [] jobs = [] for n in range(c1): node1 = SshNode(hostname, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node1) for c in range(commands): jobs.append(SshJob(node=node1, command=s_command.format(n, c), )) scheduler = self.populate_sched(scheduler, jobs, nested=nested_sched[0], pack_job=nested_sched[1]) expected = c1 # record base status in0, out0 = in_out_connections() verbose(f"INITIAL count in={in0} out={out0}") scheduler.export_as_pngfile("debug") topology_as_pngfile(scheduler, "topology") scheduler.run() in1, out1 = in_out_connections() verbose(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1-in0, expected) self.assertEqual(out1-out0, expected) arg = nodes # cleanup close_ssh_in_scheduler(scheduler) in1, out1 = in_out_connections() verbose(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1-in0, 0) self.assertEqual(out1-out0, 0)
def test_hop_depth(self, hostname='localhost', username=None, depth=4, commands=1): # Do not use the close_nodes manually on this test, it does keep the # Order of the declared nodes. if username is None: username = localuser() verbose(f"creating hop{depth}-connections - " f"{commands} commands per conn " f" to {username}@{hostname}") scheduler = Scheduler(timeout=7) nodes = [] jobs = [] gateway = None for n in range(depth): node = SshNode(hostname, gateway=gateway, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node) gateway = node for c in range(commands): jobs.append(SshJob(node=node, command=f"echo hop{n}-{c}", scheduler=scheduler)) expected = depth # record base status in0, out0 = in_out_connections() verbose(f"INITIAL count in={in0} out={out0}") # try: scheduler.run() #except Exception: # pass in1, out1 = in_out_connections() verbose(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1-in0, expected) self.assertEqual(out1-out0, expected) # cleanup close_ssh_in_scheduler(scheduler) #Lets wait a little bit to count time.sleep(1) in1, out1 = in_out_connections() verbose(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1-in0, 0) self.assertEqual(out1-out0, 0)
def hop1(self, hostname='localhost', username=None, *, c1, commands): """ create * <c1> connections to one node 1 hop away * and on each <commands> commands check current number of connections """ if username is None: username = localuser() print(f"creating {c1} hop1-connections - " f"{commands} commands per conn - " f" to {username}@{hostname}") scheduler = Scheduler() nodes = [] jobs = [] for n in range(c1): node1 = SshNode(hostname, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node1) for c in range(commands): jobs.append( SshJob(node=node1, command=f'echo hop1-{n}-{c}', scheduler=scheduler)) expected = c1 # record base status in0, out0 = in_out_connections() print(f"INITIAL count in={in0} out={out0}") scheduler.run() in1, out1 = in_out_connections() print(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1 - in0, expected) self.assertEqual(out1 - out0, expected) # cleanup gathered = asyncio.get_event_loop().run_until_complete( asyncio.gather(*(node.close() for node in nodes))) in1, out1 = in_out_connections() print(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1 - in0, 0) self.assertEqual(out1 - out0, 0)
def run(self, message_bus, timeout): """ send verb to all nodes, waits for max timeout returns True if all nodes behaved as expected and False otherwise - including in case of KeyboardInterrupt """ nodes = [ Node(cmc_name, message_bus) for cmc_name in self.selector.cmc_names() ] jobs = [ Job(self.get_and_show_verb(node, self.verb), critical=True) for node in nodes ] display = Display(nodes, message_bus) scheduler = Scheduler(Job(display.run(), forever=True, critical=True), *jobs, timeout=timeout, critical=False) try: if scheduler.run(): return True else: scheduler.debrief() print(f"rhubarbe-{self.verb} failed: {scheduler.why()}") return False except KeyboardInterrupt: print(f"rhubarbe-{self.verb} : keyboard interrupt - exiting") return False
def test_capture(self): s = Scheduler() f = CaptureFormatter() n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=[ Run("echo LINE1"), Run("echo LINE2"), ])) f.start_capture() s.run() captured = f.get_capture() expected = "LINE1\nLINE2\n" self.assertEqual(captured, expected)
def test_environment(self): needle_foo = 'xxx-foo-xxx' needle_bar = 'xxx-bar-xxx' scheduler = Scheduler() node = SshNode("localhost") env = Variables() service = Service("env", service_id='echo-environ', environ={ 'FOO': needle_foo, 'BAR': needle_bar, }) SshJob(scheduler=scheduler, node=node, commands=[ Run(service.start_command()), Run(service.journal_command(since='5s ago'), capture=Capture('journal', env)) ]) self.assertEqual(scheduler.run(), True) self.assertTrue(f"FOO={needle_foo}" in env.journal) self.assertTrue(f"BAR={needle_bar}" in env.journal)
def test_nesting_sequence(self): expected_duration = 1. watch = Watch('test_nesting_sequence') subjob = Scheduler( Sequence( Job(co_print_sleep(watch, .2, "one")), Job(co_print_sleep(watch, .2, "two")), Job(co_print_sleep(watch, .2, "three")), ), watch=watch, label="sub-scheduler\non several lines", critical=True, forever=True, ) main = Scheduler(Sequence( Job(co_print_sleep(watch, .2, "BEGIN"), label="job-label"), subjob, Job(co_print_sleep(watch, .2, "END")), ), watch=watch) print("===== test_nesting_sequence", "LIST with details") main.list(details=True) self.assertTrue(main.run()) self.assertAlmostEqual(watch.seconds(), expected_duration, delta=.05) produce_png(main, "test_nesting_sequence")
def _allowed_retcod(self, allowed_exits, host="localhost", username=None): print(f"Testing allowed retcod allowed_exits={allowed_exits}") # global timeout total = 4 # scheduled duration long = 1 # we always exit code 100 retcod = 1000 if username is None: username = util.localuser() node = SshNode(host, username=username) scheduler = Scheduler(timeout = total, critical=False) SshJob(node=node, scheduler=scheduler, command=Run(f"sleep {long}; exit {retcod}", allowed_exits=allowed_exits)) expected = retcod in allowed_exits run = scheduler.run() scheduler.list() self.assertEqual(run, expected)
def _allowed_signal(self, allowed_exits, host="localhost", username=None): print(f"Testing allowed signal allowed_exits={allowed_exits}") # global timeout total = 4 # scheduled duration long = 2 # send signal after that amount short = 1 # we always kill with TERM signal = "TERM" if username is None: username = util.localuser() node = SshNode(host, username=username) scheduler = Scheduler(timeout = total, critical=False) SshJob(node=node, scheduler=scheduler, command=Run(f"sleep {long}", allowed_exits=allowed_exits)) SshJob(node=node, scheduler=scheduler, command=f"sleep {short}; pkill -{signal} sleep") expected = signal in allowed_exits run = scheduler.run() scheduler.list() self.assertEqual(run, expected)
def run_one_job(self, job, *, details=False, expected=True): print(job) scheduler = Scheduler(job, verbose=True) orchestration = scheduler.run() scheduler.list(details=details) if not orchestration: scheduler.debrief() self.assertTrue(orchestration) if expected: self.assertEqual(job.result(), 0) else: self.assertNotEqual(job.result(), 0)
def main(self, reset, timeout): mainjob = Job(self.run(reset), critical=True) displayjob = Job(self.display.run(), forever=True, critical=True) scheduler = Scheduler(mainjob, displayjob, timeout=timeout, critical=False) try: is_ok = scheduler.run() if not is_ok: scheduler.debrief() self.display.set_goodbye( f"rhubarbe-save failed: {scheduler.why()}") return 1 return 0 if mainjob.result() else 1 except KeyboardInterrupt: self.display.set_goodbye("rhubarbe-save : keyboard interrupt, bye") return 1 finally: self.cleanup()
def test_commands_verbose(self): dummy_path = "tests/dummy-10" dummy_file = Path(dummy_path).name scheduler = Scheduler() Sequence(SshJob( node=self.gateway(), verbose=True, commands=[ Run("hostname"), RunScript("tests/script-with-args.sh", "arg1", "arg2"), RunString("for i in $(seq 3); do echo verbose$i; done"), Push(localpaths=dummy_path, remotepath="."), Pull(remotepaths=dummy_file, localpath=dummy_path + ".loop"), ]), SshJob(node=LocalNode(), critical=True, commands=Run("diff {x} {x}.loop".format(x=dummy_path), verbose=True)), scheduler=scheduler) ok = scheduler.run() ok or scheduler.debrief() self.assertTrue(ok)
def global_check_image(self, _image, check_strings): # on the remaining nodes: check image marker self.print(f"checking {len(self.nodes)} nodes" f" against {check_strings} in /etc/rhubarbe-image") grep_pattern = "|".join(check_strings) check_command = ( f"tail -1 /etc/rhubarbe-image | egrep -q '{grep_pattern}'") jobs = [ SshJob(node=silent_sshnode(node, verbose=self.verbose), command=check_command, critical=False) for node in self.nodes ] scheduler = Scheduler(Job(self.display.run(), forever=True), *jobs, critical=False, timeout=self.wait_timeout) if not scheduler.run(): self.verbose and scheduler.debrief() # pylint: disable=w0106 # exclude nodes that have not behaved for node, job in zip(self.nodes, jobs): if not job.is_done() or job.raised_exception(): self.verbose_msg( f"checking {grep_pattern}: something went badly wrong with {node}" ) message = None if exc := job.raised_exception(): message = f"OOPS {type(exc)} {exc}" self.mark_and_exclude(node, Reason.CANT_CHECK_IMAGE, message) continue if not job.result() == 0: explanation = f"wrong image found on {node} - looking for {grep_pattern}" self.verbose_msg(explanation) self.mark_and_exclude(node, Reason.DID_NOT_LOAD, explanation) continue self.print(f"node {node} checked out OK")
def global_wait_ssh(self): # wait for nodes to be ssh-reachable self.print(f"waiting for {len(self.nodes)} nodes" f" (timeout={self.wait_timeout})") sshs = [SshWaiter(node, verbose=self.verbose) for node in self.nodes] jobs = [ Job(ssh.wait_for(self.backoff), critical=False) for ssh in sshs ] scheduler = Scheduler(Job(self.display.run(), forever=True), *jobs, critical=False, timeout=self.wait_timeout) if not scheduler.run(): self.verbose and scheduler.debrief() # pylint: disable=w0106 # exclude nodes that have not behaved for node, job in zip(self.nodes, jobs): self.verbose_msg( f"node {node.id} wait_ssh_job -> done={job.is_done()}", f"exc={job.raised_exception()}") if exc := job.raised_exception(): message = f"OOPS {type(exc)} {exc}" self.mark_and_exclude(node, Reason.WONT_SSH, message)
def test_graphics1(self): scheduler = Scheduler(critical=False) gateway = SshNode(hostname=localhostname(), username=localuser()) Sequence( SshJob( node=gateway, command='hostname', ), SshJob(node=gateway, command=[ Run('ls /etc/passwd'), Run('wc -l /etc/passwd'), ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo with RunString on $(hostname) at $(date)"), ]), SshJob(node=gateway, commands=[ RunScript("tests/testbasic.sh"), ]), SshJob(node=gateway, commands=[ Run('wc -l /etc/passwd'), RunString( "#!/usr/bin/env bash\n" "echo with RunsString on $(hostname) at $(date)", remote_name="show-host-date"), RunScript("tests/testbasic.sh"), ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo first arg is $1\n", 10) ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo first arg is $1\n", 10, remote_name='short-show-args') ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo first arg is $1\n" "echo second arg is $2\n" "echo third arg is $3\n" "echo fourth arg is $4\n", 100, 200, 300, 400) ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo first arg is $1\n" "echo second arg is $2\n" "echo third arg is $3\n" "echo fourth arg is $4\n", 1000, 2000, 3000, 4000, remote_name='long-show-args') ]), SshJob(node=gateway, commands=[ RunString( "#!/usr/bin/env bash\n" "echo first arg is $1\n" "echo second arg is $2\n" "echo third arg is $3\n" "echo fourth arg is $4\n", 1000, 2000, 3000, 4000, remote_name='long-show-args', label='snip') ]), SshJob(node=gateway, commands=[ Run("hostname", label="Run()"), RunScript("foobar", label="RunScript()"), RunString("foobar", label="RunString()"), Push("foobar", remotepath="remote", label="Push()"), Pull("remote", localpath="foobar", label="Pull()"), Run("hostname", label=None), RunScript("foobar", label=[]), RunString("foobar", label=0), Push("foobar", remotepath="remote", label={}), Pull("remote", localpath="foobar", label=""), ]), scheduler=scheduler, ) print("NO DETAILS") scheduler.list() print("WITH DETAILS") scheduler.list(details=True) produce_png(scheduler, "test_graphics1") ok = scheduler.run() self.assertFalse(ok)
def _simple(self, forever): storage = f"/root/TCPDUMP-{forever}.pcap" status = f"/root/TCPDUMP-{forever}.status" tcpdump = Service(f"tcpdump -i lo -w {storage}", service_id='tcpdump', verbose=True) monitor = ProcessMonitor() scheduler = Scheduler() node = SshNode("localhost") SshJob(node, scheduler=scheduler, command=tcpdump.start_command(), forever=forever) Sequence( SshJob(node, command="sleep 1"), SshJob(node, command=tcpdump.status_command(output=status)), SshJob(node, command="sleep 1"), SshJob(node, command=tcpdump.stop_command()), # could use a pull to retrive both files but that's not required # since we run on localhost, so keep tests simple scheduler=scheduler, ) # cleanup before we run paths = (Path(x) for x in (storage, status)) for path in paths: if path.exists(): path.unlink() self.assertFalse(path.exists()) produce_png(scheduler, f"service-{forever}") self.assertTrue(scheduler.run()) scheduler.list() for path in paths: self.assertTrue(path.exists()) with Path(status).open() as feed: contents = feed.read() for needle in ('Loaded: loaded', 'Active: active'): self.assertTrue(contents.find(needle) >= 0) close_ssh_in_scheduler(scheduler) # let it settle for a short while, and check the process space import time time.sleep(0.5) monitor.difference() news = monitor.news if news: print(f"we have {len(news)} new processes, {news}") ps_command = "ps " + "".join(str(pid) for pid in news) import os os.system(ps_command) self.assertEqual(len(news), 0)
def hop2(self, hostname='localhost', username=None, *, c1=1, c2=1, commands=1): """ create * <c1> connections to one node 1 hop away * on each one, <c2> connections one hop behind * and on each <commands> commands check current number of connections """ if username is None: username = localuser() print(f"creating {c1}x{c2} hop2-connections - " f"{commands} commands per conn " f" to {username}@{hostname}") scheduler = Scheduler() nodes1 = [] nodes2 = [] jobs = [] for n in range(c1): node1 = SshNode(hostname, username=username, formatter=ColonFormatter(verbose=False)) nodes1.append(node1) for m in range(c2): node2 = SshNode(hostname, username=username, gateway=node1, formatter=ColonFormatter(verbose=False)) nodes2.append(node2) for c in range(commands): jobs.append( SshJob(node=node2, command=f'echo hop1-{n}-{m}-{c}', scheduler=scheduler)) # for each hop1 conn, there are 1 hop1 + c2 hop2 connections alive expected = c1 * (c2 + 1) # record base status in0, out0 = in_out_connections() print(f"INITIAL count in={in0} out={out0}") scheduler.run() in1, out1 = in_out_connections() print(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1 - in0, expected) self.assertEqual(out1 - out0, expected) # cleanup # would be nice to find a way to check that the result # holds no matter in what order the cleanup is done for nodeset in nodes1, nodes2: gathered = asyncio.get_event_loop().run_until_complete( asyncio.gather(*(node.close() for node in nodeset))) in1, out1 = in_out_connections() print(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1 - in0, 0) self.assertEqual(out1 - out0, 0)
def main(argv): if len(argv) == 3: print("!! Unfinished routines !!") else: print("++ Using default settings ++") ########################### ## Local Variables # platform='multiGPU' platform='distributed' gateway_user='******' gateway_host='gw_host' node_username='******' ######################################################### ## Distributed Requirements num_ps = 1 num_workers = 2 ######################################################### gateway = SshNode( gateway_host, username=gateway_user ) ########################################################## elif platform == 'distributed': ## Jetson-TX2 Cluster hosts = [cluster_ip_host] ######################################################### ## Use the Server node for processing the first satge Data-mining server = ResourceManager._set_Node(master_host, master_user, gateway,) ############################ # Push the launch file (run_splitpoint) # With the Parameters Connfiguration on the server # To execute the First Satege in this host job_launch_S1 = SshJob( node = server, commands = [ ## Run the script locate in the laptop RunScript("run_dataspworkers_mlp.sh", platform, num_ps, num_workers), Run("echo Split Data DONE"), ], ) ############################# ## A collection of the PS node ps = [] [ps.append(ResourceManager._set_Node(hosts[i], node_username, gateway,)) for i in range(num_ps)] ############################# ## A collection of the workers node workers = [] [workers.append(ResourceManager._set_Node(hosts[num_ps+i], node_username, gateway,)) for i in range(num_workers)] ######################################################### ## Setting Parameters for the First Stage FEATURES_NAME = "FULL-W1_x1_x2_x3_x4_x5_x7_x8_Y1" SANDBOX=str("/data_B/datasets/drg-PACA/healthData/sandbox-"+FEATURES_NAME) YEAR=str(2008) ## Stage 1 # localdir = "/1_Mining-Stage/" # SP_Dir_X = str(SANDBOX+localdir+"BPPR-"+FEATURES_NAME+"-"YEAR) ############################# ## Setting parameters for the Second Stage S_PLOINT = str(3072) #1536) #SP_ARGV = str(S_PLOINT+"-"+platform) SP_ARGV = platform+"-"+str(num_workers) SP2=str(SANDBOX+"/2_Split-Point-"+SP_ARGV+"/") ############################# ## BPPR Directories dir_train = "/data_training/" dir_valid = "/data_valid/" dir_test = "/data_test/" ############################ ## Worker data management worker_healthData = "/opt/diagnosenet/healthData/" worker_sandbox = str(worker_healthData+"/sandbox-"+FEATURES_NAME) worker_splitpoint = str(worker_sandbox+"/2_Split-Point-"+SP_ARGV+"/") worker_train = str(worker_splitpoint+dir_train) worker_valid = str(worker_splitpoint+dir_valid) worker_test = str(worker_splitpoint+dir_test) ############################ ## Worker commands mkd_worker_sandbox = str("mkdir"+" "+worker_sandbox) mkd_worker_splitpoint = str("mkdir"+" "+worker_splitpoint) mkd_worker_train = str("mkdir"+" "+worker_train) mkd_worker_valid = str("mkdir"+" "+worker_valid) mkd_worker_test = str("mkdir"+" "+worker_test) ############################# ## Create a JOB to build the sandbox for each Worker job_build_sandbox = [] [ job_build_sandbox.append(SshJob( node = workers[i], commands = [ RunString(mkd_worker_sandbox), RunString(mkd_worker_splitpoint), RunString(mkd_worker_train), RunString(mkd_worker_valid), RunString(mkd_worker_test), Run("echo SANDBOX ON WORKER DONE"), ], )) for i in range(len(workers)) ] ############################# ## Create a command for transfer data scp = "scp" cmd_X_train_transfer = [] cmd_y_train_transfer = [] cmd_X_valid_transfer = [] cmd_y_valid_transfer = [] cmd_X_test_transfer = [] cmd_y_test_transfer = [] for i in range(num_workers): worker_host = str(node_user+"@"+ hosts[num_ps+i] +":") num_file = str(i+1) ## Commands to transfer Training dataset X_train_splitted = str(SP2+dir_train+"X_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_train_transfer.append(str(scp+" "+X_train_splitted+" "+worker_host+worker_train)) y_train_splitted = str(SP2+dir_train+"y_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_train_transfer.append(str(scp+" "+y_train_splitted+" "+worker_host+worker_train)) ## Commands to transfer Validation dataset X_valid_splitted = str(SP2+dir_valid+"X_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_valid_transfer.append(str(scp+" "+X_valid_splitted+" "+worker_host+worker_valid)) y_valid_splitted = str(SP2+dir_valid+"y_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_valid_transfer.append(str(scp+" "+y_valid_splitted+" "+worker_host+worker_valid)) ## Commands to transfer Test dataset X_test_splitted = str(SP2+dir_test+"X_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_test_transfer.append(str(scp+" "+X_test_splitted+" "+worker_host+worker_test)) y_test_splitted = str(SP2+dir_test+"y_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_test_transfer.append(str(scp+" "+y_test_splitted+" "+worker_host+worker_test)) ############################ ## Build a JOB for transfering data to each worker sandbox job_data_transfer = [] [job_data_transfer.append(SshJob( node = server, commands = [ RunString(cmd_X_train_transfer[i]), RunString(cmd_y_train_transfer[i]), Run("echo SENDER TRAINING DATA DONE"), RunString(cmd_X_valid_transfer[i]), RunString(cmd_y_valid_transfer[i]), Run("echo SENDER VALID DATA DONE"), RunString(cmd_X_test_transfer[i]), RunString(cmd_y_test_transfer[i]), Run("echo SENDER TEST DATA DONE"), ],) ) for i in range(len(workers))] ######################################################### ## Create a sequence orchestration scheduler instance upfront worker_seq = [] ## Add the Stage-1 JOB into Scheduler worker_seq.append(Scheduler(Sequence( job_launch_S1))) ## Add the worker JOBs into Scheduler [worker_seq.append(Scheduler(Sequence( job_build_sandbox[i], job_data_transfer[i], )) ) for i in range(len(workers))] ############################# ## Old method ## Add the JOB PS Replicas into Scheduler # worker_seq.append(Scheduler(Sequence( # job_PS_replicas))) # # ## Add the JOB WORKER Replicas into Scheduler # worker_seq.append(Scheduler(Sequence( # job_WORKER_replicas))) ############################# ## Run the Sequence JOBS # [seq.orchestrate() for seq in worker_seq] ######################################################### ######################################################### ## Push the launch file (run_secondstage_distributed) ## With the Distributed Parameters for each worker replicas ## To distributed training of Unsupervised Embedding ############################# ## Build a collection of TensorFlow Hosts for PS tf_ps = [] [tf_ps.append(str(hosts[i]+":2222")) for i in range(num_ps)] # print("+++ tf_ps: {}".format(tf_ps)) tf_ps=','.join(tf_ps) ############################# ## Build a collection of TensorFlow Hosts for workers tf_workers = [] [tf_workers.append(str(hosts[num_ps+i]+":2222")) for i in range(num_workers)] # print("+++ tf_workers: {}".format(tf_workers)) tf_workers=','.join(tf_workers) job_PS_replicas = [] [job_PS_replicas.append(SshJob( node = ps[i], commands = [ ## Launches local script to execute on cluster # RunScript("run_secondstage_distributed.sh", # platform, tf_ps, tf_workers, # num_ps, num_workers, "ps", i), RunScript("run_thirdstage_distributed_mlp.sh", platform, tf_ps, tf_workers, num_ps, num_workers, "ps", i), Run("echo PS REPLICA DONE"), ],) ) for i in range(len(ps))] job_WORKER_replicas = [] [job_WORKER_replicas.append(SshJob( node = workers[i], commands = [ ## Launches local script to execute on cluster # RunScript("run_secondstage_distributed.sh", # platform, tf_ps, tf_workers, # num_ps, num_workers, "worker", i), RunScript("run_thirdstage_distributed_mlp.sh", platform, tf_ps, tf_workers, num_ps, num_workers, "worker", i), Run("echo WORKER REPLICA DONE"), ], ) ) for i in range(len(workers))] ############################# ### Simultaneous jobs s_distraining = Scheduler() [s_distraining.add(job_PS_replicas[i]) for i in range(len(ps))] [s_distraining.add(job_WORKER_replicas[i]) for i in range(len(workers))] s_distraining.run(jobs_window = int(num_ps+num_workers+1))
def hop2(self, hostname='localhost', username=None, *, c1=1, c2=1, commands=1, s_command='echo hop2-{}-{}-{}', nested_sched=(0, 1)): """ create * <c1> connections to one node 1 hop away * on each one, <c2> connections one hop behind * and on each <commands> commands check current number of connections """ if username is None: username = localuser() verbose(f"creating {c1}x{c2} hop2-connections - " f"{commands} commands per conn " f" to {username}@{hostname}") scheduler = Scheduler(timeout=7) nodes = [] #nodes2 = [] jobs = [] for n in range(c1): node1 = SshNode(hostname, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node1) for m in range(c2): node2 = SshNode(hostname, username=username, gateway=node1, formatter=ColonFormatter(verbose=False)) nodes.append(node2) for c in range(commands): jobs.append(SshJob(node=node2, command=s_command.format(n, m, c), )) scheduler = self.populate_sched(scheduler, jobs, nested=nested_sched[0], pack_job=nested_sched[1]) # for each hop1 conn, there are 1 hop1 + c2 hop2 connections alive expected = c1 * (c2+1) scheduler.export_as_pngfile("debug") topology_as_pngfile(scheduler, "topology") # record base status in0, out0 = in_out_connections() verbose(f"INITIAL count in={in0} out={out0}") # try: scheduler.run() #except Exception: # pass in1, out1 = in_out_connections() verbose(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1-in0, expected) self.assertEqual(out1-out0, expected) # cleanup close_ssh_in_scheduler(scheduler) #Lets wait a little bit to count time.sleep(1) in1, out1 = in_out_connections() verbose(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1-in0, 0) self.assertEqual(out1-out0, 0)
def one_run(*, protocol, interference, run_name=default_run_name, slicename=default_slicename, tx_power, phy_rate, antenna_mask, channel, load_images=False, node_ids=DEFAULT_NODE_IDS, src_ids=DEFAULT_SRC_IDS, dest_ids=DEFAULT_DEST_IDS, scrambler_id=DEFAULT_SCRAMBLER_ID, tshark=False, map=False, warmup=False, route_sampling=False, iperf=False, verbose_ssh=False, verbose_jobs=False, dry_run=False, run_number=None): """ Performs data acquisition on all nodes with the following settings Arguments: tx_power: in dBm, a string like 5, 10 or 14. Corresponds to the transmission power. phy_rate: a string among 1, 54. Correspond to the wifi rate. antenna_mask: a string among 1, 3, 7. channel: a string like e.g. 1 or 40. Correspond to the channel. protocol: a string among batman , olsr. Correspond to the protocol interference : in amplitude percentage, a string like 15 or 20. Correspond to the power of the noise generated in the spectrum. Can be either None or "None" to mean no interference. run_name: the name for a subdirectory where all data will be kept successive runs should use the same name for further visualization slicename: the Unix login name (slice name) to enter the gateway load_images: a boolean specifying whether nodes should be re-imaged first node_ids: a list of node ids to run the scenario against; strings or ints are OK; tshark: a boolean specifying wether we should format/parse the .pcap. map: a boolean specifying wether we should fetch/parse the route tables of the nodes. warmup: a boolean specifying whether we should run a ping before the experiment to be certain of the stabilisation on the network. src_ids: a list of nodes from which we will launch the ping from. strings or ints are OK. ping_messages : the number of ping packets that will be generated """ # set default for the nodes parameter node_ids = ([int(id) for id in node_ids] if node_ids is not None else DEFAULT_NODE_IDS) src_ids = ([int(id) for id in src_ids] if src_ids is not None else DEFAULT_SRC_IDS) dest_ids = ([int(id) for id in dest_ids] if dest_ids is not None else DEFAULT_NODE_IDS) # all nodes - i.e. including sources and destinations - # need to run the protocol node_ids = list(set(node_ids).union(set(src_ids).union(set(dest_ids)))) if interference == "None": interference = None # open result dir no matter what run_root = naming_scheme( run_name=run_name, protocol=protocol, interference=interference, autocreate=True) # fix me trace = run_root / f"trace-{%m-%d-%H-%M}" ref_time = apssh_time() trace = run_root / f"trace-{ref_time}" try: with trace.open('w') as feed: def log_line(line): time_line(line, file=feed) load_msg = f"{'WITH' if load_images else 'NO'} image loading" interference_msg = (f"interference={interference} " f"from scrambler={scrambler_id}") nodes = " ".join(str(n) for n in node_ids) srcs = " ".join(str(n) for n in src_ids) dests = " ".join(str(n) for n in dest_ids) ping_labels = [ f"PING {s} ➡︎ {d}" for s in src_ids # and on the destination for d in dest_ids if d != s ] log_line(f"output in {run_root}") log_line(f"trace in {trace}") log_line(f"protocol={protocol}") log_line(f"{load_msg}") log_line(f"{interference_msg}") log_line("----") log_line(f"Selected nodes : {nodes}") log_line(f"Sources : {srcs}") log_line(f"Destinations : {dests}") for label in ping_labels: log_line(f"{label}") log_line("----") for feature in ('warmup', 'tshark', 'map', 'route_sampling', 'iperf'): log_line(f"Feature {feature}: {locals()[feature]}") except Exception as exc: print(f"Cannot write into {trace} - aborting this run") print(f"Found exception {type(exc)} - {exc}") return False # # dry-run mode # just display a one-liner with parameters # prelude = "" if not dry_run else "dry_run:" with trace.open() as feed: print(f"**************** {ref_time} one_run #{run_number}:") for line in feed: print(prelude, line, sep='', end='') if dry_run: return True # the nodes involved faraday = SshNode(hostname=default_gateway, username=slicename, formatter=TimeColonFormatter(), verbose=verbose_ssh) # this is a python dictionary that allows to retrieve a node object # from an id node_index = { id: SshNode(gateway=faraday, hostname=fitname(id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) for id in node_ids } # extracts for sources and destinations src_index = {id:node for (id, node) in node_index.items() if id in src_ids} dest_index = {id:node for (id, node) in node_index.items() if id in dest_ids} if interference: node_scrambler = SshNode( gateway=faraday, hostname=fitname(scrambler_id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) # the global scheduler scheduler = Scheduler(verbose=verbose_jobs) ########## check_lease = SshJob( scheduler=scheduler, node=faraday, verbose=verbose_jobs, label="rhubarbe check lease", command=Run("rhubarbe leases --check", label="rlease"), ) # load images if requested green_light = check_lease # at some point we did not load the scrambler if interference was None # and that was a way to run faster loads with no interference # but now we always load the scrambler node with gnuradio # this is because when we do runs.py -i None 15 30 ... # then the first call to one_run is with interference being None # but it is still important to load the scrambler if load_images: # copy node_ids load_ids = node_ids[:] load_ids.append(scrambler_id) # the nodes that we **do not** use should be turned off # so if we have selected e.g. nodes 10 12 and 15, we will do # rhubarbe off -a ~10 ~12 ~15, meaning all nodes except 10, 12 and 15 negated_node_ids = [f"~{id}" for id in load_ids] # we can do these three things in parallel ready_jobs = [ SshJob(node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, command=Run("rhubarbe", "off", "-a", *negated_node_ids, label="turn off unused nodes")), SshJob(node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, label="load batman image", command=Run("rhubarbe", "load", "-i", "batman-olsr", *node_ids, label=f"load ubuntu on {node_ids}")), SshJob( node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, label="load gnuradio image", command=Run("rhubarbe", "load", "-i", "batman-olsr-gnuradio", scrambler_id, label=f"load gnuradio on {scrambler_id}")), ] # replace green_light in this case green_light = SshJob( node=faraday, required=ready_jobs, scheduler=scheduler, verbose=verbose_jobs, label="wait for nodes to come up", command=Run("rhubarbe", "wait", *load_ids)) ########## # setting up the wireless interface on all nodes # # provide node-utilities with the ranges/units it expects frequency = channel_frequency[int(channel)] # tx_power_in_mBm not in dBm tx_power_driver = tx_power * 100 #just in case somme services failed in the previous experiment reset_failed_services_job = [ SshJob( node=node, verbose=verbose_jobs, label="reset failed services", command=Run("systemctl reset-failed", label="reset-failed services")) for id, node in node_index.items() ] reset_failed_services = Scheduler( *reset_failed_services_job, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Reset failed services") init_wireless_sshjobs = [ SshJob( node=node, verbose=verbose_jobs, label=f"init {id}", command=RunScript( "node-utilities.sh", f"init-ad-hoc-network-{WIRELESS_DRIVER}", WIRELESS_DRIVER, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver, label="init add-hoc network"), ) for id, node in node_index.items()] init_wireless_jobs = Scheduler( *init_wireless_sshjobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Initialisation of wireless chips") if interference: # Run uhd_siggen with the chosen power init_scrambler_job = SshJob( scheduler=scheduler, required=green_light, forever=True, node=node_scrambler, verbose=verbose_jobs, #TODO : If exit-signal patch is done add exit-signal=["TERM"] # to this run object and call uhd_siggen directly commands=[RunScript("node-utilities.sh", "init-scrambler", label="init scrambler"), Run(f"systemd-run --unit=uhd_siggen -t ", f"uhd_siggen -a usrp -f {frequency}M", f"--sine --amplitude 0.{interference}", label="systemctl start uhd_siggen") ] ) green_light = [init_wireless_jobs, reset_failed_services] # then install and run batman on fit nodes run_protocol_job = [ SshJob( # scheduler=scheduler, node=node, label=f"init and run {protocol} on fit node {id}", verbose=verbose_jobs, # CAREFUL : These ones use sytemd-run # with the ----service-type=forking option! command=RunScript("node-utilities.sh", f"run-{protocol}", label=f"run {protocol}"), ) for id, node in node_index.items()] run_protocol = Scheduler( *run_protocol_job, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="init and run routing protocols") green_light = run_protocol # after that, run tcpdump on fit nodes, this job never ends... if tshark: run_tcpdump_job = [ SshJob( # scheduler=scheduler_monitoring, node=node, forever=True, label=f"run tcpdump on fit node {id}", verbose=verbose_jobs, command=[ Run("systemd-run -t --unit=tcpdump", f"tcpdump -U -i moni-{WIRELESS_DRIVER}", f"-y ieee802_11_radio -w /tmp/fit{id}.pcap", label=f"tcpdump {id}") ] ) for id, node in node_index.items() ] run_tcpdump = Scheduler( *run_tcpdump_job, scheduler=scheduler, required=green_light, forever=True, verbose=verbose_jobs, label="Monitoring - tcpdumps") # let the wireless network settle settle_scheduler = Scheduler( scheduler=scheduler, required=green_light, ) if warmup: # warmup pings don't need to be sequential, so let's # do all the nodes at the same time # on a given node though, we'll ping the other ends sequentially # see the graph for more warmup_jobs = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ RunScript("node-utilities.sh", "my-ping", f"10.0.0.{d}", warmup_ping_timeout, warmup_ping_interval, warmup_ping_size, warmup_ping_messages, f"warmup {s} ➡︎ {d}", label=f"warmup {s} ➡︎ {d}") for d in dest_index.keys() if s != d ] ) # for each selected experiment nodes for s, node_s in src_index.items() ] warmup_scheduler = Scheduler( *warmup_jobs, scheduler=settle_scheduler, verbose=verbose_jobs, label="Warmup pings") settle_wireless_job2 = PrintJob( "Let the wireless network settle after warmup", sleep=settle_delay_shorter, scheduler=settle_scheduler, required=warmup_scheduler, label=f"settling-warmup for {settle_delay_shorter} sec") # this is a little cheating; could have gone before the bloc above # but produces a nicer graphical output # we might want to help asynciojobs if it offered a means # to specify entry and exit jobs in a scheduler settle_wireless_job = PrintJob( "Let the wireless network settle", sleep=settle_delay_long, scheduler=settle_scheduler, label=f"settling for {settle_delay_long} sec") green_light = settle_scheduler if iperf: iperf_service_jobs = [ SshJob( node=node_d, verbose=verbose_jobs, forever=True, commands=[ Run("systemd-run -t --unit=iperf", "iperf -s -p 1234 -u", label=f"iperf serv on {d}"), ], ) for d, node_d in dest_index.items() ] iperf_serv_sched = Scheduler( *iperf_service_jobs, verbose=verbose_jobs, label="Iperf Servers", # for a nicer graphical output # otherwise the exit arrow # from scheduler 'iperf mode' # to job 'settling for 60s' # gets to start from this box forever=True, ) iperf_cli = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ Run("sleep 7", label=""), Run(f"iperf", f"-c 10.0.0.{d} -p 1234", f"-u -b {phy_rate}M -t 60", f"-l 1024 > IPERF-{s:02d}-{d:02d}", label=f"run iperf {s} ➡︎ {d}") ] ) for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] iperf_cli_sched = Scheduler( Sequence(*iperf_cli), verbose=verbose_jobs, label="Iperf Clients") iperf_stop = [ SshJob(node=node_d, verbose=verbose_jobs, label=f"Stop iperf on {d}", command=Run("systemctl stop iperf")) for d, node_d in dest_index.items() ] iperf_stop_sched = Scheduler( *iperf_stop, required=iperf_cli_sched, verbose=verbose_jobs, label="Iperf server stop") iperf_fetch = [ SshJob(node=node_s, verbose=verbose_jobs, command=Pull( remotepaths=[f"IPERF-{s:02d}-{d:02d}"], localpath=str(run_root), label="fetch iperf {s} ➡︎ {d}") ) for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] iperf_fetch_sched = Scheduler( *iperf_fetch, required=iperf_stop_sched, verbose=verbose_jobs, label="Iperf fetch report") iperf_jobs = [iperf_serv_sched, iperf_cli_sched, iperf_stop_sched, iperf_fetch_sched] iperf_sched = Scheduler( *iperf_jobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Iperf Module") settle_wireless_job_iperf = PrintJob( "Let the wireless network settle", sleep=settle_delay_shorter, scheduler=scheduler, required=iperf_sched, label=f"settling-iperf for {settle_delay_shorter} sec") green_light = settle_wireless_job_iperf # create all the tracepath jobs from the first node in the list if map: map_jobs = [ SshJob( node=node, label=f"Generating ROUTE file for proto {protocol} on node {id}", verbose=verbose_jobs, commands=[ RunScript(f"node-utilities.sh", f"route-{protocol}", f"> ROUTE-TABLE-{id:02d}", label="get route table"), Pull(remotepaths=[f"ROUTE-TABLE-{id:02d}"], localpath=str(run_root), label="") ], ) for id, node in node_index.items() ] map_scheduler = Scheduler( *map_jobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Snapshoting route files") green_light = map_scheduler if route_sampling: route_sampling_jobs = [ SshJob( node=node, label=f"Route sampling service for proto {protocol} on node {id}", verbose=False, forever=True, commands=[ Push(localpaths=["route-sample-service.sh"], remotepath=".", label=""), Run("chmod +x route-sample-service.sh", label=""), Run("systemd-run -t --unit=route-sample", "/root/route-sample-service.sh", "route-sample", f"ROUTE-TABLE-{id:02d}-SAMPLED", protocol, label="start route-sampling"), ], ) for id, node in node_index.items() ] route_sampling_scheduler = Scheduler( *route_sampling_jobs, scheduler=scheduler, verbose=False, forever=True, label="Route Sampling services launch", required=green_light) ########## # create all the ping jobs, i.e. max*(max-1)/2 # this again is a python list comprehension # see the 2 for instructions at the bottom # # notice that these SshJob instances are not yet added # to the scheduler, we will add them later on # depending on the sequential/parallel strategy pings_job = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ Run(f"echo actual ping {s} ➡︎ {d} using {protocol}", label=f"ping {s} ➡︎ {d}"), RunScript("node-utilities.sh", "my-ping", f"10.0.0.{d}", ping_timeout, ping_interval, ping_size, ping_messages, f"actual {s} ➡︎ {d}", ">", f"PING-{s:02d}-{d:02d}", label=""), Pull(remotepaths=[f"PING-{s:02d}-{d:02d}"], localpath=str(run_root), label=""), ], ) # for each selected experiment nodes for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] pings = Scheduler( scheduler=scheduler, label="PINGS", verbose=verbose_jobs, required=green_light) # retrieve all pcap files from fit nodes stop_protocol_job = [ SshJob( # scheduler=scheduler, node=node, # required=pings, label=f"kill routing protocol on {id}", verbose=verbose_jobs, command=RunScript(f"node-utilities.sh", f"kill-{protocol}", label=f"kill-{protocol}"), ) for id, node in node_index.items() ] stop_protocol = Scheduler( *stop_protocol_job, scheduler=scheduler, required=pings, label="Stop routing protocols", ) if tshark: retrieve_tcpdump_job = [ SshJob( # scheduler=scheduler, node=nodei, # required=pings, label=f"retrieve pcap trace from fit{i:02d}", verbose=verbose_jobs, commands=[ Run("systemctl stop tcpdump", label="stop tcpdump"), #Run("systemctl reset-failed tcpdump"), #RunScript("node-utilities.sh", "kill-tcpdump", # label="kill-tcpdump"), Run( f"echo retrieving pcap trace and result-{i}.txt from fit{i:02d}", label=""), Pull(remotepaths=[f"/tmp/fit{i}.pcap"], localpath=str(run_root), label=""), ], ) for i, nodei in node_index.items() ] retrieve_tcpdump = Scheduler( *retrieve_tcpdump_job, scheduler=scheduler, required=pings, label="Retrieve tcpdump", ) if route_sampling: retrieve_sampling_job = [ SshJob( # scheduler=scheduler, node=nodei, # required=pings, label=f"retrieve sampling trace from fit{i:02d}", verbose=verbose_jobs, commands=[ # RunScript("node-utilities.sh", "kill-route-sample", protocol, # label = "kill route sample"), #RunScript("route-sample-service.sh", "kill-route-sample", # label="kill route sample"), Run("systemctl stop route-sample", label="stop route-sample"), Run( f"echo retrieving sampling trace from fit{i:02d}", label=""), Pull(remotepaths=[f"ROUTE-TABLE-{i:02d}-SAMPLED"], localpath=str(run_root), label=""), ], ) for i, nodei in node_index.items() ] retrieve_sampling = Scheduler( *retrieve_sampling_job, scheduler=scheduler, required=pings, verbose=verbose_jobs, label="Stop & retrieve route sampling", ) if tshark: parse_pcaps_job = [ SshJob( # scheduler=scheduler, node=LocalNode(), # required=retrieve_tcpdump, label=f"parse pcap trace {run_root}/fit{i}.pcap", verbose=verbose_jobs, #commands = [RunScript("parsepcap.sh", run_root, i)] command=Run("tshark", "-2", "-r", f"{run_root}/fit{i}.pcap", "-R", f"'(ip.dst==10.0.0.{i} && icmp) && radiotap.dbm_antsignal'", "-Tfields", "-e", "'ip.src'", "-e" "'ip.dst'", "-e", "'radiotap.dbm_antsignal'", ">", f"{run_root}/result-{i}.txt", label=f"parsing pcap from {i}"), ) for i in node_ids ] parse_pcaps = Scheduler( *parse_pcaps_job, scheduler=scheduler, required=retrieve_tcpdump, label="Parse pcap", ) if interference: kill_uhd_siggen = SshJob( scheduler=scheduler, node=node_scrambler, required=pings, label=f"killing uhd_siggen on the scrambler node {scrambler_id}", verbose=verbose_jobs, commands=[Run("systemctl", "stop", "uhd_siggen"), #Run("systemctl reset-failed tcpdump"), ], ) kill_2_uhd_siggen = SshJob( scheduler=scheduler, node=faraday, required=kill_uhd_siggen, label=f"turning off usrp on the scrambler node {scrambler_id}", verbose=verbose_jobs, command=Run("rhubarbe", "usrpoff", scrambler_id), ) pings.add(Sequence(*pings_job)) # for running sequentially we impose no limit on the scheduler # that will be limitied anyways by the very structure # of the required graph # safety check scheduler.export_as_pngfile(run_root / "experiment-graph") if dry_run: scheduler.list() return True # if not in dry-run mode, let's proceed to the actual experiment ok = scheduler.run() # jobs_window=jobs_window) # close all ssh connections close_ssh_in_scheduler(scheduler) # give details if it failed if not ok: scheduler.debrief() scheduler.export_as_pngfile("debug") if ok and map: time_line("Creation of MAP files") post_processor = ProcessRoutes(run_root, src_ids, node_ids) post_processor.run() if ok and route_sampling: time_line("Creation of ROUTE SAMPLING files") post_processor = ProcessRoutes(run_root, src_ids, node_ids) post_processor.run_sampled() # data acquisition is done, let's aggregate results # i.e. compute averages #if ok and tshark: #post_processor = Aggregator(run_root, node_ids, antenna_mask) #post_processor.run() time_line("one_run done") return ok
def test_format(self): s = Scheduler() f = TerminalFormatter("%Y:%H:%S - @host@:@line@", verbose=True) n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=[Run("echo LINE1"), Run("echo LINE2")])) s.run()
def wait(*argv): # pylint: disable=r0914 usage = """ Wait for selected nodes to be reachable by ssh Returns 0 if all nodes indeed are reachable """ # suppress info log messages from asyncssh asyncssh_set_log_level(logging.WARNING) config = Config() parser = ArgumentParser(usage=usage, formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("-c", "--curses", action='store_true', default=False, help="Use curses to provide term-based animation") parser.add_argument("-t", "--timeout", action='store', default=config.value('nodes', 'wait_default_timeout'), type=float, help="Specify global timeout for the whole process") parser.add_argument("-b", "--backoff", action='store', default=config.value('networking', 'ssh_backoff'), type=float, help="Specify backoff average between " "attempts to ssh connect") parser.add_argument("-u", "--user", default="root", help="select other username") # really dont' write anything parser.add_argument("-s", "--silent", action='store_true', default=False) parser.add_argument("-v", "--verbose", action='store_true', default=False) add_selector_arguments(parser) args = parser.parse_args(argv) # --curses implies --verbose otherwise nothing shows up if args.curses: args.verbose = True selector = selected_selector(args) message_bus = asyncio.Queue() if args.verbose: message_bus.put_nowait({'selected_nodes': selector}) from rhubarbe.logger import logger logger.info(f"wait: backoff is {args.backoff} " f"and global timeout is {args.timeout}") nodes = [ Node(cmc_name, message_bus) # pylint: disable=w0621 for cmc_name in selector.cmc_names() ] sshs = [ SshProxy(node, username=args.user, verbose=args.verbose) for node in nodes ] jobs = [Job(ssh.wait_for(args.backoff), critical=True) for ssh in sshs] display_class = Display if not args.curses else DisplayCurses display = display_class(nodes, message_bus) # have the display class run forever until the other ones are done scheduler = Scheduler(Job(display.run(), forever=True, critical=True), *jobs, timeout=args.timeout, critical=False) try: orchestration = scheduler.run() if orchestration: return 0 else: if args.verbose: scheduler.debrief() return 1 except KeyboardInterrupt: print("rhubarbe-wait : keyboard interrupt - exiting") # xxx return 1 finally: display.epilogue() if not args.silent: for ssh in sshs: print(f"{ssh.node}:ssh {'OK' if ssh.status else 'KO'}")
def collect(run_name, slicename, cn, ran, oai_ues, verbose, dry_run): """ retrieves all relevant logs under a common name otherwise, same signature as run() for convenience retrieved stuff will be made of * one pcap file for the CN * compressed tgz files, one per node, gathering logs and configs and datas * for convenience the tgz files are unwrapped in run_name/id0 """ # the local dir to store incoming raw files. mostly tar files local_path = Path(f"{run_name}") if not local_path.exists(): print(f"Creating directory {local_path}") local_path.mkdir() gwuser, gwhost = r2lab_parse_slice(slicename) gwnode = SshNode(hostname=gwhost, username=gwuser, formatter=TimeColonFormatter(verbose=verbose), debug=verbose) functions = ["cn", "ran"] hostnames = [r2lab_hostname(x) for x in (cn, ran)] node_cn, node_ran = nodes = [ SshNode(gateway=gwnode, hostname=hostname, username='******', formatter=TimeColonFormatter(verbose=verbose), debug=verbose) for hostname in hostnames ] if oai_ues: hostnames_ue = [r2lab_hostname(x) for x in oai_ues] nodes_ue = [ SshNode(gateway=gwnode, hostname=hostname, username='******', formatter=TimeColonFormatter(verbose=verbose), debug=verbose) for hostname in hostnames_ue] # all nodes involved are managed in the same way # node: a SshNode instance # id: the fit number # function, a string like 'cn' or 'ran' or 'oai-ue' local_nodedirs_tars = [] scheduler = Scheduler(verbose=verbose) for (node, id, function) in zip( chain(nodes, nodes_ue), chain( [cn, ran], oai_ues), chain(functions, cycle(["oai-ue"]))): # nodes on 2 digits id0 = f"{id:02d}" # node-dep collect dir node_dir = local_path / id0 node_dir.exists() or node_dir.mkdir() local_tar = f"{local_path}/{function}-{id0}.tgz" SshJob( node=node, commands=[ # first run a 'capture-all' function remotely # to gather all the relevant files and commands remotely RunScript( find_local_embedded_script(f"mosaic-{function}.sh"), f"capture-all", f"{run_name}-{function}", includes=INCLUDES), # and retrieve it locally Pull( remotepaths=f"{run_name}-{function}.tgz", localpath=local_tar), ], scheduler=scheduler) local_nodedirs_tars.append((node_dir, local_tar)) # retrieve tcpdump on CN SshJob( node=node_cn, commands=[ tcpdump_cn_service.stop_command(), Pull(remotepaths=[tcpdump_cn_pcap], localpath=local_path), ], scheduler=scheduler ) print(10*'*', 'See collect scheduler in', scheduler.export_as_pngfile("cefore-collect")) if verbose: scheduler.list() if dry_run: return if not scheduler.run(): print("KO") scheduler.debrief() return # unwrap for node_dir, tar in local_nodedirs_tars: print(f"Untaring {tar} in {node_dir}/") os.system(f"tar -C {node_dir} -xzf {tar}")
def main(self, *test_argv): # pylint: disable=r0915,r0912,r0914,c0111 self.parser = parser = argparse.ArgumentParser() # scope - on what hosts parser.add_argument( "-s", "--script", action='store_true', default=False, help=f"""If this flag is present, the first element of the remote command is assumed to be either the name of a local script, or, if this is not found, the body of a local script, that will be copied over before being executed remotely. In this case it should be executable. On the remote boxes it will be installed and run in the {default_remote_workdir} directory. """) parser.add_argument( "-i", "--includes", dest='includes', default=[], action='append', help="""for script mode only : a list of local files that are pushed remotely together with the local script, and in the same location; useful when you want to to run remotely a shell script that sources other files; remember that on the remote end all files (scripts and includes) end up in the same location""") parser.add_argument("-t", "--target", dest='targets', action='append', default=[], help=""" specify targets (additive); at least one is required; each target can be either * a space-separated list of hostnames * the name of a file containing hostnames * the name of a directory containing files named after hostnames; see e.g. the --mark option """) parser.add_argument("-x", "--exclude", dest='excludes', action='append', default=[], help=""" like --target, but for specifying exclusions; for now there no wildcard mechanism is supported here; also the order in which --target and --exclude options are mentioned does not matter; use --dry-run to only check for the list of applicable hosts """) # global settings parser.add_argument("-w", "--window", type=int, default=0, help=""" specify how many connections can run simultaneously; default is no limit """) parser.add_argument( "-c", "--connect-timeout", dest='timeout', type=float, default=default_timeout, help=f"specify connection timeout, default is {default_timeout}s") # ssh settings parser.add_argument( "-l", "--login", default=default_username, help=f"remote user name - default is {default_username}") parser.add_argument("-k", "--key", dest='keys', default=None, action='append', type=str, help=""" The default is for apssh to locate an ssh-agent through the SSH_AUTH_SOCK environment variable. If this cannot be found, or has an empty set of keys, then the user should specify private key file(s) - additive """) parser.add_argument("-K", "--ok-if-no-key", default=False, action='store_true', help=""" When no key can be found, apssh won't even bother to try and connect. With this option it proceeds even with no key available. """) parser.add_argument("-g", "--gateway", default=None, help=""" specify a gateway for 2-hops ssh - either hostname or username@hostname """) # how to store results # terminal parser.add_argument("-r", "--raw-format", default=False, action='store_true', help=""" produce raw result, incoming lines are shown as-is without hostname """) parser.add_argument( "-tc", "--time-colon-format", default=False, action='store_true', help="equivalent to --format '@time@:@host@:@line@") parser.add_argument("-f", "--format", default=None, action='store', help="""specify output format, which may include * `strftime` formats like e.g. %%H-%%M, and one of the following: * @user@ for the remote username, * @host@ for the target hostname, * @line@ for the actual line output (which contains the actual newline) * @time@ is a shorthand for %%H-%%M-%%S""") # filesystem parser.add_argument("-o", "--out-dir", default=None, help="specify directory where to store results") parser.add_argument("-d", "--date-time", default=None, action='store_true', help="use date-based directory to store results") parser.add_argument("-m", "--mark", default=False, action='store_true', help=""" available with the -d and -o options only. When specified, then for all nodes there will be a file created in the output subdir, named either 0ok/<hostname> for successful nodes, or 1failed/<hostname> for the other ones. This mark file will contain a single line with the returned code, or 'None' if the node was not reachable at all """) # usual stuff parser.add_argument("-n", "--dry-run", default=False, action='store_true', help="Only show details on selected hostnames") parser.add_argument("-v", "--verbose", action='store_true', default=False) parser.add_argument("-D", "--debug", action='store_true', default=False) parser.add_argument("-V", "--version", action='store_true', default=False) # the commands to run parser.add_argument("commands", nargs=argparse.REMAINDER, type=str, help=""" command to run remotely. If the -s or --script option is provided, the first argument here should denote a (typically script) file **that must exist** on the local filesystem. This script is then copied over to the remote system and serves as the command for remote execution """) if test_argv: args = self.parsed_args = parser.parse_args(test_argv) else: args = self.parsed_args = parser.parse_args() # helpers if args.version: print(f"apssh version {apssh_version}") exit(0) # manual check for REMAINDER if not args.commands: print("You must provide a command to be run remotely") parser.print_help() exit(1) # load keys self.loaded_private_keys = load_private_keys( self.parsed_args.keys, args.verbose or args.debug) if not self.loaded_private_keys and not args.ok_if_no_key: print("Could not find any usable key - exiting") exit(1) # initialize a gateway proxy if --gateway is specified gateway = None if args.gateway: gwuser, gwhost = self.user_host(args.gateway) gateway = SshProxy(hostname=gwhost, username=gwuser, keys=self.loaded_private_keys, formatter=self.get_formatter(), timeout=self.parsed_args.timeout, debug=self.parsed_args.debug) proxies = self.create_proxies(gateway) if args.verbose: print_stderr(f"apssh is working on {len(proxies)} nodes") window = self.parsed_args.window # populate scheduler scheduler = Scheduler(verbose=args.verbose) if not args.script: command_class = Run extra_kwds_args = {} else: # try RunScript command_class = RunScript extra_kwds_args = {'includes': args.includes} # but if the filename is not found then use RunString script = args.commands[0] if not Path(script).exists(): if args.verbose: print("Warning: file not found '{}'\n" "=> Using RunString instead".format(script)) command_class = RunString for proxy in proxies: scheduler.add( SshJob(node=proxy, critical=False, command=command_class(*args.commands, **extra_kwds_args))) # pylint: disable=w0106 scheduler.jobs_window = window if not scheduler.run(): scheduler.debrief() results = [job.result() for job in scheduler.jobs] ########## # print on stdout the name of the output directory # useful mostly with -d : subdir = self.get_formatter().run_name \ if isinstance(self.get_formatter(), SubdirFormatter) \ else None if subdir: print(subdir) # details on the individual retcods - a bit hacky if self.parsed_args.debug: for proxy, result in zip(proxies, results): print(f"PROXY {proxy.hostname} -> {result}") # marks names = {0: '0ok', None: '1failed'} if subdir and self.parsed_args.mark: # do we need to create the subdirs need_ok = [s for s in results if s == 0] if need_ok: os.makedirs(f"{subdir}/{names[0]}", exist_ok=True) need_fail = [s for s in results if s != 0] if need_fail: os.makedirs(f"{subdir}/{names[None]}", exist_ok=True) for proxy, result in zip(proxies, results): prefix = names[0] if result == 0 else names[None] mark_path = Path(subdir) / prefix / proxy.hostname with mark_path.open("w") as mark: mark.write(f"{result}\n") # xxx - when in gateway mode, the gateway proxy never gets disconnected # which probably is just fine # return 0 only if all hosts have returned 0 # otherwise, return 1 failures = [r for r in results if r != 0] overall = 0 if not failures else 1 return overall
def test_logic2(self): todo = SshJob(node=self.gateway(), commands=[Run("true"), Run("false")], label="should fail") sched = Scheduler(todo, critical=False, verbose=True) self.assertFalse(sched.run())