def run(self, message_bus, timeout): """ send verb to all nodes, waits for max timeout returns True if all nodes behaved as expected and False otherwise - including in case of KeyboardInterrupt """ nodes = [ Node(cmc_name, message_bus) for cmc_name in self.selector.cmc_names() ] jobs = [ Job(self.get_and_show_verb(node, self.verb), critical=True) for node in nodes ] display = Display(nodes, message_bus) scheduler = Scheduler(Job(display.run(), forever=True, critical=True), *jobs, timeout=timeout, critical=False) try: if scheduler.run(): return True else: scheduler.debrief() print(f"rhubarbe-{self.verb} failed: {scheduler.why()}") return False except KeyboardInterrupt: print(f"rhubarbe-{self.verb} : keyboard interrupt - exiting") return False
def test_deferred_service(self): """ a service can be defined from a deferred instance rather than a plain string """ s = Scheduler() env = Variables() echo_service = Service(Deferred("echo {{run1}}", env), service_id='echo', verbose=True) n = SshNode(localhostname(), username=localuser()) Sequence(SshJob(n, commands=Run("echo from-first-run", capture=Capture('run1', env))), SshJob(n, commands=Run(echo_service.start_command())), SshJob(n, commands=Run(echo_service.journal_command( since="10 second ago"), capture=Capture('journal', env))), scheduler=s) print('STARTING', 20 * '-', echo_service.start_command()) s.run() print('DONE', 20 * '-', echo_service.start_command()) #print(f"env={env}") obtained = env.journal expected = "from-first-run" found = expected in obtained self.assertTrue(found)
def test_environment(self): needle_foo = 'xxx-foo-xxx' needle_bar = 'xxx-bar-xxx' scheduler = Scheduler() node = SshNode("localhost") env = Variables() service = Service("env", service_id='echo-environ', environ={ 'FOO': needle_foo, 'BAR': needle_bar, }) SshJob(scheduler=scheduler, node=node, commands=[ Run(service.start_command()), Run(service.journal_command(since='5s ago'), capture=Capture('journal', env)) ]) self.assertEqual(scheduler.run(), True) self.assertTrue(f"FOO={needle_foo}" in env.journal) self.assertTrue(f"BAR={needle_bar}" in env.journal)
def _allowed_retcod(self, allowed_exits, host="localhost", username=None): print(f"Testing allowed retcod allowed_exits={allowed_exits}") # global timeout total = 4 # scheduled duration long = 1 # we always exit code 100 retcod = 1000 if username is None: username = util.localuser() node = SshNode(host, username=username) scheduler = Scheduler(timeout = total, critical=False) SshJob(node=node, scheduler=scheduler, command=Run(f"sleep {long}; exit {retcod}", allowed_exits=allowed_exits)) expected = retcod in allowed_exits run = scheduler.run() scheduler.list() self.assertEqual(run, expected)
def _test_window(self, total, window): atom = .1 tolerance = 8 # more or less % in terms of overall time s = Scheduler() jobs = [PrintJob("{}-th {}s job".format(i, atom), sleep=atom, scheduler=s) for i in range(1, total + 1)] import time beg = time.time() ok = s.orchestrate(jobs_window=window) ok or s.debrief(details=True) end = time.time() duration = end - beg # estimate global time # unwindowed: overall duration is atom # otherwise a multiple of it (assuming total = k*window) expected = atom if not window else (total / window) * atom print('overall expected {} - measured {}' .format(expected, duration)) distortion = duration / expected time_ok = 1 - tolerance / 100 <= distortion <= 1 + tolerance / 100 if not time_ok: print("_test_window - window = {} :" "wrong execution time {} - not within {}% of {}" .format(window, end - beg, tolerance, expected)) self.assertTrue(time_ok) self.assertTrue(ok)
def test_shutdown_nested_timeout(self): # so here we create 16 jobs for which the shutdown # durations will be # 0.0 0.1 0.2 0.3 - 1.0 1.1 1.2 1.3 # 2.0 2.1 2.2 2.3 - 3.0 3.1 3.2 3.3 # so if we set shutdown_timeout = 0.9s, we should # still find counter == 12 cardinal = 4 # same to the square top = CounterScheduler(label="TOP", shutdown_timeout=0.9) subs = [] for i in range(cardinal): sub = Scheduler(label=f"SUB {i}") subs.append(sub) sub.add( Sequence(*[ CounterJob(top, 10 * i + j, aprint('ok'), label=10 * i + j) for j in range(cardinal) ])) top.add(Sequence(*subs)) self.assertEqual(top.counter, 0) self.assertTrue(top.run()) self.assertEqual(top.counter, cardinal * cardinal) self.assertFalse(top.shutdown()) self.assertEqual(top.counter, cardinal * (cardinal - 1))
def all_off(slice, verbose, debug): """ expects a slice name, and turns off faraday completely """ # what argparse knows as a slice actually is a gateway (user + host) try: gwuser, gwhost = slice.split('@') except: gwuser, gwhost = slice, "faraday.inria.fr" gwnode = SshNode(hostname=gwhost, username=gwuser, formatter=ColonFormatter(verbose=verbose), debug=debug) scheduler = Scheduler( SshJob( node=gwnode, command=Run("rhubarbe", "bye"), label="turn off", )) result = scheduler.orchestrate() if not result: print("RUN KO : {}".format(scheduler.why())) sched.debrief() else: print("faraday turned off OK") return 0 if result else 1
def test_deferred_chain(self): """ one command computes a string that gets passed to another one this is analogous to run1=$(ssh localhost echo from-first-run) final=$(ssh localhost echo ${run1}) the 'final' variable is only needed for checking everything went well """ s = Scheduler() env = Variables() n = SshNode(localhostname(), username=localuser()) Sequence(SshJob(n, commands=Run("echo from-first-run", capture=Capture('run1', env))), SshJob(n, commands=Run(Deferred("echo {{run1}}", env), capture=Capture('final', env))), scheduler=s) s.run() #print(f"env={env}") obtained = env.final expected = "from-first-run" self.assertEqual(obtained, expected)
def _allowed_signal(self, allowed_exits, host="localhost", username=None): print(f"Testing allowed signal allowed_exits={allowed_exits}") # global timeout total = 4 # scheduled duration long = 2 # send signal after that amount short = 1 # we always kill with TERM signal = "TERM" if username is None: username = util.localuser() node = SshNode(host, username=username) scheduler = Scheduler(timeout = total, critical=False) SshJob(node=node, scheduler=scheduler, command=Run(f"sleep {long}", allowed_exits=allowed_exits)) SshJob(node=node, scheduler=scheduler, command=f"sleep {short}; pkill -{signal} sleep") expected = signal in allowed_exits run = scheduler.run() scheduler.list() self.assertEqual(run, expected)
def _test_exc_non_critical(self, verbose): print("verbose = {}".format(verbose)) a1, a2 = SLJ(1), J(co_exception(0.5), label='non critical boom') sched = Scheduler(a1, a2, verbose=verbose) self.assertTrue(sched.orchestrate()) print(sep + 'debrief()') sched.debrief()
def sched_sched_boom(s1_crit, s2_crit, j_crit): return Scheduler(Scheduler(Job(boom("ok"), critical=j_crit, label=f"boom {j_crit}"), critical=s2_crit, label=f"internal {s2_crit}"), critical=s1_crit, label=f"external {s1_crit}")
def test_timeout(self): a1, a2, a3 = [SLJ(x) for x in (0.5, 0.6, 0.7)] a2.requires(a1) a3.requires(a2) sched = Scheduler(a1, a2, a3) # should timeout in the middle of stage 2 self.assertFalse(sched.orchestrate(timeout=1)) sched.list()
def test_sequence6(self): "adding a sequence" sched = Scheduler() a1 = J(sl(0.1), label=1) a2 = J(sl(0.1), label=2) a3 = J(sl(0.1), label=3) sched.add(Seq(a1, a2, a3)) self.assertTrue(sched.orchestrate())
def simple(): j1, j2, j3, j4, j5 = [job(i) for i in range(1, 6)] s1 = Scheduler(j1, j2, j3, label='top simple') j2.requires(j4) j3.requires(j5) self.assertEqual(len(j2.required), 1) self.assertEqual(len(j3.required), 1) s1.sanitize() self.assertEqual(len(j2.required), 0) self.assertEqual(len(j3.required), 0)
def test_creation_scheduler(self): sched = Scheduler() s = Seq(J(sl(1)), J(sl(2)), scheduler=sched) j = J(sl(3), required=s, scheduler=sched) # make sure that jobs appended in the sequence # even later on are also added to the scheduler s.append(J(sl(.5))) self.assertEqual(len(sched.jobs), 4) self.assertTrue(sched.rain_check()) self.assertTrue(sched.orchestrate())
def test_cycle(self): """a simple loop with 3 jobs - cannot handle that""" a1, a2, a3 = J(sl(1.1)), J(sl(1.2)), J(sl(1.3)) a1.requires(a2) a2.requires(a3) a3.requires(a1) sched = Scheduler(a1, a2, a3) # these lines seem to trigger a nasty message about a coro not being # waited self.assertFalse(sched.rain_check())
def test_sequence2(self): "a job and a sequence" a1 = J(sl(0.1), label=1) a2 = J(sl(0.1), label=2) a3 = J(sl(0.1), label=3) s = Seq(a2, a3, required=a1) sched = Scheduler(a1, s) list_sep(sched, sep + "sequence2") self.assertEqual(len(a1.required), 0) self.assertEqual(len(a2.required), 1) self.assertEqual(len(a3.required), 1) self.assertTrue(check_required_types(sched, "test_sequence2")) self.assertTrue(sched.orchestrate())
def test_forever(self): async def tick(n): while True: print('tick {}'.format(n)) await asyncio.sleep(n) a1 = J(sl(0.5), label="finite") a2 = J(tick(0.1), forever=True, label="forever") sched = Scheduler(a1, a2) result = sched.orchestrate() self.assertEqual(result, True) self.assertEqual(a1.is_done(), True) self.assertEqual(a2.is_done(), False)
def test_sequence3(self): "a sequence and a job" a1 = J(sl(0.1), label=1) a2 = J(sl(0.1), label=2) s = Seq(a1, a2) a3 = J(sl(0.1), label=3, required=s) sched = Scheduler() sched.update((s, a3)) list_sep(sched, sep + "sequence3") self.assertEqual(len(a1.required), 0) self.assertEqual(len(a2.required), 1) self.assertEqual(len(a3.required), 1) self.assertTrue(check_required_types(sched, "test_sequence3")) self.assertTrue(sched.orchestrate())
def nested(): j11, j12, j13, j14, j15 = [job(i) for i in range(11, 16)] s2 = Scheduler(Sequence(j11, j12, j13), label="nested internal") j12.requires(j14) j13.requires(j15) j1, j2, j3, j4, j5 = [job(i) for i in range(1, 6)] s1 = Scheduler(Sequence(j1, s2, j3), label="nested top") j1.requires(j4) j1.requires(j11) s2.requires(j13) # j2 not included in sched, untouched j2.requires(j1) self.assertEqual(len(j12.required), 2) self.assertEqual(len(j13.required), 2) self.assertEqual(len(j1.required), 2) self.assertEqual(len(s2.required), 2) self.assertEqual(len(j3.required), 1) s1.sanitize() self.assertEqual(len(j12.required), 1) self.assertEqual(len(j13.required), 1) self.assertEqual(len(j1.required), 0) self.assertEqual(len(s2.required), 1) self.assertEqual(len(j3.required), 1)
def test_order1(self): async def aprint(x): print(x) def job(n): return Job(aprint(n), label=n) sub1, sub2, sub3, sub4 = Scheduler(), Scheduler(), Scheduler(), Scheduler() sched = Scheduler( Sequence( job('top'), sub1, job('middle'), sub2, sub3, sub4)) for i in range(3): sub1.add(job(i+1)) sub2.add(job(i+4)) sub3.add(job(i+7)) sub4.add(job(i+10)) sub4.add(job(13)) produce_png(sched, "test_png_order1")
def test_hop_depth(self, hostname='localhost', username=None, depth=4, commands=1): # Do not use the close_nodes manually on this test, it does keep the # Order of the declared nodes. if username is None: username = localuser() verbose(f"creating hop{depth}-connections - " f"{commands} commands per conn " f" to {username}@{hostname}") scheduler = Scheduler(timeout=7) nodes = [] jobs = [] gateway = None for n in range(depth): node = SshNode(hostname, gateway=gateway, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node) gateway = node for c in range(commands): jobs.append(SshJob(node=node, command=f"echo hop{n}-{c}", scheduler=scheduler)) expected = depth # record base status in0, out0 = in_out_connections() verbose(f"INITIAL count in={in0} out={out0}") # try: scheduler.run() #except Exception: # pass in1, out1 = in_out_connections() verbose(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1-in0, expected) self.assertEqual(out1-out0, expected) # cleanup close_ssh_in_scheduler(scheduler) #Lets wait a little bit to count time.sleep(1) in1, out1 = in_out_connections() verbose(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1-in0, 0) self.assertEqual(out1-out0, 0)
def hop1(self, hostname='localhost', username=None, *, c1, commands): """ create * <c1> connections to one node 1 hop away * and on each <commands> commands check current number of connections """ if username is None: username = localuser() print(f"creating {c1} hop1-connections - " f"{commands} commands per conn - " f" to {username}@{hostname}") scheduler = Scheduler() nodes = [] jobs = [] for n in range(c1): node1 = SshNode(hostname, username=username, formatter=ColonFormatter(verbose=False)) nodes.append(node1) for c in range(commands): jobs.append( SshJob(node=node1, command=f'echo hop1-{n}-{c}', scheduler=scheduler)) expected = c1 # record base status in0, out0 = in_out_connections() print(f"INITIAL count in={in0} out={out0}") scheduler.run() in1, out1 = in_out_connections() print(f"AFTER RUN in={in1} out={out1}") self.assertEqual(in1 - in0, expected) self.assertEqual(out1 - out0, expected) # cleanup gathered = asyncio.get_event_loop().run_until_complete( asyncio.gather(*(node.close() for node in nodes))) in1, out1 = in_out_connections() print(f"AFTER CLEANUP in={in1} out={out1}") self.assertEqual(in1 - in0, 0) self.assertEqual(out1 - out0, 0)
def check_lease(experiment_scheduler, sshnode): """ re-usable function that acts a bit like a python decorator on schedulers. Given an experiment described as a scheduler, this function returns a higher-level scheduler that first checks for the lease, and then proceeds with the experiment. """ check_lease_job = SshJob( # checking the lease is done on the gateway node=faraday, # this means that a failure in any of the commands # will cause the scheduler to bail out immediately critical=True, command=Run("rhubarbe leases --check"), ) return Scheduler( Sequence( check_lease_job, # here we create a nested scheduler # by inserting the experiment_scheduler # as a regular job in the main scheduler experiment_scheduler, ))
def test_png_styles2(self): """ trying the rendering of critical and forever jobs """ watch = Watch() j1 = pipes(watch, .5, "none", nb_pipes=6) j1.critical = False j1.forever = False j1.label = "label-none" j2 = diamond_scheduler(watch, .5, "critical") j2.critical = True j2.forever = False j2.label = "label-critical" j3 = diamond_scheduler(watch, .5, "forever") j3.critical = False j3.forever = True j3.label = "label-forever" j4 = diamond_scheduler(watch, .5, "both") j4.critical = True j4.forever = True j4.label = "label-both" sched = Scheduler( Sequence(j1, j2, j3, j4), watch=watch, ) produce_png(sched, "test_png_styles2")
def test_sequence4(self): "a sequence of 2 sequences" a1 = J(sl(0.1), label=1) a2 = J(sl(0.1), label=2) a3 = J(sl(0.1), label=3) a4 = J(sl(0.1), label=4) s1 = Seq(a1, a2) s2 = Seq(a3, a4) sched = Scheduler(Seq(s1, s2)) list_sep(sched, sep + "sequence4") self.assertEqual(len(a1.required), 0) self.assertEqual(len(a2.required), 1) self.assertEqual(len(a3.required), 1) self.assertEqual(len(a4.required), 1) self.assertTrue(check_required_types(sched, "test_sequence4")) self.assertTrue(sched.orchestrate())
def test_forever(self): a1, a2, t1 = SLJ(1), SLJ(1.5), TJ(.6) a2.requires(a1) sched = Scheduler(a1, a2, t1) sched.list() self.assertTrue(sched.orchestrate()) sched.list()
def test_topology(self): g1 = SshNode("faraday", username="******") n1 = SshNode(gateway=g1, hostname="fit01", username="******") n2 = SshNode(gateway=g1, hostname="fit02", username="******") s = Scheduler() SshJob(n1, command='hostname', scheduler=s) SshJob(n2, command='hostname', scheduler=s) topology_as_pngfile(s, "topology")
def main(self, reset, timeout): mainjob = Job(self.run(reset)) displayjob = Job(self.display.run(), forever=True) scheduler = Scheduler (mainjob, displayjob) try: ok = scheduler.orchestrate(timeout = timeout) if not ok: self.display.set_goodbye("rhubarbe-load failed: {}".format(scheduler.why())) return 1 return 0 if mainjob.result() else 1 except KeyboardInterrupt as e: self.display.set_goodbye("rhubarbe-load : keyboard interrupt - exiting") return 1 finally: self.frisbeed and self.frisbeed.stop_nowait() self.nextboot_cleanup() self.display.epilogue()
def test_timeout(self): a1 = J(sl(1), label="a1") a2 = J(sl(2), label="a2") a3 = J(sl(10), label="a3") result = Scheduler(a1, a2, a3).orchestrate(timeout=3) self.assertEqual(result, False) self.assertEqual(a1.is_done(), True) self.assertEqual(a1.result(), 1) self.assertEqual(a2.is_done(), True) self.assertEqual(a2.result(), 2) self.assertEqual(a3.is_done(), False)
def all_off(slice, verbose, debug): """ expects a slice name, and turns off faraday completely """ # what argparse knows as a slice actually is a gateway (user + host) try: gwuser, gwhost = slice.split('@') except: gwuser, gwhost = slice, "faraday.inria.fr" gwnode = SshNode(hostname=gwhost, username=gwuser, formatter=ColonFormatter(verbose=verbose), debug=debug) scheduler = Scheduler( Sequence( SshJob( node=gwnode, command=Run("rhubarbe", "leases", "--check"), label="check we have a current lease", ), SshJob( node=gwnode, command=Run("rhubarbe", "bye"), label="turn off", ))) result = scheduler.orchestrate() if not result: if check_for_lease.raised_exception(): print( "slice {} does not appear to hold a valid lease".format(slice)) else: print("RUN KO : {}".format(scheduler.why())) sched.debrief() else: print("faraday turned off OK") return 0 if result else 1
def wait(*argv): usage = """ Wait for selected nodes to be reachable by ssh Returns 0 if all nodes indeed are reachable """ the_config = Config() default_timeout = the_config.value('nodes', 'wait_default_timeout') default_backoff = the_config.value('networking', 'ssh_backoff') parser = ArgumentParser(usage=usage) parser.add_argument("-c", "--curses", action='store_true', default=False, help="Use curses to provide term-based animation") parser.add_argument("-t", "--timeout", action='store', default=default_timeout, type=float, help="Specify global timeout for the whole process, default={}" .format(default_timeout)) parser.add_argument("-b", "--backoff", action='store', default=default_backoff, type=float, help="Specify backoff average between " "attempts to ssh connect, default={}" .format(default_backoff)) # really dont' write anything parser.add_argument("-s", "--silent", action='store_true', default=False) parser.add_argument("-v", "--verbose", action='store_true', default=False) add_selector_arguments(parser) args = parser.parse_args(argv) # --curses implies --verbose otherwise nothing shows up if args.curses: args.verbose = True selector = selected_selector(args) message_bus = asyncio.Queue() if args.verbose: message_bus.put_nowait({'selected_nodes': selector}) from rhubarbe.logger import logger logger.info("wait: backoff is {} and global timeout is {}" .format(args.backoff, args.timeout)) nodes = [Node(cmc_name, message_bus) for cmc_name in selector.cmc_names()] sshs = [SshProxy(node, verbose=args.verbose) for node in nodes] jobs = [Job(ssh.wait_for(args.backoff)) for ssh in sshs] display_class = Display if not args.curses else DisplayCurses display = display_class(nodes, message_bus) # have the display class run forever until the other ones are done scheduler = Scheduler(Job(display.run(), forever=True), *jobs) try: orchestration = scheduler.orchestrate(timeout=args.timeout) if orchestration: return 0 else: if args.verbose: scheduler.debrief() return 1 except KeyboardInterrupt as e: print("rhubarbe-wait : keyboard interrupt - exiting") # xxx return 1 finally: display.epilogue() if not args.silent: for ssh in sshs: print("{}:ssh {}".format(ssh.node, "OK" if ssh.status else "KO"))
def cmc_verb(verb, check_resa, *argv): """ check_resa can be either (*) enforce: refuse to send the message if the lease is not there (*) warn: issue a warning when the lease is not there (*) none: does not check the leases """ usage = """ Send verb '{verb}' to the CMC interface of selected nodes""".format(verb=verb) if check_resa == 'enforce': usage += "\n {resa}".format(resa=reservation_required) the_config = Config() default_timeout = the_config.value('nodes', 'cmc_default_timeout') parser = ArgumentParser(usage=usage) parser.add_argument("-t", "--timeout", action='store', default=default_timeout, type=float, help="Specify global timeout for the whole process, default={}" .format(default_timeout)) add_selector_arguments(parser) args = parser.parse_args(argv) selector = selected_selector(args) message_bus = asyncio.Queue() if check_resa in ('warn', 'enforce'): reserved = check_reservation(verbose=False) if not reserved: if check_resa == 'enforce': return 1 verb_to_method = {'status': 'get_status', 'on': 'turn_on', 'off': 'turn_off', 'reset': 'do_reset', 'info': 'get_info', 'usrpstatus': 'get_usrpstatus', 'usrpon': 'turn_usrpon', 'usrpoff': 'turn_usrpoff', } async def get_and_show_verb(node, verb): assert verb in verb_to_method # send the 'verb' method on node method = getattr(node, verb_to_method[verb]) # bound methods must not be passed the subject ! await method() result = getattr(node, verb) result = result if result is not None else "{} N/A".format(verb) for line in result.split("\n"): if line: print("{}:{}".format(node.cmc_name, line)) nodes = [Node(cmc_name, message_bus) for cmc_name in selector.cmc_names()] jobs = [Job(get_and_show_verb(node, verb)) for node in nodes] display = Display(nodes, message_bus) scheduler = Scheduler(Job(display.run(), forever=True), *jobs) try: if scheduler.orchestrate(timeout=args.timeout): return 0 else: print("rhubarbe-{} failed: {}".format(verb, scheduler.why())) return 1 except KeyboardInterrupt as e: print("rhubarbe-{} : keyboard interrupt - exiting".format(verb)) return 1