def create_test_jobspec(args): # Create a test jobspec if not args.command: args.command = ["true"] jobspec = JobspecV1.from_command(args.command) # Set any requested shell options if args.setopt is not None: for keyval in args.setopt: # Split into key, val with a default for 1 if no val given: key, val = (keyval.split("=", 1) + [1])[:2] try: val = json.loads(val) except (json.JSONDecodeError, TypeError): pass jobspec.setattr_shell_option(key, val) # Set any requested Jobspec attributes if args.setattr is not None: for keyval in args.setattr: tmp = keyval.split("=", 1) if len(tmp) != 2: raise ValueError("--setattr: Missing value for attr " + keyval) key = tmp[0] try: val = json.loads(tmp[1]) except (json.JSONDecodeError, TypeError): val = tmp[1] jobspec.setattr(key, val) if not args.exec: jobspec.setattr("system.exec.test.run_duration", args.runtime) return jobspec
def submitJob(self): compute_jobreq = JobspecV1.from_command( command=["sleep", "0"], num_tasks=2, num_nodes=1, cores_per_task=1 ) compute_jobreq.cwd = os.getcwd() compute_jobreq.environment = dict(os.environ) flux.job.submit(self.fh, compute_jobreq, waitable=True)
def test_as_completed(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) futures = [executor.submit(jobspec) for _ in range(3)] for fut in cf.as_completed(futures): self.assertEqual(fut.result(timeout=0), 0) self.assertIsNone(fut.exception())
def test_exception_event(self): with FluxExecutor() as executor: flag = threading.Event() future = executor.submit(JobspecV1.from_command(["/not/a/real/app"])) future.add_event_callback("exception", lambda fut, event: flag.set()) self.assertIsInstance(future.exception(), JobException) self.assertTrue(flag.is_set())
def init_jobspec(self, args): # If no script (reading from stdin), then use "flux" as arg[0] command = args.SCRIPT if not command: command = ["flux"] if not args.nslots: raise ValueError("Number of slots to allocate must be specified") jobspec = JobspecV1.from_command( command=command, num_tasks=args.nslots, cores_per_task=args.cores_per_slot, gpus_per_task=args.gpus_per_slot, num_nodes=args.nodes, ) # Start one flux-broker per node: jobspec.setattr_shell_option("per-resource.type", "node") # Copy script contents into jobspec: jobspec.setattr("system.batch.script", self.read_script(args)) jobspec.setattr("system.batch.broker-opts", list_split(args.broker_opts)) # Default output is flux-{{jobid}}.out # overridden by either --output=none or --output=kvs if not args.output: jobspec.setattr_shell_option("output.stdout.type", "file") jobspec.setattr_shell_option("output.stdout.path", "flux-{{id}}.out") return jobspec
def test_15_job_cancel(self): self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"]) jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True) job.cancel(self.fh, jobid) fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0) return_id, success, errmsg = fut.get_status() self.assertEqual(return_id, jobid) self.assertFalse(success)
def test_executor_event_callbacks(self): with FluxExecutor() as executor: expected_events = set(["start", "finish", "depend", "priority", "free"]) future = executor.submit(JobspecV1.from_command(["false"])) for event in executor.EVENTS: future.add_event_callback( event, lambda fut, event: expected_events.discard(event.name) ) self.assertFalse(expected_events) # no more expected events
def test_20_003_job_event_watch_sync(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) event = future.get_event() self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") future.cancel()
def main(): # set up command-line parser parser = argparse.ArgumentParser( description="submit and wait for the completion of " "N bundles, each consisting of compute " "and io-forwarding jobs") parser.add_argument( "njobs", metavar="N", type=int, help="the number of bundles to submit and wait", ) args = parser.parse_args() # set up jobspecs compute_jobreq = JobspecV1.from_command(command=["./compute.py", "10"], num_tasks=6, num_nodes=3, cores_per_task=2) compute_jobreq.cwd = os.getcwd() compute_jobreq.environment = dict(os.environ) io_jobreq = JobspecV1.from_command(command=["./io-forwarding.py", "10"], num_tasks=3, num_nodes=3, cores_per_task=1) io_jobreq.cwd = os.getcwd() io_jobreq.environment = dict(os.environ) # submit jobs and register event callbacks for all events with FluxExecutor() as executor: futures = [ executor.submit(compute_jobreq) for _ in range(args.njobs // 2) ] futures.extend( executor.submit(io_jobreq) for _ in range(args.njobs // 2, args.njobs)) print("bookkeeper: all jobs submitted") for fut in futures: # each event can have a different callback for event in executor.EVENTS: fut.add_event_callback(event, event_callback) print("bookkeeper: waiting until all jobs complete") # exiting the context manager waits for the executor to complete all futures print("bookkeeper: all jobs completed")
def test_wait(self): with FluxExecutor(threads=3) as executor: jobspec = JobspecV1.from_command(["false"]) futures = [executor.submit(jobspec) for _ in range(3)] done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED) self._check_done(done) done, not_done = cf.wait(futures, return_when=cf.FIRST_EXCEPTION) self._check_done(done) done, not_done = cf.wait(futures) self._check_done(done) self.assertEqual(len(not_done), 0)
def init_jobspec(self, args): if not args.command: raise ValueError("job command and arguments are missing") return JobspecV1.from_command( args.command, num_tasks=args.ntasks, cores_per_task=args.cores_per_task, gpus_per_task=args.gpus_per_task, num_nodes=args.nodes, )
def test_failed_submit(self): with FluxExecutor(thread_name_prefix="foobar") as executor: jobspec = JobspecV1.from_command(["false"]) future = executor.submit(jobspec).add_jobid_callback( lambda future: event.set() ) event = threading.Event() jobid = future.jobid() self.assertGreater(jobid, 0) self.assertTrue(event.is_set()) self.assertEqual(future.result(), 1) self.assertIsNone(future.exception())
def test_submit_after_shutdown(self): executor = FluxExecutor() executor.shutdown(wait=True) with self.assertRaises(RuntimeError): executor.submit(JobspecV1.from_command(["true"])) with self.assertRaises(RuntimeError): executor.submit(None) with self.assertRaises(RuntimeError): executor.attach(5) with self.assertRaises(RuntimeError): executor.attach(None) self.assertFalse(executor._broken_event.is_set())
def test_20_004_job_event_watch(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) events = [] for event in job.event_watch(self.fh, jobid): self.assertIsInstance(event, job.EventLogEvent) self.assertTrue(hasattr(event, "timestamp")) self.assertTrue(hasattr(event, "name")) self.assertTrue(hasattr(event, "context")) self.assertIs(type(event.timestamp), float) self.assertIs(type(event.name), str) self.assertIs(type(event.context), dict) events.append(event.name) self.assertEqual(len(events), 10)
def test_16_job_kill(self): self.sleep_jobspec = JobspecV1.from_command(["sleep", "1000"]) jobid = job.submit(self.fh, self.sleep_jobspec, waitable=True) # Wait for shell to fully start to avoid delay in signal job.event_wait(self.fh, jobid, name="start") job.event_wait( self.fh, jobid, name="shell.start", eventlog="guest.exec.eventlog" ) job.kill(self.fh, jobid, signum=signal.SIGKILL) fut = job.wait_async(self.fh, jobid=jobid).wait_for(5.0) return_id, success, errmsg = fut.get_status() self.assertEqual(return_id, jobid) self.assertFalse(success)
def test_bad_submit_arguments(self): """send bad arguments to ``flux.job.submit``""" deq = collections.deque() event = threading.Event() thread = _FluxExecutorThread(event, deq, 0.01, (), {}) futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)] jobspec = JobspecV1.from_command(["false"]) deq.extend(((jobspec,), {"not_an_arg": 42}, f) for f in futures) event.set() thread.run() self.assertFalse(deq) self.assertEqual(0, thread._FluxExecutorThread__remaining_flux_futures) for fut in futures: self.assertIsInstance(fut.exception(), TypeError)
def test_as_completed(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) futures = [executor.submit(jobspec) for _ in range(3)] attach_futures = [] for fut in cf.as_completed(futures): self.assertEqual(fut.result(timeout=0), 0) self.assertIsNone(fut.exception()) attach_fut = executor.attach(fut.jobid()) self.assertEqual(fut.jobid(), attach_fut.jobid()) attach_futures.append(attach_fut) for attach_fut in cf.as_completed(attach_futures): self.assertEqual(attach_fut.result(timeout=0), 0) self.assertIsNone(attach_fut.exception()) self.assertFalse(executor._broken_event.is_set())
def test_cancel(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["false"]) for _ in range(3): future = executor.submit(jobspec) if future.cancel(): self.assertFalse(future.running()) self.assertTrue(future.cancelled()) with self.assertRaises(cf.CancelledError): future.jobid() with self.assertRaises(cf.CancelledError): future.exception() else: self.assertEqual(future.result(), 1) self.assertIsNone(future.exception())
def test_20_006_job_event_wait(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) event = job.event_wait(self.fh, jobid, "start") self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "start") event = job.event_wait( self.fh, jobid, "shell.init", eventlog="guest.exec.eventlog" ) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "shell.init") event = job.event_wait(self.fh, jobid, "clean") self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "clean") with self.assertRaises(OSError): job.event_wait(self.fh, jobid, "foo")
def test_cancel(self): deq = collections.deque() event = threading.Event() jobspec = JobspecV1.from_command(["false"]) thread = _FluxExecutorThread(event, deq, 0.01, (), {}) futures = [FluxExecutorFuture(threading.get_ident()) for _ in range(5)] for fut in futures: deq.append(((jobspec,), {}, fut)) fut.cancel() event.set() thread.run() for fut in futures: with self.assertRaises(cf.CancelledError): fut.result() with self.assertRaises(cf.CancelledError): fut.jobid()
def test_cancel_attach(self): with FluxExecutor() as executor: jobspec = JobspecV1.from_command(["true"]) jobid = executor.submit(jobspec).jobid() for _ in range(3): future = executor.attach(jobid) if future.cancel(): self.assertFalse(future.running()) self.assertTrue(future.cancelled()) self.assertEqual(future.jobid(), jobid) with self.assertRaises(cf.CancelledError): future.exception() else: self.assertEqual(future.result(), 0) self.assertIsNone(future.exception()) self.assertFalse(executor._broken_event.is_set())
def init_jobspec(self, args): if not args.nslots: raise ValueError("Number of slots to allocate must be specified") broker_opts = list_split(args.broker_opts) jobspec = JobspecV1.from_command( command=["flux", "broker", *broker_opts, *args.COMMAND], num_tasks=args.nslots, cores_per_task=args.cores_per_slot, gpus_per_task=args.gpus_per_slot, num_nodes=args.nodes, ) jobspec.setattr_shell_option("per-resource.type", "node") if sys.stdin.isatty(): jobspec.setattr_shell_option("pty", True) return jobspec
def test_20_007_job_event_wait_exception(self): event = None jobid = job.submit( self.fh, JobspecV1.from_command(["sleep", "0"], num_tasks=128) ) self.assertTrue(jobid > 0) try: event = job.event_wait(self.fh, jobid, "start") except job.JobException as err: self.assertEqual(err.severity, 0) self.assertEqual(err.type, "alloc") self.assertGreater(err.timestamp, 0.0) self.assertIs(event, None) try: event = job.event_wait(self.fh, jobid, "start", raiseJobException=False) except OSError as err: self.assertEqual(err.errno, errno.ENODATA) self.assertIs(event, None)
def test_20_005_job_event_watch_with_cancel(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "3"]), waitable=True) self.assertTrue(jobid > 0) events = [] future = job.event_watch_async(self.fh, jobid) while True: event = future.get_event() if event is None: break if event.name == "start": future.cancel() events.append(event.name) self.assertEqual(event, None) # Should have less than the expected number of events due to cancel self.assertLess(len(events), 8) job.cancel(self.fh, jobid) job.wait(self.fh, jobid)
def test_exception_event(self): with FluxExecutor() as executor: flag = threading.Event() future = executor.submit( JobspecV1.from_command(["/not/a/real/app"])) future.add_event_callback("exception", lambda fut, event: flag.set()) self.assertIsInstance(future.exception(), JobException) self.assertTrue(flag.is_set()) # repeat the test, attaching to the same job jobid = future.jobid() flag = threading.Event() future = executor.attach(jobid) self.assertEqual(jobid, future.jobid()) future.add_event_callback("exception", lambda fut, event: flag.set()) self.assertIsInstance(future.exception(), JobException) self.assertTrue(flag.is_set()) self.assertFalse(executor._broken_event.is_set())
def main(): parser = argparse.ArgumentParser( description="Submit a command repeatedly using FluxExecutor") parser.add_argument( "-n", "--njobs", type=int, metavar="N", help="Set the total number of jobs to run", default=100, ) parser.add_argument("command", nargs=argparse.REMAINDER) args = parser.parse_args() if not args.command: args.command = ["true"] t0 = time.perf_counter() label = "bulksubmit_executor" with FluxExecutor() as executor: compute_jobspec = JobspecV1.from_command(args.command) futures = [executor.submit(compute_jobspec) for _ in range(args.njobs)] # wait for the jobid for each job, as a proxy for the job being submitted for fut in futures: fut.jobid() # all jobs submitted - print timings dt = time.perf_counter() - t0 jps = args.njobs / dt log(label, f"submitted {args.njobs} jobs in {dt:.2f}s. {jps:.2f}job/s") # wait for jobs to complete for i, _ in enumerate(cf.as_completed(futures)): if i == 0: log( label, f"First job finished in about {time.perf_counter() - t0:.3f}s", ) jps = (i + 1) / (time.perf_counter() - t0) progress((i + 1) / args.njobs, length=58, suffix=f"({jps:.1f} job/s)") # print time summary dt = time.perf_counter() - t0 log(label, f"Ran {args.njobs} jobs in {dt:.1f}s. {args.njobs / dt:.1f} job/s")
def test_20_005_1_job_event_watch_with_cancel_stop_true(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "3"]), waitable=True) self.assertTrue(jobid > 0) events = [] future = job.event_watch_async(self.fh, jobid) def cb(future, events): event = future.get_event() if event.name == "start": future.cancel(stop=True) events.append(event.name) future.then(cb, events) rc = self.fh.reactor_run() # Last event should be "start" self.assertEqual(events[-1], "start") job.cancel(self.fh, jobid) job.wait(self.fh, jobid)
def main(): # parse command line parser = argparse.ArgumentParser() parser.add_argument("njobs", nargs="?", type=int, default=10) parser.add_argument("window_size", nargs="?", type=int, default=2) args = parser.parse_args() print(args) # create jobspec for compute.py compute_jobspec = JobspecV1.from_command( command=["./compute.py", "5"], num_tasks=4, num_nodes=2, cores_per_task=2 ) compute_jobspec.cwd = os.getcwd() compute_jobspec.environment = dict(os.environ) # create a queue of the jobspecs to submit jobspec_queue = collections.deque(compute_jobspec for _ in range(args.njobs)) futures = [] # holds incomplete futures with FluxExecutor() as executor: while jobspec_queue or futures: if len(futures) < args.window_size and jobspec_queue: fut = executor.submit(jobspec_queue.popleft()) print(f"submit: {id(fut)}") futures.append(fut) else: done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED) futures = list(not_done) for fut in done: if fut.exception() is not None: print( f"wait: {id(fut)} Error: job raised error " f"{fut.exception()}" ) elif fut.result() == 0: print(f"wait: {id(fut)} Success") else: print( f"wait: {id(fut)} Error: job returned " f"exit code {fut.result()}" )
def test_20_001_job_event_watch_async(self): myarg = dict(a=1, b=2) events = [] def cb(future, arg): self.assertEqual(arg, myarg) event = future.get_event() if event is None: future.get_flux().reactor_stop() return self.assertIsInstance(event, job.EventLogEvent) events.append(event.name) jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) future.then(cb, myarg) rc = self.fh.reactor_run() self.assertGreaterEqual(rc, 0) self.assertEqual(len(events), 10) self.assertEqual(events[0], "submit") self.assertEqual(events[-1], "clean")
def test_20_002_job_event_watch_no_autoreset(self): jobid = job.submit(self.fh, JobspecV1.from_command(["sleep", "0"])) self.assertTrue(jobid > 0) future = job.event_watch_async(self.fh, jobid) self.assertIsInstance(future, job.JobEventWatchFuture) # First event should be "submit" event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") # get_event() again with no reset returns same event: event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "submit") # reset, then get_event() should get next event future.reset() event = future.get_event(autoreset=False) self.assertIsInstance(event, job.EventLogEvent) self.assertEqual(event.name, "validate") future.cancel()