def drain(args): """ Send a drain request to resource module for args.targets, if args.targets not specified, then list currently drained targets """ if args.targets is None: drain_list() return payload = { "targets": args.targets, } if args.update and args.force: LOGGER.error("Only one of --force and --update may be specified") sys.exit(1) if args.update: payload["mode"] = "update" elif args.force: payload["mode"] = "overwrite" if args.reason: payload["reason"] = " ".join(args.reason) RPC( flux.Flux(), "resource.drain", payload, nodeid=0, ).get()
def __init__(self, name, npMaxH): self.submitted = dict() self.fh = flux.Flux() jsc.notify_status(self.fh, update_test_status, self) # self.broker_thread = thread.start_new_thread(run_broker, (self.fh,)) self.cores = 0 max_cores = 0 self.numNodes = 0 self.numberCoresInUse = 0 with kvs.get_dir(self.fh, 'resource.hwloc.by_rank') as d: for name, rankdir in d.items(): max_cores = max(max_cores, rankdir['Core']) self.cores += rankdir['Core'] self.numNodes += 1 self.npMax = max_cores # initialize the upper versions with the real core count super(FluxDirect, self).__init__(name, self.cores) # self.numberTestsRunningMax = 1 # TODO: REMOVE THIS DEBUG VALUE self.cores * 2 # for flux, this is number in the scheduling queue self.numberTestsRunningMax = 1000 # for flux, this is number in the scheduling queue self.scheduler = FluxScheduler() self.timer = self.fh.timer_watcher_create( after=self.naptime, repeat=self.naptime, callback=lambda fh, y, z, w: fh.reactor_stop(fh.get_reactor())) self.timer.start()
def jobtap_load(args): """Load a jobtap plugin into the job manager""" if args.plugin == "none" or args.plugin.startswith("builtin."): path = args.plugin else: path = os.path.abspath(args.plugin) try: resp = (flux.Flux().rpc("job-manager.jobtap", { "load": path, "conf": args.conf }).get()) except FileNotFoundError: LOGGER.error( "%s not found", args.plugin, ) sys.exit(1) if not args.quiet: print("Loaded:") for name in resp["plugins"]: print(name) print("Previously loaded:") for name in resp["previous"]: print(name)
def test_no_topic_invalid(self): """flux_request_encode returns EINVAL with no topic string""" f = flux.Flux("loop://") with self.assertRaises(EnvironmentError) as err: f.request_encode(None, json_str) err = err.exception self.assertEqual(err.errno, errno.EINVAL)
def undrain(args): """ Send an "undrain" request to resource module for args.targets """ RPC(flux.Flux(), "resource.undrain", { "targets": args.targets }, nodeid=0).get()
def submit_bundles(f, N): f = flux.Flux() for i in range(0, N): print(flux.job.submit(f, compute_jobreq)) print(flux.job.submit(f, io_jobreq)) print("bookkeeper: all jobs submitted")
def main(): nthreads = 2 threads = [] queue = Queue() for i in range(0, nthreads): thread = threading.Thread( target=get_events, args=( i, queue, ), ) thread.start() threads.append(thread) print(f"starting {nthreads} threads", file=sys.stderr) # Ensure threads have subscribed to 'test-event' for thread in threads: queue.get() print(f"got response from {thread}", file=sys.stderr) print(f"{nthreads} threads started", file=sys.stderr) flux.Flux().event_send("test-event", "hello") print(f"published test-event", file=sys.stderr) for thread in threads: thread.join() print("Done", file=sys.stderr)
def main(): args = parse_args() time0 = time.time() jobspec = create_test_jobspec(args) bulk = BulkRun(flux.Flux(), args.njobs, jobspec).run(args) jobs = bulk.jobs # Get the job with the earliest 'submit' event: first = jobs[min(jobs.keys(), key=lambda x: jobs[x]["submit"].timestamp)] # Get the job with the latest 'clean' event: last = jobs[max(jobs.keys(), key=lambda x: jobs[x]["clean"].timestamp)] # Get the job with the latest 't_submit' time: lastsubmit = jobs[max(jobs.keys(), key=lambda x: jobs[x]["t_submit"])] submit_time = lastsubmit["t_submit"] - time0 sjps = args.njobs / submit_time script_runtime = time.time() - time0 job_runtime = last["clean"].timestamp - first["submit"].timestamp jps = args.njobs / job_runtime jpsb = args.njobs / script_runtime print(f"number of jobs: {args.njobs}") print(f"submit time: {submit_time:<6.3f}s ({sjps:5.1f} job/s)") print(f"script runtime: {script_runtime:<6.3f}s") print(f"job runtime: {job_runtime:<6.3f}s") print(f"throughput: {jps:<.1f} job/s (script: {jpsb:5.1f} job/s)")
def main(): implementation = "bulksubmit" start_time = time.perf_counter() args = setup_parser().parse_args() # open connection to broker h = flux.Flux() # create jobspec for sleep command compute_jobspec = job.JobspecV1.from_command(command=["true"], num_tasks=1, num_nodes=1, cores_per_task=1) compute_jobspec.cwd = os.getcwd() done = 0 for _ in range(args.jobcount): job.submit_async(h, compute_jobspec, waitable=True).then(submit_cb) if h.reactor_run(h.get_reactor(), 0) < 0: h.fatal_error("reactor start failed") while done < args.jobcount: jobid, success, errstr = job.wait(h) if not success: print("wait: {} Error: {}".format(jobid, errstr)) done += 1 total_time = time.perf_counter() - start_time print("Total seconds: {}".format(total_time)) utils.save_timing_data(args.jobcount, total_time, implementation)
def list_handler(args): valid_states = ["up", "down", "allocated", "free", "all"] headings = { "state": "STATE", "nnodes": "NNODES", "ncores": "NCORES", "ngpus": "NGPUS", "ranks": "RANKS", "rlist": "LIST", } states = args.states.split(",") for state in states: if state not in valid_states: LOGGER.error("Invalid resource state %s specified", state) sys.exit(1) fmt = "{state:>10} {nnodes:>6} {ncores:>8} {ngpus:>8}" if args.verbose: fmt += " {rlist}" if args.format: fmt = args.format formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") if args.from_stdin: resp = json.load(sys.stdin) else: resp = RPC(flux.Flux(), "sched.resource-status").get() resources = SchedResourceList(resp) if not args.no_header: print(formatter.header()) for state in states: print(formatter.format(resources[state]))
def drain_list(): headings = { "timestamp": "TIMESTAMP", "ranks": "RANK", "reason": "REASON", "nodelist": "NODELIST", } resp = RPC(flux.Flux(), "resource.status").get() rset = ResourceSet(resp["R"]) nodelist = rset.nodelist lines = [] for ranks, entry in resp["drain"].items(): ranks = IDset(ranks) line = StatusLine( "drain", ranks, Hostlist([nodelist[i] for i in ranks]), entry["reason"], entry["timestamp"], ) lines.append(line) fmt = "{timestamp:<20} {ranks:<8} {reason:<30} {nodelist}" formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") print(formatter.header()) for line in lines: print(formatter.format(line))
def jobtap_remove(args): """Remove jobtap plugin matching name""" try: flux.Flux().rpc("job-manager.jobtap", {"remove": args.plugin}).get() except FileNotFoundError: LOGGER.error("%s not found", args.plugin) sys.exit(1)
def get_root_jobinfo(): """Fetch a mock JobInfo object for the current enclosing instance""" handle = flux.Flux() size = handle.attr_get("size") try: # If the enclosing instance has a jobid and a parent-uri, then # fill in data from job-list in the parent: # jobid = JobID(handle.attr_get("jobid")) parent = flux.Flux(handle.attr_get("parent-uri")) info = JobList(parent, ids=[jobid]).fetch_jobs().get_jobs()[0] except OSError: # Make a best-effort attempt to create a mock job info dictionary uri = handle.attr_get("local-uri") nodelist = handle.attr_get("hostlist") userid = handle.attr_get("security.owner") info = dict( id=0, userid=int(userid), state=flux.constants.FLUX_JOB_STATE_RUN, name=".", ntasks=int(size), nnodes=int(size), nodelist=nodelist, annotations={"user": {"uri": uri}}, ) try: info["t_run"] = float(handle.attr_get("broker.starttime")) except OSError: pass # If 'ranks' idset came from parent, it could be confusing, # rewrite ranks to be relative to current instance, i.e. # 0-(size-1) # info["ranks"] = "0-{}".format(int(size) - 1) # Fetch instance-specific information for the current instance: job = JobInfo(info).get_instance_info() # If no jobid was discovered for the root instance, use RootJobID() if job.id == 0: job.id = RootJobID() return job
def get(args): """ Get current value of group. This only works on rank 0, but for testing that case we have --rank. """ h = flux.Flux() resp = h.rpc("groups.get", {"name": args.name}, nodeid=args.rank).get() print(resp["members"])
def __init__(self): self.event_router = aurcore.event.EventRouter(name="roombot") self.flux = flux.Flux("roombot", admin_id=TOKENS.ADMIN_ID, parent_router=self.event_router) print("init!") @self.flux.router.endpoint(":ready") def rdy(event: aurcore.event.Event): asyncio.get_running_loop().create_task(self.clock())
def get_events(i, queue): f = flux.Flux() f.event_subscribe("test-event") queue.put(True) w = f.msg_watcher_create(cb, topic_glob="test-event", args=i) w.start() f.reactor_run() w.destroy()
def setUpClass(self): self.f = flux.Flux() self.job_spec = json.dumps({ "nnodes": 1, "ntasks": 1, "cmdline": ["sleep", "0"], "walltime": 15 })
def add_service_and_disconnect(): import sys h = flux.Flux() try: h.service_register("baz").get() except Exception(): sys.exit(-1) sys.exit(0)
def status(args): valid_states = [ "all", "online", "avail", "offline", "exclude", "drain", "draining", "drained", ] default_states = "avail,offline,exclude,draining,drained" headings = { "state": "STATUS", "nnodes": "NNODES", "ranks": "RANKS", "nodelist": "NODELIST", "reason": "REASON", } # Emit list of valid states or formats if requested if "help" in [args.states, args.format]: status_help(args, valid_states, headings) # Get state list from args or defaults: states = status_get_state_list(args, valid_states, default_states) # Include reason field only with -vv if args.verbose >= 2: fmt = "{state:>10} {nnodes:>6} {reason:<25} {nodelist}" else: fmt = "{state:>10} {nnodes:>6} {nodelist}" if args.format: fmt = args.format # Get payload from stdin or from resource.status RPC: if args.from_stdin: resp = sys.stdin.read() allocated = IDset() else: rpc = ListStatusRPC(flux.Flux()) resp = rpc.get_status() allocated = rpc.get_allocated_ranks() rstat = ResourceStatus.from_status_response(resp, fmt, allocated) formatter = flux.util.OutputFormat(headings, fmt, prepend="0.") if not args.no_header: print(formatter.header()) for line in sorted(rstat, key=lambda x: valid_states.index(x.state)): if line.state not in states: continue # Skip empty lines unless --verbose or --states if line.nnodes == 0 and args.states is None and not args.verbose: continue print(formatter.format(line))
def job_exec_start(args): """Start testexec job under manual override""" try: flux.Flux().rpc("job-exec.override", { "event": "start", "jobid": args.jobid }).get() except OSError as exc: LOGGER.error("%s", exc.strerror) sys.exit(1)
def __init__(self): self.event_router = aurcore.event.EventRouter(name="roombot") self.flux = flux.Flux("pinbot", admin_id=TOKENS.ADMIN_ID, parent_router=self.event_router) print("init!") @self.flux.router.endpoint(":ready") def rdy(event: aurcore.event.Event): print("Ready!")
def kill(args): h = flux.Flux(os.environ.get("FLUX_START_URI")) try: h.rpc("start.kill", { "rank": int(args.rank), "signum": int(args.signum) }).get() except ProcessLookupError: LOGGER.error("rank %s broker process not found", args.rank) sys.exit(1)
def test_null_handle_exception(self): f = flux.Flux() payload = {"seq": 1, "pad": "stuff"} future = f.rpc("cmb.ping", payload) resp = future.get() future.pimpl.handle = None with six.assertRaisesRegex( self, ValueError, r"Attempting to call a cached, bound method.*NULL handle" ): resp = future.get()
def reload(args): """ Send a "reload" request to resource module """ RPC( flux.Flux(), "resource.reload", {"path": os.path.realpath(args.path), "xml": args.xml, "force": args.force}, nodeid=0, ).get()
def __getattr__(self, attr): if attr == "flux": # Allow one flux handle per thread, created on demand: try: return self.tls.flux except AttributeError: self.tls.flux = flux.Flux() return self.tls.flux else: # Return components of the validate request as attrs return self.jobinfo[attr]
def setUpClass(self): self.fh = flux.Flux() self.jobspec_dir = os.path.abspath( os.path.join(os.environ["FLUX_SOURCE_DIR"], "t", "jobspec")) # get a valid jobspec basic_jobspec_fname = os.path.join(self.jobspec_dir, "valid", "basic_v1.yaml") with open(basic_jobspec_fname, "rb") as infile: basic_yaml = infile.read() self.basic_jobspec = yaml_to_json(basic_yaml)
def drain(args): """ Send a drain request to resource module for args.idset """ RPC( flux.Flux(), "resource.drain", { "idset": args.idset, "reason": " ".join(args.reason) }, ).get()
def main(): h = flux.Flux() alloc = h.rpc("sched.alloc", json.dumps({"id": 0})) free = h.rpc("sched.free", json.dumps({"id": 0})) print("Sent alloc and free requests") h.rpc("cmb.rmmod", json.dumps({"name": args.sched_module})).get() print("Removed {}".format(args.sched_module)) expect_enosys(alloc) expect_enosys(free)
def run_per_rank(name, jobid, args): """Run args.exec_per_rank on every rank of jobid If command fails on any rank then drain that rank """ returncode = 0 if args.exec_per_rank is None: return 0 per_rank_cmd = args.exec_per_rank.split(",") processes = {} fail_ids = IDset() handle = flux.Flux() hostlist = flux.hostlist.Hostlist(handle.attr_get("hostlist")) ranks = fetch_job_ranks(handle, jobid) if ranks is None: return 1 if args.verbose: LOGGER.info( "%s: %s: executing %s on ranks %s", jobid, name, per_rank_cmd, ranks ) for rank in ranks: cmd = ["flux", "exec", "-qn", f"-r{rank}"] + per_rank_cmd processes[rank] = process_create(cmd, stderr=subprocess.PIPE) for rank in ranks: rc = processes[rank].wait() for line in processes[rank].stderr: errline = line.decode("utf-8").rstrip() LOGGER.error("%s (rank %d): %s", hostlist[rank], rank, errline) if rc != 0: fail_ids.set(rank) if rc > returncode: returncode = rc if len(fail_ids) > 0: LOGGER.error("%s: rank %s failed %s, draining", jobid, fail_ids, name) drain(handle, fail_ids, f"{name} failed for jobid {jobid}") if args.verbose: ranks.subtract(fail_ids) if len(ranks) > 0: LOGGER.info("%s: %s: completed successfully on %s", jobid, name, ranks) return returncode
def waitfor(args): """ Wait for group to have zero (or --count) members. """ h = flux.Flux() rpc = h.rpc( "groups.get", {"name": args.name}, nodeid=0, flags=flux.constants.FLUX_RPC_STREAMING, ) rpc.then(waitfor_continuation, args.count) h.reactor_run()