def celery_status(): from celery import Celery app = Celery() appc = app.control.inspect() celery_d = celery_driver() celery_d.jobs = [] master_name = socket.gethostname() nodes = set() i = 0 for arr in (arr for arr in [appc.active(), appc.reserved()] if arr != None): i += 1 for k, v in arr.items(): on_master = False if c_strip(k) == master_name: on_master = True nodes.add(c_strip(k)) for _job in v: print(_job) if i == 1 and not on_master: job = Job(name=_job['id'], constraints={"ncpus": 1}, executing_hostnames=[c_strip(_job['hostname'])]) else: job = Job(name=_job['id'], constraints={"ncpus": 1}) celery_d.jobs.append(job) celery_d.scheduler_nodes = [SchedulerNode(hostname=x) for x in list(nodes)] return celery_d
def target_counts_demand() -> None: """ TODO """ dcalc = new_demand_calculator(CONFIG) # # 100 cores dcalc.add_job( Job( "tc-10", { "node.nodearray": "htc", "ncpus": 1, "exclusive": False }, iterations=10, )) # 10 nodes dcalc.add_job( Job( "tn-10", { "node.nodearray": "htc", "ncpus": 4, "exclusive": True }, node_count=10, )) # 2 x 5 nodes, non-exclusive so a node from tc-10 can be reused dcalc.add_job( Job( "tn-2x5", { "node.nodearray": "htc", "ncpus": 2, "exclusive": True }, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(["name", "job_ids", "nodearray", "ncpus", "*ncpus"], demand_result) assert len(demand_result.new_nodes) == 18
def test_clone() -> None: orig = SchedulerNode("lnx0", {"ncpus": 4}) orig.metadata["exists_in_both"] = True new = orig.clone() assert new.available["ncpus"] == 4 assert new.resources["ncpus"] == 4 new.available["ncpus"] -= 1 assert new.available["ncpus"] == 3 assert orig.available["ncpus"] == 4 job = Job("1", {"ncpus": 2}) new.decrement(job._constraints, assignment_id=job.name) assert new.available["ncpus"] == 1 assert orig.available["ncpus"] == 4 assert new.assignments == set(["1"]) assert orig.assignments == set() orig.metadata["exists_in_orig"] = True new.metadata["exists_in_new"] = True assert orig.metadata["exists_in_both"] is True assert "exists_in_new" not in orig.metadata assert orig.metadata["exists_in_orig"] is True assert new.metadata["exists_in_both"] is True assert new.metadata["exists_in_new"] is True assert "exists_in_orig" not in new.metadata
def _xjob(jobid, constraints=None): constraints = constraints or [{"slots": 1}] if not isinstance(constraints, list): constraints = [constraints] constraints += [{"exclusive": True}] return Job(jobid, constraints=constraints)
def test_bug100(mixedbindings) -> None: dcalc = _new_dc(mixedbindings) # # 100 cores dcalc.add_job(Job("tc-10", {"node.nodearray": "htc", "ncpus": 1}, iterations=10,)) demand = dcalc.finish() assert len(demand.new_nodes) == 3
def test_no_buckets(): node_mgr = NodeManager(MockClusterBinding(), []) dc = DemandCalculator( node_mgr, NullNodeHistory(), singleton_lock=util.NullSingletonLock() ) result = dc._add_job(Job("1", {"ncpus": 2})) assert not result assert "NoBucketsDefined" == result.status
def target_counts_demand() -> None: """ Handle a mixture of 'target count' style allocation of ncpus and nodes via the DemandCalculator. """ dcalc = new_demand_calculator(CONFIG) # job requires 10 cores (ncpus) dcalc.add_job( Job( name="tc-10", constraints={"node.nodearray": "htc", "ncpus": 1, "exclusive": False}, iterations=10, ) ) # job requires 10 nodes with 4 cores (ncpus) dcalc.add_job( Job( name="tn-10", constraints={"node.nodearray": "htc", "ncpus": 4, "exclusive": True}, node_count=10, ) ) # 2 x 5 nodes dcalc.add_job( Job( name="tn-2x5", constraints={"node.nodearray": "htc", "ncpus": 2, "exclusive": True}, node_count=5, ), ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # note that /ncpus will display available/total. ncpus will display the total, and # *ncpus will display available. print_demand(["name", "job_ids", "nodearray", "/ncpus"], demand_result)
def _mpi_job(job_name="1", nodes=1, placeby="pg", resources=None): resources = resources or {"ncpus": 2} constraints = get_constraints([resources]) constraints.append(InAPlacementGroup()) constraints.append(ExclusiveNode()) return Job( job_name, constraints=constraints, node_count=nodes, colocated=True, )
def preprocess_jobs_stdin(stdin=sys.stdin, stdout=sys.stdout) -> None: # load the json from stdin job_dicts = json.load(stdin) # parse the job dictionaries into hpc Job objects jobs = [Job.from_dict(n) for n in job_dicts] # run our preprocessing modified_jobs = preprocess_jobs(jobs) # finally dump the modified jobs out to stdout json.dump(modified_jobs, stdout, default=lambda x: x.to_dict())
def do_draw(self, data: Any) -> ht.VMSize: import hypothesis.internal.conjecture.utils as d idx = d.integer_range(data, 0, 1_000_000_000) r = random.Random(idx) def draw_value(rtype_draw: Optional[int] = None) -> Optional[Any]: if rtype_draw is None: rtype_draw = r.randint(0, 4) if rtype_draw == 0: return r.randint(0, 100) elif rtype_draw == 1: return r.random() * 100 elif rtype_draw == 2: def draw_letter(): return r.choice(string.ascii_letters) return "".join( [draw_letter() for n in range(r.randint(0, 100))]) elif rtype_draw == 3: return r.random() < 0.5 else: list_length = r.randint(0, 10) list_type = r.randint(0, 3) # exclude lists return [draw_value(list_type) for _ in range(list_length)] job_id = "j-o-b_-{}".format(r.randint(1, 1000000)) constraints: Dict[str, Optional[Any]] = {} num_resources = r.randint(0, 10) for n in range(num_resources): cname = "cons-{}".format(n) constraints[cname] = draw_value() job = Job( job_id, constraints, iterations=r.randint(0, 100), node_count=r.randint(0, 100), colocated=r.random() < 0.5, packing_strategy=r.choice(["pack", "scatter", None]), executing_hostnames=None if r.random() < 0.5 else [draw_value(2) for _ in range(r.randint(0, 5))], ) job.iterations_remaining -= r.randint(0, job.iterations) for n in range(r.randint(0, 5)): job.metadata["meta-{}".format(n)] = draw_value() return job
def scale_up() -> DemandCalculator: dcalc = new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-100", {"node.nodearray": "htc", "ncpus": 1}, iterations=50) ) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) dcalc.node_history.conn.close() return dcalc
def scale_down(dcalc: typing.Optional[DemandCalculator]) -> None: dcalc = dcalc or new_demand_calculator(CONFIG) dcalc.add_job( Job("tc-50", { "node.nodearray": "htc", "ncpus": 1 }, iterations=25)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() print_demand(columns, demand_result) print("The following nodes can be shutdown: {}".format(",".join( [n.name for n in demand_result.unmatched_nodes])))
def _pack_job(self, job: Job) -> Result: """ 1) will it ever fit? - check num nodes with any capacity 2) does it have the proper resources? bucket.match(job.resources) 3) order them 4) tell the bucket to allocate X nodes - let the bucket figure out what is new and what is not. """ # TODO break non-exclusive allocated_nodes: List[Node] = [] slots_to_allocate = job.iterations_remaining assert job.iterations_remaining > 0 available_buckets = self.node_mgr.get_buckets() # I don't want to fill up the log with rejecting placement groups # so just filter them here filter_by_colocated = [ b for b in available_buckets if bool(b.placement_group) == job.colocated ] candidates_result = job.bucket_candidates(filter_by_colocated) if not candidates_result: # TODO log or something logging.warning("There are no resources to scale up for job %s", job) logging.warning("See below:") for child_result in candidates_result.child_results or []: logging.warning(" %s", child_result.message) return candidates_result failure_reasons = self._handle_allocate(job, allocated_nodes, all_or_nothing=False) # we have allocated at least some tasks if allocated_nodes: assert allocated_nodes return AllocationResult("success", nodes=allocated_nodes, slots_allocated=slots_to_allocate) return AllocationResult("Failed", reasons=failure_reasons)
def onprem_burst_demand() -> None: onprem001 = SchedulerNode("onprem001", resources={ "onprem": True, "nodetype": "A", "ncpus": 16 }) onprem002 = SchedulerNode("onprem002", resources={ "onprem": True, "nodetype": "A", "ncpus": 32 }) # onprem002 already has 10 cores occupied onprem002.available["ncpus"] -= 10 dcalc = new_demand_calculator(CONFIG, existing_nodes=[onprem001, onprem002]) dcalc.node_mgr.add_default_resource({"node.nodearray": ["htc", "htcspot"]}, "nodetype", "A") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" dcalc.node_mgr.add_default_resource({}, "nodetype", "B") assert [b for b in dcalc.node_mgr.get_buckets() if b.nodearray == "htc"][0].resources["nodetype"] == "A" # we want 50 ncpus, but there are only 38 onpremise, so we need to burst # 12 more cores. dcalc.add_job(Job("tc-100", {"nodetype": "A", "ncpus": 1}, iterations=50)) demand_result = dcalc.finish() if not DRY_RUN: dcalc.bootup() # also note we can add defaults to the column by adding a :, like # onprem:False, as this is only defined on the onprem nodes and not # on the Azure nodes. print_demand(["name", "job_ids", "nodetype", "onprem:False", "*ncpus"], demand_result)
def _handle_allocate( self, job: Job, allocated_nodes_out: List[Node], all_or_nothing: bool, ) -> Optional[List[str]]: result = job.do_allocate( self.node_mgr, all_or_nothing=all_or_nothing, allow_existing=True, ) if not result: return result.reasons for node in result.nodes: if not node.exists and node.metadata.get( "__demand_allocated") is None: self.__scheduler_nodes_queue.push(node) node.metadata["__demand_allocated"] = True allocated_nodes_out.extend(result.nodes) return None
def _htc_job(job_name="1", slot_count=1, resources=None): resources = resources or {"ncpus": 2} return Job(job_name, resources, iterations=slot_count)
def jobs_list() -> List[Job]: jobs = [] for n, job_i in enumerate(job_iters): jobs.append(Job(str(n), {"ncpus": ncpus_per_job[n]}, iterations=job_i)) return jobs
def _job(jobid, constraints=None, t=1): constraints = constraints or [{"slots": 1}] if not isinstance(constraints, list): constraints = [constraints] return Job(jobid, constraints=constraints, iterations=t)
def parse_jobs( pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition], queues: Dict[str, PBSProQueue], resources_for_scheduling: Set[str], ) -> List[Job]: """ Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects """ parser = get_pbspro_parser() # alternate format triggered by # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u ret: List[Job] = [] response: Dict = pbscmd.qstat_json("-f", "-t") for job_id, jdict in response.get("Jobs", {}).items(): job_id = job_id.split(".")[0] job_state = jdict.get("job_state") if not job_state: logging.warning("No job_state defined for job %s. Skipping", job_id) continue if job_state != PBSProJobStates.Queued: continue # ensure we don't autoscale jobs from disabled or non-started queues qname = jdict.get("queue") if not qname or qname not in queues: logging.warning("queue was not defined for job %s: ignoring", job_id) continue queue: PBSProQueue = queues[qname] if not queue.enabled: logging.fine("Skipping job %s from disabled queue %s", job_id, qname) continue if not queue.started: logging.fine("Skipping job %s from non-started queue %s", job_id, qname) continue # handle array vs individual jobs if jdict.get("array"): iterations = parser.parse_range_size( jdict["array_indices_submitted"]) remaining = parser.parse_range_size( jdict["array_indices_remaining"]) elif "[" in job_id: continue else: iterations = 1 remaining = 1 res_list = jdict["Resource_List"] res_list["schedselect"] = jdict["schedselect"] rdict = parser.convert_resource_list(res_list) pack = (PackingStrategy.PACK if rdict["place"]["arrangement"] in ["free", "pack"] else PackingStrategy.SCATTER) # SMP style jobs is_smp = (rdict["place"].get("grouping") == "host" or rdict["place"]["arrangement"] == "pack") # pack jobs do not need to define node_count node_count = int(rdict.get("nodect", "0")) smp_multiplier = 1 if is_smp: smp_multiplier = max(1, iterations) * max(1, node_count) # for key, value in list(rdict.items()): # if isinstance(value, (float, int)): # value = value * smp_multiplier iterations = node_count = 1 effective_node_count = max(node_count, 1) # htc jobs set ungrouped=true. see our default htcq colocated = (not is_smp and queue.uses_placement and rdict.get("ungrouped", "false").lower() == "false") sharing = rdict["place"].get("sharing") for n, chunk_base in enumerate(rdict["schedselect"]): chunk: Dict[str, Any] = {} chunk.update(rdict) if "ncpus" not in chunk_base: chunk["ncpus"] = chunk["ncpus"] // effective_node_count if smp_multiplier > 1: for key, value in list(chunk_base.items()): if isinstance(value, (int, float)): chunk_base[key] = value * smp_multiplier # do this _after_ rdict, since the chunks # will override the top level resources # e.g. notice that ncpus=4. This will be the rdict value # but the chunks have ncpus=2 # Resource_List.ncpus = 4 # Resource_List.nodect = 2 # Resource_List.select = 2:ncpus=2 chunk.update(chunk_base) working_constraint: Dict[str, Any] = {} constraints = [working_constraint] if colocated: working_constraint["in-a-placement-group"] = True my_job_id = job_id if len(rdict["schedselect"]) > 1: if "." in job_id: job_index, host = job_id.split(".", 1) my_job_id = "{}+{}.{}".format(job_index, n, host) else: my_job_id = "{}+{}".format(job_id, n) if sharing == "excl": working_constraint["exclusive-task"] = True elif sharing == "exclhost": working_constraint["exclusive"] = True job_resources = {} for rname, rvalue in chunk.items(): if rname in ["select", "schedselect", "place", "nodect"]: continue if rname not in resources_for_scheduling: if rname == "skipcyclesubhook": continue logging.warning( "Ignoring resource %s as it was not defined in sched_config", rname, ) continue # add all resource requests here. By that, I mean # non resource requests, like exclusive, should be ignored # required for get_non_host_constraints job_resources[rname] = rvalue resource_def = resource_definitions.get(rname) # constraints are for the node/host # queue/scheduler level ones will be added using # > queue.get_non_host_constraints(job_resource) if not resource_def or not resource_def.is_host: continue if rname not in working_constraint: working_constraint[rname] = rvalue else: # hit a conflict, so start a new working cons # so we maintain precedence working_constraint = {rname: rvalue} constraints.append(working_constraint) queue_constraints = queue.get_non_host_constraints(job_resources) constraints.extend(queue_constraints) job = Job( name=my_job_id, constraints=constraints, iterations=iterations, node_count=node_count, colocated=colocated, packing_strategy=pack, ) job.iterations_remaining = remaining ret.append(job) return ret
def job_buffer(n=1): jobs = [] for i in range(n): jobs.append(Job(name="pad-%s" % i, constraints={"ncpus": 1})) return jobs
def test_job_json(a: Job): b = Job.from_dict(a.to_dict()) assert _cmp(a, b)