Example #1
0
def celery_status():
    from celery import Celery
    app = Celery()
    appc = app.control.inspect()
    celery_d = celery_driver()
    celery_d.jobs = []
    master_name = socket.gethostname()
    nodes = set()
    i = 0
    for arr in (arr for arr in [appc.active(), appc.reserved()]
                if arr != None):
        i += 1
        for k, v in arr.items():
            on_master = False
            if c_strip(k) == master_name:
                on_master = True
            nodes.add(c_strip(k))
            for _job in v:
                print(_job)
                if i == 1 and not on_master:
                    job = Job(name=_job['id'],
                              constraints={"ncpus": 1},
                              executing_hostnames=[c_strip(_job['hostname'])])
                else:
                    job = Job(name=_job['id'], constraints={"ncpus": 1})
                celery_d.jobs.append(job)

    celery_d.scheduler_nodes = [SchedulerNode(hostname=x) for x in list(nodes)]
    return celery_d
Example #2
0
def target_counts_demand() -> None:
    """
    TODO
    """
    dcalc = new_demand_calculator(CONFIG)

    # # 100 cores
    dcalc.add_job(
        Job(
            "tc-10",
            {
                "node.nodearray": "htc",
                "ncpus": 1,
                "exclusive": False
            },
            iterations=10,
        ))

    # 10 nodes
    dcalc.add_job(
        Job(
            "tn-10",
            {
                "node.nodearray": "htc",
                "ncpus": 4,
                "exclusive": True
            },
            node_count=10,
        ))

    # 2 x 5 nodes, non-exclusive so a node from tc-10 can be reused
    dcalc.add_job(
        Job(
            "tn-2x5",
            {
                "node.nodearray": "htc",
                "ncpus": 2,
                "exclusive": True
            },
            node_count=5,
        ), )

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    print_demand(["name", "job_ids", "nodearray", "ncpus", "*ncpus"],
                 demand_result)

    assert len(demand_result.new_nodes) == 18
Example #3
0
def test_clone() -> None:
    orig = SchedulerNode("lnx0", {"ncpus": 4})
    orig.metadata["exists_in_both"] = True
    new = orig.clone()
    assert new.available["ncpus"] == 4
    assert new.resources["ncpus"] == 4
    new.available["ncpus"] -= 1
    assert new.available["ncpus"] == 3
    assert orig.available["ncpus"] == 4

    job = Job("1", {"ncpus": 2})
    new.decrement(job._constraints, assignment_id=job.name)
    assert new.available["ncpus"] == 1
    assert orig.available["ncpus"] == 4
    assert new.assignments == set(["1"])
    assert orig.assignments == set()

    orig.metadata["exists_in_orig"] = True
    new.metadata["exists_in_new"] = True

    assert orig.metadata["exists_in_both"] is True
    assert "exists_in_new" not in orig.metadata
    assert orig.metadata["exists_in_orig"] is True

    assert new.metadata["exists_in_both"] is True
    assert new.metadata["exists_in_new"] is True
    assert "exists_in_orig" not in new.metadata
    def _xjob(jobid, constraints=None):
        constraints = constraints or [{"slots": 1}]
        if not isinstance(constraints, list):
            constraints = [constraints]
        constraints += [{"exclusive": True}]

        return Job(jobid, constraints=constraints)
Example #5
0
def test_bug100(mixedbindings) -> None:
    dcalc = _new_dc(mixedbindings)

    # # 100 cores
    dcalc.add_job(Job("tc-10", {"node.nodearray": "htc", "ncpus": 1}, iterations=10,))
    demand = dcalc.finish()

    assert len(demand.new_nodes) == 3
Example #6
0
def test_no_buckets():
    node_mgr = NodeManager(MockClusterBinding(), [])
    dc = DemandCalculator(
        node_mgr, NullNodeHistory(), singleton_lock=util.NullSingletonLock()
    )
    result = dc._add_job(Job("1", {"ncpus": 2}))
    assert not result
    assert "NoBucketsDefined" == result.status
Example #7
0
def target_counts_demand() -> None:
    """
    Handle a mixture of 'target count' style allocation of ncpus and nodes via the
    DemandCalculator.
    """
    dcalc = new_demand_calculator(CONFIG)

    # job requires 10 cores (ncpus)
    dcalc.add_job(
        Job(
            name="tc-10",
            constraints={"node.nodearray": "htc", "ncpus": 1, "exclusive": False},
            iterations=10,
        )
    )

    # job requires 10 nodes with 4 cores (ncpus)
    dcalc.add_job(
        Job(
            name="tn-10",
            constraints={"node.nodearray": "htc", "ncpus": 4, "exclusive": True},
            node_count=10,
        )
    )

    # 2 x 5 nodes
    dcalc.add_job(
        Job(
            name="tn-2x5",
            constraints={"node.nodearray": "htc", "ncpus": 2, "exclusive": True},
            node_count=5,
        ),
    )

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    # note that /ncpus will display available/total. ncpus will display the total, and
    # *ncpus will display available.
    print_demand(["name", "job_ids", "nodearray", "/ncpus"], demand_result)
Example #8
0
def _mpi_job(job_name="1", nodes=1, placeby="pg", resources=None):
    resources = resources or {"ncpus": 2}
    constraints = get_constraints([resources])
    constraints.append(InAPlacementGroup())
    constraints.append(ExclusiveNode())
    return Job(
        job_name,
        constraints=constraints,
        node_count=nodes,
        colocated=True,
    )
def preprocess_jobs_stdin(stdin=sys.stdin, stdout=sys.stdout) -> None:
    # load the json from stdin
    job_dicts = json.load(stdin)

    # parse the job dictionaries into hpc Job objects
    jobs = [Job.from_dict(n) for n in job_dicts]

    # run our preprocessing
    modified_jobs = preprocess_jobs(jobs)

    # finally dump the modified jobs out to stdout
    json.dump(modified_jobs, stdout, default=lambda x: x.to_dict())
Example #10
0
        def do_draw(self, data: Any) -> ht.VMSize:
            import hypothesis.internal.conjecture.utils as d

            idx = d.integer_range(data, 0, 1_000_000_000)
            r = random.Random(idx)

            def draw_value(rtype_draw: Optional[int] = None) -> Optional[Any]:
                if rtype_draw is None:
                    rtype_draw = r.randint(0, 4)

                if rtype_draw == 0:
                    return r.randint(0, 100)
                elif rtype_draw == 1:
                    return r.random() * 100
                elif rtype_draw == 2:

                    def draw_letter():
                        return r.choice(string.ascii_letters)

                    return "".join(
                        [draw_letter() for n in range(r.randint(0, 100))])
                elif rtype_draw == 3:
                    return r.random() < 0.5
                else:
                    list_length = r.randint(0, 10)
                    list_type = r.randint(0, 3)  # exclude lists
                    return [draw_value(list_type) for _ in range(list_length)]

            job_id = "j-o-b_-{}".format(r.randint(1, 1000000))
            constraints: Dict[str, Optional[Any]] = {}
            num_resources = r.randint(0, 10)
            for n in range(num_resources):
                cname = "cons-{}".format(n)
                constraints[cname] = draw_value()

            job = Job(
                job_id,
                constraints,
                iterations=r.randint(0, 100),
                node_count=r.randint(0, 100),
                colocated=r.random() < 0.5,
                packing_strategy=r.choice(["pack", "scatter", None]),
                executing_hostnames=None if r.random() < 0.5 else
                [draw_value(2) for _ in range(r.randint(0, 5))],
            )

            job.iterations_remaining -= r.randint(0, job.iterations)
            for n in range(r.randint(0, 5)):
                job.metadata["meta-{}".format(n)] = draw_value()

            return job
Example #11
0
    def scale_up() -> DemandCalculator:
        dcalc = new_demand_calculator(CONFIG)

        dcalc.add_job(
            Job("tc-100", {"node.nodearray": "htc", "ncpus": 1}, iterations=50)
        )

        demand_result = dcalc.finish()

        if not DRY_RUN:
            dcalc.bootup()

        print_demand(columns, demand_result)

        dcalc.node_history.conn.close()

        return dcalc
Example #12
0
    def scale_down(dcalc: typing.Optional[DemandCalculator]) -> None:
        dcalc = dcalc or new_demand_calculator(CONFIG)
        dcalc.add_job(
            Job("tc-50", {
                "node.nodearray": "htc",
                "ncpus": 1
            }, iterations=25))

        demand_result = dcalc.finish()

        if not DRY_RUN:
            dcalc.bootup()

        print_demand(columns, demand_result)

        print("The following nodes can be shutdown: {}".format(",".join(
            [n.name for n in demand_result.unmatched_nodes])))
    def _pack_job(self, job: Job) -> Result:
        """
        1) will it ever fit? - check num nodes with any capacity
        2) does it have the proper resources? bucket.match(job.resources)
        3) order them
        4) tell the bucket to allocate X nodes - let the bucket figure out what is new and what is not.
        """
        # TODO break non-exclusive
        allocated_nodes: List[Node] = []
        slots_to_allocate = job.iterations_remaining
        assert job.iterations_remaining > 0

        available_buckets = self.node_mgr.get_buckets()
        # I don't want to fill up the log with rejecting placement groups
        # so just filter them here
        filter_by_colocated = [
            b for b in available_buckets
            if bool(b.placement_group) == job.colocated
        ]
        candidates_result = job.bucket_candidates(filter_by_colocated)

        if not candidates_result:
            # TODO log or something
            logging.warning("There are no resources to scale up for job %s",
                            job)
            logging.warning("See below:")
            for child_result in candidates_result.child_results or []:
                logging.warning("    %s", child_result.message)
            return candidates_result

        failure_reasons = self._handle_allocate(job,
                                                allocated_nodes,
                                                all_or_nothing=False)

        # we have allocated at least some tasks
        if allocated_nodes:
            assert allocated_nodes
            return AllocationResult("success",
                                    nodes=allocated_nodes,
                                    slots_allocated=slots_to_allocate)

        return AllocationResult("Failed", reasons=failure_reasons)
Example #14
0
def onprem_burst_demand() -> None:
    onprem001 = SchedulerNode("onprem001",
                              resources={
                                  "onprem": True,
                                  "nodetype": "A",
                                  "ncpus": 16
                              })
    onprem002 = SchedulerNode("onprem002",
                              resources={
                                  "onprem": True,
                                  "nodetype": "A",
                                  "ncpus": 32
                              })

    # onprem002 already has 10 cores occupied
    onprem002.available["ncpus"] -= 10

    dcalc = new_demand_calculator(CONFIG,
                                  existing_nodes=[onprem001, onprem002])
    dcalc.node_mgr.add_default_resource({"node.nodearray": ["htc", "htcspot"]},
                                        "nodetype", "A")
    assert [b for b in dcalc.node_mgr.get_buckets()
            if b.nodearray == "htc"][0].resources["nodetype"] == "A"
    dcalc.node_mgr.add_default_resource({}, "nodetype", "B")

    assert [b for b in dcalc.node_mgr.get_buckets()
            if b.nodearray == "htc"][0].resources["nodetype"] == "A"
    # we want 50 ncpus, but there are only 38 onpremise, so we need to burst
    # 12 more cores.
    dcalc.add_job(Job("tc-100", {"nodetype": "A", "ncpus": 1}, iterations=50))

    demand_result = dcalc.finish()

    if not DRY_RUN:
        dcalc.bootup()

    # also note we can add defaults to the column by adding a :, like
    # onprem:False, as this is only defined on the onprem nodes and not
    # on the Azure nodes.
    print_demand(["name", "job_ids", "nodetype", "onprem:False", "*ncpus"],
                 demand_result)
Example #15
0
    def _handle_allocate(
        self,
        job: Job,
        allocated_nodes_out: List[Node],
        all_or_nothing: bool,
    ) -> Optional[List[str]]:
        result = job.do_allocate(
            self.node_mgr,
            all_or_nothing=all_or_nothing,
            allow_existing=True,
        )

        if not result:
            return result.reasons

        for node in result.nodes:
            if not node.exists and node.metadata.get(
                    "__demand_allocated") is None:
                self.__scheduler_nodes_queue.push(node)
                node.metadata["__demand_allocated"] = True
        allocated_nodes_out.extend(result.nodes)

        return None
Example #16
0
def _htc_job(job_name="1", slot_count=1, resources=None):
    resources = resources or {"ncpus": 2}
    return Job(job_name, resources, iterations=slot_count)
Example #17
0
 def jobs_list() -> List[Job]:
     jobs = []
     for n, job_i in enumerate(job_iters):
         jobs.append(Job(str(n), {"ncpus": ncpus_per_job[n]}, iterations=job_i))
     return jobs
 def _job(jobid, constraints=None, t=1):
     constraints = constraints or [{"slots": 1}]
     if not isinstance(constraints, list):
         constraints = [constraints]
     return Job(jobid, constraints=constraints, iterations=t)
Example #19
0
def parse_jobs(
    pbscmd: PBSCMD,
    resource_definitions: Dict[str, PBSProResourceDefinition],
    queues: Dict[str, PBSProQueue],
    resources_for_scheduling: Set[str],
) -> List[Job]:
    """
    Parses PBS qstat output and creates relevant hpc.autoscale.job.job.Job objects
    """
    parser = get_pbspro_parser()
    # alternate format triggered by
    # -a, -i, -G, -H, -M, -n, -r, -s, -T, or -u
    ret: List[Job] = []

    response: Dict = pbscmd.qstat_json("-f", "-t")

    for job_id, jdict in response.get("Jobs", {}).items():
        job_id = job_id.split(".")[0]

        job_state = jdict.get("job_state")
        if not job_state:
            logging.warning("No job_state defined for job %s. Skipping",
                            job_id)
            continue

        if job_state != PBSProJobStates.Queued:
            continue

        # ensure we don't autoscale jobs from disabled or non-started queues
        qname = jdict.get("queue")
        if not qname or qname not in queues:
            logging.warning("queue was not defined for job %s: ignoring",
                            job_id)
            continue

        queue: PBSProQueue = queues[qname]
        if not queue.enabled:
            logging.fine("Skipping job %s from disabled queue %s", job_id,
                         qname)
            continue

        if not queue.started:
            logging.fine("Skipping job %s from non-started queue %s", job_id,
                         qname)
            continue

        # handle array vs individual jobs
        if jdict.get("array"):
            iterations = parser.parse_range_size(
                jdict["array_indices_submitted"])
            remaining = parser.parse_range_size(
                jdict["array_indices_remaining"])
        elif "[" in job_id:
            continue
        else:
            iterations = 1
            remaining = 1

        res_list = jdict["Resource_List"]
        res_list["schedselect"] = jdict["schedselect"]
        rdict = parser.convert_resource_list(res_list)

        pack = (PackingStrategy.PACK if rdict["place"]["arrangement"]
                in ["free", "pack"] else PackingStrategy.SCATTER)

        # SMP style jobs
        is_smp = (rdict["place"].get("grouping") == "host"
                  or rdict["place"]["arrangement"] == "pack")

        # pack jobs do not need to define node_count

        node_count = int(rdict.get("nodect", "0"))

        smp_multiplier = 1

        if is_smp:
            smp_multiplier = max(1, iterations) * max(1, node_count)
            # for key, value in list(rdict.items()):
            #     if isinstance(value, (float, int)):
            #         value = value * smp_multiplier
            iterations = node_count = 1

        effective_node_count = max(node_count, 1)

        # htc jobs set ungrouped=true. see our default htcq
        colocated = (not is_smp and queue.uses_placement
                     and rdict.get("ungrouped", "false").lower() == "false")

        sharing = rdict["place"].get("sharing")

        for n, chunk_base in enumerate(rdict["schedselect"]):

            chunk: Dict[str, Any] = {}

            chunk.update(rdict)

            if "ncpus" not in chunk_base:
                chunk["ncpus"] = chunk["ncpus"] // effective_node_count

            if smp_multiplier > 1:
                for key, value in list(chunk_base.items()):
                    if isinstance(value, (int, float)):
                        chunk_base[key] = value * smp_multiplier
            # do this _after_ rdict, since the chunks
            # will override the top level resources
            # e.g. notice that ncpus=4. This will be the rdict value
            # but the chunks have ncpus=2
            # Resource_List.ncpus = 4
            # Resource_List.nodect = 2
            # Resource_List.select = 2:ncpus=2

            chunk.update(chunk_base)
            working_constraint: Dict[str, Any] = {}
            constraints = [working_constraint]

            if colocated:
                working_constraint["in-a-placement-group"] = True

            my_job_id = job_id
            if len(rdict["schedselect"]) > 1:
                if "." in job_id:
                    job_index, host = job_id.split(".", 1)
                    my_job_id = "{}+{}.{}".format(job_index, n, host)
                else:
                    my_job_id = "{}+{}".format(job_id, n)

            if sharing == "excl":
                working_constraint["exclusive-task"] = True
            elif sharing == "exclhost":
                working_constraint["exclusive"] = True

            job_resources = {}

            for rname, rvalue in chunk.items():
                if rname in ["select", "schedselect", "place", "nodect"]:
                    continue

                if rname not in resources_for_scheduling:
                    if rname == "skipcyclesubhook":
                        continue
                    logging.warning(
                        "Ignoring resource %s as it was not defined in sched_config",
                        rname,
                    )
                    continue

                # add all resource requests here. By that, I mean
                # non resource requests, like exclusive, should be ignored
                # required for get_non_host_constraints
                job_resources[rname] = rvalue

                resource_def = resource_definitions.get(rname)

                # constraints are for the node/host
                # queue/scheduler level ones will be added using
                # > queue.get_non_host_constraints(job_resource)
                if not resource_def or not resource_def.is_host:
                    continue

                if rname not in working_constraint:
                    working_constraint[rname] = rvalue
                else:
                    # hit a conflict, so start a new working cons
                    # so we maintain precedence
                    working_constraint = {rname: rvalue}
                    constraints.append(working_constraint)

            queue_constraints = queue.get_non_host_constraints(job_resources)
            constraints.extend(queue_constraints)

            job = Job(
                name=my_job_id,
                constraints=constraints,
                iterations=iterations,
                node_count=node_count,
                colocated=colocated,
                packing_strategy=pack,
            )
            job.iterations_remaining = remaining
            ret.append(job)

    return ret
Example #20
0
def job_buffer(n=1):
    jobs = []
    for i in range(n):
        jobs.append(Job(name="pad-%s" % i, constraints={"ncpus": 1}))
    return jobs
Example #21
0
def test_job_json(a: Job):
    b = Job.from_dict(a.to_dict())
    assert _cmp(a, b)