def parse_queue_complex_values(
    expr: str, complexes: Dict[str, Complex], qname: str
) -> Dict[str, Dict[str, Dict]]:
    raw: Dict[str, List[str]] = parse_hostgroup_mapping(expr)
    ret: Dict[str, Dict] = {}

    for hostgroup, sub_exprs in raw.items():
        if hostgroup not in ret:
            ret[hostgroup] = {}
        d = ret[hostgroup]

        for sub_expr in sub_exprs:
            if sub_expr is None:
                sub_expr = "NONE"

            if "=" not in sub_expr:
                continue

            sub_expr = sub_expr.strip()
            complex_name, value_expr = sub_expr.split("=", 1)

            c = complexes.get(complex_name)
            if not c:
                logging.debug(
                    "Could not find complex %s defined in queue %s", complex_name, qname
                )
                continue
            d[complex_name] = c.parse(value_expr)
    return ret
Esempio n. 2
0
    def __init__(
        self,
        node_mgr: NodeManager,
        node_history: NodeHistory = NullNodeHistory(),
        node_queue: Optional[NodeQueue] = None,
        singleton_lock: Optional[SingletonLock] = None,
    ) -> None:
        assert isinstance(node_mgr, NodeManager)
        self.node_mgr = node_mgr
        self.node_history = node_history

        if node_queue is None:
            node_queue = NodeQueue()

        self.__scheduler_nodes_queue: NodeQueue = node_queue

        for node in self.node_mgr.get_non_failed_nodes():
            self.__scheduler_nodes_queue.push(node)

        self.__set_buffer_delayed_invocations: List[Tuple[Any, ...]] = []

        self.node_history.decorate(list(self.__scheduler_nodes_queue))

        if not singleton_lock:
            singleton_lock = new_singleton_lock({})
        self.__singleton_lock = singleton_lock
        logging.debug(
            "Calculating demand using the following pre-existing nodes: %s",
            [n.name for n in self.__scheduler_nodes_queue],
        )
Esempio n. 3
0
    def autoscale(
        self,
        config: Dict,
        output_columns: Optional[List[str]],
        output_format: OutputFormat,
        dry_run: bool = False,
        long: bool = False,
    ) -> None:
        """End-to-end autoscale process, including creation, deletion and joining of nodes."""
        output_columns = output_columns or self._get_default_output_columns(
            config)

        ctx_handler = self._ctx_handler(config)

        register_result_handler(ctx_handler)

        driver = self._driver(config)
        driver.initialize()

        config = driver.preprocess_config(config)

        logging.debug("Driver = %s", driver)

        return autoscale_hpcpack(config,
                                 ctx_handler=ctx_handler,
                                 dry_run=dry_run)
    def update_scheduler_nodes(self,
                               scheduler_nodes: List[SchedulerNode]) -> None:

        by_hostname: Dict[str, Node] = partition_single(
            self.__scheduler_nodes_queue,
            lambda n: n.hostname_or_uuid  # type: ignore
        )
        for new_snode in scheduler_nodes:
            if new_snode.hostname not in by_hostname:
                logging.debug(
                    "Found new node[hostname=%s] that does not exist in CycleCloud",
                    new_snode.hostname,
                )
                by_hostname[new_snode.hostname] = new_snode
                self.__scheduler_nodes_queue.push(new_snode)
                self.node_mgr.add_unmanaged_nodes([new_snode])

                # TODO inform bucket catalog?
            else:
                old_snode = by_hostname[new_snode.hostname_or_uuid]
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s]",
                    new_snode.hostname,
                )
                old_snode.update(new_snode)
Esempio n. 5
0
def read_queues(
    autoscale_config: Dict,
    scheduler: "GridEngineScheduler",
    pes: Dict[str, "ParallelEnvironment"],
    hostgroups: List[Hostgroup],
    complexes: Dict[str, "Complex"],
    qbin: QBin,
) -> Dict[str, GridEngineQueue]:
    queues = {}
    qnames = qbin.qconf(["-sql"]).split()

    logging.debug("Found %d queues: %s", len(qnames), " ".join(qnames))
    autoscale_queues_config = autoscale_config.get("gridengine", {}).get("queues", {})

    unbound_hostgroups = partition_single(hostgroups, lambda h: h.name)

    for qname in qnames:

        lines = qbin.qconf(["-sq", qname]).splitlines()
        queue_config = parse_ge_config(lines)
        autoscale_enabled = autoscale_queues_config.get(queue_config["qname"], {}).get(
            "autoscale_enabled", True
        )
        expr = queue_config.get("complex_values", "NONE")
        complex_values = parse_queue_complex_values(expr, complexes, qname)
        queues[qname] = GridEngineQueue(
            queue_config,
            scheduler,
            pes,
            unbound_hostgroups,
            complex_values,
            autoscale_enabled,
        )

    return queues
Esempio n. 6
0
    def bootup(self, nodes: Optional[List[Node]] = None) -> BootupResult:
        nodes = nodes if nodes is not None else self.get_demand().new_nodes
        if not nodes:
            logging.info("No nodes to bootup.")
            return BootupResult("success", OperationId(""), None)

        logging.debug("booting up %s", [n.name for n in nodes])
        return self.node_mgr.bootup(nodes)
Esempio n. 7
0
    def delete(self, nodes: Optional[List[Node]] = None) -> DeleteResult:
        nodes = nodes if nodes is not None else self.get_demand(
        ).unmatched_nodes
        if not nodes:
            logging.info("No nodes to delete.")
            return DeleteResult("success", OperationId(""), None)

        logging.debug("deleting %s", [n.name for n in nodes])
        return self.node_mgr.delete(nodes)
 def add_node_to_node_group(self, group_name: str,
                            node_names: Iterable[str]) -> List[str]:
     assert len(node_names) > 0 and group_name
     logging.debug("Adding nodes {} to nodegroup {}".format(
         node_names, group_name))
     res = self._post(
         self.add_node_to_node_group.__name__,
         self.ADD_NODES_TO_NODE_GROUP_ROUTE.format(group_name=group_name),
         json.dumps(node_names))
     return json.loads(res.content)
Esempio n. 9
0
    def __call__(self, result: Result) -> None:
        logging.debug("%s: %s", self.ctx, result)

        self.by_context[self.ctx].append(result)

        if hasattr(result, "nodes") and getattr(result, "nodes"):
            for result_node in getattr(result, "nodes"):
                if "contexts" not in result_node.metadata:
                    result_node.metadata["contexts"] = set()

                result_node.metadata["contexts"].add(self.ctx)
Esempio n. 10
0
def autoscale(
    config: Dict,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Runs actual autoscale process"""
    logging.debug("Begin autoscale")
    ctx_handler = register_result_handler(DefaultContextHandler("[initialization]"))
    if output_columns:
        config["output_columns"] = output_columns

    if output_format:
        config["output_format"] = output_format

    autoscaler.autoscale_grid_engine(config, ctx_handler=ctx_handler)
    logging.debug("End autoscale")
Esempio n. 11
0
    def handle_draining(self, nodes: List[Node]) -> List[Node]:
        # TODO batch these up, but keep it underneath the
        # max arg limit
        ret = []
        for node in nodes:
            if not node.hostname:
                logging.info("Node %s has no hostname.", node)
                continue

            # TODO implement after we have resources added back in
            # what about deleting partially initialized nodes? I think we
            # just need to skip non-managed nodes
            # if not node.resources.get("ccnodeid"):
            #     continue

            if not node.managed and not node.resources.get("ccnodeid"):
                logging.debug("Ignoring attempt to drain unmanaged %s", node)
                continue

            if "offline" in node.metadata.get("pbs_state", ""):
                if node.assignments:
                    logging.info("Node %s has jobs still running on it.", node)
                    # node is already 'offline' i.e. draining, but a job is still running
                    continue
                else:
                    # ok - it is offline _and_ no jobs are running on it.
                    ret.append(node)
            else:
                try:
                    self.pbscmd.pbsnodes("-o", node.hostname)

                    # # Due to a delay in when pbsnodes -o exits to when pbsnodes -a
                    # # actually reports an offline state, w ewill just optimistically set it to offline
                    # # otherwise ~50% of the time you get the old state (free)
                    # response = self.pbscmd.pbsnodes_parsed("-a", node.hostname)
                    # if response:
                    #     node.metadata["pbs_state"] = response[0]["state"]
                    node.metadata["pbs_state"] = "offline"

                except CalledProcessError as e:
                    if node.private_ip:
                        logging.error(
                            "'pbsnodes -o %s' failed and this node will not be scaled down: %s",
                            node.hostname,
                            e,
                        )
        return ret
Esempio n. 12
0
    def _down_long_enough(self, now: datetime.datetime, node: Node) -> bool:
        last_state_change_time_str = node.metadata.get(
            "last_state_change_time")

        if last_state_change_time_str:
            last_state_change_time = datetime.datetime.strptime(
                last_state_change_time_str, "%a %b %d %H:%M:%S %Y")
            delta = now - last_state_change_time
            if delta > self.down_timeout_td:
                return True
            else:
                seconds_remaining = (delta - self.down_timeout_td).seconds
                logging.debug(
                    "Down node %s still has %s seconds before setting to offline",
                    node,
                    seconds_remaining,
                )

        return False
Esempio n. 13
0
def demand(
    config: Dict,
    jobs: Optional[str] = None,
    scheduler_nodes: Optional[str] = None,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Runs autoscale in dry run mode to see the demand for new nodes"""
    logging.debug("Begin demand")
    ctx = DefaultContextHandler("[demand-cli]")
    register_result_handler(ctx)
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver, ctx)
    demand_result = demand_calc.finish()

    autoscaler.print_demand(config, demand_result, output_columns, output_format)
    logging.debug("End demand")
Esempio n. 14
0
    def update_scheduler_nodes(self,
                               scheduler_nodes: List[SchedulerNode]) -> None:

        by_hostname: Dict[str, Node] = partition_single(
            self.__scheduler_nodes_queue,
            lambda n: n.hostname_or_uuid  # type: ignore
        )

        for new_snode in scheduler_nodes:
            if new_snode.hostname not in by_hostname:
                by_hostname[new_snode.hostname] = new_snode
                self.__scheduler_nodes_queue.push(new_snode)
                self.node_mgr.add_unmanaged_nodes([new_snode])
                if new_snode.resources.get("ccnodeid"):
                    logging.warning(
                        "%s has ccnodeid defined, but no longer exists in CycleCloud",
                        new_snode,
                    )
                else:
                    logging.debug(
                        "Found new node[hostname=%s] that does not exist in CycleCloud",
                        new_snode.hostname,
                    )

                # TODO inform bucket catalog?
            elif new_snode.metadata.get("override_resources", True):

                old_snode = by_hostname[new_snode.hostname_or_uuid]
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s]",
                    new_snode.hostname,
                )
                old_snode.update(new_snode)
            else:
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s], but node.metadata.override_resources=false"
                    +
                    " so ignoring the reported resources and only copying metadata",
                    new_snode.hostname,
                )
                old_snode = by_hostname[new_snode.hostname_or_uuid]
                old_snode.metadata.update(new_snode.metadata)
Esempio n. 15
0
    def _log_response(self, s: ResponseStatus, r: Any) -> None:
        if logging.getLogger().getEffectiveLevel() > logging.DEBUG:
            return

        import inspect

        current_frame = inspect.currentframe()
        caller_frame = inspect.getouterframes(current_frame, 2)
        caller = "[{}]".format(caller_frame[1].function)

        as_json = json.dumps(r.to_dict())

        logging.debug(
            "[%s] Response: Status=%s -> %s", caller, s.status_code, as_json[:100],
        )

        if logging.getLogger().getEffectiveLevel() > logging.FINE:
            return

        logging.fine(
            "[%s] Full response: Status=%s -> %s", caller, s.status_code, as_json,
        )
Esempio n. 16
0
def add_default_placement_groups(config: Dict, node_mgr: NodeManager) -> None:
    nas = config.get("nodearrays", {})
    for name, child in nas.items():
        if child.get("placement_groups"):
            return

    by_pg = partition(node_mgr.get_buckets(), lambda b:
                      (b.nodearray, b.placement_group))
    by_na_vm = partition(node_mgr.get_buckets(), lambda b:
                         (b.nodearray, b.vm_size))

    for key, buckets in by_na_vm.items():
        nodearray, vm_size = key
        non_pg_buckets = [b for b in buckets if not b.placement_group]
        if not non_pg_buckets:
            # hardcoded PlacementGroupId
            logging.debug(
                "Nodearray %s defines PlacementGroupId, so no additional " +
                "placement groups will be created automatically.",
                nodearray,
            )
            continue
        bucket = non_pg_buckets[0]
        if not bucket.supports_colocation:
            continue

        buf_size = int(
            nas.get(nodearray, {}).get("generated_placement_group_buffer", 2))
        buf_remaining = buf_size
        pgi = 0
        while buf_remaining > 0:
            pg_name = ht.PlacementGroup("{}_pg{}".format(vm_size, pgi))
            pg_key = (nodearray, pg_name)
            if pg_key not in by_pg:
                logging.fine("Adding placement group %s", pg_name)
                node_mgr.add_placement_group(pg_name, bucket)
                buf_remaining -= 1
            pgi += 1
def autoscale_grid_engine(
    config: Dict[str, Any],
    ge_env: Optional[GridEngineEnvironment] = None,
    ge_driver: Optional["GridEngineDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    dry_run: bool = False,
) -> DemandResult:
    global _exit_code

    assert not config.get("read_only", False)
    if dry_run:
        logging.warning("Running gridengine autoscaler in dry run mode")
        # allow multiple instances
        config["lock_file"] = None
        # put in read only mode
        config["read_only"] = True

    if ge_env is None:
        ge_env = envlib.from_qconf(config)

    # interface to GE, generally by cli
    if ge_driver is None:
        # allow tests to pass in a mock
        ge_driver = new_driver(config, ge_env)

    ge_driver.initialize_environment()

    config = ge_driver.preprocess_config(config)

    logging.debug("Driver = %s", ge_driver)

    invalid_nodes = []

    for node in ge_env.nodes:
        # many combinations of a u and other states. However,
        # as long as a and u are in there it is down
        state = node.metadata.get("state", "")
        if "a" in state and "u" in state:
            invalid_nodes.append(node)

    # nodes in error state must also be deleted
    nodes_to_delete = ge_driver.clean_hosts(invalid_nodes)
    for node in nodes_to_delete:
        ge_env.delete_node(node)

    demand_calculator = calculate_demand(config, ge_env, ge_driver,
                                         ctx_handler, node_history)

    ge_driver.handle_failed_nodes(
        demand_calculator.node_mgr.get_failed_nodes())

    demand_result = demand_calculator.finish()

    if ctx_handler:
        ctx_handler.set_context("[joining]")

    # details here are that we pass in nodes that matter (matched) and the driver figures out
    # which ones are new and need to be added via qconf
    joined = ge_driver.handle_join_cluster(
        [x for x in demand_result.compute_nodes if x.exists])

    ge_driver.handle_post_join_cluster(joined)

    if ctx_handler:
        ctx_handler.set_context("[scaling]")

    # bootup all nodes. Optionally pass in a filtered list
    if demand_result.new_nodes:
        if not dry_run:
            demand_calculator.bootup()

    if not dry_run:
        demand_calculator.update_history()

    # we also tell the driver about nodes that are unmatched. It filters them out
    # and returns a list of ones we can delete.
    idle_timeout = int(config.get("idle_timeout", 300))
    boot_timeout = int(config.get("boot_timeout", 3600))
    logging.fine("Idle timeout is %s", idle_timeout)

    unmatched_for_5_mins = demand_calculator.find_unmatched_for(
        at_least=idle_timeout)
    timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout)

    # I don't care about nodes that have keep_alive=true
    timed_out_booting = [n for n in timed_out_booting if not n.keep_alive]

    timed_out_to_deleted = []
    unmatched_nodes_to_delete = []

    if timed_out_booting:
        logging.info("The following nodes have timed out while booting: %s",
                     timed_out_booting)
        timed_out_to_deleted = ge_driver.handle_boot_timeout(
            timed_out_booting) or []

    if unmatched_for_5_mins:
        node_expr = ", ".join([str(x) for x in unmatched_for_5_mins])
        logging.info("Unmatched for at least %s seconds: %s", idle_timeout,
                     node_expr)
        unmatched_nodes_to_delete = (
            ge_driver.handle_draining(unmatched_for_5_mins) or [])

    nodes_to_delete = []
    for node in timed_out_to_deleted + unmatched_nodes_to_delete:
        if node.assignments:
            logging.warning(
                "%s has jobs assigned to it so we will take no action.", node)
            continue
        nodes_to_delete.append(node)

    if nodes_to_delete:
        try:
            logging.info("Deleting %s", [str(n) for n in nodes_to_delete])
            delete_result = demand_calculator.delete(nodes_to_delete)

            if delete_result:
                # in case it has anything to do after a node is deleted (usually just remove it from the cluster)
                ge_driver.handle_post_delete(delete_result.nodes)
        except Exception as e:
            _exit_code = 1
            logging.warning(
                "Deletion failed, will retry on next iteration: %s", e)
            logging.exception(str(e))

    print_demand(config, demand_result, log=not dry_run)

    return demand_result
Esempio n. 18
0
def main(argv: Iterable[str] = None) -> None:
    default_install_dir = os.path.join("/", "opt", "cycle", "gridengine")

    parser = ArgumentParser()
    sub_parsers = parser.add_subparsers()

    def csv_list(x: str) -> List[str]:
        return [x.strip() for x in x.split(",")]

    help_msg = io.StringIO()

    def add_parser(
        name: str, func: Callable, read_only: bool = True, skip_config: bool = False
    ) -> ArgumentParser:
        doc_str = (func.__doc__ or "").strip()
        doc_str = " ".join([x.strip() for x in doc_str.splitlines()])
        help_msg.write("\n    {:20} - {}".format(name, doc_str))

        default_config: Optional[str]
        default_config = os.path.join(default_install_dir, "autoscale.json")
        if not os.path.exists(default_config):
            default_config = None

        new_parser = sub_parsers.add_parser(name)
        new_parser.set_defaults(func=func, read_only=read_only)

        if skip_config:
            return new_parser

        new_parser.add_argument(
            "--config",
            "-c",
            default=default_config,
            required=not bool(default_config),
            action="append",
        )
        return new_parser

    def str_list(c: str) -> List[str]:
        return c.split(",")

    def add_parser_with_columns(
        name: str, func: Callable, read_only: bool = True
    ) -> ArgumentParser:
        parser = add_parser(name, func, read_only)

        def parse_format(c: str) -> str:
            c = c.lower()
            if c in ["json", "table", "table_headerless"]:
                return c
            print("Expected json, table or table_headerless - got", c, file=sys.stderr)
            sys.exit(1)

        parser.add_argument("--output-columns", "-o", type=str_list)
        parser.add_argument("--output-format", "-F", type=parse_format)
        return parser

    add_parser_with_columns("autoscale", autoscale, read_only=False)

    add_parser_with_columns("buckets", buckets).add_argument(
        "--constraint-expr", "-C", default="[]"
    )

    add_parser("complexes", complexes).add_argument(
        "-a", "--include-irrelevant", action="store_true", default=False
    )

    delete_parser = add_parser("delete_nodes", delete_nodes, read_only=False)
    delete_parser.add_argument("-H", "--hostnames", type=str_list, default=[])
    delete_parser.add_argument("-N", "--node-names", type=str_list, default=[])
    delete_parser.add_argument("--force", action="store_true", default=False)

    remove_parser = add_parser("remove_nodes", remove_nodes, read_only=False)
    remove_parser.add_argument("-H", "--hostnames", type=str_list, default=[])
    remove_parser.add_argument("-N", "--node-names", type=str_list, default=[])
    remove_parser.add_argument("--force", action="store_true", default=False)

    add_parser_with_columns("demand", demand).add_argument(
        "--jobs", "-j", default=None, required=False
    )

    add_parser("drain_node", drain_node, read_only=False).add_argument(
        "-H", "--hostname", required=True
    )

    initconfig_parser = add_parser(
        "initconfig", initconfig, read_only=False, skip_config=True
    )

    initconfig_parser.add_argument("--cluster-name", required=True)
    initconfig_parser.add_argument("--username", required=True)
    initconfig_parser.add_argument("--password")
    initconfig_parser.add_argument("--url", required=True)
    initconfig_parser.add_argument(
        "--log-config",
        default=os.path.join(default_install_dir, "logging.conf"),
        dest="logging__config_file",
    )
    initconfig_parser.add_argument(
        "--lock-file", default=os.path.join(default_install_dir, "scalelib.lock")
    )
    initconfig_parser.add_argument(
        "--default-resource",
        type=json.loads,
        action="append",
        default=[],
        dest="default_resources",
    )
    initconfig_parser.add_argument(
        "--relevant-complexes",
        default=["slots", "slot_type", "exclusive"],
        type=csv_list,
        dest="gridengine__relevant_complexes",
    )

    initconfig_parser.add_argument(
        "--idle-timeout", default=300, type=int, dest="idle_timeout"
    )
    initconfig_parser.add_argument(
        "--boot-timeout", default=1800, type=int, dest="boot_timeout"
    )
    initconfig_parser.add_argument(
        "--disable-pgs-for-pe",
        default=[],
        type=str,
        action="append",
        help="Disable creation of placement groups for a parallel environment. "
        + "This can be invoked more than once.",
        dest="disable_pgs_for_pe",
    )
    initconfig_parser.add_argument(
        "--hostgroup-constraint",
        default=[],
        action="append",
        dest="hostgroup_constraints",
    )

    add_parser("jobs", jobs)
    add_parser("jobs_and_nodes", jobs_and_nodes)

    add_parser("join_cluster", join_cluster).add_argument(
        "-H", "--hostname", type=str_list, required=True
    )

    add_parser_with_columns("nodes", nodes).add_argument(
        "--constraint-expr", "-C", default="[]"
    )

    add_parser("scheduler_nodes", scheduler_nodes)

    help_msg.write("\nadvanced usage:")
    add_parser("validate", validate_func, read_only=True)
    add_parser("queues", queues, read_only=True)
    add_parser("shell", shell)
    analyze_parser = add_parser("analyze", analyze)
    analyze_parser.add_argument("--job-id", "-j", required=True)
    analyze_parser.add_argument("--wide", "-w", action="store_true", default=False)

    parser.usage = help_msg.getvalue()
    args = parser.parse_args()
    if not hasattr(args, "func"):
        parser.print_help()
        sys.exit(1)

    # parse list of config paths to a single config
    if hasattr(args, "config"):
        args.config = load_config(*args.config)
        logging.initialize_logging(args.config)

    if args.read_only:
        args.config["read_only"] = True
        args.config["lock_file"] = None

    kwargs = {}
    for k in dir(args):
        if k[0].islower() and k not in ["read_only", "func"]:
            kwargs[k] = getattr(args, k)

    try:
        args.func(**kwargs)
    except Exception as e:
        print(str(e), file=sys.stderr)
        if hasattr(e, "message"):
            print(getattr(e, "message"), file=sys.stderr)
        logging.debug("Full stacktrace", exc_info=sys.exc_info())
        sys.exit(1)
Esempio n. 19
0
def autoscale_pbspro(
    config: Dict[str, Any],
    pbs_env: Optional[PBSProEnvironment] = None,
    pbs_driver: Optional[PBSProDriver] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    dry_run: bool = False,
) -> DemandResult:
    global _exit_code

    assert not config.get("read_only", False)
    if dry_run:
        logging.warning("Running pbs autoscaler in dry run mode")
        # allow multiple instances
        config["lock_file"] = None
        # put in read only mode
        config["read_only"] = True

    # interface to PBSPro, generally by cli
    if pbs_driver is None:
        # allow tests to pass in a mock
        pbs_driver = PBSProDriver(config)

    if pbs_env is None:
        pbs_env = envlib.from_driver(config, pbs_driver)

    pbs_driver.initialize()

    config = pbs_driver.preprocess_config(config)

    logging.debug("Driver = %s", pbs_driver)

    demand_calculator = calculate_demand(config, pbs_env, ctx_handler,
                                         node_history)

    failed_nodes = demand_calculator.node_mgr.get_failed_nodes()
    for node in pbs_env.scheduler_nodes:
        if "down" in node.metadata.get("pbs_state", ""):
            failed_nodes.append(node)
    pbs_driver.handle_failed_nodes(failed_nodes)

    demand_result = demand_calculator.finish()

    if ctx_handler:
        ctx_handler.set_context("[joining]")

    # details here are that we pass in nodes that matter (matched) and the driver figures out
    # which ones are new and need to be added
    joined = pbs_driver.add_nodes_to_cluster(
        [x for x in demand_result.compute_nodes if x.exists])

    pbs_driver.handle_post_join_cluster(joined)

    if ctx_handler:
        ctx_handler.set_context("[scaling]")

    # bootup all nodes. Optionally pass in a filtered list
    if demand_result.new_nodes:
        if not dry_run:
            demand_calculator.bootup()

    if not dry_run:
        demand_calculator.update_history()

    # we also tell the driver about nodes that are unmatched. It filters them out
    # and returns a list of ones we can delete.
    idle_timeout = int(config.get("idle_timeout", 300))
    boot_timeout = int(config.get("boot_timeout", 3600))
    logging.fine("Idle timeout is %s", idle_timeout)

    unmatched_for_5_mins = demand_calculator.find_unmatched_for(
        at_least=idle_timeout)
    timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout)

    # I don't care about nodes that have keep_alive=true
    timed_out_booting = [n for n in timed_out_booting if not n.keep_alive]

    timed_out_to_deleted = []
    unmatched_nodes_to_delete = []

    if timed_out_booting:
        logging.info("The following nodes have timed out while booting: %s",
                     timed_out_booting)
        timed_out_to_deleted = pbs_driver.handle_boot_timeout(
            timed_out_booting) or []

    if unmatched_for_5_mins:
        logging.info("unmatched_for_5_mins %s", unmatched_for_5_mins)
        unmatched_nodes_to_delete = (
            pbs_driver.handle_draining(unmatched_for_5_mins) or [])

    nodes_to_delete = []
    for node in timed_out_to_deleted + unmatched_nodes_to_delete:
        if node.assignments:
            logging.warning(
                "%s has jobs assigned to it so we will take no action.", node)
            continue
        nodes_to_delete.append(node)

    if nodes_to_delete:
        try:
            logging.info("Deleting %s", [str(n) for n in nodes_to_delete])
            delete_result = demand_calculator.delete(nodes_to_delete)

            if delete_result:
                # in case it has anything to do after a node is deleted (usually just remove it from the cluster)
                pbs_driver.handle_post_delete(delete_result.nodes)
        except Exception as e:
            _exit_code = 1
            logging.warning(
                "Deletion failed, will retry on next iteration: %s", e)
            logging.exception(str(e))

    print_demand(config, demand_result, log=not dry_run)

    return demand_result
Esempio n. 20
0
 def _execute(self, stmt: str) -> sqlite3.Cursor:
     logging.debug(stmt)
     return self.conn.execute(stmt)