Esempio n. 1
0
    def handle_post_delete(self, nodes: List[Node]) -> List[Node]:
        ret = []
        for node in nodes:
            if not node.hostname:
                continue
            try:
                self.pbscmd.qmgr("list", "node", node.hostname)
            except CalledProcessError as e:
                if "Server has no node list" in str(e):
                    ret.append(node)
                    continue
                logging.error("Could not list node with hostname %s - %s",
                              node.hostname, e)
                continue

            try:
                self.pbscmd.qmgr("delete", "node", node.hostname)
                node.metadata["pbs_state"] = "deleted"
                ret.append(node)
            except CalledProcessError as e:
                logging.error(
                    "Could not remove %s from cluster: %s. Will retry next cycle.",
                    node,
                    e,
                )
        return ret
Esempio n. 2
0
def call(cmd: List[str]) -> None:
    shlexed = " ".join([shlex.quote(x) for x in cmd])
    logging.trace("Running '%s'", shlexed)
    _QCMD_LOGGER.info(shlexed)
    stderr = ""
    completed_process = None
    try:
        # capture_output was added in 3.7 and we support as far back as 3.6
        if sys.version_info < (3, 7):
            completed_process = subprocess.run(cmd, stderr=subprocess.PIPE)
        else:
            completed_process = subprocess.run(cmd, capture_output=True)

        if completed_process.returncode != 0:
            if completed_process.stderr:
                stderr = completed_process.stderr.decode()
            logging.warning(
                "'%s' failed with exit %d: Stderr '%s'",
                shlexed,
                completed_process.returncode,
                stderr,
            )
    except Exception as e:
        logging.error("'%s' failed: %s.", shlexed, str(e))
        _QCMD_LOGGER.error(">> %s", str(e))
        raise
Esempio n. 3
0
    def __init__(
        self,
        sched_dict: Dict[str, str],
        resource_state: ResourceState,
    ) -> None:
        btype = BooleanType()
        self.do_not_span_psets = btype.parse(
            sched_dict.get("do_not_span_psets", "false"))
        self.scheduling = btype.parse(sched_dict["scheduling"])
        self.only_explicit_psets = btype.parse(
            sched_dict.get("only_explicit_psets", "false"))
        self.node_group_enable = btype.parse(
            sched_dict.get("node_group_enable", "false"))
        self.node_group_key = sched_dict.get("node_group_key")

        self.sched_log = sched_dict["sched_log"]
        self.sched_priv = sched_dict["sched_priv"]
        priv_config_path = os.path.join(self.sched_priv, "sched_config")
        self.resources_for_scheduling = (get_pbspro_parser(
        ).parse_resources_from_sched_priv(priv_config_path))
        self.state = sched_dict["state"]
        self.hostname = sched_dict["sched_host"].split(".")[0]
        self.resource_state = resource_state

        try:
            self.pbs_version: Tuple = tuple(
                [int(x) for x in sched_dict["pbs_version"].split(".")])
        except ValueError:
            self.pbs_version = tuple(sched_dict["pbs_version"].split("."))
        self.sched_dict = sched_dict

        if not self.only_explicit_psets:
            logging.error(
                "only_explicit_psets must be set to true. You can change this by running:"
                + ' qmgr -c "set sched default only_explicit_psets = true')
Esempio n. 4
0
 def try_parse(k: str, default: float) -> float:
     try:
         return float(config.get(k, default))
     except ValueError:
         logging.error(
             "Could not parse %s as a float", config.get(k),
         )
         return default
Esempio n. 5
0
def check_call(cmd: List[str], *args: Any, **kwargs: Any) -> None:
    shlexed = " ".join([shlex.quote(x) for x in cmd])
    logging.trace("Running '%s'", shlexed)
    _QCMD_LOGGER.info(shlexed)
    try:
        _check_call(cmd, *args, **kwargs)
    except Exception as e:
        logging.error("'%s' failed: %s", shlexed, str(e))
        _QCMD_LOGGER.error(">> %s", str(e))
        raise
    def __init__(self, attr: str,
                 *values: typing.Union[None, ht.ResourceTypeAtom]) -> None:
        self.attr = attr
        from hpc.autoscale.node.node import QUERYABLE_PROPERTIES

        if attr not in QUERYABLE_PROPERTIES:
            msg = "Property[name={}] not defined for Node".format(self.attr)
            logging.error(msg)
            raise ValueError("UndefinedNodeProperty: " + msg)

        if len(values) == 1 and isinstance(values[0], list):
            self.values: List[Optional[ht.ResourceTypeAtom]] = values[0]
        else:
            self.values = list(values)
Esempio n. 7
0
def check_output(cmd: List[str], *args: Any, **kwargs: Any) -> Any:
    if not cmd or not cmd[0]:
        raise RuntimeError(
            "Could not run the following command {}. Please check your PATH".
            format(cmd))
    kwargs["stderr"] = kwargs.pop("stderr", STDOUT)
    shlexed = " ".join([shlex.quote(x) for x in cmd])
    logging.trace("Running '%s'", shlexed)
    _QCMD_LOGGER.info(shlexed)
    try:
        return _check_output(cmd, *args, **kwargs).decode()
    except Exception as e:
        logging.error("'%s' failed: %s", shlexed, str(e))
        _QCMD_LOGGER.error(">> %s", str(e))
        raise
 def _post(self, function_name: str, function_route: str, data) -> Response:
     headers = {"Content-Type": "application/json"}
     url = function_route.format(self.hostname)
     res = requests.post(url,
                         data=data,
                         headers=headers,
                         verify=False,
                         cert=self._pem)
     try:
         res.raise_for_status()
         logging.info("{} resp: {}".format(function_name, str(res.content)))
         return res
     except HTTPError:
         logging.error("{}: status_code:{} content:{}".format(
             function_name, res.status_code, res.content))
         raise
Esempio n. 9
0
    def handle_draining(self, nodes: List[Node]) -> List[Node]:
        # TODO batch these up, but keep it underneath the
        # max arg limit
        ret = []
        for node in nodes:
            if not node.hostname:
                logging.info("Node %s has no hostname.", node)
                continue

            # TODO implement after we have resources added back in
            # what about deleting partially initialized nodes? I think we
            # just need to skip non-managed nodes
            # if not node.resources.get("ccnodeid"):
            #     continue

            if not node.managed and not node.resources.get("ccnodeid"):
                logging.debug("Ignoring attempt to drain unmanaged %s", node)
                continue

            if "offline" in node.metadata.get("pbs_state", ""):
                if node.assignments:
                    logging.info("Node %s has jobs still running on it.", node)
                    # node is already 'offline' i.e. draining, but a job is still running
                    continue
                else:
                    # ok - it is offline _and_ no jobs are running on it.
                    ret.append(node)
            else:
                try:
                    self.pbscmd.pbsnodes("-o", node.hostname)

                    # # Due to a delay in when pbsnodes -o exits to when pbsnodes -a
                    # # actually reports an offline state, w ewill just optimistically set it to offline
                    # # otherwise ~50% of the time you get the old state (free)
                    # response = self.pbscmd.pbsnodes_parsed("-a", node.hostname)
                    # if response:
                    #     node.metadata["pbs_state"] = response[0]["state"]
                    node.metadata["pbs_state"] = "offline"

                except CalledProcessError as e:
                    if node.private_ip:
                        logging.error(
                            "'pbsnodes -o %s' failed and this node will not be scaled down: %s",
                            node.hostname,
                            e,
                        )
        return ret
Esempio n. 10
0
    def qstat_json(self, *args: str) -> Dict:
        if "-F" not in args:
            args = ("-F", "json") + args

        response = self.qstat(*args)
        # For some reason both json and regular format are printed...
        expr = response
        # fix invalid json output like the following
        # "pset":"group_id=""",
        expr = expr.replace('"""', '"')
        attempts = 1000
        while "{" in expr and attempts > 0:
            attempts -= 1
            expr = expr[expr.index("{"):]
            try:
                return json.loads(expr)
            except JSONDecodeError as e:
                logging.error(e)
        raise RuntimeError(
            "Could not parse qstat json output: '{}'".format(response))
Esempio n. 11
0
    def _validate_reverse_dns(self, node: Node) -> bool:
        # let's make sure the hostname is valid and reverse
        # dns compatible before adding to GE

        # if there is no private ip, then the hostname was removed, most likely
        # by azure DNS
        if not node.private_ip:
            return True

        try:
            addr_info = socket.gethostbyaddr(node.private_ip)
        except Exception as e:
            logging.error(
                "Could not convert private_ip(%s) to hostname using gethostbyaddr() for %s: %s",
                node.private_ip,
                node,
                str(e),
            )
            return False

        addr_info_ips = addr_info[-1]
        if isinstance(addr_info_ips, str):
            addr_info_ips = [addr_info_ips]

        if node.private_ip not in addr_info_ips:
            logging.warning(
                "%s has a hostname that does not match the" +
                " private_ip (%s) reported by cyclecloud (%s)! Skipping",
                node,
                addr_info_ips,
                node.private_ip,
            )
            return False

        expect_multiple_entries = (node.software_configuration.get(
            "cyclecloud", {}).get("hosts", {}).get("standalone_dns",
                                                   {}).get("enabled", True))

        addr_info_hostname = addr_info[0].split(".")[0]
        if addr_info_hostname.lower() != node.hostname.lower():
            if expect_multiple_entries:
                logging.warning(
                    "%s has a hostname that can not be queried via reverse" +
                    " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)."
                    + " This is common and usually repairs itself. Skipping",
                    node,
                    node.private_ip,
                    node.hostname,
                    addr_info_hostname,
                )
            else:
                logging.error(
                    "%s has a hostname that can not be queried via reverse" +
                    " dns (private_ip=%s cyclecloud hostname=%s reverse dns hostname=%s)."
                    +
                    " If you have an entry for this address in your /etc/hosts file, please remove it.",
                    node,
                    node.private_ip,
                    node.hostname,
                    addr_info_hostname,
                )
            return False
        return True
Esempio n. 12
0
    def add_nodes_to_cluster(self, nodes: List[Node]) -> List[Node]:
        self.initialize()

        all_nodes = self.pbscmd.pbsnodes_parsed("-a")
        by_ccnodeid = partition(
            all_nodes, lambda x: x.get("resources_available.ccnodeid"))

        ret = []
        for node in nodes:
            if not node.hostname:
                continue

            if not node.private_ip:
                continue

            node_id = node.delayed_node_id.node_id
            if not node_id:
                logging.error("%s does not have a nodeid! Skipping", node)
                continue

            if node_id in by_ccnodeid:
                skip_node = False
                for ndict in by_ccnodeid[node_id]:
                    if ndict["name"].lower() != node.hostname.lower():
                        logging.error(
                            "Duplicate hostname found for the same node id! %s and %s. See 'valid_hostnames' in autoscale as a possible workaround.",
                            node,
                            ndict["name"],
                        )
                        skip_node = True
                        break
                if skip_node:
                    continue

            if not is_valid_hostname(self.config, node):
                continue

            if not self._validate_reverse_dns(node):
                logging.fine(
                    "%s still has a hostname that can not be looked via reverse dns. This should repair itself.",
                    node,
                )
                continue

            if not node.resources.get("ccnodeid"):
                logging.info(
                    "%s is not managed by CycleCloud, or at least 'ccnodeid' is not defined. Ignoring",
                    node,
                )
                continue
            try:
                try:
                    ndicts = self.pbscmd.qmgr_parsed("list", "node",
                                                     node.hostname)
                    if ndicts and ndicts[0].get(
                            "resources_available.ccnodeid"):
                        logging.info(
                            "ccnodeid is already defined on %s. Skipping",
                            node)
                        continue
                    # TODO RDH should we just delete it instead?
                    logging.info(
                        "%s already exists in this cluster. Setting resources.",
                        node)
                except CalledProcessError:
                    logging.info(
                        "%s does not exist in this cluster yet. Creating.",
                        node)
                    self.pbscmd.qmgr("create", "node", node.hostname)

                for res_name, res_value in node.resources.items():
                    # we set ccnodeid last, so that we can see that we have completely joined a node
                    # if and only if ccnodeid has been set
                    if res_name == "ccnodeid":
                        continue

                    if res_value is None:
                        continue

                    # TODO RDH track down
                    if res_name == "group_id" and res_value == "None":
                        continue

                    # skip things like host which are useful to set default resources on non-existent
                    # nodes for autoscale packing, but not on actual nodes
                    if res_name in self.read_only_resources:
                        continue

                    if res_name not in self.resource_definitions:
                        # TODO bump to a warning?
                        logging.fine(
                            "%s is an unknown PBS resource for node %s. Skipping this resource",
                            res_name,
                            node,
                        )
                        continue
                    res_value_str: str

                    # pbs size does not support decimals
                    if isinstance(res_value, ht.Size):
                        res_value_str = "{}{}".format(int(res_value.value),
                                                      res_value.magnitude)
                    elif isinstance(res_value, bool):
                        res_value_str = "1" if bool else "0"
                    else:
                        res_value_str = str(res_value)

                    self.pbscmd.qmgr(
                        "set",
                        "node",
                        node.hostname,
                        "resources_available.{}={}".format(
                            res_name, res_value_str),
                    )

                self.pbscmd.qmgr(
                    "set",
                    "node",
                    node.hostname,
                    "resources_available.{}={}".format(
                        "ccnodeid", node.resources["ccnodeid"]),
                )
                self.pbscmd.pbsnodes("-r", node.hostname)
                ret.append(node)
            except SubprocessError as e:
                logging.error(
                    "Could not fully add %s to cluster: %s. Will attempt next cycle",
                    node,
                    e,
                )

        return ret
Esempio n. 13
0
from hpc.autoscale import hpclogging as logging

_QCMD_LOGGER = logging.getLogger("gridengine.driver")
_QCONF_PATH = which("qconf") or ""
_QMOD_PATH = which("qmod") or ""
_QSELECT_PATH = which("qselect") or ""
_QSTAT_PATH = which("qstat") or ""

__VALIDATED = False
if not __VALIDATED:
    for key, value in list(globals().items()):
        if key.startswith("_Q") and key.endswith("_PATH"):
            if not value:
                executable = key.split("_")[0].lower()
                logging.error("Could not find %s in PATH: %s", executable,
                              os.environ.get("PATH"))
__VALIDATED = True


def check_call(cmd: List[str], *args: Any, **kwargs: Any) -> None:
    shlexed = " ".join([shlex.quote(x) for x in cmd])
    logging.trace("Running '%s'", shlexed)
    _QCMD_LOGGER.info(shlexed)
    try:
        _check_call(cmd, *args, **kwargs)
    except Exception as e:
        logging.error("'%s' failed: %s", shlexed, str(e))
        _QCMD_LOGGER.error(">> %s", str(e))
        raise

Esempio n. 14
0
def _parse_complexes(
    autoscale_config: Dict, complex_lines: List[str]
) -> Dict[str, "Complex"]:
    relevant_complexes = None
    if autoscale_config:
        relevant_complexes = autoscale_config.get("gridengine", {}).get(
            "relevant_complexes"
        )
        if relevant_complexes:
            # special handling of ccnodeid, since it is something we
            # create for the user
            relevant_complexes = relevant_complexes + ["ccnodeid"]

        if relevant_complexes:
            logging.info(
                "Restricting complexes for autoscaling to %s", relevant_complexes
            )

    complexes: List[Complex] = []
    headers = complex_lines[0].lower().replace("#", "").split()

    required = set(["name", "type", "consumable"])
    missing = required - set(headers)
    if missing:
        logging.error(
            "Could not parse complex file as it is missing expected columns: %s."
            + " Autoscale likely will not work.",
            list(missing),
        )
        return {}

    for n, line in enumerate(complex_lines[1:]):
        if line.startswith("#"):
            continue
        toks = line.split()
        if len(toks) != len(headers):
            logging.warning(
                "Could not parse complex at line {} - ignoring: '{}'".format(n, line)
            )
            continue
        c = dict(zip(headers, toks))
        try:

            if (
                relevant_complexes
                and c["name"] not in relevant_complexes
                and c["shortcut"] not in relevant_complexes
            ):
                logging.trace(
                    "Ignoring complex %s because it was not defined in gridengine.relevant_complexes",
                    c["name"],
                )
                continue

            complex = Complex(
                name=c["name"],
                shortcut=c.get("shortcut", c["name"]),
                complex_type=c["type"],
                relop=c.get("relop", "=="),
                requestable=c.get("requestable", "YES").lower() == "yes",
                consumable=c.get("consumable", "YES").lower() == "yes",
                default=c.get("default"),
                urgency=int(c.get("urgency", 0)),
            )

            complexes.append(complex)

        except Exception:
            logging.exception("Could not parse complex %s - %s", line, c)

    # TODO test RDH
    ret = partition_single(complexes, lambda x: x.name)
    shortcut_dict = partition_single(complexes, lambda x: x.shortcut)
    ret.update(shortcut_dict)
    return ret
Esempio n. 15
0
def main(argv: Iterable[str] = None) -> None:
    default_install_dir = os.path.join("/", "opt", "cycle", "gridengine")

    parser = ArgumentParser()
    sub_parsers = parser.add_subparsers()

    def csv_list(x: str) -> List[str]:
        return [x.strip() for x in x.split(",")]

    help_msg = io.StringIO()

    def add_parser(name: str,
                   func: Callable,
                   read_only: bool = True,
                   skip_config: bool = False) -> ArgumentParser:
        doc_str = (func.__doc__ or "").strip()
        doc_str = " ".join([x.strip() for x in doc_str.splitlines()])
        help_msg.write("\n    {:20} - {}".format(name, doc_str))

        default_config: Optional[str]
        default_config = os.path.join(default_install_dir, "autoscale.json")
        if not os.path.exists(default_config):
            default_config = None

        new_parser = sub_parsers.add_parser(name)
        new_parser.set_defaults(func=func, read_only=read_only)

        if skip_config:
            return new_parser

        new_parser.add_argument("--config",
                                "-c",
                                default=default_config,
                                required=not bool(default_config))
        return new_parser

    def str_list(c: str) -> List[str]:
        return c.split(",")

    def add_parser_with_columns(name: str,
                                func: Callable,
                                read_only: bool = True) -> ArgumentParser:
        parser = add_parser(name, func, read_only)

        def parse_format(c: str) -> str:
            c = c.lower()
            if c in ["json", "table", "table_headerless"]:
                return c
            print("Expected json, table or table_headerless - got",
                  c,
                  file=sys.stderr)
            sys.exit(1)

        parser.add_argument("--output-columns", "-o", type=str_list)
        parser.add_argument("--output-format", "-F", type=parse_format)
        return parser

    add_parser_with_columns("autoscale", autoscale, read_only=False)

    add_parser_with_columns("buckets",
                            buckets).add_argument("--constraint-expr",
                                                  "-C",
                                                  default="[]")

    add_parser("complexes", complexes).add_argument("-a",
                                                    "--include-irrelevant",
                                                    action="store_true",
                                                    default=False)

    delete_parser = add_parser("delete_nodes", delete_nodes, read_only=False)
    delete_parser.add_argument("-H", "--hostnames", type=str_list, default=[])
    delete_parser.add_argument("-N", "--node-names", type=str_list, default=[])
    delete_parser.add_argument("--force", action="store_true", default=False)

    remove_parser = add_parser("remove_nodes", remove_nodes, read_only=False)
    remove_parser.add_argument("-H", "--hostnames", type=str_list, default=[])
    remove_parser.add_argument("-N", "--node-names", type=str_list, default=[])
    remove_parser.add_argument("--force", action="store_true", default=False)

    add_parser_with_columns("demand", demand).add_argument("--jobs",
                                                           "-j",
                                                           default=None,
                                                           required=False)

    add_parser("drain_node", drain_node,
               read_only=False).add_argument("-H", "--hostname", required=True)

    initconfig_parser = add_parser("initconfig",
                                   initconfig,
                                   read_only=False,
                                   skip_config=True)

    initconfig_parser.add_argument("--cluster-name", required=True)
    initconfig_parser.add_argument("--username", required=True)
    initconfig_parser.add_argument("--password")
    initconfig_parser.add_argument("--url", required=True)
    initconfig_parser.add_argument(
        "--log-config",
        default=os.path.join(default_install_dir, "logging.conf"),
        dest="logging__config_file",
    )
    initconfig_parser.add_argument("--lock-file",
                                   default=os.path.join(
                                       default_install_dir, "scalelib.lock"))
    initconfig_parser.add_argument(
        "--default-resource",
        type=json.loads,
        action="append",
        default=[],
        dest="default_resources",
    )
    initconfig_parser.add_argument(
        "--default-hostgroups",
        type=json.loads,
        action="append",
        default=[],
        dest="default_hostgroups",
    )
    initconfig_parser.add_argument(
        "--relevant-complexes",
        default=["slots", "slot_type", "exclusive"],
        type=csv_list,
        dest="gridengine__relevant_complexes",
    )

    initconfig_parser.add_argument("--idle-timeout",
                                   default=300,
                                   type=int,
                                   dest="idle_timeout")
    initconfig_parser.add_argument("--boot-timeout",
                                   default=1800,
                                   type=int,
                                   dest="boot_timeout")
    initconfig_parser.add_argument(
        "--disable-pgs-for-pe",
        default=[],
        type=str,
        action="append",
        help="Disable creation of placement groups for a parallel environment. "
        + "This can be invoked more than once.",
        dest="disable_pgs_for_pe",
    )
    initconfig_parser.add_argument(
        "--hostgroup-constraint",
        default=[],
        action="append",
        dest="hostgroup_constraints",
    )

    add_parser("jobs", jobs)
    add_parser("jobs_and_nodes", jobs_and_nodes)

    support_archive_parser = add_parser("support_archive",
                                        create_support_archive)
    support_archive_parser.add_argument(
        "--archive",
        "-a",
        default="gridengine_support-{}.tar.gz".format(time.time()))

    join_cluster_parser = add_parser("join_cluster", join_cluster)
    join_cluster_parser.add_argument("-H", "--hostnames", type=str_list)
    join_cluster_parser.add_argument("-N", "--nodenames", type=str_list)

    add_parser_with_columns("nodes", nodes).add_argument("--constraint-expr",
                                                         "-C",
                                                         default="[]")

    add_parser("scheduler_nodes", scheduler_nodes)

    help_msg.write("\nadvanced usage:")
    add_parser("validate", validate_func, read_only=True)
    add_parser("queues", queues, read_only=True)
    add_parser("shell", shell)
    analyze_parser = add_parser("analyze", analyze)
    analyze_parser.add_argument("--job-id", "-j", required=True)
    analyze_parser.add_argument("--wide",
                                "-w",
                                action="store_true",
                                default=False)

    parser.usage = help_msg.getvalue()
    args = parser.parse_args()
    if not hasattr(args, "func"):
        parser.print_help()
        sys.exit(1)

    # parse list of config paths to a single config
    if hasattr(args, "config"):
        try:
            with open(args.config) as fr:
                args.config = json.load(fr)
        except Exception as e:
            logging.error("Could not load config file %s: %s", args.config, e)
            sys.exit(1)
        logging.initialize_logging(args.config)

    if args.read_only:
        args.config["read_only"] = True
        args.config["lock_file"] = None

    kwargs = {}
    for k in dir(args):
        if k[0].islower() and k not in ["read_only", "func"]:
            kwargs[k] = getattr(args, k)

    try:
        args.func(**kwargs)
    except Exception as e:
        print(str(e), file=sys.stderr)
        if hasattr(e, "message"):
            print(getattr(e, "message"), file=sys.stderr)
        logging.debug("Full stacktrace", exc_info=sys.exc_info())
        sys.exit(1)