Esempio n. 1
0
def analyze(config: Dict, job_id: str, wide: bool = False) -> None:
    if not wide:
        try:
            _, columns_str = os.popen("stty size", "r").read().split()
        except Exception:
            columns_str = "120"
        columns = int(columns_str)
    else:
        columns = 120

    ctx = DefaultContextHandler("[demand-cli]")

    register_result_handler(ctx)
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    autoscaler.calculate_demand(config, ge_env, ge_driver, ctx)

    key = "[job {}]".format(job_id)
    results = ctx.by_context[key]
    for result in results:
        if isinstance(result, (EarlyBailoutResult, MatchResult)) and result:
            continue

        if isinstance(result, HostgroupConstraint) and not result:
            continue
        if wide:
            print(result.message)
        else:
            print(result.message[:columns])
Esempio n. 2
0
def resources(config: Dict, constraint_expr: str) -> None:
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    node_mgr = new_node_manager(config, existing_nodes=ge_driver)

    filtered = _query_with_constraints(config, constraint_expr, node_mgr.get_buckets())

    columns = set()
    for node in filtered:
        columns.update(set(node.resources.keys()))
        columns.update(set(node.resources.keys()))
    config["output_columns"]
Esempio n. 3
0
def nodes(
    config: Dict,
    constraint_expr: str,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Query nodes"""
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    dcalc = autoscaler.new_demand_calculator(config, ge_env, ge_driver)
    filtered = _query_with_constraints(config, constraint_expr,
                                       dcalc.node_mgr.get_nodes())

    demand_result = DemandResult([], filtered, [], [])
    autoscaler.print_demand(config, demand_result, output_columns)
Esempio n. 4
0
def validate_func(config: Dict) -> None:
    ge_env = environment.from_qconf(config)
    dcalc = autoscaler.new_demand_calculator(config, ge_env=ge_env)
    queue: GridEngineQueue
    failure = False
    failure = (
        validate.validate_hg_intersections(ge_env, dcalc.node_mgr, warn) or failure
    )
    failure = validate.validate_nodes(config, dcalc, warn) or failure
    for qname, queue in ge_env.queues.items():
        failure = validate.validate_queue_has_hosts(queue, ge_env.qbin, warn) or failure
        failure = validate.validate_ht_hostgroup(queue, ge_env, warn) or failure
        failure = validate.validate_pe_hostgroups(queue, warn) or failure

    if failure:
        sys.exit(1)
Esempio n. 5
0
def complexes(config: Dict, include_irrelevant: bool = False) -> None:
    """Prints out, by default, only relevant complexes"""
    relevant: typing.Optional[typing.Set[str]]
    if include_irrelevant:
        ge_config = config.get("gridengine", {})
        if "relevant_complexes" in ge_config:
            ge_config.pop("relevant_complexes")

    relevant = set(config.get("gridengine", {}).get("relevant_complexes", []))
    ge_env = from_qconf(config)
    already_printed: typing.Set[str] = set()
    for complex in ge_env.complexes.values():
        if (include_irrelevant or complex.name in relevant
                and complex.name not in already_printed):
            print(repr(complex))
            already_printed.add(complex.name)
Esempio n. 6
0
def buckets(
    config: Dict,
    constraint_expr: str,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Prints out autoscale bucket information, like limits etc"""
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    node_mgr = new_node_manager(config)
    specified_output_columns = output_columns
    output_columns = output_columns or [
        "nodearray",
        "placement_group",
        "vm_size",
        "vcpu_count",
        "pcpu_count",
        "memory",
        "available_count",
    ]

    if specified_output_columns is None:
        for bucket in node_mgr.get_buckets():
            for resource_name in bucket.resources:
                if resource_name not in output_columns:
                    output_columns.append(resource_name)

        for attr in dir(bucket.limits):
            if attr[0].isalpha() and "count" in attr:
                value = getattr(bucket.limits, attr)
                if isinstance(value, int):
                    bucket.resources[attr] = value
                    bucket.example_node._resources[attr] = value

    filtered = _query_with_constraints(config, constraint_expr,
                                       node_mgr.get_buckets())

    demand_result = DemandResult([], [f.example_node for f in filtered], [],
                                 [])

    if "all" in output_columns:
        output_columns = ["all"]
    config["output_columns"] = output_columns

    autoscaler.print_demand(config, demand_result, output_columns,
                            output_format)
Esempio n. 7
0
def demand(
    config: Dict,
    jobs: Optional[str] = None,
    scheduler_nodes: Optional[str] = None,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Runs autoscale in dry run mode to see the demand for new nodes"""
    logging.debug("Begin demand")
    ctx = DefaultContextHandler("[demand-cli]")
    register_result_handler(ctx)
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver, ctx)
    demand_result = demand_calc.finish()

    autoscaler.print_demand(config, demand_result, output_columns, output_format)
    logging.debug("End demand")
Esempio n. 8
0
def _find_nodes(
    config: Dict, hostnames: List[str], node_names: List[str]
) -> Tuple[GridEngineDriver, DemandCalculator, List[Node]]:
    hostnames = hostnames or []
    node_names = node_names or []
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)

    demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver)
    demand_result = demand_calc.finish()
    by_hostname = partition_single(
        demand_result.compute_nodes, lambda n: n.hostname_or_uuid.lower()
    )
    by_node_name = partition_single(
        demand_result.compute_nodes, lambda n: n.name.lower()
    )
    found_nodes = []
    for hostname in hostnames:
        if not hostname:
            error("Please specify a hostname")

        if hostname.lower() not in by_hostname:
            # it doesn't exist in CC, but we still want to delete it
            # from the cluster
            by_hostname[hostname.lower()] = SchedulerNode(hostname, {})

        found_nodes.append(by_hostname[hostname.lower()])

    for node_name in node_names:
        if not node_name:
            error("Please specify a node_name")

        if node_name.lower() not in by_node_name:
            error(
                "Could not find a CycleCloud node that has node_name %s."
                + " Run 'nodes' to see available nodes.",
                node_name,
            )
        found_nodes.append(by_node_name[node_name.lower()])

    return ge_driver, demand_calc, found_nodes
Esempio n. 9
0
def queues(config: Dict) -> None:
    ge_env = environment.from_qconf(config)
    schedulers = ge_env.qbin.qconf(["-sss"]).split()
    rows: List[List[str]] = []

    for qname, ge_queue in ge_env.queues.items():

        for hgrp in ge_queue.hostlist_groups:
            fqdns = ge_env.qbin.qconf(["-shgrp", hgrp]).splitlines()
            for line in fqdns:
                line = line.strip()
                if not line:
                    continue

                if line.startswith("group_name"):
                    continue

                # trim this out
                if line.startswith("hostlist "):
                    line = line[len("hostlist ") :]  # noqa: E203

                for fqdn_expr in line.split():
                    fqdn_expr = fqdn_expr.strip()
                    if not fqdn_expr or fqdn_expr == "\\":
                        continue
                    host = fqdn_expr.split(".")[0]

                    if host in schedulers:
                        continue

                    rows.append([qname, hgrp, host])

    demandprinter.print_rows(
        columns=["QNAME", "HOSTGROUP", "HOSTNAME"],
        rows=rows,
        stream=sys.stdout,
        output_format="table",
    )
Esempio n. 10
0
def validate_func(config: Dict) -> None:
    ge_env = environment.from_qconf(config)
    dcalc = autoscaler.new_demand_calculator(config, ge_env=ge_env)
    queue: GridEngineQueue
    success = True
    success = (validate.validate_hg_intersections(ge_env, dcalc.node_mgr, warn)
               and success)
    success = validate.validate_nodes(config, dcalc, warn) and success

    for qname, queue in ge_env.queues.items():
        success = (validate.validate_queue_has_hosts(queue, ge_env.qbin, warn)
                   and success)
        success = validate.validate_ht_hostgroup(queue, ge_env,
                                                 warn) and success
        success = validate.validate_pe_hostgroups(queue, warn) and success

    success = validate.validate_default_hostgroups(config, ge_env,
                                                   warn) and success
    success = validate.validate_scheduler_has_no_slots(config, ge_env,
                                                       warn) and success

    if not success:
        sys.exit(1)
Esempio n. 11
0
def new_demand_calculator(
    config: Dict,
    ge_env: Optional[GridEngineEnvironment] = None,
    ge_driver: Optional["GridEngineDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    singleton_lock: Optional[SingletonLock] = None,
) -> DemandCalculator:
    if ge_env is None:
        ge_env = envlib.from_qconf(config)

    if ge_driver is None:
        ge_driver = new_driver(config, ge_env)

    if node_history is None:
        db_path = config.get("nodehistorydb")
        if not db_path:
            db_dir = "/opt/cycle/jetpack/system/bootstrap/gridengine"
            if not os.path.exists(db_dir):
                db_dir = os.getcwd()
            db_path = os.path.join(db_dir, "nodehistory.db")

        read_only = config.get("read_only", False)
        node_history = SQLiteNodeHistory(db_path, read_only)

        node_history.create_timeout = config.get("boot_timeout", 3600)
        node_history.last_match_timeout = config.get("idle_timeout", 300)

    demand_calculator = dcalclib.new_demand_calculator(
        config,
        existing_nodes=ge_env.nodes,
        node_history=node_history,
        node_queue=ge_driver.new_node_queue(),
        singleton_lock=singleton_lock,  # it will handle the none case
    )

    for name, default_complex in ge_env.complexes.items():
        if name == "slots":
            continue

        if default_complex.default is None:
            continue

        if not default_complex.requestable:
            continue

        logging.trace("Adding default resource %s=%s", name,
                      default_complex.default)
        demand_calculator.node_mgr.add_default_resource(
            {}, name, default_complex.default)

    ccnode_id_added = False
    slots_added: Set[str] = set()

    for bucket in demand_calculator.node_mgr.get_buckets():
        if "slots" not in bucket.resources and bucket.nodearray not in slots_added:
            default = (
                '"default_resources": [{"select": {"node.nodearray": "%s"}, "name": "slots", "value": "node.vcpu_count"}]'
                % (bucket.nodearray))
            demand_calculator.node_mgr.add_default_resource(
                selection={"node.nodearray": bucket.nodearray},
                resource_name="slots",
                default_value="node.vcpu_count",
            )

            logging.warning(
                """slots is not defined for bucket {}. Using the default, which you can add to your config: {}"""
                .format(bucket, default))
            slots_added.add(bucket.nodearray)

        # ccnodeid will almost certainly not be defined. It just needs
        # to be definede once, so we will add a default for all nodes
        # the first time we see it is missingg
        if "ccnodeid" not in bucket.resources and not ccnode_id_added:
            demand_calculator.node_mgr.add_default_resource(
                selection={},  # applies to all nodes
                resource_name="ccnodeid",
                default_value=lambda n: n.delayed_node_id.node_id,
            )
            ccnode_id_added = True

    return demand_calculator
Esempio n. 12
0
def autoscale_grid_engine(
    config: Dict[str, Any],
    ge_env: Optional[GridEngineEnvironment] = None,
    ge_driver: Optional["GridEngineDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    dry_run: bool = False,
) -> DemandResult:
    global _exit_code

    assert not config.get("read_only", False)
    if dry_run:
        logging.warning("Running gridengine autoscaler in dry run mode")
        # allow multiple instances
        config["lock_file"] = None
        # put in read only mode
        config["read_only"] = True

    if ge_env is None:
        ge_env = envlib.from_qconf(config)

    # interface to GE, generally by cli
    if ge_driver is None:
        # allow tests to pass in a mock
        ge_driver = new_driver(config, ge_env)

    ge_driver.initialize_environment()

    config = ge_driver.preprocess_config(config)

    logging.fine("Driver = %s", ge_driver)

    invalid_nodes = []

    # we need an instance without any scheduler nodes, so don't
    # pass in the existing nodes.
    tmp_node_mgr = new_node_manager(config)

    by_hostname = partition_single(tmp_node_mgr.get_nodes(),
                                   lambda n: n.hostname_or_uuid)

    for node in ge_env.nodes:
        # many combinations of a u and other states. However,
        # as long as a and u are in there it is down
        state = node.metadata.get("state", "")
        cc_node = by_hostname.get(node.hostname)
        ccnodeid = node.resources.get("ccnodeid")
        if cc_node:
            if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id:
                if cc_node.state in ["Preparing", "Acquiring"]:
                    continue
        if "a" in state and "u" in state:
            invalid_nodes.append(node)

    # nodes in error state must also be deleted
    nodes_to_delete = ge_driver.clean_hosts(invalid_nodes)
    for node in nodes_to_delete:
        ge_env.delete_node(node)

    demand_calculator = calculate_demand(config, ge_env, ge_driver,
                                         ctx_handler, node_history)

    ge_driver.handle_failed_nodes(
        demand_calculator.node_mgr.get_failed_nodes())

    demand_result = demand_calculator.finish()

    if ctx_handler:
        ctx_handler.set_context("[joining]")

    # details here are that we pass in nodes that matter (matched) and the driver figures out
    # which ones are new and need to be added via qconf
    joined = ge_driver.handle_join_cluster(
        [x for x in demand_result.compute_nodes if x.exists])

    ge_driver.handle_post_join_cluster(joined)

    if ctx_handler:
        ctx_handler.set_context("[scaling]")

    # bootup all nodes. Optionally pass in a filtered list
    if demand_result.new_nodes:
        if not dry_run:
            demand_calculator.bootup()

    if not dry_run:
        demand_calculator.update_history()

    # we also tell the driver about nodes that are unmatched. It filters them out
    # and returns a list of ones we can delete.
    idle_timeout = int(config.get("idle_timeout", 300))
    boot_timeout = int(config.get("boot_timeout", 3600))
    logging.fine("Idle timeout is %s", idle_timeout)

    unmatched_for_5_mins = demand_calculator.find_unmatched_for(
        at_least=idle_timeout)
    timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout)

    # I don't care about nodes that have keep_alive=true
    timed_out_booting = [n for n in timed_out_booting if not n.keep_alive]

    timed_out_to_deleted = []
    unmatched_nodes_to_delete = []

    if timed_out_booting:
        logging.info("The following nodes have timed out while booting: %s",
                     timed_out_booting)
        timed_out_to_deleted = ge_driver.handle_boot_timeout(
            timed_out_booting) or []

    if unmatched_for_5_mins:
        node_expr = ", ".join([str(x) for x in unmatched_for_5_mins])
        logging.info("Unmatched for at least %s seconds: %s", idle_timeout,
                     node_expr)
        unmatched_nodes_to_delete = (
            ge_driver.handle_draining(unmatched_for_5_mins) or [])

    nodes_to_delete = []
    for node in timed_out_to_deleted + unmatched_nodes_to_delete:
        if node.assignments:
            logging.warning(
                "%s has jobs assigned to it so we will take no action.", node)
            continue
        nodes_to_delete.append(node)

    if nodes_to_delete:
        try:
            logging.info("Deleting %s", [str(n) for n in nodes_to_delete])
            delete_result = demand_calculator.delete(nodes_to_delete)

            if delete_result:
                # in case it has anything to do after a node is deleted (usually just remove it from the cluster)
                ge_driver.handle_post_delete(delete_result.nodes)
        except Exception as e:
            _exit_code = 1
            logging.warning(
                "Deletion failed, will retry on next iteration: %s", e)
            logging.exception(str(e))

    print_demand(config, demand_result, log=not dry_run)

    return demand_result
Esempio n. 13
0
def shell(config: Dict) -> None:
    """
        Provides read only interactive shell. type gehelp()
        in the shell for more information
    """
    ctx = DefaultContextHandler("[interactive-readonly]")

    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    demand_calc = autoscaler.new_demand_calculator(config, ge_env, ge_driver, ctx)

    queues = ge_env.queues

    def gehelp() -> None:
        print("config       - dict representing autoscale configuration.")
        print("dbconn       - Read-only SQLite conn to node history")
        print("demand_calc  - DemandCalculator")
        print("ge_driver    - GEDriver object.")
        print("jobs         - List[Job] from ge_driver")
        print("node_mgr     - NodeManager")
        print("logging      - HPCLogging module")
        print("queues      - GridEngineQueue objects")

    shell_locals = {
        "config": config,
        "ctx": ctx,
        "ge_driver": ge_driver,
        "demand_calc": demand_calc,
        "node_mgr": demand_calc.node_mgr,
        "jobs": ge_env.jobs,
        "dbconn": demand_calc.node_history.conn,
        "gehelp": gehelp,
        "queues": queues,
        "ge_env": ge_env,
    }
    banner = "\nCycleCloud GE Autoscale Shell"
    interpreter = ReraiseAssertionInterpreter(locals=shell_locals)
    try:
        __import__("readline")
        # some magic - create a completer that is bound to the locals in this interpreter and not
        # the __main__ interpreter.
        interpreter.push("import readline, rlcompleter")
        interpreter.push('readline.parse_and_bind("tab: complete")')
        interpreter.push("_completer = rlcompleter.Completer(locals())")
        interpreter.push("def _complete_helper(text, state):")
        interpreter.push("    ret = _completer.complete(text, state)")
        interpreter.push('    ret = ret + ")" if ret[-1] == "(" else ret')
        interpreter.push("    return ret")
        interpreter.push("")
        interpreter.push("readline.set_completer(_complete_helper)")
        for item in interpreter.history_lines:
            try:
                if '"""' in item:
                    interpreter.push(
                        "readline.add_history('''%s''')" % item.rstrip("\n")
                    )
                else:
                    interpreter.push(
                        'readline.add_history("""%s""")' % item.rstrip("\n")
                    )
            except Exception:
                pass

        interpreter.push("from hpc.autoscale.job.job import Job\n")
        interpreter.push("from hpc.autoscale import hpclogging as logging\n")

    except ImportError:
        banner += (
            "\nWARNING: `readline` is not installed, so autocomplete will not work."
        )

    interpreter.interact(banner=banner)
Esempio n. 14
0
def create_support_archive(config: Dict, archive: str) -> None:
    """
    Creates an archive with most logs and configurations required when requesting support.
    """
    ge_env = environment.from_qconf(config)

    # for some reason mypy doesn't see gzopen
    tf = tarfile.TarFile.gzopen(archive, "w")  # type: ignore

    def _add(cmd: List[str], name: str) -> None:
        contents = ge_env.qbin.qconf(cmd)
        _add_contents(contents, name)

    def _add_contents(contents: str, name: str) -> None:
        tarinfo = tarfile.TarInfo("gridengine-support/" + name)
        tarinfo.size = len(contents)
        tarinfo.mtime = int(time.time())
        fr = io.BytesIO(contents.encode())
        tf.addfile(tarinfo, fr)

    # get our queue definitions
    for qname in ge_env.queues:
        _add(["-sq", qname], "queue_{}".format(qname))

    # get our parallel env definitions
    for pe_name in ge_env.pes:
        _add(["-sp", pe_name], "pe_{}".format(pe_name))

    # get a list of hostgroups. Actual definition of hgs is immaterial
    _add(["-shgrpl"], "hostgroups")
    # dump out the complexes
    _add(["-sc"], "complexes")

    config_no_creds = dict(config)
    config_no_creds["password"] = ""
    config_no_creds["cluster_name"] = ""
    config_no_creds["username"] = ""
    config_no_creds["url"] = ""
    _add_contents(json.dumps(config_no_creds, indent=2), "autoscale.json")

    install_logs = os.path.join(os.getenv("SGE_ROOT", ""),
                                os.getenv("SGE_CELL", ""),
                                "common/install_logs")
    if os.path.exists(install_logs):
        for fil in os.listdir(install_logs):
            path = os.path.join(install_logs, fil)
            with open(path) as fr:
                _add_contents(fr.read(), fil)

    # e.g. /sched/sge/sge-2011.11/default/spool/qmaster/messages
    spool_dir = os.path.join(os.getenv("SGE_ROOT", ""),
                             os.getenv("SGE_CELL", ""), "spool")
    if os.path.exists(spool_dir):
        for hostname in os.listdir(spool_dir):
            messages_path = os.path.join(spool_dir, hostname, "messages")
            if os.path.exists(messages_path):
                with open(messages_path) as fr:
                    _add_contents(fr.read(), "messages_{}".format(hostname))

    # may not exist on self-installs
    chef_client_log = "/opt/cycle/jetpack/logs/chef-client.log"
    if os.path.exists(chef_client_log):
        with open(chef_client_log) as fr:
            _add_contents(fr.read(), "chef-client.log")

    # find autoscale.log and autoscale.log.1-5
    for handler in logging.getLogger().handlers:
        if hasattr(handler, "baseFilename"):
            base_filename = getattr(handler, "baseFilename")
            file_names = [base_filename] + [
                base_filename + ".{}".format(n) for n in range(1, 6)
            ]
            for fname in file_names:
                if os.path.exists(fname):
                    with open(fname) as fr:
                        _add_contents(fr.read(), os.path.basename(fname))
    tf.close()
    print("Wrote archive to", archive)
Esempio n. 15
0
def jobs(config: Dict) -> None:
    """Writes out Job objects as json"""
    ge_env = environment.from_qconf(config)
    util.json_dump(ge_env.jobs)
Esempio n. 16
0
def jobs_and_nodes(config: Dict) -> None:
    """Writes out SchedulerNode and Job objects as json - simultaneously to avoid race"""
    ge_env = environment.from_qconf(config)
    to_dump = {"jobs": ge_env.jobs, "nodes": ge_env.nodes}
    util.json_dump(to_dump)
Esempio n. 17
0
def scheduler_nodes(config: Dict) -> None:
    """Writes out SchedulerNode objects as json"""
    ge_env = environment.from_qconf(config)
    util.json_dump(ge_env.nodes)