コード例 #1
0
ファイル: cli.py プロジェクト: Azure/cyclecloud-scalelib
    def _setup_shell_locals(self, config: Dict) -> Dict:
        ctx = DefaultContextHandler("[interactive-readonly]")

        driver = self._driver(config)
        dcalc, jobs_list = self._demand_calc(config, driver)
        nodes_list = dcalc.node_mgr.get_nodes()
        for node in nodes_list:
            node.shellify()
        nodes = partition_single(nodes_list, lambda n: n.name)
        nodes.update(
            partition_single([x for x in nodes_list if x.hostname],
                             lambda n: n.hostname))
        jobs: Dict[str, Any]
        try:
            jobs = partition_single(jobs_list, lambda j: j.name)
        except Exception:
            jobs = partition(jobs_list, lambda j: j.name)

        return {
            "config": config,
            "cli": self,
            "ctx": ctx,
            "demand_calc": dcalc,
            "node_mgr": dcalc.node_mgr,
            "jobs": ShellDict(jobs),
            "nodes": ShellDict(nodes),
        }
コード例 #2
0
    def _setup_shell_locals(self, config: Dict) -> Dict:
        """
        Provides read only interactive shell. type hpcpackhelp()
        in the shell for more information
        """
        ctx = DefaultContextHandler("[interactive-readonly]")

        def hpcpackhelp() -> None:
            print(
                "config               - dict representing autoscale configuration."
            )
            print(
                "cli                  - object representing the CLI commands")
            print(
                "node_mgr             - ScaleLib NodeManager - interacts with CycleCloud for all node related"
                +
                "                    activities - creation, deletion, limits, buckets etc."
            )
            print("hpcpackhelp            - This help function")

        shell_locals = {
            "config": config,
            "cli": self,
            "ctx": ctx,
            "node_mgr": new_node_manager(config),
            "hpcpackhelp": hpcpackhelp,
        }

        return shell_locals
コード例 #3
0
def test_choice_ordering() -> None:
    bindings = MockClusterBinding()
    bindings.add_nodearray("array-a", {"nodetype": "A"})
    bindings.add_bucket("array-a", "Standard_F4", 10, 10)
    bindings.add_nodearray("array-b", {"nodetype": "B"})
    bindings.add_bucket("array-b", "Standard_F4s", 10, 10)

    register_result_handler(DefaultContextHandler("[test_or_ordering]"))
    for ordering in [["A", "B"], ["B", "A"]]:
        node_mgr = _node_mgr(bindings)
        hi, lo = node_mgr.get_buckets()

        if hi.resources["nodetype"] != ordering[0]:
            hi, lo = lo, hi

        assert hi.available_count == 10
        assert lo.available_count == 10
        result = node_mgr.allocate(
            {
                "nodetype": ordering,
                "exclusive": True,
            },
            node_count=15,  # noqa: E231
        )
        assert hi.available_count == 0
        assert lo.available_count == 5
        assert result

        by_array = partition(result.nodes, lambda n: n.resources["nodetype"])
        assert len(by_array[ordering[0]]) == 10
        assert len(by_array[ordering[1]]) == 5
コード例 #4
0
def analyze(config: Dict, job_id: str, wide: bool = False) -> None:
    if not wide:
        try:
            _, columns_str = os.popen("stty size", "r").read().split()
        except Exception:
            columns_str = "120"
        columns = int(columns_str)
    else:
        columns = 120

    ctx = DefaultContextHandler("[demand-cli]")

    register_result_handler(ctx)
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    autoscaler.calculate_demand(config, ge_env, ge_driver, ctx)

    key = "[job {}]".format(job_id)
    results = ctx.by_context[key]
    for result in results:
        if isinstance(result, (EarlyBailoutResult, MatchResult)) and result:
            continue

        if isinstance(result, HostgroupConstraint) and not result:
            continue
        if wide:
            print(result.message)
        else:
            print(result.message[:columns])
コード例 #5
0
    def invoke(*args: Any, **kwargs: Any) -> Any:
        handler = register_result_handler(
            DefaultContextHandler("[{}]".format(func.__name__)))
        if "handler" in inspect.signature(func).parameters:
            kwargs["handler"] = handler

        ret = func(*args, **kwargs)

        unregister_result_handler(handler)
        return ret
コード例 #6
0
def autoscale(
    config: Dict,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Runs actual autoscale process"""
    logging.debug("Begin autoscale")
    ctx_handler = register_result_handler(DefaultContextHandler("[initialization]"))
    if output_columns:
        config["output_columns"] = output_columns

    if output_format:
        config["output_format"] = output_format

    autoscaler.autoscale_grid_engine(config, ctx_handler=ctx_handler)
    logging.debug("End autoscale")
コード例 #7
0
def test_excl_colocated_packing_bug() -> None:
    def n() -> NodeManager:
        return _node_mgr(_bindings())

    # assert [] == node_mgr.get_nodes()
    # result = node_mgr.allocate({"node.nodearray": "htc", "ncpus": 1, "exclusive": True}, slot_count=2_000_000, all_or_nothing=True)
    # assert not result, str(result)
    # assert [] == node_mgr.get_nodes()
    # assert [] == node_mgr.new_nodes
    register_result_handler(DefaultContextHandler("[ttt]"))
    result = n().allocate(
        {
            "node.nodearray": "htc",
            "ncpus": 1,
            "exclusive": True
        },
        slot_count=10,
        all_or_nothing=True,
    )
    assert len(result.nodes) == 3, len(result.nodes)

    result = n().allocate(
        {
            "node.nodearray": "htc",
            "ncpus": 1,
            "exclusive": True,
            "node.vm_size": "Standard_F4s",
        },
        slot_count=19,
        all_or_nothing=True,
    )
    assert len(result.nodes) == 5, len(result.nodes)

    result = n().allocate(
        {
            "node.nodearray": "htc",
            "ncpus": 1,
            "exclusive": True,
            "node.vm_size": "Standard_F4s",
        },
        node_count=101,
        all_or_nothing=True,
    )
    assert not result, result
コード例 #8
0
def demand(
    config: Dict,
    jobs: Optional[str] = None,
    scheduler_nodes: Optional[str] = None,
    output_columns: Optional[List[str]] = None,
    output_format: Optional[str] = None,
) -> None:
    """Runs autoscale in dry run mode to see the demand for new nodes"""
    logging.debug("Begin demand")
    ctx = DefaultContextHandler("[demand-cli]")
    register_result_handler(ctx)
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver, ctx)
    demand_result = demand_calc.finish()

    autoscaler.print_demand(config, demand_result, output_columns, output_format)
    logging.debug("End demand")
コード例 #9
0
ファイル: autoscaler.py プロジェクト: Azure/cyclecloud-pbspro
def main() -> int:
    ctx_handler = register_result_handler(
        DefaultContextHandler("[initialization]"))

    parser = ArgumentParser()
    parser.add_argument("-c",
                        "--config",
                        help="Path to autoscale config.",
                        required=True)
    args = parser.parse_args()
    config_path = os.path.expanduser(args.config)

    if not os.path.exists(config_path):
        print("{} does not exist.".format(config_path), file=sys.stderr)
        return 1

    config = json_load(config_path)

    autoscale_pbspro(config, ctx_handler=ctx_handler)

    return _exit_code
コード例 #10
0
from gridengine.allocation_rules import FillUp, FixedProcesses, RoundRobin
from gridengine.complex import Complex
from gridengine.environment import GridEngineEnvironment
from gridengine.hostgroup import Hostgroup
from gridengine.parallel_environments import new_parallel_environment as new_pe
from gridengine.qbin import QBinImpl
from gridengine.queue import new_gequeue
from gridengine.scheduler import GridEngineScheduler
from gridengine_test import mock_driver

SLOTS_COMPLEX = Complex("slots", "s", "INT", "<=", True, True, "1", 1000)
MFREE_COMPLEX = Complex("m_mem_free", "mfree", "MEMORY", "<=", True, True, "0",
                        0)
EXCL_COMPLEX = Complex("exclusive", "excl", "BOOL", "EXCL", True, True, "0",
                       1000)
CONTEXT = DefaultContextHandler("[default]")


def setup_module() -> None:
    SchedulerNode.ignore_hostnames = True
    hpclogging.initialize_logging(mock_config(None))
    register_result_handler(CONTEXT)


def test_non_exclusive_htc_arrays() -> None:
    # ask for exactly the available count 10
    common_cluster_test(["-l nodearray=htc -t 1-40  -q htc.q sleep.sh"],
                        htc=10)

    # ask for more than 10, hit limit
    common_cluster_test(["-t 1-44  -q htc.q sleep.sh"], htc=10)
コード例 #11
0
def test_mock_bindings(bindings: MockClusterBinding) -> None:
    ctx = register_result_handler(DefaultContextHandler("[test]"))
    hpc, htc = _node_mgr(bindings).get_buckets()
    if hpc.nodearray != "hpc":
        hpc, htc = htc, hpc
    assert hpc.nodearray == "hpc"
    assert htc.nodearray == "htc"

    assert hpc.family_available_count == 10
    assert hpc.available_count == 10

    assert hpc.family_available_count == 10
    assert htc.family_available_count == 20

    hpc.decrement(1)
    assert hpc.family_available_count == 9
    assert htc.family_available_count == 20
    hpc.commit()
    assert hpc.family_available_count == 9
    assert htc.family_available_count == 18

    hpc.increment(1)
    hpc.commit()
    assert hpc.family_available_count == 10
    assert htc.family_available_count == 20

    ctx.set_context("[failure]")
    nm = _node_mgr(bindings)

    b = MockClusterBinding()
    b.add_nodearray("haspgs", {}, max_placement_group_size=20)
    b.add_bucket(
        "haspgs",
        "Standard_F4",
        100,
        100,
        placement_groups=["pg0", "pg1"],
    )
    # make sure we take the max_placement_group_size (20) into account
    # and that we have the non-pg and 2 pg buckets.
    nm = _node_mgr(b)
    no_pg, pg0, pg1 = sorted(nm.get_buckets(),
                             key=lambda b: b.placement_group or "")
    assert no_pg.available_count == 100
    assert pg0.available_count == 20
    assert pg1.available_count == 20

    # let's add a node to pg0 (100 - 1, 20 - 1, 20)
    b.add_node("haspgs-pg0-1", "haspgs", "Standard_F4", placement_group="pg0")

    nm = _node_mgr(b)
    no_pg, pg0, pg1 = sorted(nm.get_buckets(),
                             key=lambda b: b.placement_group or "")
    assert no_pg.available_count == 99
    assert pg0.available_count == 19
    assert pg1.available_count == 20

    # let's add a node to pg1 (100 - 2, 20 - 1, 20 - 1)
    b.add_node("haspgs-pg1-1", "haspgs", "Standard_F4", placement_group="pg1")

    nm = _node_mgr(b)
    no_pg, pg0, pg1 = sorted(nm.get_buckets(),
                             key=lambda b: b.placement_group or "")
    assert no_pg.available_count == 98
    assert pg0.available_count == 19
    assert pg1.available_count == 19

    # let's add 90 htc nodes so that our pg available counts are floored
    # by the overall available_count
    for i in range(90):
        b.add_node("haspgs-{}".format(i + 1), "haspgs", "Standard_F4")

    nm = _node_mgr(b)
    no_pg, pg0, pg1 = sorted(nm.get_buckets(),
                             key=lambda b: b.placement_group or "")
    assert no_pg.available_count == 8
    assert pg0.available_count == 8
    assert pg1.available_count == 8

    # lastly, add a nother node to a pg and see that all of avail go down
    b.add_node("haspgs-pg1-2", "haspgs", "Standard_F4", placement_group="pg1")
    nm = _node_mgr(b)
    no_pg, pg0, pg1 = sorted(nm.get_buckets(),
                             key=lambda b: b.placement_group or "")
    assert no_pg.available_count == 7
    assert pg0.available_count == 7
    assert pg1.available_count == 7
コード例 #12
0
def use_result_handler() -> None:
    # use request_id in context
    request_id = str(uuid4())
    handler = DefaultContextHandler("")
    handler.set_context("[{}]".format(request_id))
コード例 #13
0
def autoscale_hpcpack(
    config: Dict[str, Any],
    ctx_handler: DefaultContextHandler = None,
    hpcpack_rest_client: Optional[HpcRestClient] = None,
    dry_run: bool = False,
) -> None:

    if not hpcpack_rest_client:
        hpcpack_rest_client = new_rest_client(config)

    if ctx_handler:
        ctx_handler.set_context("[Sync-Status]")
    autoscale_config = config.get("autoscale") or {}
    # Load history info
    idle_timeout_seconds: int = autoscale_config.get("idle_timeout") or 600
    provisioning_timeout_seconds = autoscale_config.get("boot_timeout") or 1500
    statefile = autoscale_config.get(
        "statefile") or "C:\\cycle\\jetpack\\config\\autoscaler_state.txt"
    archivefile = autoscale_config.get(
        "archivefile") or "C:\\cycle\\jetpack\\config\\autoscaler_archive.txt"
    node_history = HpcNodeHistory(
        statefile=statefile,
        archivefile=archivefile,
        provisioning_timeout=provisioning_timeout_seconds,
        idle_timeout=idle_timeout_seconds)

    logging.info("Synchronizing the nodes between Cycle cloud and HPC Pack")

    # Initialize data of History info, cc nodes, HPC Pack nodes, HPC grow decisions
    # Get node list from Cycle Cloud
    def nodes_state_key(n: Node) -> Tuple[int, str, int]:
        try:
            state_pri = 1
            if n.state == 'Deallocated':
                state_pri = 2
            elif n.state == 'Stopping':
                state_pri = 3
            elif n.state == 'Terminating':
                state_pri = 4
            name, index = n.name.rsplit("-", 1)
            return (state_pri, name, int(index))
        except Exception:
            return (state_pri, n.name, 0)

    node_mgr: NodeManager = new_node_manager(config)
    for b in node_mgr.get_buckets():
        b.nodes.sort(key=nodes_state_key)
    cc_nodes: List[Node] = node_mgr.get_nodes()
    cc_nodes_by_id = partition_single(cc_nodes,
                                      func=lambda n: n.delayed_node_id.node_id)
    # Get compute node list and grow decision from HPC Pack
    hpc_node_groups = hpcpack_rest_client.list_node_groups()
    grow_decisions = hpcpack_rest_client.get_grow_decision()
    logging.info("grow decision: {}".format(grow_decisions))
    hpc_cn_nodes: List[HpcNode] = hpcpack_rest_client.list_computenodes()
    hpc_cn_nodes = [n for n in hpc_cn_nodes if n.active]

    # This function will link node history items, cc nodes and hpc nodes
    node_history.synchronize(cc_nodes, hpc_cn_nodes)

    cc_nodearrays = set([b.nodearray for b in node_mgr.get_buckets()])
    logging.info("Current node arrays in cyclecloud: {}".format(cc_nodearrays))

    # Create HPC node groups for CC node arrays
    cc_map_hpc_groups = ["CycleCloudNodes"] + list(cc_nodearrays)
    for cc_grp in cc_map_hpc_groups:
        if ci_notin(cc_grp, hpc_node_groups):
            logging.info("Create HPC node group: {}".format(cc_grp))
            hpcpack_rest_client.add_node_group(cc_grp,
                                               "Cycle Cloud Node group")

    # Add HPC nodes into corresponding node groups
    add_cc_tag_nodes = [
        n.name for n in hpc_cn_nodes if n.shall_addcyclecloudtag
    ]
    if len(add_cc_tag_nodes) > 0:
        logging.info(
            "Adding HPC nodes to node group CycleCloudNodes: {}".format(
                add_cc_tag_nodes))
        hpcpack_rest_client.add_node_to_node_group("CycleCloudNodes",
                                                   add_cc_tag_nodes)
    for cc_grp in list(cc_nodearrays):
        add_array_tag_nodes = [
            n.name for n in hpc_cn_nodes
            if n.shall_addnodearraytag and ci_equals(n.cc_nodearray, cc_grp)
        ]
        if len(add_array_tag_nodes) > 0:
            logging.info("Adding HPC nodes to node group {}: {}".format(
                cc_grp, add_array_tag_nodes))
            hpcpack_rest_client.add_node_to_node_group(cc_grp,
                                                       add_array_tag_nodes)

    # Possible values for HPC NodeState (states marked with * shall not occur for CC nodes):
    #   Unknown, Provisioning, Offline, Starting, Online, Draining, Rejected(*), Removing, NotDeployed(*), Stopping(*)
    # Remove the following HPC Pack nodes:
    #   1. The corresponding CC node already removed
    #   2. The corresponding CC node is stopped and HPC node is not assigned a node template
    # Take offline the following HPC Pack nodes:
    #   1. The corresponding CC node is stopped or is going to stop
    hpc_nodes_to_remove = [
        n.name for n in hpc_cn_nodes
        if n.removed_cc_node or (n.stopped_cc_node and not n.template_assigned)
    ]
    hpc_nodes_to_take_offline = [
        n.name for n in hpc_cn_nodes
        if n.stopped_cc_node and ci_equals(n.state, "Online")
    ]
    if len(hpc_nodes_to_remove) > 0:
        logging.info("Removing the HPC nodes: {}".format(hpc_nodes_to_remove))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.remove_nodes(hpc_nodes_to_remove)
    hpc_cn_nodes = [
        n for n in hpc_cn_nodes if not (n.stopped_cc_node or n.removed_cc_node)
    ]

    # Assign default node template for unapproved CC node
    hpc_nodes_to_assign_template = [
        n.name for n in hpc_cn_nodes
        if n.bound_cc_node and not n.template_assigned
    ]
    if len(hpc_nodes_to_assign_template) > 0:
        logging.info(
            "Assigning default node template for the HPC nodes: {}".format(
                hpc_nodes_to_assign_template))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.assign_default_compute_node_template(
                hpc_nodes_to_assign_template)

    ### Start scale up checking:
    logging.info("Start scale up checking ...")
    if ctx_handler:
        ctx_handler.set_context("[scale-up]")

    hpc_nodes_with_active_cc = [
        n for n in hpc_cn_nodes if n.template_assigned and n.bound_cc_node
    ]
    # Exclude the already online healthy HPC nodes before calling node_mgr.allocate
    for hpc_node in hpc_nodes_with_active_cc:
        if hpc_node.ready_for_job:
            hpc_node.bound_cc_node.closed = True

    # Terminate the provisioning timeout CC nodes
    cc_node_to_terminate: List[Node] = []
    for cc_node in cc_nodes:
        if ci_equals(cc_node.target_state, 'Deallocated') or ci_equals(
                cc_node.target_state,
                'Terminated') or cc_node.create_time_remaining:
            continue
        nhi = node_history.find(cc_id=cc_node.delayed_node_id.node_id)
        if not nhi.hpc_id:
            cc_node.closed = True
            cc_node_to_terminate.append(cc_node)
        else:
            hpc_node = ci_find_one(hpc_nodes_with_active_cc, nhi.hpc_id,
                                   lambda n: n.id)
            if hpc_node and hpc_node.error:
                cc_node.closed = True
                cc_node_to_terminate.append(cc_node)

    # "ComputeNodes", "CycleCloudNodes", "AzureIaaSNodes" are all treated as default
    # grow_by_socket not supported yet, treat as grow_by_node
    defaultGroups = [
        "Default", "ComputeNodes", "AzureIaaSNodes", "CycleCloudNodes"
    ]
    default_cores_to_grow = default_nodes_to_grow = 0.0

    # If the current CC nodes in the node array cannot satisfy the grow decision, the group is hungry
    # For a hungry group, no idle check is required if the node health is OK
    group_hungry: Dict[str, bool] = {}
    nbrNewNodes: int = 0
    grow_groups = list(grow_decisions.keys())
    for grp in grow_groups:
        tmp = grow_decisions.pop(grp)
        if not (tmp.cores_to_grow + tmp.nodes_to_grow + tmp.sockets_to_grow):
            continue
        if ci_in(grp, defaultGroups):
            default_cores_to_grow += tmp.cores_to_grow
            default_nodes_to_grow += tmp.nodes_to_grow + tmp.sockets_to_grow
            continue
        if ci_notin(grp, cc_nodearrays):
            logging.warning(
                "No mapping node array for the grow requirement {}:{}".format(
                    grp, grow_decisions[grp]))
            grow_decisions.pop(grp)
            continue
        group_hungry[grp] = False
        array = ci_lookup(grp, cc_nodearrays)
        selector = {'ncpus': 1, 'node.nodearray': [array]}
        target_cores = math.ceil(tmp.cores_to_grow)
        target_nodes = math.ceil(tmp.nodes_to_grow + tmp.sockets_to_grow)
        if target_nodes:
            logging.info("Allocate: {}  Target Nodes: {}".format(
                selector, target_nodes))
            result = node_mgr.allocate(selector, node_count=target_nodes)
            logging.info(result)
            if not result or result.total_slots < target_nodes:
                group_hungry[grp] = True
        if target_cores:
            logging.info("Allocate: {}  Target Cores: {}".format(
                selector, target_cores))
            result = node_mgr.allocate(selector, slot_count=target_cores)
            logging.info(result)
            if not result or result.total_slots < target_cores:
                group_hungry[grp] = True
        if len(node_mgr.new_nodes) > nbrNewNodes:
            group_hungry[grp] = True
        nbrNewNodes = len(node_mgr.new_nodes)

    # We then check the grow decision for the default node groups:
    checkShrinkNeeded = True
    growForDefaultGroup = True if default_nodes_to_grow or default_cores_to_grow else False
    if growForDefaultGroup:
        selector = {'ncpus': 1}
        if default_nodes_to_grow:
            target_nodes = math.ceil(default_nodes_to_grow)
            logging.info("Allocate: {}  Target Nodes: {}".format(
                selector, target_nodes))
            result = node_mgr.allocate({'ncpus': 1}, node_count=target_nodes)
            if not result or result.total_slots < target_nodes:
                checkShrinkNeeded = False
        if default_cores_to_grow:
            target_cores = math.ceil(default_cores_to_grow)
            logging.info("Allocate: {}  Target Cores: {}".format(
                selector, target_cores))
            result = node_mgr.allocate({'ncpus': 1}, slot_count=target_cores)
            if not result or result.total_slots < target_cores:
                checkShrinkNeeded = False
        if len(node_mgr.new_nodes) > nbrNewNodes:
            checkShrinkNeeded = False
        nbrNewNodes = len(node_mgr.new_nodes)

    if nbrNewNodes > 0:
        logging.info("Need to Allocate {} nodes in total".format(nbrNewNodes))
        if dry_run:
            logging.info("Dry-run: skipping node bootup...")
        else:
            logging.info("Allocating {} nodes in total".format(
                len(node_mgr.new_nodes)))
            bootup_result: BootupResult = node_mgr.bootup()
            logging.info(bootup_result)
            if bootup_result and bootup_result.nodes:
                for cc_node in bootup_result.nodes:
                    nhi = node_history.find(
                        cc_id=cc_node.delayed_node_id.node_id)
                    if nhi is None:
                        nhi = node_history.insert(
                            NodeHistoryItem(cc_node.delayed_node_id.node_id))
                    else:
                        nhi.restart()
    else:
        logging.info("No need to allocate new nodes ...")

    ### Start the shrink checking
    if ctx_handler:
        ctx_handler.set_context("[scale-down]")

    cc_node_to_shutdown: List[Node] = []
    if not checkShrinkNeeded:
        logging.info("No shrink check at this round ...")
        if not dry_run:
            for nhi in node_history.items:
                if not nhi.stopped and nhi.hpc_id:
                    nhi.idle_from = None
    else:
        logging.info("Start scale down checking ...")
        # By default, we check idle for active CC nodes in HPC Pack with 'Offline', 'Starting', 'Online', 'Draining' state
        candidate_idle_check_nodes = [
            n for n in hpc_nodes_with_active_cc
            if (not n.bound_cc_node.keep_alive)
            and ci_in(n.state, ["Offline", "Starting", "Online", "Draining"])
        ]

        # We can exclude some nodes from idle checking:
        # 1. If HPC Pack ask for grow in default node group(s), all healthy ONLINE nodes are considered as busy
        # 2. If HPC Pack ask for grow in certain node group, all healthy ONLINE nodes in that node group are considered as busy
        # 3. If a node group is hungry (new CC required or grow request not satisfied), no idle check needed for all nodes in that node array
        if growForDefaultGroup:
            candidate_idle_check_nodes = [
                n for n in candidate_idle_check_nodes if not n.ready_for_job
            ]
        for grp, hungry in group_hungry.items():
            if hungry:
                candidate_idle_check_nodes = [
                    n for n in candidate_idle_check_nodes
                    if not ci_equals(grp, n.cc_nodearray)
                ]
            elif not growForDefaultGroup:
                candidate_idle_check_nodes = [
                    n for n in candidate_idle_check_nodes
                    if not (ci_equals(grp, n.cc_nodearray) and n.ready_for_job)
                ]

        curtime = datetime.utcnow()
        # Offline node must be idle
        idle_node_names = [
            n.name for n in candidate_idle_check_nodes
            if ci_equals(n.state, 'Offline')
        ]
        if len(candidate_idle_check_nodes) > len(idle_node_names):
            idle_nodes = hpcpack_rest_client.check_nodes_idle([
                n.name for n in candidate_idle_check_nodes
                if not ci_equals(n.state, 'Offline')
            ])
            if len(idle_nodes) > 0:
                idle_node_names.extend([n.node_name for n in idle_nodes])

        if len(idle_node_names) > 0:
            logging.info(
                "The following node is idle: {}".format(idle_node_names))
        else:
            logging.info("No idle node found in this round.")

        retention_days = autoscale_config.get("vm_retention_days") or 7
        for nhi in node_history.items:
            if nhi.stopped:
                if nhi.stop_time + timedelta(
                        days=retention_days) < datetime.utcnow():
                    cc_node = cc_nodes_by_id.get(nhi.cc_id)
                    if cc_node is not None:
                        cc_node_to_terminate.append(cc_node)
                continue
            if ci_in(nhi.hostname, idle_node_names):
                if nhi.idle_from is None:
                    nhi.idle_from = curtime
                elif nhi.idle_timeout(idle_timeout_seconds):
                    nhi.stop_time = curtime
                    cc_node = cc_nodes_by_id.get(nhi.cc_id)
                    if cc_node is not None:
                        cc_node_to_shutdown.append(cc_node)
            else:
                nhi.idle_from = None

    shrinking_cc_node_ids = [
        n.delayed_node_id.node_id for n in cc_node_to_terminate
    ]
    shrinking_cc_node_ids.extend(
        [n.delayed_node_id.node_id for n in cc_node_to_shutdown])
    hpc_nodes_to_bring_online = [
        n.name for n in hpc_nodes_with_active_cc
        if ci_equals(n.state, 'Offline') and not n.error
        and ci_notin(n.cc_node_id, shrinking_cc_node_ids)
    ]
    hpc_nodes_to_take_offline.extend([
        n.name for n in hpc_nodes_with_active_cc
        if ci_equals(n.state, 'Online')
        and ci_in(n.cc_node_id, shrinking_cc_node_ids)
    ])
    if len(hpc_nodes_to_bring_online) > 0:
        logging.info("Bringing the HPC nodes online: {}".format(
            hpc_nodes_to_bring_online))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.bring_nodes_online(hpc_nodes_to_bring_online)

    if len(hpc_nodes_to_take_offline) > 0:
        logging.info("Taking the HPC nodes offline: {}".format(
            hpc_nodes_to_take_offline))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.take_nodes_offline(hpc_nodes_to_take_offline)

    if len(cc_node_to_shutdown) > 0:
        logging.info("Shut down the following Cycle cloud node: {}".format(
            [cn.name for cn in cc_node_to_shutdown]))
        if dry_run:
            logging.info("Dry-run: skip ...")
        else:
            node_mgr.shutdown_nodes(cc_node_to_shutdown)

    if len(cc_node_to_terminate) > 0:
        logging.info(
            "Terminating the following provisioning-timeout Cycle cloud nodes: {}"
            .format([cn.name for cn in cc_node_to_terminate]))
        if dry_run:
            logging.info("Dry-run: skip ...")
        else:
            node_mgr.terminate_nodes(cc_node_to_terminate)

    if not dry_run:
        logging.info("Save node history: {}".format(node_history))
        node_history.save()
コード例 #14
0

def new_rest_client(config: Dict[str, Any]) -> HpcRestClient:

    hpcpack_config = config.get('hpcpack') or {}
    hpc_pem_file = hpcpack_config.get('pem')
    hn_hostname = hpcpack_config.get('hn_hostname')
    return HpcRestClient(config, pem=hpc_pem_file, hostname=hn_hostname)


if __name__ == "__main__":

    config_file = ""
    if len(sys.argv) > 1:
        config_file = sys.argv[1]

    dry_run = False
    if len(sys.argv) > 2:
        dry_run = ci_in(sys.argv[2], ['true', 'dryrun'])

    ctx_handler = register_result_handler(
        DefaultContextHandler("[initialization]"))
    config = load_config(config_file)
    logging.initialize_logging(config)
    logging.info(
        "------------------------------------------------------------------------"
    )
    if config["autoscale"]["start_enabled"]:
        autoscale_hpcpack(config, ctx_handler=ctx_handler, dry_run=dry_run)
    else:
        logging.info("Autoscaler is not enabled")
コード例 #15
0
ファイル: cli.py プロジェクト: Azure/cyclecloud-pbspro
    def _setup_shell_locals(self, config: Dict) -> Dict:
        """
        Provides read only interactive shell. type pbsprohelp()
        in the shell for more information
        """
        ctx = DefaultContextHandler("[interactive-readonly]")

        pbs_driver = PBSProDriver(config)
        pbs_env = self._pbs_env(pbs_driver)

        def pbsprohelp() -> None:
            print(
                "config               - dict representing autoscale configuration."
            )
            print(
                "cli                  - object representing the CLI commands")
            print(
                "pbs_env              - object that contains data structures for queues, resources etc"
            )
            print(
                "queues               - dict of queue name -> PBSProQueue object"
            )

            print("jobs                 - dict of job id -> Autoscale Job")
            print(
                "scheduler_nodes      - dict of hostname -> node objects. These represent purely what"
                "                  the scheduler sees without additional booting nodes / information from CycleCloud"
            )
            print(
                "resource_definitions - dict of resource name -> PBSProResourceDefinition objects."
            )
            print(
                "default_scheduler    - PBSProScheduler object representing the default scheduler."
            )
            print(
                "pbs_driver           - PBSProDriver object that interacts directly with PBS and implements"
                "                    PBS specific behavior for scalelib.")
            print(
                "demand_calc          - ScaleLib DemandCalculator - pseudo-scheduler that determines the what nodes are unnecessary"
            )
            print(
                "node_mgr             - ScaleLib NodeManager - interacts with CycleCloud for all node related"
                +
                "                    activities - creation, deletion, limits, buckets etc."
            )
            print("pbsprohelp            - This help function")

        # try to make the key "15" instead of "15.hostname" if only
        # a single submitter was in use
        num_scheds = len(set([x.name.split(".", 1)[-1] for x in pbs_env.jobs]))
        if num_scheds == 1:
            jobs_dict = partition_single(pbs_env.jobs,
                                         lambda j: j.name.split(".")[0])
        else:
            jobs_dict = partition_single(pbs_env.jobs, lambda j: j.name)

        sched_nodes_dict = partition_single(pbs_env.scheduler_nodes,
                                            lambda n: n.hostname)

        pbs_env.queues = clilib.ShellDict(pbs_env.queues)

        for snode in pbs_env.scheduler_nodes:
            snode.shellify()

        pbs_env.resource_definitions = clilib.ShellDict(
            pbs_env.resource_definitions)

        demand_calc, _ = self._demand_calc(config, pbs_driver)

        shell_locals = {
            "config": config,
            "cli": self,
            "ctx": ctx,
            "pbs_env": pbs_env,
            "queues": pbs_env.queues,
            "jobs": clilib.ShellDict(jobs_dict, "j"),
            "scheduler_nodes": clilib.ShellDict(sched_nodes_dict),
            "resource_definitions": pbs_env.resource_definitions,
            "default_scheduler": pbs_env.default_scheduler,
            "pbs_driver": pbs_driver,
            "demand_calc": demand_calc,
            "node_mgr": demand_calc.node_mgr,
            "pbsprohelp": pbsprohelp,
        }

        return shell_locals
コード例 #16
0
def shell(config: Dict) -> None:
    """
        Provides read only interactive shell. type gehelp()
        in the shell for more information
    """
    ctx = DefaultContextHandler("[interactive-readonly]")

    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)
    config = ge_driver.preprocess_config(config)
    demand_calc = autoscaler.new_demand_calculator(config, ge_env, ge_driver, ctx)

    queues = ge_env.queues

    def gehelp() -> None:
        print("config       - dict representing autoscale configuration.")
        print("dbconn       - Read-only SQLite conn to node history")
        print("demand_calc  - DemandCalculator")
        print("ge_driver    - GEDriver object.")
        print("jobs         - List[Job] from ge_driver")
        print("node_mgr     - NodeManager")
        print("logging      - HPCLogging module")
        print("queues      - GridEngineQueue objects")

    shell_locals = {
        "config": config,
        "ctx": ctx,
        "ge_driver": ge_driver,
        "demand_calc": demand_calc,
        "node_mgr": demand_calc.node_mgr,
        "jobs": ge_env.jobs,
        "dbconn": demand_calc.node_history.conn,
        "gehelp": gehelp,
        "queues": queues,
        "ge_env": ge_env,
    }
    banner = "\nCycleCloud GE Autoscale Shell"
    interpreter = ReraiseAssertionInterpreter(locals=shell_locals)
    try:
        __import__("readline")
        # some magic - create a completer that is bound to the locals in this interpreter and not
        # the __main__ interpreter.
        interpreter.push("import readline, rlcompleter")
        interpreter.push('readline.parse_and_bind("tab: complete")')
        interpreter.push("_completer = rlcompleter.Completer(locals())")
        interpreter.push("def _complete_helper(text, state):")
        interpreter.push("    ret = _completer.complete(text, state)")
        interpreter.push('    ret = ret + ")" if ret[-1] == "(" else ret')
        interpreter.push("    return ret")
        interpreter.push("")
        interpreter.push("readline.set_completer(_complete_helper)")
        for item in interpreter.history_lines:
            try:
                if '"""' in item:
                    interpreter.push(
                        "readline.add_history('''%s''')" % item.rstrip("\n")
                    )
                else:
                    interpreter.push(
                        'readline.add_history("""%s""")' % item.rstrip("\n")
                    )
            except Exception:
                pass

        interpreter.push("from hpc.autoscale.job.job import Job\n")
        interpreter.push("from hpc.autoscale import hpclogging as logging\n")

    except ImportError:
        banner += (
            "\nWARNING: `readline` is not installed, so autocomplete will not work."
        )

    interpreter.interact(banner=banner)
コード例 #17
0
def setup_function(function: Any) -> None:
    SchedulerNode.ignore_hostnames = True
    register_result_handler(
        DefaultContextHandler("[{}]".format(function.__name__)))