Ejemplo n.º 1
0
    def _setup_shell_locals(self, config: Dict) -> Dict:
        ctx = DefaultContextHandler("[interactive-readonly]")

        driver = self._driver(config)
        dcalc, jobs_list = self._demand_calc(config, driver)
        nodes_list = dcalc.node_mgr.get_nodes()
        for node in nodes_list:
            node.shellify()
        nodes = partition_single(nodes_list, lambda n: n.name)
        nodes.update(
            partition_single([x for x in nodes_list if x.hostname],
                             lambda n: n.hostname))
        jobs: Dict[str, Any]
        try:
            jobs = partition_single(jobs_list, lambda j: j.name)
        except Exception:
            jobs = partition(jobs_list, lambda j: j.name)

        return {
            "config": config,
            "cli": self,
            "ctx": ctx,
            "demand_calc": dcalc,
            "node_mgr": dcalc.node_mgr,
            "jobs": ShellDict(jobs),
            "nodes": ShellDict(nodes),
        }
Ejemplo n.º 2
0
    def update(self, nodes: typing.Iterable[Node]) -> None:
        if self.read_only:
            return

        now = self.now()

        rows = list(
            self._execute("""SELECT node_id, hostname, last_match_time,
                          create_time from nodes where delete_time IS NULL"""))

        rows_by_id = partition_single(rows, lambda r: r[0])
        nodes_with_ids = [n for n in nodes if n.delayed_node_id.node_id]

        nodes_by_id: typing.Dict[typing.Optional[NodeId],
                                 Node] = partition_single(
                                     nodes_with_ids,
                                     lambda n: n.delayed_node_id.node_id,
                                 )

        to_delete = set(rows_by_id.keys()) - set(nodes_by_id.keys())

        for node in nodes:
            node_id = node.delayed_node_id.node_id

            if node_id not in rows_by_id:
                # first time we see it, just put an entry
                rows_by_id[node_id] = tuple([node_id, node.hostname, now, now])

            if node.required:
                rec = list(rows_by_id[node_id])
                rec[-2] = now
                rows_by_id[node_id] = tuple(rec)

        if rows_by_id:
            exprs = []
            for row in rows_by_id.values():
                node_id, hostname, match_time, create_time = row
                expr = "('{}', '{}', {}, {}, NULL)".format(
                    node_id, hostname, match_time, create_time)
                exprs.append(expr)

            values_expr = ",".join(exprs)

            stmt = "INSERT OR REPLACE INTO nodes (node_id, hostname, last_match_time, create_time, delete_time) VALUES {}".format(
                values_expr)
            self._execute(stmt)

        if to_delete:
            to_delete_expr = " OR ".join(
                ['node_id="{}"'.format(node_id) for node_id in to_delete])
            now = datetime.datetime.utcnow().timestamp()
            self._execute("UPDATE nodes set delete_time={} where {}".format(
                now, to_delete_expr))

        self.retire_records(commit=True)
Ejemplo n.º 3
0
def test_partition_single() -> None:
    objs = [{"id": 1}, {"id": 2}, {"id": 3}]
    by_id = partition_single(objs, lambda x: x["id"])
    assert set([1, 2, 3]) == set(by_id.keys())

    for k, v in by_id.items():
        assert by_id[k] == {"id": k}

    try:
        partition_single(objs, lambda x: None)
        assert False
    except RuntimeError as e:
        expected = "Could not partition list into single values - key=None values=[{'id': 1}, {'id': 2}, {'id': 3}]"
        assert str(e) == expected
Ejemplo n.º 4
0
    def find_booting(
        self,
        at_least: float = 1800,
        booting_nodes: Optional[List[Node]] = None,
    ) -> List[Node]:
        if not booting_nodes:
            booting_nodes = self.node_mgr.get_nodes()

        # filter out nodes that have converged.
        booting_nodes = [
            n for n in booting_nodes if n.target_state == "Started" and n.state
            not in ["Ready", "Started"] and n.delayed_node_id.node_id
        ]

        by_id = partition_single(booting_nodes,
                                 lambda n: n.delayed_node_id.node_id)

        ret = []
        for node_id, hostname, create_time in self.node_history.find_booting(
                for_at_least=at_least):
            if not node_id:
                continue

            if node_id in by_id:
                ret.append(by_id[node_id])

        return ret
    def update_scheduler_nodes(self,
                               scheduler_nodes: List[SchedulerNode]) -> None:

        by_hostname: Dict[str, Node] = partition_single(
            self.__scheduler_nodes_queue,
            lambda n: n.hostname_or_uuid  # type: ignore
        )
        for new_snode in scheduler_nodes:
            if new_snode.hostname not in by_hostname:
                logging.debug(
                    "Found new node[hostname=%s] that does not exist in CycleCloud",
                    new_snode.hostname,
                )
                by_hostname[new_snode.hostname] = new_snode
                self.__scheduler_nodes_queue.push(new_snode)
                self.node_mgr.add_unmanaged_nodes([new_snode])

                # TODO inform bucket catalog?
            else:
                old_snode = by_hostname[new_snode.hostname_or_uuid]
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s]",
                    new_snode.hostname,
                )
                old_snode.update(new_snode)
Ejemplo n.º 6
0
def read_queues(
    autoscale_config: Dict,
    scheduler: "GridEngineScheduler",
    pes: Dict[str, "ParallelEnvironment"],
    hostgroups: List[Hostgroup],
    complexes: Dict[str, "Complex"],
    qbin: QBin,
) -> Dict[str, GridEngineQueue]:
    queues = {}
    qnames = qbin.qconf(["-sql"]).split()

    logging.debug("Found %d queues: %s", len(qnames), " ".join(qnames))
    autoscale_queues_config = autoscale_config.get("gridengine", {}).get("queues", {})

    unbound_hostgroups = partition_single(hostgroups, lambda h: h.name)

    for qname in qnames:

        lines = qbin.qconf(["-sq", qname]).splitlines()
        queue_config = parse_ge_config(lines)
        autoscale_enabled = autoscale_queues_config.get(queue_config["qname"], {}).get(
            "autoscale_enabled", True
        )
        expr = queue_config.get("complex_values", "NONE")
        complex_values = parse_queue_complex_values(expr, complexes, qname)
        queues[qname] = GridEngineQueue(
            queue_config,
            scheduler,
            pes,
            unbound_hostgroups,
            complex_values,
            autoscale_enabled,
        )

    return queues
Ejemplo n.º 7
0
def read_schedulers(
    pbscmd: PBSCMD, resource_definitions: Dict[str, PBSProResourceDefinition]
) -> Dict[Hostname, PBSProScheduler]:
    parser = get_pbspro_parser()
    sched_dicts = pbscmd.qmgr_parsed("list", "sched")
    server_dicts = pbscmd.qmgr_parsed("list", "server")

    server_dicts_by_host = partition_single(server_dicts,
                                            lambda s: s["server_host"])

    ret: Dict[str, PBSProScheduler] = {}

    for sched_dict in sched_dicts:
        hostname = sched_dict["sched_host"]
        server_dict = server_dicts_by_host[hostname]

        for key, value in server_dict.items():
            if key not in sched_dict:
                sched_dict[key] = value

        # this is a scheduler, so it has no parent shared resources
        resource_state = parser.parse_resource_state(
            sched_dict, parent_shared_resources=None)
        scheduler = PBSProScheduler(sched_dict, resource_state)
        ret[scheduler.hostname] = scheduler

    return ret
Ejemplo n.º 8
0
def common_cluster_test(qsub_commands: List[str],
                        pg_counts: Optional[Dict[str, int]] = None,
                        previous_dcalc: Optional[DemandCalculator] = None,
                        **array_counts: int) -> DemandCalculator:
    pg_counts = pg_counts or {}
    dcalc = common_cluster(qsub_commands, previous_dcalc)
    demand = dcalc.get_demand()
    demandprinter.print_demand(["name", "job_ids", "placement_group"], demand)

    # sanity check that we don't recreate the same node
    partition_single(demand.new_nodes, lambda n: n.name)
    by_array = partition(demand.new_nodes, lambda n: n.nodearray)
    by_pg = partition(demand.new_nodes, lambda n: n.placement_group)
    if set(by_pg.keys()) != set([None]):
        if set(by_pg.keys()) != set(pg_counts.keys()):
            assert False, "\n%s\n%s" % (
                [(x, len(y)) for x, y in by_pg.items()],
                pg_counts,
            )
        assert set(by_pg.keys()) == set(pg_counts.keys())
        assert not (bool(by_pg) ^ bool(pg_counts))

    if pg_counts:
        for pg_name, count in pg_counts.items():
            assert pg_name in by_pg
            assert (
                len(by_pg[pg_name]) == count
            ), "Expected pg {} to have {} nodes. Found {}. Full {}".format(
                pg_name,
                count,
                len(by_pg[pg_name]),
                [(x, len(y)) for x, y in by_pg.items()],
            )

        for pg_name in by_pg:
            assert pg_name in pg_counts

    for nodearray_name, count in array_counts.items():
        assert nodearray_name in by_array
        assert len(by_array[nodearray_name]) == count, [
            n.name for n in by_array[nodearray_name]
        ]

    for nodearray_name, node_list in by_array.items():
        assert nodearray_name in array_counts

    return dcalc
Ejemplo n.º 9
0
def test_mock_bindings2() -> None:
    bindings = MockClusterBinding()
    bindings.add_nodearray("w", {}, location="westus2", max_count=8)
    bindings.add_bucket(
        "w",
        "Standard_E2_v3",
        max_count=80,
        available_count=8,
        family_consumed_core_count=72 * 2,
        family_quota_core_count=160,
        family_quota_count=80,
    )
    bindings.add_bucket(
        "w",
        "Standard_E4_v3",
        max_count=40,
        available_count=4,
        family_consumed_core_count=72 * 2,
        family_quota_core_count=160,
        family_quota_count=80,
    )
    bindings.add_bucket("w",
                        "Standard_D8s_v3",
                        max_count=80,
                        available_count=8)

    bindings.add_nodearray("e", {}, location="eastus")
    bindings.add_bucket("e", "Standard_E2_v3", max_count=20, available_count=4)
    node_mgr = _node_mgr(bindings)
    by_size = partition_single(node_mgr.get_buckets(), lambda b:
                               (b.location, b.vm_size))

    assert by_size[("westus2", "Standard_E2_v3")].available_count == 8
    assert by_size[("westus2",
                    "Standard_E2_v3")].limits.nodearray_available_count == 8
    assert by_size[("westus2", "Standard_E2_v3")].limits.family_max_count == 80
    assert by_size[("westus2", "Standard_E4_v3")].available_count == 4
    assert by_size[("westus2", "Standard_D8s_v3")].available_count == 8
    assert by_size[("eastus", "Standard_E2_v3")].available_count == 4

    result = node_mgr.allocate(
        {
            "node.vm_size": "Standard_E4_v3",
            "exclusive": True,
            "node.location": "westus2",
        },
        node_count=1,
    )

    assert result, "\n".join(result.reasons)

    assert by_size[("westus2",
                    "Standard_E2_v3")].limits.nodearray_available_count == 7
    assert by_size[("westus2", "Standard_E2_v3")].available_count == 6
    assert by_size[("westus2", "Standard_E4_v3")].available_count == 3
    assert by_size[("westus2", "Standard_D8s_v3")].available_count == 7
    assert by_size[("eastus", "Standard_E2_v3")].available_count == 4
Ejemplo n.º 10
0
    def decorate(self, nodes: typing.List[Node]) -> None:
        if not nodes:
            nodes = []

        nodes = [n for n in nodes if n.exists]
        equalities = [
            " (node_id == '{}') ".format(n.delayed_node_id.node_id)
            for n in nodes
        ]

        if not equalities:
            return

        stmt = "select node_id, last_match_time, create_time, delete_time from nodes where {}".format(
            "{}".format(" OR ".join(equalities)))

        rows = self._execute(stmt)
        rows_by_id = partition_single(list(rows), lambda r: r[0])

        now = self.now()

        for node in nodes:
            node_id = node.delayed_node_id.node_id

            # should be impossible because we already filtered by exists
            if not node_id:
                logging.warning(
                    "Null node_id for %s. Leaving create/last_match/delete times as null.",
                    node,
                )
                continue

            if node_id in rows_by_id:

                node_id, last_match_time, create_time, delete_time = rows_by_id[
                    node_id]
                node.create_time_unix = create_time
                node.last_match_time_unix = last_match_time
                node.delete_time_unix = delete_time

                if self.create_timeout:
                    create_elapsed = max(0, now - create_time)
                    create_remaining = max(
                        0, self.create_timeout - create_elapsed)
                    node.create_time_remaining = create_remaining

                if self.last_match_timeout:
                    if node.keep_alive:
                        node.idle_time_remaining = -1
                    else:
                        match_elapsed = max(0, now - last_match_time)
                        match_remaining = max(
                            0, self.last_match_timeout - match_elapsed)
                        node.idle_time_remaining = match_remaining
Ejemplo n.º 11
0
def _find_nodes(
    config: Dict, hostnames: List[str], node_names: List[str]
) -> Tuple[GridEngineDriver, DemandCalculator, List[Node]]:
    hostnames = hostnames or []
    node_names = node_names or []
    ge_env = environment.from_qconf(config)
    ge_driver = autoscaler.new_driver(config, ge_env)

    demand_calc = autoscaler.calculate_demand(config, ge_env, ge_driver)
    demand_result = demand_calc.finish()
    by_hostname = partition_single(
        demand_result.compute_nodes, lambda n: n.hostname_or_uuid.lower()
    )
    by_node_name = partition_single(
        demand_result.compute_nodes, lambda n: n.name.lower()
    )
    found_nodes = []
    for hostname in hostnames:
        if not hostname:
            error("Please specify a hostname")

        if hostname.lower() not in by_hostname:
            # it doesn't exist in CC, but we still want to delete it
            # from the cluster
            by_hostname[hostname.lower()] = SchedulerNode(hostname, {})

        found_nodes.append(by_hostname[hostname.lower()])

    for node_name in node_names:
        if not node_name:
            error("Please specify a node_name")

        if node_name.lower() not in by_node_name:
            error(
                "Could not find a CycleCloud node that has node_name %s."
                + " Run 'nodes' to see available nodes.",
                node_name,
            )
        found_nodes.append(by_node_name[node_name.lower()])

    return ge_driver, demand_calc, found_nodes
Ejemplo n.º 12
0
    def __init__(
        self,
        scheduler: GridEngineScheduler,
        jobs: Optional[List[Job]] = None,
        nodes: Optional[List[Node]] = None,
        queues: Optional[Dict[str, GridEngineQueue]] = None,
        hostgroups: Optional[List[hglib.Hostgroup]] = None,
        pes: Optional[Dict[str, ParallelEnvironment]] = None,
        complexes: Optional[Dict[str, Complex]] = None,
        unfiltered_complexes: Optional[Dict[str, Complex]] = None,
        qbin: Optional[QBin] = None,
    ) -> None:
        self.__scheduler = scheduler
        self.__jobs: List[Job] = jobs or []
        self.__nodes: Dict[str, Node] = partition_single(
            nodes or [], lambda n: n.hostname_or_uuid.lower())
        self.__queues: Dict[str, GridEngineQueue] = queues or {}
        self.__pes: Dict[str, ParallelEnvironment] = pes or {}
        self.__complexes: Dict[str, Complex] = complexes or {}

        if unfiltered_complexes:
            self.__unfiltered_complexes = unfiltered_complexes
        else:
            self.__unfiltered_complexes = deepcopy(self.__complexes)

        self.__qbin = qbin or QBinImpl()

        self.__hostgroups = partition_single(hostgroups or [],
                                             lambda h: h.name)
        self.__host_memberships: Dict[str, List[str]] = {}

        for hostgroup in self.__hostgroups.values():
            for host in hostgroup.members:
                if host not in self.__host_memberships:
                    self.__host_memberships[host] = []
                self.__host_memberships[host].append(hostgroup.name)
Ejemplo n.º 13
0
def validate_hg_intersections(ge_env: GridEngineEnvironment,
                              node_mgr: NodeManager,
                              warn_function: WarnFunction) -> bool:
    bucket_to_hgs: Dict[str, Set[str]] = {}
    for bucket in node_mgr.get_buckets():
        if bucket.bucket_id not in bucket_to_hgs:
            bucket_to_hgs[str(bucket)] = set()

    by_str = partition_single(node_mgr.get_buckets(), str)

    for queue in ge_env.queues.values():
        if not queue.autoscale_enabled:
            continue

        for hostgroup in queue.bound_hostgroups.values():
            for bucket in node_mgr.get_buckets():
                is_satisfied = True
                for constraint in hostgroup.constraints:
                    result = constraint.satisfied_by_bucket(bucket)
                    if not result:
                        is_satisfied = False
                        break
                if is_satisfied:
                    bucket_to_hgs[str(bucket)].add(hostgroup.name)

    failure = False
    for bkey, matches in bucket_to_hgs.items():
        bucket = by_str[bkey]
        if not matches:
            warn_function(
                "%s is not matched by any hostgroup. This is not an error.",
                bucket,
            )
        elif len(matches) > 1:
            # seq_no will be used to determine ties
            if not ge_env.scheduler.sort_by_seqno:
                warn_function(
                    "%s is matched by more than one hostgroup %s. This is not an error.",
                    bucket,
                    ",".join(matches),
                )
    return failure
Ejemplo n.º 14
0
    def update_scheduler_nodes(self,
                               scheduler_nodes: List[SchedulerNode]) -> None:

        by_hostname: Dict[str, Node] = partition_single(
            self.__scheduler_nodes_queue,
            lambda n: n.hostname_or_uuid  # type: ignore
        )

        for new_snode in scheduler_nodes:
            if new_snode.hostname not in by_hostname:
                by_hostname[new_snode.hostname] = new_snode
                self.__scheduler_nodes_queue.push(new_snode)
                self.node_mgr.add_unmanaged_nodes([new_snode])
                if new_snode.resources.get("ccnodeid"):
                    logging.warning(
                        "%s has ccnodeid defined, but no longer exists in CycleCloud",
                        new_snode,
                    )
                else:
                    logging.debug(
                        "Found new node[hostname=%s] that does not exist in CycleCloud",
                        new_snode.hostname,
                    )

                # TODO inform bucket catalog?
            elif new_snode.metadata.get("override_resources", True):

                old_snode = by_hostname[new_snode.hostname_or_uuid]
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s]",
                    new_snode.hostname,
                )
                old_snode.update(new_snode)
            else:
                logging.fine(
                    "Found existing CycleCloud node[hostname=%s], but node.metadata.override_resources=false"
                    +
                    " so ignoring the reported resources and only copying metadata",
                    new_snode.hostname,
                )
                old_snode = by_hostname[new_snode.hostname_or_uuid]
                old_snode.metadata.update(new_snode.metadata)
Ejemplo n.º 15
0
def test_complex_shortcut() -> None:
    # make sure that if a user mixes the shortcut and long form
    # we still handle that.
    dcalc = common_cluster_test(
        [
            "-l m_mem_free=2g -q htc.q sleep.sh",
            "-l m_mem_free=2g -q htc.q sleep.sh",
            "-l m_mem_free=2g -q htc.q sleep.sh",
            "-l mfree=2g      -q htc.q sleep.sh",
            "-l mfree=2g      -q htc.q sleep.sh",
            "-l mfree=2g      -q htc.q sleep.sh",
            # "-l m_mem_free=2g -q htc.q sleep.sh",
            # "-l m_mem_free=2g -q htc.q sleep.sh",
            # "-l m_mem_free=2g -q htc.q sleep.sh",
        ],
        htc=2,
    )
    eg = dcalc.node_mgr.example_node("westus", "Standard_F4")

    new_nodes = dcalc.get_demand().new_nodes
    by_name = partition_single(new_nodes, lambda n: n.name)

    def m(expr: str) -> Memory:
        return Memory.value_of(expr)

    assert eg.memory == m("8g")

    assert by_name["htc-1"].memory == m("8g")
    assert by_name["htc-1"].resources["m_mem_free"] == m("8g")
    assert by_name["htc-1"].resources["mfree"] == m("8g")
    assert by_name["htc-1"].available["m_mem_free"] == m("0g")
    assert by_name["htc-1"].available["mfree"] == m("0g")

    assert by_name["htc-2"].resources["m_mem_free"] == m("8g")
    assert by_name["htc-2"].resources["mfree"] == m("8g")
    assert by_name["htc-2"].available["m_mem_free"] == m("4g")
    assert by_name["htc-2"].available["mfree"] == m("4g")
Ejemplo n.º 16
0
def create_vm_sizes(cache_path: Optional[str] = None) -> None:

    if cache_path and os.path.exists(cache_path):
        raw = open(cache_path).read()
    else:
        az_path = which("az")
        if az_path:
            raw = check_output([
                az_path,
                "vm",
                "list-skus",
                "--all",
            ]).decode()
        else:
            print("You need az cli installed.", file=sys.stderr)
            sys.exit(1)

        if cache_path:
            with open(cache_path, "w") as fw:
                fw.write(raw)

    print("Parsing list-skus...")
    try:
        skus = json.loads(raw)
    except Exception as e:

        toks = str(e).split()
        line_no = int(toks[toks.index("line") + 1])
        print("{}: '{}'".format(e, raw.splitlines()[line_no - 1]))
        return

    print("done")

    skus = [
        s for s in skus
        if s.get("family") and s.get("resourceType") == "virtualMachines"
    ]

    min_skus = []
    for sku in skus:
        min_sku = {}
        for key in ["name", "family", "size", "tier"]:
            min_sku[key] = sku[key]

        assert min_sku["family"], sku
        if not sku["locationInfo"]:
            print("WARNING: Missing location info. See", min_sku)
            continue
        min_sku["location"] = sku["locationInfo"][0]["location"]

        cap_list = sku["capabilities"]
        cap_dict = {}
        for entry in cap_list:
            value = entry["value"]
            if value.isdigit():
                value = int(value)
            elif value in ["True", "False"]:
                value = value == "True"
            elif "," in value:
                value = value.split(",")
            else:
                try:
                    value = float(value)
                except ValueError:
                    pass
            cap_dict[entry["name"]] = value
        min_sku["capabilities"] = cap_dict
        min_skus.append(min_sku)

    by_location = partition(min_skus, lambda s: s["location"])
    if os.path.exists("src/hpc/autoscale/node/vm_sizes.json"):
        print("reload")
        vm_sizes = json.load(open("src/hpc/autoscale/node/vm_sizes.json"))
    else:
        vm_sizes = {}
    locs = list(by_location.keys())
    a = sorted(by_location.items(),
               key=lambda x: locs.index(x[0]) if x[0] in locs else -1)
    for loc, loc_skus in a:
        vm_sizes[loc] = partition_single(loc_skus, lambda s: s["name"])

    if which("cycle_server"):
        cs_mts = json.loads(
            check_output([
                "cycle_server",
                "execute",
                "--format",
                "json",
                "select * from Azure.MachineType",
            ]).decode())
    else:
        print(
            "Warning: cycle_server found! Skipping validation",
            file=sys.stderr,
        )
        cs_mts = []

    for row in cs_mts:
        try:
            aux_info = AuxVMSizeInfo(vm_sizes[row["Location"]][row["Name"]])
            if aux_info.vcpu_count != row["CoreCount"]:

                print(
                    row,
                    aux_info.vcpu_count,
                    json.dumps(getattr(aux_info, "_AuxVMSizeInfo__record"),
                               indent=2),
                )
                if row["Location"] not in vm_sizes:
                    vm_sizes[row["Location"]] = {}

                rec = {
                    "name": row.pop("Name"),
                    "family": row.pop("Family"),
                    "size": row.pop("SKU"),
                    "tier": row.pop("Tier"),
                    "location": row.pop("Location"),
                    "linux_price": row.get("Linux", {}).get("Regular", 0.0),
                    "windows_price": row.get("Linux", {}).get("Regular", 0.0),
                    "capabilities": row,
                }
                vm_sizes[row["Location"]][row["Name"]] = rec
                sys.exit(1)
            continue
        except KeyError:
            pass

        if row["Location"] not in vm_sizes:
            vm_sizes[row["Location"]] = {}

    final_vm_sizes: Dict = {}
    for loc in sorted(vm_sizes):
        final_vm_sizes[loc] = loc_dict = {}
        for vm_size in sorted(vm_sizes[loc]):
            loc_dict[vm_size] = vm_sizes[loc][vm_size]

    with open("new_vm_sizes.json", "w") as fw:
        json.dump(final_vm_sizes, fw, indent=2)

    with open("../src/hpc/autoscale/node/vm_sizes.json") as fr:
        old_data = json.load(fr)

    missing_locations = set(old_data.keys()) - set(final_vm_sizes.keys())
    new_locations = set(final_vm_sizes.keys()) - set(old_data.keys())
    if missing_locations:
        print("WARNING: Missing locations:", ",".join(missing_locations))
    if missing_locations:
        print("INFO: New locations:", ",".join(new_locations))

    all_locations = list(old_data.keys()) + list(new_locations)

    for location in all_locations:
        old_loc_data = old_data.get(location, {})
        new_loc_data = final_vm_sizes.get(location, {})
        missing_skus = set(old_loc_data.keys()) - set(new_loc_data.keys())
        new_skus = set(new_loc_data.keys()) - set(old_loc_data.keys())
        if missing_skus and location not in missing_locations:
            print(
                "WARNING: Missing SKUs for location",
                location,
                ":",
                ",".join(missing_skus),
            )
        if new_skus and location not in new_locations:
            print("INFO: New SKUs for location", location, ":",
                  ",".join(new_skus))

    print(
        "Copy ./new_vm_sizes.json to ./src/hpc/autoscale/node/vm_sizes.json to complete the creation."
    )
Ejemplo n.º 17
0
def _parse_complexes(
    autoscale_config: Dict, complex_lines: List[str]
) -> Dict[str, "Complex"]:
    relevant_complexes = None
    if autoscale_config:
        relevant_complexes = autoscale_config.get("gridengine", {}).get(
            "relevant_complexes"
        )
        if relevant_complexes:
            # special handling of ccnodeid, since it is something we
            # create for the user
            relevant_complexes = relevant_complexes + ["ccnodeid"]

        if relevant_complexes:
            logging.info(
                "Restricting complexes for autoscaling to %s", relevant_complexes
            )

    complexes: List[Complex] = []
    headers = complex_lines[0].lower().replace("#", "").split()

    required = set(["name", "type", "consumable"])
    missing = required - set(headers)
    if missing:
        logging.error(
            "Could not parse complex file as it is missing expected columns: %s."
            + " Autoscale likely will not work.",
            list(missing),
        )
        return {}

    for n, line in enumerate(complex_lines[1:]):
        if line.startswith("#"):
            continue
        toks = line.split()
        if len(toks) != len(headers):
            logging.warning(
                "Could not parse complex at line {} - ignoring: '{}'".format(n, line)
            )
            continue
        c = dict(zip(headers, toks))
        try:

            if (
                relevant_complexes
                and c["name"] not in relevant_complexes
                and c["shortcut"] not in relevant_complexes
            ):
                logging.trace(
                    "Ignoring complex %s because it was not defined in gridengine.relevant_complexes",
                    c["name"],
                )
                continue

            complex = Complex(
                name=c["name"],
                shortcut=c.get("shortcut", c["name"]),
                complex_type=c["type"],
                relop=c.get("relop", "=="),
                requestable=c.get("requestable", "YES").lower() == "yes",
                consumable=c.get("consumable", "YES").lower() == "yes",
                default=c.get("default"),
                urgency=int(c.get("urgency", 0)),
            )

            complexes.append(complex)

        except Exception:
            logging.exception("Could not parse complex %s - %s", line, c)

    # TODO test RDH
    ret = partition_single(complexes, lambda x: x.name)
    shortcut_dict = partition_single(complexes, lambda x: x.shortcut)
    ret.update(shortcut_dict)
    return ret
Ejemplo n.º 18
0
    def _setup_shell_locals(self, config: Dict) -> Dict:
        """
        Provides read only interactive shell. type pbsprohelp()
        in the shell for more information
        """
        ctx = DefaultContextHandler("[interactive-readonly]")

        pbs_driver = PBSProDriver(config)
        pbs_env = self._pbs_env(pbs_driver)

        def pbsprohelp() -> None:
            print(
                "config               - dict representing autoscale configuration."
            )
            print(
                "cli                  - object representing the CLI commands")
            print(
                "pbs_env              - object that contains data structures for queues, resources etc"
            )
            print(
                "queues               - dict of queue name -> PBSProQueue object"
            )

            print("jobs                 - dict of job id -> Autoscale Job")
            print(
                "scheduler_nodes      - dict of hostname -> node objects. These represent purely what"
                "                  the scheduler sees without additional booting nodes / information from CycleCloud"
            )
            print(
                "resource_definitions - dict of resource name -> PBSProResourceDefinition objects."
            )
            print(
                "default_scheduler    - PBSProScheduler object representing the default scheduler."
            )
            print(
                "pbs_driver           - PBSProDriver object that interacts directly with PBS and implements"
                "                    PBS specific behavior for scalelib.")
            print(
                "demand_calc          - ScaleLib DemandCalculator - pseudo-scheduler that determines the what nodes are unnecessary"
            )
            print(
                "node_mgr             - ScaleLib NodeManager - interacts with CycleCloud for all node related"
                +
                "                    activities - creation, deletion, limits, buckets etc."
            )
            print("pbsprohelp            - This help function")

        # try to make the key "15" instead of "15.hostname" if only
        # a single submitter was in use
        num_scheds = len(set([x.name.split(".", 1)[-1] for x in pbs_env.jobs]))
        if num_scheds == 1:
            jobs_dict = partition_single(pbs_env.jobs,
                                         lambda j: j.name.split(".")[0])
        else:
            jobs_dict = partition_single(pbs_env.jobs, lambda j: j.name)

        sched_nodes_dict = partition_single(pbs_env.scheduler_nodes,
                                            lambda n: n.hostname)

        pbs_env.queues = clilib.ShellDict(pbs_env.queues)

        for snode in pbs_env.scheduler_nodes:
            snode.shellify()

        pbs_env.resource_definitions = clilib.ShellDict(
            pbs_env.resource_definitions)

        demand_calc, _ = self._demand_calc(config, pbs_driver)

        shell_locals = {
            "config": config,
            "cli": self,
            "ctx": ctx,
            "pbs_env": pbs_env,
            "queues": pbs_env.queues,
            "jobs": clilib.ShellDict(jobs_dict, "j"),
            "scheduler_nodes": clilib.ShellDict(sched_nodes_dict),
            "resource_definitions": pbs_env.resource_definitions,
            "default_scheduler": pbs_env.default_scheduler,
            "pbs_driver": pbs_driver,
            "demand_calc": demand_calc,
            "node_mgr": demand_calc.node_mgr,
            "pbsprohelp": pbsprohelp,
        }

        return shell_locals
Ejemplo n.º 19
0
def autoscale_grid_engine(
    config: Dict[str, Any],
    ge_env: Optional[GridEngineEnvironment] = None,
    ge_driver: Optional["GridEngineDriver"] = None,
    ctx_handler: Optional[DefaultContextHandler] = None,
    node_history: Optional[NodeHistory] = None,
    dry_run: bool = False,
) -> DemandResult:
    global _exit_code

    assert not config.get("read_only", False)
    if dry_run:
        logging.warning("Running gridengine autoscaler in dry run mode")
        # allow multiple instances
        config["lock_file"] = None
        # put in read only mode
        config["read_only"] = True

    if ge_env is None:
        ge_env = envlib.from_qconf(config)

    # interface to GE, generally by cli
    if ge_driver is None:
        # allow tests to pass in a mock
        ge_driver = new_driver(config, ge_env)

    ge_driver.initialize_environment()

    config = ge_driver.preprocess_config(config)

    logging.fine("Driver = %s", ge_driver)

    invalid_nodes = []

    # we need an instance without any scheduler nodes, so don't
    # pass in the existing nodes.
    tmp_node_mgr = new_node_manager(config)

    by_hostname = partition_single(tmp_node_mgr.get_nodes(),
                                   lambda n: n.hostname_or_uuid)

    for node in ge_env.nodes:
        # many combinations of a u and other states. However,
        # as long as a and u are in there it is down
        state = node.metadata.get("state", "")
        cc_node = by_hostname.get(node.hostname)
        ccnodeid = node.resources.get("ccnodeid")
        if cc_node:
            if not ccnodeid or ccnodeid == cc_node.delayed_node_id.node_id:
                if cc_node.state in ["Preparing", "Acquiring"]:
                    continue
        if "a" in state and "u" in state:
            invalid_nodes.append(node)

    # nodes in error state must also be deleted
    nodes_to_delete = ge_driver.clean_hosts(invalid_nodes)
    for node in nodes_to_delete:
        ge_env.delete_node(node)

    demand_calculator = calculate_demand(config, ge_env, ge_driver,
                                         ctx_handler, node_history)

    ge_driver.handle_failed_nodes(
        demand_calculator.node_mgr.get_failed_nodes())

    demand_result = demand_calculator.finish()

    if ctx_handler:
        ctx_handler.set_context("[joining]")

    # details here are that we pass in nodes that matter (matched) and the driver figures out
    # which ones are new and need to be added via qconf
    joined = ge_driver.handle_join_cluster(
        [x for x in demand_result.compute_nodes if x.exists])

    ge_driver.handle_post_join_cluster(joined)

    if ctx_handler:
        ctx_handler.set_context("[scaling]")

    # bootup all nodes. Optionally pass in a filtered list
    if demand_result.new_nodes:
        if not dry_run:
            demand_calculator.bootup()

    if not dry_run:
        demand_calculator.update_history()

    # we also tell the driver about nodes that are unmatched. It filters them out
    # and returns a list of ones we can delete.
    idle_timeout = int(config.get("idle_timeout", 300))
    boot_timeout = int(config.get("boot_timeout", 3600))
    logging.fine("Idle timeout is %s", idle_timeout)

    unmatched_for_5_mins = demand_calculator.find_unmatched_for(
        at_least=idle_timeout)
    timed_out_booting = demand_calculator.find_booting(at_least=boot_timeout)

    # I don't care about nodes that have keep_alive=true
    timed_out_booting = [n for n in timed_out_booting if not n.keep_alive]

    timed_out_to_deleted = []
    unmatched_nodes_to_delete = []

    if timed_out_booting:
        logging.info("The following nodes have timed out while booting: %s",
                     timed_out_booting)
        timed_out_to_deleted = ge_driver.handle_boot_timeout(
            timed_out_booting) or []

    if unmatched_for_5_mins:
        node_expr = ", ".join([str(x) for x in unmatched_for_5_mins])
        logging.info("Unmatched for at least %s seconds: %s", idle_timeout,
                     node_expr)
        unmatched_nodes_to_delete = (
            ge_driver.handle_draining(unmatched_for_5_mins) or [])

    nodes_to_delete = []
    for node in timed_out_to_deleted + unmatched_nodes_to_delete:
        if node.assignments:
            logging.warning(
                "%s has jobs assigned to it so we will take no action.", node)
            continue
        nodes_to_delete.append(node)

    if nodes_to_delete:
        try:
            logging.info("Deleting %s", [str(n) for n in nodes_to_delete])
            delete_result = demand_calculator.delete(nodes_to_delete)

            if delete_result:
                # in case it has anything to do after a node is deleted (usually just remove it from the cluster)
                ge_driver.handle_post_delete(delete_result.nodes)
        except Exception as e:
            _exit_code = 1
            logging.warning(
                "Deletion failed, will retry on next iteration: %s", e)
            logging.exception(str(e))

    print_demand(config, demand_result, log=not dry_run)

    return demand_result
Ejemplo n.º 20
0
def autoscale_hpcpack(
    config: Dict[str, Any],
    ctx_handler: DefaultContextHandler = None,
    hpcpack_rest_client: Optional[HpcRestClient] = None,
    dry_run: bool = False,
) -> None:

    if not hpcpack_rest_client:
        hpcpack_rest_client = new_rest_client(config)

    if ctx_handler:
        ctx_handler.set_context("[Sync-Status]")
    autoscale_config = config.get("autoscale") or {}
    # Load history info
    idle_timeout_seconds: int = autoscale_config.get("idle_timeout") or 600
    provisioning_timeout_seconds = autoscale_config.get("boot_timeout") or 1500
    statefile = autoscale_config.get(
        "statefile") or "C:\\cycle\\jetpack\\config\\autoscaler_state.txt"
    archivefile = autoscale_config.get(
        "archivefile") or "C:\\cycle\\jetpack\\config\\autoscaler_archive.txt"
    node_history = HpcNodeHistory(
        statefile=statefile,
        archivefile=archivefile,
        provisioning_timeout=provisioning_timeout_seconds,
        idle_timeout=idle_timeout_seconds)

    logging.info("Synchronizing the nodes between Cycle cloud and HPC Pack")

    # Initialize data of History info, cc nodes, HPC Pack nodes, HPC grow decisions
    # Get node list from Cycle Cloud
    def nodes_state_key(n: Node) -> Tuple[int, str, int]:
        try:
            state_pri = 1
            if n.state == 'Deallocated':
                state_pri = 2
            elif n.state == 'Stopping':
                state_pri = 3
            elif n.state == 'Terminating':
                state_pri = 4
            name, index = n.name.rsplit("-", 1)
            return (state_pri, name, int(index))
        except Exception:
            return (state_pri, n.name, 0)

    node_mgr: NodeManager = new_node_manager(config)
    for b in node_mgr.get_buckets():
        b.nodes.sort(key=nodes_state_key)
    cc_nodes: List[Node] = node_mgr.get_nodes()
    cc_nodes_by_id = partition_single(cc_nodes,
                                      func=lambda n: n.delayed_node_id.node_id)
    # Get compute node list and grow decision from HPC Pack
    hpc_node_groups = hpcpack_rest_client.list_node_groups()
    grow_decisions = hpcpack_rest_client.get_grow_decision()
    logging.info("grow decision: {}".format(grow_decisions))
    hpc_cn_nodes: List[HpcNode] = hpcpack_rest_client.list_computenodes()
    hpc_cn_nodes = [n for n in hpc_cn_nodes if n.active]

    # This function will link node history items, cc nodes and hpc nodes
    node_history.synchronize(cc_nodes, hpc_cn_nodes)

    cc_nodearrays = set([b.nodearray for b in node_mgr.get_buckets()])
    logging.info("Current node arrays in cyclecloud: {}".format(cc_nodearrays))

    # Create HPC node groups for CC node arrays
    cc_map_hpc_groups = ["CycleCloudNodes"] + list(cc_nodearrays)
    for cc_grp in cc_map_hpc_groups:
        if ci_notin(cc_grp, hpc_node_groups):
            logging.info("Create HPC node group: {}".format(cc_grp))
            hpcpack_rest_client.add_node_group(cc_grp,
                                               "Cycle Cloud Node group")

    # Add HPC nodes into corresponding node groups
    add_cc_tag_nodes = [
        n.name for n in hpc_cn_nodes if n.shall_addcyclecloudtag
    ]
    if len(add_cc_tag_nodes) > 0:
        logging.info(
            "Adding HPC nodes to node group CycleCloudNodes: {}".format(
                add_cc_tag_nodes))
        hpcpack_rest_client.add_node_to_node_group("CycleCloudNodes",
                                                   add_cc_tag_nodes)
    for cc_grp in list(cc_nodearrays):
        add_array_tag_nodes = [
            n.name for n in hpc_cn_nodes
            if n.shall_addnodearraytag and ci_equals(n.cc_nodearray, cc_grp)
        ]
        if len(add_array_tag_nodes) > 0:
            logging.info("Adding HPC nodes to node group {}: {}".format(
                cc_grp, add_array_tag_nodes))
            hpcpack_rest_client.add_node_to_node_group(cc_grp,
                                                       add_array_tag_nodes)

    # Possible values for HPC NodeState (states marked with * shall not occur for CC nodes):
    #   Unknown, Provisioning, Offline, Starting, Online, Draining, Rejected(*), Removing, NotDeployed(*), Stopping(*)
    # Remove the following HPC Pack nodes:
    #   1. The corresponding CC node already removed
    #   2. The corresponding CC node is stopped and HPC node is not assigned a node template
    # Take offline the following HPC Pack nodes:
    #   1. The corresponding CC node is stopped or is going to stop
    hpc_nodes_to_remove = [
        n.name for n in hpc_cn_nodes
        if n.removed_cc_node or (n.stopped_cc_node and not n.template_assigned)
    ]
    hpc_nodes_to_take_offline = [
        n.name for n in hpc_cn_nodes
        if n.stopped_cc_node and ci_equals(n.state, "Online")
    ]
    if len(hpc_nodes_to_remove) > 0:
        logging.info("Removing the HPC nodes: {}".format(hpc_nodes_to_remove))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.remove_nodes(hpc_nodes_to_remove)
    hpc_cn_nodes = [
        n for n in hpc_cn_nodes if not (n.stopped_cc_node or n.removed_cc_node)
    ]

    # Assign default node template for unapproved CC node
    hpc_nodes_to_assign_template = [
        n.name for n in hpc_cn_nodes
        if n.bound_cc_node and not n.template_assigned
    ]
    if len(hpc_nodes_to_assign_template) > 0:
        logging.info(
            "Assigning default node template for the HPC nodes: {}".format(
                hpc_nodes_to_assign_template))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.assign_default_compute_node_template(
                hpc_nodes_to_assign_template)

    ### Start scale up checking:
    logging.info("Start scale up checking ...")
    if ctx_handler:
        ctx_handler.set_context("[scale-up]")

    hpc_nodes_with_active_cc = [
        n for n in hpc_cn_nodes if n.template_assigned and n.bound_cc_node
    ]
    # Exclude the already online healthy HPC nodes before calling node_mgr.allocate
    for hpc_node in hpc_nodes_with_active_cc:
        if hpc_node.ready_for_job:
            hpc_node.bound_cc_node.closed = True

    # Terminate the provisioning timeout CC nodes
    cc_node_to_terminate: List[Node] = []
    for cc_node in cc_nodes:
        if ci_equals(cc_node.target_state, 'Deallocated') or ci_equals(
                cc_node.target_state,
                'Terminated') or cc_node.create_time_remaining:
            continue
        nhi = node_history.find(cc_id=cc_node.delayed_node_id.node_id)
        if not nhi.hpc_id:
            cc_node.closed = True
            cc_node_to_terminate.append(cc_node)
        else:
            hpc_node = ci_find_one(hpc_nodes_with_active_cc, nhi.hpc_id,
                                   lambda n: n.id)
            if hpc_node and hpc_node.error:
                cc_node.closed = True
                cc_node_to_terminate.append(cc_node)

    # "ComputeNodes", "CycleCloudNodes", "AzureIaaSNodes" are all treated as default
    # grow_by_socket not supported yet, treat as grow_by_node
    defaultGroups = [
        "Default", "ComputeNodes", "AzureIaaSNodes", "CycleCloudNodes"
    ]
    default_cores_to_grow = default_nodes_to_grow = 0.0

    # If the current CC nodes in the node array cannot satisfy the grow decision, the group is hungry
    # For a hungry group, no idle check is required if the node health is OK
    group_hungry: Dict[str, bool] = {}
    nbrNewNodes: int = 0
    grow_groups = list(grow_decisions.keys())
    for grp in grow_groups:
        tmp = grow_decisions.pop(grp)
        if not (tmp.cores_to_grow + tmp.nodes_to_grow + tmp.sockets_to_grow):
            continue
        if ci_in(grp, defaultGroups):
            default_cores_to_grow += tmp.cores_to_grow
            default_nodes_to_grow += tmp.nodes_to_grow + tmp.sockets_to_grow
            continue
        if ci_notin(grp, cc_nodearrays):
            logging.warning(
                "No mapping node array for the grow requirement {}:{}".format(
                    grp, grow_decisions[grp]))
            grow_decisions.pop(grp)
            continue
        group_hungry[grp] = False
        array = ci_lookup(grp, cc_nodearrays)
        selector = {'ncpus': 1, 'node.nodearray': [array]}
        target_cores = math.ceil(tmp.cores_to_grow)
        target_nodes = math.ceil(tmp.nodes_to_grow + tmp.sockets_to_grow)
        if target_nodes:
            logging.info("Allocate: {}  Target Nodes: {}".format(
                selector, target_nodes))
            result = node_mgr.allocate(selector, node_count=target_nodes)
            logging.info(result)
            if not result or result.total_slots < target_nodes:
                group_hungry[grp] = True
        if target_cores:
            logging.info("Allocate: {}  Target Cores: {}".format(
                selector, target_cores))
            result = node_mgr.allocate(selector, slot_count=target_cores)
            logging.info(result)
            if not result or result.total_slots < target_cores:
                group_hungry[grp] = True
        if len(node_mgr.new_nodes) > nbrNewNodes:
            group_hungry[grp] = True
        nbrNewNodes = len(node_mgr.new_nodes)

    # We then check the grow decision for the default node groups:
    checkShrinkNeeded = True
    growForDefaultGroup = True if default_nodes_to_grow or default_cores_to_grow else False
    if growForDefaultGroup:
        selector = {'ncpus': 1}
        if default_nodes_to_grow:
            target_nodes = math.ceil(default_nodes_to_grow)
            logging.info("Allocate: {}  Target Nodes: {}".format(
                selector, target_nodes))
            result = node_mgr.allocate({'ncpus': 1}, node_count=target_nodes)
            if not result or result.total_slots < target_nodes:
                checkShrinkNeeded = False
        if default_cores_to_grow:
            target_cores = math.ceil(default_cores_to_grow)
            logging.info("Allocate: {}  Target Cores: {}".format(
                selector, target_cores))
            result = node_mgr.allocate({'ncpus': 1}, slot_count=target_cores)
            if not result or result.total_slots < target_cores:
                checkShrinkNeeded = False
        if len(node_mgr.new_nodes) > nbrNewNodes:
            checkShrinkNeeded = False
        nbrNewNodes = len(node_mgr.new_nodes)

    if nbrNewNodes > 0:
        logging.info("Need to Allocate {} nodes in total".format(nbrNewNodes))
        if dry_run:
            logging.info("Dry-run: skipping node bootup...")
        else:
            logging.info("Allocating {} nodes in total".format(
                len(node_mgr.new_nodes)))
            bootup_result: BootupResult = node_mgr.bootup()
            logging.info(bootup_result)
            if bootup_result and bootup_result.nodes:
                for cc_node in bootup_result.nodes:
                    nhi = node_history.find(
                        cc_id=cc_node.delayed_node_id.node_id)
                    if nhi is None:
                        nhi = node_history.insert(
                            NodeHistoryItem(cc_node.delayed_node_id.node_id))
                    else:
                        nhi.restart()
    else:
        logging.info("No need to allocate new nodes ...")

    ### Start the shrink checking
    if ctx_handler:
        ctx_handler.set_context("[scale-down]")

    cc_node_to_shutdown: List[Node] = []
    if not checkShrinkNeeded:
        logging.info("No shrink check at this round ...")
        if not dry_run:
            for nhi in node_history.items:
                if not nhi.stopped and nhi.hpc_id:
                    nhi.idle_from = None
    else:
        logging.info("Start scale down checking ...")
        # By default, we check idle for active CC nodes in HPC Pack with 'Offline', 'Starting', 'Online', 'Draining' state
        candidate_idle_check_nodes = [
            n for n in hpc_nodes_with_active_cc
            if (not n.bound_cc_node.keep_alive)
            and ci_in(n.state, ["Offline", "Starting", "Online", "Draining"])
        ]

        # We can exclude some nodes from idle checking:
        # 1. If HPC Pack ask for grow in default node group(s), all healthy ONLINE nodes are considered as busy
        # 2. If HPC Pack ask for grow in certain node group, all healthy ONLINE nodes in that node group are considered as busy
        # 3. If a node group is hungry (new CC required or grow request not satisfied), no idle check needed for all nodes in that node array
        if growForDefaultGroup:
            candidate_idle_check_nodes = [
                n for n in candidate_idle_check_nodes if not n.ready_for_job
            ]
        for grp, hungry in group_hungry.items():
            if hungry:
                candidate_idle_check_nodes = [
                    n for n in candidate_idle_check_nodes
                    if not ci_equals(grp, n.cc_nodearray)
                ]
            elif not growForDefaultGroup:
                candidate_idle_check_nodes = [
                    n for n in candidate_idle_check_nodes
                    if not (ci_equals(grp, n.cc_nodearray) and n.ready_for_job)
                ]

        curtime = datetime.utcnow()
        # Offline node must be idle
        idle_node_names = [
            n.name for n in candidate_idle_check_nodes
            if ci_equals(n.state, 'Offline')
        ]
        if len(candidate_idle_check_nodes) > len(idle_node_names):
            idle_nodes = hpcpack_rest_client.check_nodes_idle([
                n.name for n in candidate_idle_check_nodes
                if not ci_equals(n.state, 'Offline')
            ])
            if len(idle_nodes) > 0:
                idle_node_names.extend([n.node_name for n in idle_nodes])

        if len(idle_node_names) > 0:
            logging.info(
                "The following node is idle: {}".format(idle_node_names))
        else:
            logging.info("No idle node found in this round.")

        retention_days = autoscale_config.get("vm_retention_days") or 7
        for nhi in node_history.items:
            if nhi.stopped:
                if nhi.stop_time + timedelta(
                        days=retention_days) < datetime.utcnow():
                    cc_node = cc_nodes_by_id.get(nhi.cc_id)
                    if cc_node is not None:
                        cc_node_to_terminate.append(cc_node)
                continue
            if ci_in(nhi.hostname, idle_node_names):
                if nhi.idle_from is None:
                    nhi.idle_from = curtime
                elif nhi.idle_timeout(idle_timeout_seconds):
                    nhi.stop_time = curtime
                    cc_node = cc_nodes_by_id.get(nhi.cc_id)
                    if cc_node is not None:
                        cc_node_to_shutdown.append(cc_node)
            else:
                nhi.idle_from = None

    shrinking_cc_node_ids = [
        n.delayed_node_id.node_id for n in cc_node_to_terminate
    ]
    shrinking_cc_node_ids.extend(
        [n.delayed_node_id.node_id for n in cc_node_to_shutdown])
    hpc_nodes_to_bring_online = [
        n.name for n in hpc_nodes_with_active_cc
        if ci_equals(n.state, 'Offline') and not n.error
        and ci_notin(n.cc_node_id, shrinking_cc_node_ids)
    ]
    hpc_nodes_to_take_offline.extend([
        n.name for n in hpc_nodes_with_active_cc
        if ci_equals(n.state, 'Online')
        and ci_in(n.cc_node_id, shrinking_cc_node_ids)
    ])
    if len(hpc_nodes_to_bring_online) > 0:
        logging.info("Bringing the HPC nodes online: {}".format(
            hpc_nodes_to_bring_online))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.bring_nodes_online(hpc_nodes_to_bring_online)

    if len(hpc_nodes_to_take_offline) > 0:
        logging.info("Taking the HPC nodes offline: {}".format(
            hpc_nodes_to_take_offline))
        if dry_run:
            logging.info("Dry-run: no real action")
        else:
            hpcpack_rest_client.take_nodes_offline(hpc_nodes_to_take_offline)

    if len(cc_node_to_shutdown) > 0:
        logging.info("Shut down the following Cycle cloud node: {}".format(
            [cn.name for cn in cc_node_to_shutdown]))
        if dry_run:
            logging.info("Dry-run: skip ...")
        else:
            node_mgr.shutdown_nodes(cc_node_to_shutdown)

    if len(cc_node_to_terminate) > 0:
        logging.info(
            "Terminating the following provisioning-timeout Cycle cloud nodes: {}"
            .format([cn.name for cn in cc_node_to_terminate]))
        if dry_run:
            logging.info("Dry-run: skip ...")
        else:
            node_mgr.terminate_nodes(cc_node_to_terminate)

    if not dry_run:
        logging.info("Save node history: {}".format(node_history))
        node_history.save()