コード例 #1
0
def refresh_yarn_nodes(args):
    k8s_operator = KubernetesOperator(args.api_server_ip)
    yarn_operator = YarnOperator(args.resource_manager_ip)
    while True:
        yarn_operator.decommission_nodes()
        node_info = yarn_operator.get_nodes_info()
        current_status = {k: v["state"] for k, v in node_info.items()}
        decommissioned_nodes = k8s_operator.get_nodes()
        unready_nodes = get_unready_nodes(decommissioned_nodes, current_status)
        if len(unready_nodes) == 0:
            break
        unready_info = ','.join([
            node_name + " in " + status
            for node_name, status in unready_nodes.items()
        ])
        logger.info("Unready nodes: {}. Waiting...".format(unready_info))
        time.sleep(30)
    logger.info("Successfully refresh nodes.")
コード例 #2
0
def get_dedicate_vc(args):
    yarn_operator = YarnOperator(args.resource_manager_ip)
    queues_info = yarn_operator.get_queues_info()
    nodes_info = yarn_operator.get_nodes_info()
    dedicate_queues = {
        queue_name: {
            "resource": Resource(**{
                "cpus": 0,
                "memory": 0,
                "gpus": 0
            }),
            "nodes": []
        }
        for queue_name, queue_info in queues_info.items()
        if is_dedicated_vc(queue_name, queue_info)
    }
    if len(dedicate_queues) == 0:
        logger.info("No dedicated vc found")
        return

    labeled_resources = get_resource_by_label(nodes_info)
    for partition in labeled_resources:
        if partition in dedicate_queues:
            dedicate_queues[partition]["resource"] = labeled_resources[
                partition]["resource"]

    for node in nodes_info:
        if nodes_info[node]["nodeLabel"] in dedicate_queues:
            dedicate_queues[nodes_info[node]["nodeLabel"]]["nodes"].append(
                node)
    for queue_name, queue_attr in dedicate_queues.items():
        print(queue_name + ":")
        print("\tNodes: " + ",".join(queue_attr["nodes"]))
        print("\tResource: <CPUs:{}, Memory:{}MB, GPUs:{}>".format(
            queue_attr["resource"].cpus, queue_attr["resource"].memory,
            queue_attr["resource"].gpus))
コード例 #3
0
def remove_dedicate_vc(args):
    yarn_operator = YarnOperator(args.resource_manager_ip)
    vc_name = args.vc_name
    nodes = args.nodes
    remove_queue_flag = nodes is None

    logger.info("Unlabeling node...")
    nodes_info = yarn_operator.get_nodes_info()
    queues_info = yarn_operator.get_queues_info()
    if nodes is None:
        nodes = set(nodes_info.keys())
    t_nodes = [
        node for node in nodes if nodes_info[node]["nodeLabel"] == vc_name
    ]
    if len(t_nodes) > 0:

        if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \
                queues_info["default"]["capacity"]:
            queues_info["default"]["maxCapacity"] = 100.0

        removed_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0})
        for node, info in nodes_info.items():
            if node in nodes and info["nodeLabel"] == vc_name:
                removed_resource += info["resource"]

        default_partition_resource = get_resource_by_label(
            nodes_info)[""]["resource"]
        default_vc_percentage = queues_info["default"]["capacity"] / 100.0
        default_vc_resource = default_partition_resource * default_vc_percentage

        new_default_partition_resource = default_partition_resource + removed_resource
        new_default_vc_resource = default_vc_resource + removed_resource

        queues_info_with_gpus = convert_percentage_to_gpus(
            queues_info, default_partition_resource)
        queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus
        new_queues_percentage = convert_gpus_to_percentage(
            queues_info_with_gpus, new_default_partition_resource)
        new_queues_percentage = normalize_percentage(new_queues_percentage)
        updated_dict = {}
        for queue, info in new_queues_percentage.items():
            updated_dict[queue] = {
                "capacity": info["capacity"],
                "maximum-capacity": info["maxCapacity"]
            }

        yarn_operator.label_nodes(t_nodes, "")
        yarn_operator.update_queue_capacity(updated_dict)

    if remove_queue_flag:
        logger.info("Removing dedicated vc...")
        if vc_name not in queues_info:
            logger.warning("Virtual cluster not found: {}.".format(vc_name))
        else:
            yarn_operator.remove_dedicated_queue(vc_name)

        logger.info("Removing cluster label...")
        if vc_name not in yarn_operator.get_cluster_labels():
            logger.warning("Cluster label not found: {}".format(vc_name))
        else:
            yarn_operator.remove_cluster_label(vc_name)
コード例 #4
0
def add_dedicate_vc(args):
    yarn_operator = YarnOperator(args.resource_manager_ip)
    vc_name = args.vc_name
    nodes = args.nodes

    logger.info("Adding cluster label...")
    existing_labels = yarn_operator.get_cluster_labels()
    if vc_name in existing_labels:
        logger.warning("Label already exists: {}".format(vc_name))
    else:
        yarn_operator.add_cluster_label(vc_name)

    logger.info("Adding dedicated vc...")
    queues_info = yarn_operator.get_queues_info()
    if vc_name in queues_info:
        logger.warning(
            "Virtual cluster already exists: {}. Adding node to it".format(
                vc_name))
    else:
        yarn_operator.add_dedicated_queue(vc_name)

    nodes_info = yarn_operator.get_nodes_info()
    if len(nodes) > 0:
        logger.info("Labeling node...")

        if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \
                queues_info["default"]["capacity"]:
            queues_info["default"]["maxCapacity"] = 100.0

        added_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0})
        for node, info in nodes_info.items():
            if node in nodes and info["nodeLabel"] == "":
                added_resource += info["resource"]

        default_partition_resource = get_resource_by_label(
            nodes_info)[""]["resource"]
        default_vc_percentage = queues_info["default"]["capacity"] / 100.0
        default_vc_resource = default_partition_resource * default_vc_percentage

        if default_vc_resource.cpus < added_resource.cpus \
            or default_vc_resource.gpus < added_resource.gpus \
                or default_vc_resource.memory < added_resource.memory:
            logger.error(
                "Default vc resource isn't enough for the dedicated vc, please free some resource"
            )
            sys.exit(1)

        new_default_partition_resource = default_partition_resource - added_resource
        new_default_vc_resource = default_vc_resource - added_resource

        queues_info_with_gpus = convert_percentage_to_gpus(
            queues_info, default_partition_resource)
        queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus
        new_queues_percentage = convert_gpus_to_percentage(
            queues_info_with_gpus, new_default_partition_resource)
        new_queues_percentage = normalize_percentage(new_queues_percentage)
        updated_dict = {}
        for queue, info in new_queues_percentage.items():
            updated_dict[queue] = {
                "capacity": info["capacity"],
                "maximum-capacity": info["maxCapacity"]
            }
            if queue != "default":
                updated_dict[queue]["disable_preemption"] = True

        yarn_operator.label_nodes(nodes, vc_name)
        yarn_operator.update_queue_capacity(updated_dict)