def refresh_yarn_nodes(args): k8s_operator = KubernetesOperator(args.api_server_ip) yarn_operator = YarnOperator(args.resource_manager_ip) while True: yarn_operator.decommission_nodes() node_info = yarn_operator.get_nodes_info() current_status = {k: v["state"] for k, v in node_info.items()} decommissioned_nodes = k8s_operator.get_nodes() unready_nodes = get_unready_nodes(decommissioned_nodes, current_status) if len(unready_nodes) == 0: break unready_info = ','.join([ node_name + " in " + status for node_name, status in unready_nodes.items() ]) logger.info("Unready nodes: {}. Waiting...".format(unready_info)) time.sleep(30) logger.info("Successfully refresh nodes.")
def get_dedicate_vc(args): yarn_operator = YarnOperator(args.resource_manager_ip) queues_info = yarn_operator.get_queues_info() nodes_info = yarn_operator.get_nodes_info() dedicate_queues = { queue_name: { "resource": Resource(**{ "cpus": 0, "memory": 0, "gpus": 0 }), "nodes": [] } for queue_name, queue_info in queues_info.items() if is_dedicated_vc(queue_name, queue_info) } if len(dedicate_queues) == 0: logger.info("No dedicated vc found") return labeled_resources = get_resource_by_label(nodes_info) for partition in labeled_resources: if partition in dedicate_queues: dedicate_queues[partition]["resource"] = labeled_resources[ partition]["resource"] for node in nodes_info: if nodes_info[node]["nodeLabel"] in dedicate_queues: dedicate_queues[nodes_info[node]["nodeLabel"]]["nodes"].append( node) for queue_name, queue_attr in dedicate_queues.items(): print(queue_name + ":") print("\tNodes: " + ",".join(queue_attr["nodes"])) print("\tResource: <CPUs:{}, Memory:{}MB, GPUs:{}>".format( queue_attr["resource"].cpus, queue_attr["resource"].memory, queue_attr["resource"].gpus))
def remove_dedicate_vc(args): yarn_operator = YarnOperator(args.resource_manager_ip) vc_name = args.vc_name nodes = args.nodes remove_queue_flag = nodes is None logger.info("Unlabeling node...") nodes_info = yarn_operator.get_nodes_info() queues_info = yarn_operator.get_queues_info() if nodes is None: nodes = set(nodes_info.keys()) t_nodes = [ node for node in nodes if nodes_info[node]["nodeLabel"] == vc_name ] if len(t_nodes) > 0: if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \ queues_info["default"]["capacity"]: queues_info["default"]["maxCapacity"] = 100.0 removed_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0}) for node, info in nodes_info.items(): if node in nodes and info["nodeLabel"] == vc_name: removed_resource += info["resource"] default_partition_resource = get_resource_by_label( nodes_info)[""]["resource"] default_vc_percentage = queues_info["default"]["capacity"] / 100.0 default_vc_resource = default_partition_resource * default_vc_percentage new_default_partition_resource = default_partition_resource + removed_resource new_default_vc_resource = default_vc_resource + removed_resource queues_info_with_gpus = convert_percentage_to_gpus( queues_info, default_partition_resource) queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus new_queues_percentage = convert_gpus_to_percentage( queues_info_with_gpus, new_default_partition_resource) new_queues_percentage = normalize_percentage(new_queues_percentage) updated_dict = {} for queue, info in new_queues_percentage.items(): updated_dict[queue] = { "capacity": info["capacity"], "maximum-capacity": info["maxCapacity"] } yarn_operator.label_nodes(t_nodes, "") yarn_operator.update_queue_capacity(updated_dict) if remove_queue_flag: logger.info("Removing dedicated vc...") if vc_name not in queues_info: logger.warning("Virtual cluster not found: {}.".format(vc_name)) else: yarn_operator.remove_dedicated_queue(vc_name) logger.info("Removing cluster label...") if vc_name not in yarn_operator.get_cluster_labels(): logger.warning("Cluster label not found: {}".format(vc_name)) else: yarn_operator.remove_cluster_label(vc_name)
def add_dedicate_vc(args): yarn_operator = YarnOperator(args.resource_manager_ip) vc_name = args.vc_name nodes = args.nodes logger.info("Adding cluster label...") existing_labels = yarn_operator.get_cluster_labels() if vc_name in existing_labels: logger.warning("Label already exists: {}".format(vc_name)) else: yarn_operator.add_cluster_label(vc_name) logger.info("Adding dedicated vc...") queues_info = yarn_operator.get_queues_info() if vc_name in queues_info: logger.warning( "Virtual cluster already exists: {}. Adding node to it".format( vc_name)) else: yarn_operator.add_dedicated_queue(vc_name) nodes_info = yarn_operator.get_nodes_info() if len(nodes) > 0: logger.info("Labeling node...") if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \ queues_info["default"]["capacity"]: queues_info["default"]["maxCapacity"] = 100.0 added_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0}) for node, info in nodes_info.items(): if node in nodes and info["nodeLabel"] == "": added_resource += info["resource"] default_partition_resource = get_resource_by_label( nodes_info)[""]["resource"] default_vc_percentage = queues_info["default"]["capacity"] / 100.0 default_vc_resource = default_partition_resource * default_vc_percentage if default_vc_resource.cpus < added_resource.cpus \ or default_vc_resource.gpus < added_resource.gpus \ or default_vc_resource.memory < added_resource.memory: logger.error( "Default vc resource isn't enough for the dedicated vc, please free some resource" ) sys.exit(1) new_default_partition_resource = default_partition_resource - added_resource new_default_vc_resource = default_vc_resource - added_resource queues_info_with_gpus = convert_percentage_to_gpus( queues_info, default_partition_resource) queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus new_queues_percentage = convert_gpus_to_percentage( queues_info_with_gpus, new_default_partition_resource) new_queues_percentage = normalize_percentage(new_queues_percentage) updated_dict = {} for queue, info in new_queues_percentage.items(): updated_dict[queue] = { "capacity": info["capacity"], "maximum-capacity": info["maxCapacity"] } if queue != "default": updated_dict[queue]["disable_preemption"] = True yarn_operator.label_nodes(nodes, vc_name) yarn_operator.update_queue_capacity(updated_dict)