def remove_dedicate_vc(args): yarn_operator = YarnOperator(args.resource_manager_ip) vc_name = args.vc_name nodes = args.nodes remove_queue_flag = nodes is None logger.info("Unlabeling node...") nodes_info = yarn_operator.get_nodes_info() queues_info = yarn_operator.get_queues_info() if nodes is None: nodes = set(nodes_info.keys()) t_nodes = [ node for node in nodes if nodes_info[node]["nodeLabel"] == vc_name ] if len(t_nodes) > 0: if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \ queues_info["default"]["capacity"]: queues_info["default"]["maxCapacity"] = 100.0 removed_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0}) for node, info in nodes_info.items(): if node in nodes and info["nodeLabel"] == vc_name: removed_resource += info["resource"] default_partition_resource = get_resource_by_label( nodes_info)[""]["resource"] default_vc_percentage = queues_info["default"]["capacity"] / 100.0 default_vc_resource = default_partition_resource * default_vc_percentage new_default_partition_resource = default_partition_resource + removed_resource new_default_vc_resource = default_vc_resource + removed_resource queues_info_with_gpus = convert_percentage_to_gpus( queues_info, default_partition_resource) queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus new_queues_percentage = convert_gpus_to_percentage( queues_info_with_gpus, new_default_partition_resource) new_queues_percentage = normalize_percentage(new_queues_percentage) updated_dict = {} for queue, info in new_queues_percentage.items(): updated_dict[queue] = { "capacity": info["capacity"], "maximum-capacity": info["maxCapacity"] } yarn_operator.label_nodes(t_nodes, "") yarn_operator.update_queue_capacity(updated_dict) if remove_queue_flag: logger.info("Removing dedicated vc...") if vc_name not in queues_info: logger.warning("Virtual cluster not found: {}.".format(vc_name)) else: yarn_operator.remove_dedicated_queue(vc_name) logger.info("Removing cluster label...") if vc_name not in yarn_operator.get_cluster_labels(): logger.warning("Cluster label not found: {}".format(vc_name)) else: yarn_operator.remove_cluster_label(vc_name)
def add_dedicate_vc(args): yarn_operator = YarnOperator(args.resource_manager_ip) vc_name = args.vc_name nodes = args.nodes logger.info("Adding cluster label...") existing_labels = yarn_operator.get_cluster_labels() if vc_name in existing_labels: logger.warning("Label already exists: {}".format(vc_name)) else: yarn_operator.add_cluster_label(vc_name) logger.info("Adding dedicated vc...") queues_info = yarn_operator.get_queues_info() if vc_name in queues_info: logger.warning( "Virtual cluster already exists: {}. Adding node to it".format( vc_name)) else: yarn_operator.add_dedicated_queue(vc_name) nodes_info = yarn_operator.get_nodes_info() if len(nodes) > 0: logger.info("Labeling node...") if queues_info["default"]["maxCapacity"] == 100 or queues_info["default"]["maxCapacity"] > \ queues_info["default"]["capacity"]: queues_info["default"]["maxCapacity"] = 100.0 added_resource = Resource(**{"cpus": 0, "memory": 0, "gpus": 0}) for node, info in nodes_info.items(): if node in nodes and info["nodeLabel"] == "": added_resource += info["resource"] default_partition_resource = get_resource_by_label( nodes_info)[""]["resource"] default_vc_percentage = queues_info["default"]["capacity"] / 100.0 default_vc_resource = default_partition_resource * default_vc_percentage if default_vc_resource.cpus < added_resource.cpus \ or default_vc_resource.gpus < added_resource.gpus \ or default_vc_resource.memory < added_resource.memory: logger.error( "Default vc resource isn't enough for the dedicated vc, please free some resource" ) sys.exit(1) new_default_partition_resource = default_partition_resource - added_resource new_default_vc_resource = default_vc_resource - added_resource queues_info_with_gpus = convert_percentage_to_gpus( queues_info, default_partition_resource) queues_info_with_gpus["default"]["gpus"] = new_default_vc_resource.gpus new_queues_percentage = convert_gpus_to_percentage( queues_info_with_gpus, new_default_partition_resource) new_queues_percentage = normalize_percentage(new_queues_percentage) updated_dict = {} for queue, info in new_queues_percentage.items(): updated_dict[queue] = { "capacity": info["capacity"], "maximum-capacity": info["maxCapacity"] } if queue != "default": updated_dict[queue]["disable_preemption"] = True yarn_operator.label_nodes(nodes, vc_name) yarn_operator.update_queue_capacity(updated_dict)