コード例 #1
0
    def testRequestBundlesAccountsForHeadNode(self):
        config = MULTI_WORKER_CLUSTER.copy()
        config["head_node_type"] = "p2.8xlarge"
        config["min_workers"] = 0
        config["max_workers"] = 50
        config_path = self.write_config(config)
        self.provider = MockProvider()
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
            TAG_RAY_NODE_KIND: "head"
        }, 1)
        runner = MockProcessRunner()
        autoscaler = StandardAutoscaler(config_path,
                                        LoadMetrics(),
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 1

        # These requests fit on the head node.
        autoscaler.update()
        self.waitForNodes(1)
        autoscaler.request_resources([{"CPU": 1}])
        autoscaler.update()
        self.waitForNodes(1)
        assert len(self.provider.mock_nodes) == 1
        autoscaler.request_resources([{"GPU": 8}])
        autoscaler.update()
        self.waitForNodes(1)

        # This request requires an additional worker node.
        autoscaler.request_resources([{"GPU": 8}] * 2)
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
コード例 #2
0
 def __init__(self, redis_address, autoscaling_config, redis_password=None):
     # Initialize the Redis clients.
     ray.state.state._initialize_global_state(redis_address,
                                              redis_password=redis_password)
     self.redis = ray._private.services.create_redis_client(
         redis_address, password=redis_password)
     self.global_state_accessor = GlobalStateAccessor(
         redis_address, redis_password, False)
     self.global_state_accessor.connect()
     # Set the redis client and mode so _internal_kv works for autoscaler.
     worker = ray.worker.global_worker
     worker.redis_client = self.redis
     worker.mode = 0
     # Setup subscriptions to the primary Redis server and the Redis shards.
     self.primary_subscribe_client = self.redis.pubsub(
         ignore_subscribe_messages=True)
     # Keep a mapping from raylet client ID to IP address to use
     # for updating the load metrics.
     self.raylet_id_to_ip_map = {}
     head_node_ip = redis_address.split(":")[0]
     self.load_metrics = LoadMetrics(local_ip=head_node_ip)
     if autoscaling_config:
         self.autoscaler = StandardAutoscaler(autoscaling_config,
                                              self.load_metrics)
         self.autoscaling_config = autoscaling_config
     else:
         self.autoscaler = None
         self.autoscaling_config = None
コード例 #3
0
 def testScaleUpLoadMetrics(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config["min_workers"] = 0
     config["max_workers"] = 50
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     lm = LoadMetrics()
     autoscaler = StandardAutoscaler(config_path,
                                     lm,
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.update()
     lm.update("1.2.3.4", {},
               True, {},
               True, {},
               waiting_bundles=[{
                   "GPU": 1
               }],
               infeasible_bundles=[{
                   "CPU": 16
               }])
     autoscaler.update()
     self.waitForNodes(2)
     nodes = {
         self.provider.mock_nodes[0].node_type,
         self.provider.mock_nodes[1].node_type
     }
     assert nodes == {"p2.xlarge", "m4.4xlarge"}
コード例 #4
0
 def __init__(self,
              redis_address,
              autoscaling_config,
              redis_password=None,
              prefix_cluster_info=False):
     # Initialize the Redis clients.
     ray.state.state._initialize_global_state(redis_address,
                                              redis_password=redis_password)
     self.redis = ray._private.services.create_redis_client(
         redis_address, password=redis_password)
     self.global_state_accessor = GlobalStateAccessor(
         redis_address, redis_password, False)
     self.global_state_accessor.connect()
     # Set the redis client and mode so _internal_kv works for autoscaler.
     worker = ray.worker.global_worker
     worker.redis_client = self.redis
     worker.mode = 0
     # Keep a mapping from raylet client ID to IP address to use
     # for updating the load metrics.
     self.raylet_id_to_ip_map = {}
     head_node_ip = redis_address.split(":")[0]
     self.load_metrics = LoadMetrics(local_ip=head_node_ip)
     if autoscaling_config:
         self.autoscaler = StandardAutoscaler(
             autoscaling_config,
             self.load_metrics,
             prefix_cluster_info=prefix_cluster_info)
         self.autoscaling_config = autoscaling_config
     else:
         self.autoscaler = None
         self.autoscaling_config = None
コード例 #5
0
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(redis_address,
                                                 redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)

        (ip, port) = redis_address.split(":")
        self.gcs_client = connect_to_gcs(ip, int(port), redis_password)
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")

        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = grpc.insecure_channel(gcs_address, options=options)
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        worker.gcs_client = self.gcs_client
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None

        logger.info("Monitor: Started")
コード例 #6
0
    def testScaleUpIgnoreUsed(self):
        config = MULTI_WORKER_CLUSTER.copy()
        # Commenting out this line causes the test case to fail?!?!
        config["min_workers"] = 0
        config["target_utilization_fraction"] = 1.0
        config_path = self.write_config(config)
        self.provider = MockProvider()
        self.provider.create_node({}, {
            TAG_RAY_NODE_KIND: "head",
            TAG_RAY_USER_NODE_TYPE: "p2.xlarge"
        }, 1)
        head_ip = self.provider.non_terminated_node_ips({})[0]
        self.provider.finish_starting_nodes()
        runner = MockProcessRunner()
        lm = LoadMetrics(local_ip=head_ip)
        autoscaler = StandardAutoscaler(config_path,
                                        lm,
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        autoscaler.update()
        self.waitForNodes(1)
        lm.update(head_ip, {"CPU": 4, "GPU": 1}, {}, {})
        self.waitForNodes(1)

        lm.update(head_ip, {
            "CPU": 4,
            "GPU": 1
        }, {"GPU": 0}, {},
                  waiting_bundles=[{
                      "GPU": 1
                  }])
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.xlarge"
コード例 #7
0
ファイル: test_autoscaling_policy.py プロジェクト: rlan/ray
    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics(local_ip=self.head_ip)
        self.autoscaler = StandardAutoscaler(
            self.config_path,
            self.load_metrics,
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )
コード例 #8
0
 def testRequestBundles(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config["min_workers"] = 0
     config["max_workers"] = 50
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.request_resources([{"CPU": 1}])
     autoscaler.update()
     self.waitForNodes(1)
     assert self.provider.mock_nodes[0].node_type == "m4.large"
     autoscaler.request_resources([{"GPU": 8}])
     autoscaler.update()
     self.waitForNodes(2)
     assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
     autoscaler.request_resources([{"CPU": 32}] * 4)
     autoscaler.update()
     self.waitForNodes(4)
     assert self.provider.mock_nodes[2].node_type == "m4.16xlarge"
     assert self.provider.mock_nodes[3].node_type == "m4.16xlarge"
コード例 #9
0
ファイル: monitor.py プロジェクト: yncxcw/ray
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(redis_address,
                                                 redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        self.gcs_client = connect_to_gcs(ip, int(port), redis_password)
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")

        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = grpc.insecure_channel(gcs_address, options=options)
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        worker.gcs_client = self.gcs_client
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    AUTOSCALER_METRIC_PORT,
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")

        logger.info("Monitor: Started")
コード例 #10
0
    def testScaleUpMinWorkers(self):
        config = copy.deepcopy(MULTI_WORKER_CLUSTER)
        config["min_workers"] = 2
        config["max_workers"] = 50
        config["idle_timeout_minutes"] = 1
        # Since config["min_workers"] > 1, the remaining worker is started
        # with the default worker node type.
        config["available_node_types"]["p2.8xlarge"]["min_workers"] = 1
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(
            config_path,
            lm,
            max_failures=0,
            process_runner=runner,
            update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(2)
        assert len(self.provider.mock_nodes) == 2
        assert {
            self.provider.mock_nodes[0].node_type,
            self.provider.mock_nodes[1].node_type
        } == {"p2.8xlarge", "m4.large"}
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "p2.8xlarge",
            TAG_RAY_NODE_KIND: NODE_KIND_WORKER
        }, 2)
        self.provider.create_node({}, {
            TAG_RAY_USER_NODE_TYPE: "m4.16xlarge",
            TAG_RAY_NODE_KIND: NODE_KIND_WORKER
        }, 2)
        assert len(self.provider.non_terminated_nodes({})) == 6
        # Make sure that after idle_timeout_minutes we don't kill idle
        # min workers.
        for node_id in self.provider.non_terminated_nodes({}):
            lm.last_used_time_by_ip[self.provider.internal_ip(node_id)] = -60
        autoscaler.update()
        self.waitForNodes(2)

        cnt = 0
        for id in self.provider.mock_nodes:
            if self.provider.mock_nodes[id].state == "running" or \
                    self.provider.mock_nodes[id].state == "pending":
                assert self.provider.mock_nodes[id].node_type in {
                    "p2.8xlarge", "m4.large"
                }
                cnt += 1
        assert cnt == 2
コード例 #11
0
 def testScaleUpMinSanity(self):
     config_path = self.write_config(MULTI_WORKER_CLUSTER)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(2)
     autoscaler.update()
     self.waitForNodes(2)
コード例 #12
0
 def testResourceDemandVector(self):
     lm = LoadMetrics()
     lm.update("1.1.1.1", {"CPU": 2}, {"CPU": 1}, {},
               waiting_bundles=[{
                   "GPU": 1
               }],
               infeasible_bundles=[{
                   "CPU": 16
               }])
     assert same_elements(lm.get_resource_demand_vector(), [{
         "CPU": 16
     }, {
         "GPU": 1
     }])
コード例 #13
0
 def testPlacementGroupLoad(self):
     lm = LoadMetrics()
     pending_placement_groups = [
         PlacementGroupTableData(
             state=PlacementGroupTableData.RESCHEDULING,
             strategy=PlacementStrategy.PACK,
             bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
         PlacementGroupTableData(
             state=PlacementGroupTableData.RESCHEDULING,
             strategy=PlacementStrategy.SPREAD,
             bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
     ]
     lm.update("1.1.1.1", {}, {}, {},
               pending_placement_groups=pending_placement_groups)
     assert lm.get_pending_placement_groups() == pending_placement_groups
コード例 #14
0
 def testCommandPassing(self):
     t = "custom"
     config = MULTI_WORKER_CLUSTER.copy()
     config["available_node_types"]["p2.8xlarge"][
         "worker_setup_commands"] = ["new_worker_setup_command"]
     config["available_node_types"]["p2.xlarge"][
         "initialization_commands"] = ["new_worker_initialization_cmd"]
     config["available_node_types"]["p2.xlarge"]["resources"][t] = 1
     # Commenting out this line causes the test case to fail?!?!
     config["min_workers"] = 0
     config["max_workers"] = 10
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(
         config_path,
         LoadMetrics(),
         max_failures=0,
         process_runner=runner,
         update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(0)
     autoscaler.request_resources([{"CPU": 1}])
     autoscaler.update()
     self.waitForNodes(1)
     assert self.provider.mock_nodes[0].node_type == "m4.large"
     autoscaler.request_resources([{"GPU": 8}])
     autoscaler.update()
     self.waitForNodes(2)
     assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
     autoscaler.request_resources([{"GPU": 1}] * 9)
     autoscaler.update()
     self.waitForNodes(3)
     assert self.provider.mock_nodes[2].node_type == "p2.xlarge"
     autoscaler.update()
     sleep(0.1)
     runner.assert_has_call(self.provider.mock_nodes[1].internal_ip,
                            "new_worker_setup_command")
     runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                "setup_cmd")
     runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                "worker_setup_cmd")
     runner.assert_has_call(self.provider.mock_nodes[2].internal_ip,
                            "new_worker_initialization_cmd")
     runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip,
                                "init_cmd")
コード例 #15
0
 def testUpdateConfig(self):
     config = MULTI_WORKER_CLUSTER.copy()
     config_path = self.write_config(config)
     self.provider = MockProvider()
     runner = MockProcessRunner()
     autoscaler = StandardAutoscaler(config_path,
                                     LoadMetrics(),
                                     max_failures=0,
                                     process_runner=runner,
                                     update_interval_s=0)
     assert len(self.provider.non_terminated_nodes({})) == 0
     autoscaler.update()
     self.waitForNodes(2)
     config["min_workers"] = 0
     config["available_node_types"]["m4.large"]["node_config"][
         "field_changed"] = 1
     config_path = self.write_config(config)
     autoscaler.update()
     self.waitForNodes(0)
コード例 #16
0
    def testResourcePassing(self):
        config = MULTI_WORKER_CLUSTER.copy()
        config["min_workers"] = 0
        config["max_workers"] = 50
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        autoscaler = StandardAutoscaler(
            config_path,
            LoadMetrics(),
            max_failures=0,
            process_runner=runner,
            update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(0)
        autoscaler.request_resources([{"CPU": 1}])
        autoscaler.update()
        self.waitForNodes(1)
        assert self.provider.mock_nodes[0].node_type == "m4.large"
        autoscaler.request_resources([{"GPU": 8}])
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"

        # TODO (Alex): Autoscaler creates the node during one update then
        # starts the updater in the enxt update. The sleep is largely
        # unavoidable because the updater runs in its own thread and we have no
        # good way of ensuring that the commands are sent in time.
        autoscaler.update()
        sleep(0.1)

        # These checks are done separately because we have no guarantees on the
        # order the dict is serialized in.
        runner.assert_has_call("172.0.0.0", "RAY_OVERRIDE_RESOURCES=")
        runner.assert_has_call("172.0.0.0", "\"CPU\":2")
        runner.assert_has_call("172.0.0.1", "RAY_OVERRIDE_RESOURCES=")
        runner.assert_has_call("172.0.0.1", "\"CPU\":32")
        runner.assert_has_call("172.0.0.1", "\"GPU\":8")
コード例 #17
0
ファイル: monitor.py プロジェクト: kaushikb11/ray
    def __init__(self,
                 redis_address,
                 autoscaling_config,
                 redis_password=None,
                 prefix_cluster_info=False,
                 monitor_ip=None,
                 stop_event: Optional[Event] = None):
        # Initialize the Redis clients.
        ray.state.state._initialize_global_state(
            redis_address, redis_password=redis_password)
        self.redis = ray._private.services.create_redis_client(
            redis_address, password=redis_password)
        if monitor_ip:
            self.redis.set("AutoscalerMetricsAddress",
                           f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}")
        (ip, port) = redis_address.split(":")
        # Initialize the gcs stub for getting all node resource usage.
        gcs_address = self.redis.get("GcsServerAddress").decode("utf-8")
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = \
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)
        self.gcs_node_info_stub = \
            gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel)

        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        worker.redis_client = self.redis
        gcs_client = GcsClient.create_from_redis(self.redis)
        _initialize_internal_kv(gcs_client)
        worker.mode = 0
        head_node_ip = redis_address.split(":")[0]
        self.redis_address = redis_address
        self.redis_password = redis_password
        if os.environ.get("RAY_FAKE_CLUSTER"):
            self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID)
        else:
            self.load_metrics = LoadMetrics(local_ip=head_node_ip)
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry)
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning("`prometheus_client` not found, so metrics will "
                           "not be exported.")

        logger.info("Monitor: Started")
コード例 #18
0
    def testDockerWorkers(self):
        config = MULTI_WORKER_CLUSTER.copy()
        config["available_node_types"]["p2.8xlarge"]["docker"] = {
            "worker_image": "p2.8x_image:latest",
            "worker_run_options": ["p2.8x-run-options"]
        }
        config["available_node_types"]["p2.xlarge"]["docker"] = {
            "worker_image": "p2x_image:nightly"
        }
        config["docker"]["worker_run_options"] = ["standard-run-options"]
        config["docker"]["image"] = "default-image:nightly"
        config["docker"]["worker_image"] = "default-image:nightly"
        # Commenting out this line causes the test case to fail?!?!
        config["min_workers"] = 0
        config["max_workers"] = 10
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        autoscaler = StandardAutoscaler(config_path,
                                        LoadMetrics(),
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        assert len(self.provider.non_terminated_nodes({})) == 0
        autoscaler.update()
        self.waitForNodes(0)
        autoscaler.request_resources([{"CPU": 1}])
        autoscaler.update()
        self.waitForNodes(1)
        assert self.provider.mock_nodes[0].node_type == "m4.large"
        autoscaler.request_resources([{"GPU": 8}])
        autoscaler.update()
        self.waitForNodes(2)
        assert self.provider.mock_nodes[1].node_type == "p2.8xlarge"
        autoscaler.request_resources([{"GPU": 1}] * 9)
        autoscaler.update()
        self.waitForNodes(3)
        assert self.provider.mock_nodes[2].node_type == "p2.xlarge"
        autoscaler.update()
        # Fill up m4, p2.8, p2 and request 2 more CPUs
        autoscaler.request_resources([{
            "CPU": 2
        }, {
            "CPU": 16
        }, {
            "CPU": 32
        }, {
            "CPU": 2
        }])
        autoscaler.update()
        self.waitForNodes(4)
        assert self.provider.mock_nodes[3].node_type == "m4.16xlarge"
        autoscaler.update()
        sleep(0.1)
        runner.assert_has_call(self.provider.mock_nodes[1].internal_ip,
                               "p2.8x-run-options")
        runner.assert_has_call(self.provider.mock_nodes[1].internal_ip,
                               "p2.8x_image:latest")
        runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                   "default-image:nightly")
        runner.assert_not_has_call(self.provider.mock_nodes[1].internal_ip,
                                   "standard-run-options")

        runner.assert_has_call(self.provider.mock_nodes[2].internal_ip,
                               "p2x_image:nightly")
        runner.assert_has_call(self.provider.mock_nodes[2].internal_ip,
                               "standard-run-options")
        runner.assert_not_has_call(self.provider.mock_nodes[2].internal_ip,
                                   "p2.8x-run-options")

        runner.assert_has_call(self.provider.mock_nodes[3].internal_ip,
                               "default-image:nightly")
        runner.assert_has_call(self.provider.mock_nodes[3].internal_ip,
                               "standard-run-options")
        runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip,
                                   "p2.8x-run-options")
        runner.assert_not_has_call(self.provider.mock_nodes[3].internal_ip,
                                   "p2x_image:nightly")
コード例 #19
0
    def __init__(
        self,
        address: str,
        autoscaling_config: Union[str, Callable[[], Dict[str, Any]]],
        redis_password: Optional[str] = None,
        prefix_cluster_info: bool = False,
        monitor_ip: Optional[str] = None,
        stop_event: Optional[Event] = None,
        retry_on_failure: bool = True,
    ):
        gcs_address = address
        options = (("grpc.enable_http_proxy", 0), )
        gcs_channel = ray._private.utils.init_grpc_channel(
            gcs_address, options)
        # TODO: Use gcs client for this
        self.gcs_node_resources_stub = (
            gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel))
        self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub(
            gcs_channel)
        if redis_password is not None:
            logger.warning("redis_password has been deprecated.")
        # Set the redis client and mode so _internal_kv works for autoscaler.
        worker = ray.worker.global_worker
        gcs_client = GcsClient(address=gcs_address)

        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        _initialize_internal_kv(gcs_client)
        if monitor_ip:
            monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}"
            gcs_client.internal_kv_put(b"AutoscalerMetricsAddress",
                                       monitor_addr.encode(), True, None)
        worker.mode = 0
        head_node_ip = gcs_address.split(":")[0]

        self.load_metrics = LoadMetrics()
        self.last_avail_resources = None
        self.event_summarizer = EventSummarizer()
        self.prefix_cluster_info = prefix_cluster_info
        # Can be used to signal graceful exit from monitor loop.
        self.stop_event = stop_event  # type: Optional[Event]
        self.retry_on_failure = retry_on_failure
        self.autoscaling_config = autoscaling_config
        self.autoscaler = None
        # If set, we are in a manually created cluster (non-autoscaling) and
        # simply mirroring what the GCS tells us the cluster node types are.
        self.readonly_config = None

        self.prom_metrics = AutoscalerPrometheusMetrics()
        if monitor_ip and prometheus_client:
            # If monitor_ip wasn't passed in, then don't attempt to start the
            # metric server to keep behavior identical to before metrics were
            # introduced
            try:
                logger.info(
                    "Starting autoscaler metrics server on port {}".format(
                        AUTOSCALER_METRIC_PORT))
                prometheus_client.start_http_server(
                    port=AUTOSCALER_METRIC_PORT,
                    addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "",
                    registry=self.prom_metrics.registry,
                )
            except Exception:
                logger.exception(
                    "An exception occurred while starting the metrics server.")
        elif not prometheus_client:
            logger.warning(
                "`prometheus_client` not found, so metrics will not be exported."
            )

        logger.info("Monitor: Started")
コード例 #20
0
    def testPlacementGroup(self):
        # Note this is mostly an integration test. See
        # testPlacementGroupScaling for more comprehensive tests.
        config = copy.deepcopy(MULTI_WORKER_CLUSTER)
        config["min_workers"] = 0
        config["max_workers"] = 999
        config_path = self.write_config(config)
        self.provider = MockProvider()
        runner = MockProcessRunner()
        lm = LoadMetrics()
        autoscaler = StandardAutoscaler(config_path,
                                        lm,
                                        max_failures=0,
                                        process_runner=runner,
                                        update_interval_s=0)
        self.provider.create_node({}, {
            TAG_RAY_NODE_KIND: "head",
            TAG_RAY_USER_NODE_TYPE: "m4.4xlarge"
        }, 1)
        head_ip = self.provider.non_terminated_node_ips({})[0]
        assert len(self.provider.non_terminated_nodes({})) == 1
        autoscaler.update()
        self.waitForNodes(1)

        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.STRICT_SPREAD,
                bundles=[Bundle(unit_resources={"GPU": 2})] * 3),
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 5)),
        ]
        # Since placement groups are implemented with custom resources, this is
        # an example of the accompanying resource demands. Note the resource
        # demand autoscaler will be unable to fulfill these demands, but we
        # should still handle the other infeasible/waiting bundles.
        placement_group_resource_demands = [{
            "GPU_group_0_6c2506ac733bc37496295b02c4fad446":
            0.0101,
            "GPU_group_6c2506ac733bc37496295b02c4fad446":
            0.0101
        }]
        lm.update(head_ip, {"CPU": 16},
                  True, {"CPU": 16},
                  False, {},
                  infeasible_bundles=placement_group_resource_demands,
                  waiting_bundles=[{
                      "GPU": 8
                  }],
                  pending_placement_groups=pending_placement_groups)
        autoscaler.update()
        self.waitForNodes(5)

        for i in range(1, 5):
            assert self.provider.mock_nodes[i].node_type == "p2.8xlarge"

        pending_placement_groups = [
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.STRICT_PACK,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 4)),
            PlacementGroupTableData(
                state=PlacementGroupTableData.RESCHEDULING,
                strategy=PlacementStrategy.SPREAD,
                bundles=([Bundle(unit_resources={"GPU": 2})] * 2)),
        ]