コード例 #1
0
    def _update(self):
        now = time.time()
        # Throttle autoscaling updates to this interval to avoid exceeding
        # rate limits on API calls.
        if now - self.last_update_time < self.update_interval_s:
            return

        self.last_update_time = now
        self.update_worker_list()

        self.load_metrics.prune_active_ips([
            self.provider.internal_ip(node_id) for node_id in self.all_workers
        ])

        self.terminate_nodes_to_enforce_config_constraints(now)

        self.launch_required_nodes()

        if self.disable_node_updaters:
            self.terminate_unhealthy_nodes(now)
        else:
            self.process_completed_updates()
            self.update_nodes()
            self.attempt_to_recover_unhealthy_nodes(now)
            self.set_prometheus_updater_data()

        logger.info(self.info_string())
        legacy_log_info_string(self, self.workers)
コード例 #2
0
ファイル: autoscaler.py プロジェクト: rlan/ray
    def _update(self):
        now = time.time()
        # Throttle autoscaling updates to this interval to avoid exceeding
        # rate limits on API calls.
        if now - self.last_update_time < self.update_interval_s:
            return

        self.last_update_time = now
        self.update_worker_list()

        self.load_metrics.prune_active_ips([
            self.provider.internal_ip(node_id) for node_id in self.all_workers
        ])

        if not self.provider.is_readonly():
            self.terminate_nodes_to_enforce_config_constraints(now)

        # Dict[NodeType, int], List[ResourceDict]
        to_launch, unfulfilled = (
            self.resource_demand_scheduler.get_nodes_to_launch(
                self.provider.non_terminated_nodes(tag_filters={}),
                self.pending_launches.breakdown(),
                self.load_metrics.get_resource_demand_vector(),
                self.load_metrics.get_resource_utilization(),
                self.load_metrics.get_pending_placement_groups(),
                self.load_metrics.get_static_node_resources_by_ip(),
                ensure_min_cluster_size=self.load_metrics.
                get_resource_requests()))
        self._report_pending_infeasible(unfulfilled)

        if not self.provider.is_readonly():
            self.launch_required_nodes(to_launch)

            if self.disable_node_updaters:
                self.terminate_unhealthy_nodes(now)
            else:
                self.process_completed_updates()
                self.update_nodes()
                self.attempt_to_recover_unhealthy_nodes(now)
                self.set_prometheus_updater_data()

        logger.info(self.info_string())
        legacy_log_info_string(self, self.workers)
コード例 #3
0
ファイル: autoscaler.py プロジェクト: zhe-thoughts/ray
    def _update(self):
        now = time.time()
        # Throttle autoscaling updates to this interval to avoid exceeding
        # rate limits on API calls.
        if now - self.last_update_time < self.update_interval_s:
            return

        self.last_update_time = now
        nodes = self.workers()

        self.load_metrics.prune_active_ips([
            self.provider.internal_ip(node_id)
            for node_id in self.all_workers()
        ])

        # Terminate any idle or out of date nodes
        last_used = self.load_metrics.last_used_time_by_ip
        horizon = now - (60 * self.config["idle_timeout_minutes"])

        nodes_to_terminate: Dict[NodeID, bool] = []
        node_type_counts = collections.defaultdict(int)
        # Sort based on last used to make sure to keep min_workers that
        # were most recently used. Otherwise, _keep_min_workers_of_node_type
        # might keep a node that should be terminated.
        sorted_node_ids = self._sort_based_on_last_used(nodes, last_used)
        # Don't terminate nodes needed by request_resources()
        nodes_allowed_to_terminate: Dict[NodeID, bool] = {}
        if self.load_metrics.get_resource_requests():
            nodes_allowed_to_terminate = self._get_nodes_allowed_to_terminate(
                sorted_node_ids)

        for node_id in sorted_node_ids:
            # Make sure to not kill idle node types if the number of workers
            # of that type is lower/equal to the min_workers of that type
            # or it is needed for request_resources().
            if (self._keep_min_worker_of_node_type(node_id, node_type_counts)
                    or not nodes_allowed_to_terminate.get(
                        node_id, True)) and self.launch_config_ok(node_id):
                continue

            node_ip = self.provider.internal_ip(node_id)
            if node_ip in last_used and last_used[node_ip] < horizon:
                logger.info("StandardAutoscaler: "
                            "{}: Terminating idle node.".format(node_id))
                self.event_summarizer.add("Removing {} nodes of type " +
                                          self._get_node_type(node_id) +
                                          " (idle).",
                                          quantity=1,
                                          aggregate=operator.add)
                nodes_to_terminate.append(node_id)
            elif not self.launch_config_ok(node_id):
                logger.info("StandardAutoscaler: "
                            "{}: Terminating outdated node.".format(node_id))
                self.event_summarizer.add("Removing {} nodes of type " +
                                          self._get_node_type(node_id) +
                                          " (outdated).",
                                          quantity=1,
                                          aggregate=operator.add)
                nodes_to_terminate.append(node_id)

        if nodes_to_terminate:
            self.provider.terminate_nodes(nodes_to_terminate)
            nodes = self.workers()

        # Terminate nodes if there are too many
        nodes_to_terminate = []
        while (len(nodes) -
               len(nodes_to_terminate)) > self.config["max_workers"] and nodes:
            to_terminate = nodes.pop()
            logger.info("StandardAutoscaler: "
                        "{}: Terminating unneeded node.".format(to_terminate))
            self.event_summarizer.add("Removing {} nodes of type " +
                                      self._get_node_type(to_terminate) +
                                      " (max workers).",
                                      quantity=1,
                                      aggregate=operator.add)
            nodes_to_terminate.append(to_terminate)

        if nodes_to_terminate:
            self.provider.terminate_nodes(nodes_to_terminate)
            nodes = self.workers()

        to_launch = self.resource_demand_scheduler.get_nodes_to_launch(
            self.provider.non_terminated_nodes(tag_filters={}),
            self.pending_launches.breakdown(),
            self.load_metrics.get_resource_demand_vector(),
            self.load_metrics.get_resource_utilization(),
            self.load_metrics.get_pending_placement_groups(),
            self.load_metrics.get_static_node_resources_by_ip(),
            ensure_min_cluster_size=self.load_metrics.get_resource_requests())
        for node_type, count in to_launch.items():
            self.launch_new_node(count, node_type=node_type)

        nodes = self.workers()

        # Process any completed updates
        completed = []
        for node_id, updater in self.updaters.items():
            if not updater.is_alive():
                completed.append(node_id)
        if completed:
            nodes_to_terminate: List[NodeID] = []
            for node_id in completed:
                if self.updaters[node_id].exitcode == 0:
                    self.num_successful_updates[node_id] += 1
                    # Mark the node as active to prevent the node recovery
                    # logic immediately trying to restart Ray on the new node.
                    self.load_metrics.mark_active(
                        self.provider.internal_ip(node_id))
                else:
                    logger.error(f"StandardAutoscaler: {node_id}: Terminating "
                                 "failed to setup/initialize node.")
                    self.event_summarizer.add("Removing {} nodes of type " +
                                              self._get_node_type(node_id) +
                                              " (launch failed).",
                                              quantity=1,
                                              aggregate=operator.add)
                    nodes_to_terminate.append(node_id)
                    self.num_failed_updates[node_id] += 1
                del self.updaters[node_id]
            if nodes_to_terminate:
                self.provider.terminate_nodes(nodes_to_terminate)

            nodes = self.workers()

        # Update nodes with out-of-date files.
        # TODO(edoakes): Spawning these threads directly seems to cause
        # problems. They should at a minimum be spawned as daemon threads.
        # See https://github.com/ray-project/ray/pull/5903 for more info.
        T = []
        for node_id, commands, ray_start, docker_config in (
                self.should_update(node_id) for node_id in nodes):
            if node_id is not None:
                resources = self._node_resources(node_id)
                logger.debug(f"{node_id}: Starting new thread runner.")
                T.append(
                    threading.Thread(target=self.spawn_updater,
                                     args=(node_id, commands, ray_start,
                                           resources, docker_config)))
        for t in T:
            t.start()
        for t in T:
            t.join()

        # Attempt to recover unhealthy nodes
        for node_id in nodes:
            self.recover_if_needed(node_id, now)

        logger.info(self.info_string())
        legacy_log_info_string(self, nodes)
コード例 #4
0
ファイル: autoscaler.py プロジェクト: haochihlin/ray
    def _update(self):
        now = time.time()
        # Throttle autoscaling updates to this interval to avoid exceeding
        # rate limits on API calls.
        if now - self.last_update_time < self.update_interval_s:
            return

        self.last_update_time = now
        nodes = self.workers()

        self.load_metrics.prune_active_ips([
            self.provider.internal_ip(node_id)
            for node_id in self.all_workers()
        ])

        # Terminate any idle or out of date nodes
        last_used = self.load_metrics.last_used_time_by_ip
        horizon = now - (60 * self.config["idle_timeout_minutes"])

        nodes_to_terminate: List[NodeID] = []
        node_type_counts = defaultdict(int)
        # Sort based on last used to make sure to keep min_workers that
        # were most recently used. Otherwise, _keep_min_workers_of_node_type
        # might keep a node that should be terminated.
        sorted_node_ids = self._sort_based_on_last_used(nodes, last_used)

        # Don't terminate nodes needed by request_resources()
        nodes_not_allowed_to_terminate: FrozenSet[NodeID] = {}
        if self.load_metrics.get_resource_requests():
            nodes_not_allowed_to_terminate = \
                self._get_nodes_needed_for_request_resources(sorted_node_ids)

        def keep_node(node_id: NodeID) -> None:
            # Update per-type counts and add node_id to nodes_to_keep.
            tags = self.provider.node_tags(node_id)
            if TAG_RAY_USER_NODE_TYPE in tags:
                node_type = tags[TAG_RAY_USER_NODE_TYPE]
                node_type_counts[node_type] += 1

        def schedule_node_termination(node_id: NodeID,
                                      reason_opt: Optional[str]) -> None:
            if reason_opt is None:
                raise Exception("reason should be not None.")
            reason: str = reason_opt
            # Log, record an event, and add node_id to nodes_to_terminate.
            logger.info("StandardAutoscaler: "
                        "{}: Terminating {} node.".format(node_id, reason))
            self.event_summarizer.add("Removing {} nodes of type " +
                                      self._get_node_type(node_id) +
                                      " ({}).".format(reason),
                                      quantity=1,
                                      aggregate=operator.add)
            nodes_to_terminate.append(node_id)

        # Nodes that we could terminate, if needed.
        nodes_we_could_terminate: List[NodeID] = []

        for node_id in sorted_node_ids:
            # Make sure to not kill idle node types if the number of workers
            # of that type is lower/equal to the min_workers of that type
            # or it is needed for request_resources().
            should_keep_or_terminate, reason = self._keep_worker_of_node_type(
                node_id, node_type_counts)
            if should_keep_or_terminate == KeepOrTerminate.terminate:
                schedule_node_termination(node_id, reason)
                continue
            if ((should_keep_or_terminate == KeepOrTerminate.keep
                 or node_id in nodes_not_allowed_to_terminate)
                    and self.launch_config_ok(node_id)):
                keep_node(node_id)
                continue

            node_ip = self.provider.internal_ip(node_id)
            if node_ip in last_used and last_used[node_ip] < horizon:
                schedule_node_termination(node_id, "idle")
            elif not self.launch_config_ok(node_id):
                schedule_node_termination(node_id, "outdated")
            else:
                keep_node(node_id)
                nodes_we_could_terminate.append(node_id)

        # Terminate nodes if there are too many
        num_extra_nodes_to_terminate = (len(nodes) - len(nodes_to_terminate) -
                                        self.config["max_workers"])

        if num_extra_nodes_to_terminate > len(nodes_we_could_terminate):
            logger.warning(
                "StandardAutoscaler: trying to terminate "
                f"{num_extra_nodes_to_terminate} nodes, while only "
                f"{len(nodes_we_could_terminate)} are safe to terminate."
                " Inconsistent config is likely.")
            num_extra_nodes_to_terminate = len(nodes_we_could_terminate)

        # If num_extra_nodes_to_terminate is negative or zero,
        # we would have less than max_workers nodes after terminating
        # nodes_to_terminate and we do not need to terminate anything else.
        if num_extra_nodes_to_terminate > 0:
            extra_nodes_to_terminate = nodes_we_could_terminate[
                -num_extra_nodes_to_terminate:]
            for node_id in extra_nodes_to_terminate:
                schedule_node_termination(node_id, "max workers")

        if nodes_to_terminate:
            self._terminate_nodes_and_cleanup(nodes_to_terminate)
            nodes = self.workers()

        to_launch = self.resource_demand_scheduler.get_nodes_to_launch(
            self.provider.non_terminated_nodes(tag_filters={}),
            self.pending_launches.breakdown(),
            self.load_metrics.get_resource_demand_vector(),
            self.load_metrics.get_resource_utilization(),
            self.load_metrics.get_pending_placement_groups(),
            self.load_metrics.get_static_node_resources_by_ip(),
            ensure_min_cluster_size=self.load_metrics.get_resource_requests())
        for node_type, count in to_launch.items():
            self.launch_new_node(count, node_type=node_type)

        if to_launch:
            nodes = self.workers()

        # Process any completed updates
        completed_nodes = []
        for node_id, updater in self.updaters.items():
            if not updater.is_alive():
                completed_nodes.append(node_id)
        if completed_nodes:
            failed_nodes = []
            for node_id in completed_nodes:
                updater = self.updaters[node_id]
                if updater.exitcode == 0:
                    self.num_successful_updates[node_id] += 1
                    self.prom_metrics.successful_updates.inc()
                    if updater.for_recovery:
                        self.prom_metrics.successful_recoveries.inc()
                    if updater.update_time:
                        self.prom_metrics.worker_update_time.observe(
                            updater.update_time)
                    # Mark the node as active to prevent the node recovery
                    # logic immediately trying to restart Ray on the new node.
                    self.load_metrics.mark_active(
                        self.provider.internal_ip(node_id))
                else:
                    failed_nodes.append(node_id)
                    self.num_failed_updates[node_id] += 1
                    self.prom_metrics.failed_updates.inc()
                    if updater.for_recovery:
                        self.prom_metrics.failed_recoveries.inc()
                    self.node_tracker.untrack(node_id)
                del self.updaters[node_id]

            if failed_nodes:
                # Some nodes in failed_nodes may have been terminated
                # during an update (for being idle after missing a heartbeat).
                # Only terminate currently non terminated nodes.
                non_terminated_nodes = self.workers()
                nodes_to_terminate: List[NodeID] = []
                for node_id in failed_nodes:
                    if node_id in non_terminated_nodes:
                        nodes_to_terminate.append(node_id)
                        logger.error(f"StandardAutoscaler: {node_id}:"
                                     " Terminating. Failed to setup/initialize"
                                     " node.")
                        self.event_summarizer.add(
                            "Removing {} nodes of type " +
                            self._get_node_type(node_id) + " (launch failed).",
                            quantity=1,
                            aggregate=operator.add)
                    else:
                        logger.warning(f"StandardAutoscaler: {node_id}:"
                                       " Failed to update node."
                                       " Node has already been terminated.")
                if nodes_to_terminate:
                    self._terminate_nodes_and_cleanup(nodes_to_terminate)
                    nodes = self.workers()

        # Update nodes with out-of-date files.
        # TODO(edoakes): Spawning these threads directly seems to cause
        # problems. They should at a minimum be spawned as daemon threads.
        # See https://github.com/ray-project/ray/pull/5903 for more info.
        T = []
        for node_id, setup_commands, ray_start_commands, docker_config in (
                self.should_update(node_id) for node_id in nodes):
            if node_id is not None:
                resources = self._node_resources(node_id)
                logger.debug(f"{node_id}: Starting new thread runner.")
                T.append(
                    threading.Thread(target=self.spawn_updater,
                                     args=(node_id, setup_commands,
                                           ray_start_commands, resources,
                                           docker_config)))
        for t in T:
            t.start()
        for t in T:
            t.join()

        if self.disable_node_updaters:
            # If updaters are unavailable, terminate unhealthy nodes.
            nodes_to_terminate = self.get_unhealthy_nodes(nodes, now)
            if nodes_to_terminate:
                self._terminate_nodes_and_cleanup(nodes_to_terminate)
                nodes = self.workers()
        else:
            # Attempt to recover unhealthy nodes
            for node_id in nodes:
                self.recover_if_needed(node_id, now)

        self.prom_metrics.updating_nodes.set(len(self.updaters))
        num_recovering = 0
        for updater in self.updaters.values():
            if updater.for_recovery:
                num_recovering += 1
        self.prom_metrics.recovering_nodes.set(num_recovering)
        logger.info(self.info_string())
        legacy_log_info_string(self, nodes)