Beispiel #1
0
    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics(local_ip=self.head_ip)
        self.autoscaler = StandardAutoscaler(
            self.config_path,
            self.load_metrics,
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )
Beispiel #2
0
    def __init__(self,
                 config_path,
                 load_metrics,
                 max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH,
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S):
        self.config_path = config_path
        # Keep this before self.reset (self.provider needs to be created
        # exactly once).
        self.provider = None
        self.resource_demand_scheduler = None
        self.reset(errors_fatal=True)
        self.load_metrics = load_metrics

        self.max_failures = max_failures
        self.max_launch_batch = max_launch_batch
        self.max_concurrent_launches = max_concurrent_launches
        self.process_runner = process_runner

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s

        # Node launchers
        self.launch_queue = queue.Queue()
        self.pending_launches = ConcurrentCounter()
        max_batches = math.ceil(max_concurrent_launches /
                                float(max_launch_batch))
        for i in range(int(max_batches)):
            node_launcher = NodeLauncher(
                provider=self.provider,
                queue=self.launch_queue,
                index=i,
                pending=self.pending_launches,
                node_types=self.available_node_types,
            )
            node_launcher.daemon = True
            node_launcher.start()

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)

        # List of resource bundles the user is requesting of the cluster.
        self.resource_demand_vector = []

        logger.info("StandardAutoscaler: {}".format(self.config))
Beispiel #3
0
    def __init__(
            self,
            config_path: str,
            load_metrics: LoadMetrics,
            max_launch_batch: int = AUTOSCALER_MAX_LAUNCH_BATCH,
            max_concurrent_launches: int = AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
            max_failures: int = AUTOSCALER_MAX_NUM_FAILURES,
            process_runner: Any = subprocess,
            update_interval_s: int = AUTOSCALER_UPDATE_INTERVAL_S,
            prefix_cluster_info: bool = False,
            event_summarizer: Optional[EventSummarizer] = None,
            prom_metrics: Optional[AutoscalerPrometheusMetrics] = None):
        """Create a StandardAutoscaler.

        Args:
        config_path: Path to a Ray Autoscaler YAML.
        load_metrics: Provides metrics for the Ray cluster.
        max_launch_batch: Max number of nodes to launch in one request.
        max_concurrent_launches: Max number of nodes that can be concurrently
            launched. This value and `max_launch_batch` determine the number
            of batches that are used to launch nodes.
        max_failures: Number of failures that the autoscaler will tolerate
            before exiting.
        process_runner: Subprocess-like interface used by the CommandRunner.
        update_interval_s: Seconds between running the autoscaling loop.
        prefix_cluster_info: Whether to add the cluster name to info strings.
        event_summarizer: Utility to consolidate duplicated messages.
        prom_metrics: Prometheus metrics for autoscaler-related operations.
        """

        self.config_path = config_path
        # Prefix each line of info string with cluster name if True
        self.prefix_cluster_info = prefix_cluster_info
        # Keep this before self.reset (self.provider needs to be created
        # exactly once).
        self.provider = None
        # Keep this before self.reset (if an exception occurs in reset
        # then prom_metrics must be instantitiated to increment the
        # exception counter)
        self.prom_metrics = prom_metrics or \
            AutoscalerPrometheusMetrics()
        self.resource_demand_scheduler = None
        self.reset(errors_fatal=True)
        self.head_node_ip = load_metrics.local_ip
        self.load_metrics = load_metrics

        self.max_failures = max_failures
        self.max_launch_batch = max_launch_batch
        self.max_concurrent_launches = max_concurrent_launches
        self.process_runner = process_runner
        self.event_summarizer = event_summarizer or EventSummarizer()

        # Map from node_id to NodeUpdater threads
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s

        # Tracks active worker nodes
        self.workers = []
        # Tracks nodes scheduled for termination
        self.nodes_to_terminate = []

        # Disable NodeUpdater threads if true.
        # Should be set to true in situations where another component, such as
        # a Kubernetes operator, is responsible for Ray setup on nodes.
        self.disable_node_updaters = self.config["provider"].get(
            "disable_node_updaters", False)

        # Node launchers
        self.launch_queue = queue.Queue()
        self.pending_launches = ConcurrentCounter()
        max_batches = math.ceil(max_concurrent_launches /
                                float(max_launch_batch))
        for i in range(int(max_batches)):
            node_launcher = NodeLauncher(provider=self.provider,
                                         queue=self.launch_queue,
                                         index=i,
                                         pending=self.pending_launches,
                                         node_types=self.available_node_types,
                                         prom_metrics=self.prom_metrics)
            node_launcher.daemon = True
            node_launcher.start()

        # NodeTracker maintains soft state to track the number of recently
        # failed nodes. It is best effort only.
        self.node_tracker = NodeTracker()

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)
        logger.info("StandardAutoscaler: {}".format(self.config))
Beispiel #4
0
class Simulator:
    """This autoscaler simulator consists of a few components.

    State is stored in 3 main data structures:
        * Resource management state is stored in self.ip_to_nodes
        * The scheduler's work queue is stored in self.work_queue
        * An event queue which acts as the simulation's "timeline" in
          self.event_queue


    The logic is organized into 3 functions (and their helpers):
        * self.run_autoscaler plays the role of `monitor.py` and translates
          resource management state for load_metrics to consume.
        * self.schedule is the only consumer of the work queue. It dispatches
          work to the appropriate schedulers, which mutate cluster state and
          produce events for the event queue.
        * self.process_event is the sole consumer of the event queue. It
          dispatches work to the appropriate event handlers.

    There are 3 main ways of interacting with the simulator:
        * simulator.submit: To submit tasks
        * simulator.step: To go to the next "event"
        * task/actor/placement group start/done callbacks

    """
    def __init__(
        self,
        config_path,
        provider,
        autoscaler_update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
        node_startup_delay_s=120,
    ):
        self.config_path = config_path
        self.provider = provider
        self.autoscaler_update_interval_s = autoscaler_update_interval_s
        self.node_startup_delay_s = node_startup_delay_s

        self._setup_autoscaler()
        self._setup_simulator()

    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: NODE_KIND_HEAD,
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics(local_ip=self.head_ip)
        self.autoscaler = StandardAutoscaler(
            self.config_path,
            self.load_metrics,
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )

    def _setup_simulator(self):
        self.virtual_time = 0
        self.ip_to_nodes = {}
        self._update_cluster_state(join_immediately=True)

        self.work_queue = []
        self.event_queue = PriorityQueue()
        self.event_queue.put(Event(0, SIMULATOR_EVENT_AUTOSCALER_UPDATE))

    def _update_cluster_state(self, join_immediately=False):
        nodes = self.provider.non_terminated_nodes(tag_filters={})
        for node_id in nodes:
            ip = self.provider.internal_ip(node_id)
            if ip in self.ip_to_nodes:
                continue
            node_tags = self.provider.node_tags(node_id)
            if TAG_RAY_USER_NODE_TYPE in node_tags:
                node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
                resources = self.config["available_node_types"][node_type].get(
                    "resources", {})
                node = Node(resources, join_immediately, node_type,
                            self.virtual_time)
                self.ip_to_nodes[ip] = node
                if not join_immediately:
                    join_time = self.virtual_time + self.node_startup_delay_s
                    self.event_queue.put(
                        Event(join_time, SIMULATOR_EVENT_NODE_JOINED, node))

    def submit(self, work):
        if isinstance(work, list):
            self.work_queue.extend(work)
        else:
            self.work_queue.append(work)

    def _get_node_to_run(self, bundle, nodes):
        for ip, node in nodes.items():
            if node.bundle_fits(bundle):
                return ip, node
        return None, None

    def _schedule_placement_group(self, pg, nodes):
        # This scheduling algorithm is bad, but it is approximately as bad as
        # the real placement group scheduler.
        to_allocate = []
        if (pg.strategy == PlacementStrategy.STRICT_PACK
                or pg.strategy == PlacementStrategy.PACK):
            combined = collections.defaultdict(float)
            for bundle in pg.bundles:
                for k, v in bundle.items():
                    combined[k] += v
            ip, node_to_run = self._get_node_to_run(combined, nodes)
            if node_to_run is None:
                return False
            to_allocate.append((combined, ip))
        elif (pg.strategy == PlacementStrategy.STRICT_SPREAD
              or pg.strategy == PlacementStrategy.SPREAD):
            # TODO (Alex): More accurate handling of non-STRICT_PACK groups.
            remaining_nodes = nodes.copy()
            for bundle in pg.bundles:
                ip, node_to_run = self._get_node_to_run(
                    bundle, remaining_nodes)
                if node_to_run is None:
                    return False
                del remaining_nodes[ip]
                to_allocate.append((bundle, ip))

        for bundle, ip in to_allocate:
            node = self.ip_to_nodes[ip]
            node.allocate(bundle)
        pg.start_time = self.virtual_time
        end_time = self.virtual_time + pg.duration
        self.event_queue.put(
            Event(end_time, SIMULATOR_EVENT_PG_DONE, (pg, to_allocate)))
        if pg.start_callback:
            pg.start_callback()
        return True

    def _schedule_task(self, task, nodes):
        ip, node = self._get_node_to_run(task.resources, nodes)
        if node is None:
            return False

        node.allocate(task.resources)
        task.node = node
        task.start_time = self.virtual_time
        end_time = self.virtual_time + task.duration
        self.event_queue.put(Event(end_time, SIMULATOR_EVENT_TASK_DONE, task))
        if task.start_callback:
            task.start_callback()
        return True

    def schedule(self):
        # TODO (Alex): Implement a more realistic scheduling algorithm.
        new_work_queue = []
        for work in self.work_queue:
            if isinstance(work, Task):
                scheduled = self._schedule_task(work, self.ip_to_nodes)
            elif isinstance(work, PlacementGroup):
                scheduled = self._schedule_placement_group(
                    work, self.ip_to_nodes)
            else:
                assert False, "Unknown work object!"

            if scheduled is False:
                new_work_queue.append(work)
        self.work_queue = new_work_queue

    def _launch_nodes(self):
        """Launch all queued nodes. Since this will be run serially after
        `autoscaler.update` there are no race conditions in checking if the
        queue is empty.
        """
        while not self.node_launcher.queue.empty():
            config, count, node_type = self.node_launcher.queue.get()
            try:
                self.node_launcher._launch_node(config, count, node_type)
            except Exception:
                pass
            finally:
                self.node_launcher.pending.dec(node_type, count)

    def _infeasible(self, bundle):
        for node in self.ip_to_nodes.values():
            if node.feasible(bundle):
                return False
        return True

    def run_autoscaler(self):
        waiting_bundles = []
        infeasible_bundles = []
        placement_groups = []
        for work in self.work_queue:
            if isinstance(work, Task):
                shape = work.resources
                if self._infeasible(shape):
                    infeasible_bundles.append(shape)
                else:
                    waiting_bundles.append(shape)
            if isinstance(work, PlacementGroup):
                placement_groups.append(
                    PlacementGroupTableData(
                        state=PlacementGroupTableData.PENDING,
                        strategy=work.strategy,
                        bundles=[
                            Bundle(unit_resources=bundle)
                            for bundle in work.bundles
                        ],
                    ))

        for ip, node in self.ip_to_nodes.items():
            if not node.in_cluster:
                continue
            self.load_metrics.update(
                ip=ip,
                static_resources=node.total_resources,
                dynamic_resources=node.available_resources,
                resource_load={},
                waiting_bundles=waiting_bundles,
                infeasible_bundles=infeasible_bundles,
                pending_placement_groups=placement_groups,
            )

        self.autoscaler.update()
        self._launch_nodes()
        self._update_cluster_state()

    def process_event(self, event):
        if event.event_type == SIMULATOR_EVENT_AUTOSCALER_UPDATE:
            self.run_autoscaler()
            next_update = self.virtual_time + self.autoscaler_update_interval_s
            self.event_queue.put(
                Event(next_update, SIMULATOR_EVENT_AUTOSCALER_UPDATE))
        elif event.event_type == SIMULATOR_EVENT_TASK_DONE:
            task = event.data
            task.node.free(task.resources)
            if task.done_callback:
                task.done_callback()
        elif event.event_type == SIMULATOR_EVENT_NODE_JOINED:
            node = event.data
            node.in_cluster = True
        elif event.event_type == SIMULATOR_EVENT_PG_DONE:
            pg, allocated = event.data
            for bundle, ip in allocated:
                self.ip_to_nodes[ip].free(bundle)
            if pg.done_callback:
                pg.done_callback()
        else:
            assert False, "Unknown event!"

    def step(self):
        self.virtual_time = self.event_queue.queue[0].time
        while self.event_queue.queue[0].time == self.virtual_time:
            event = self.event_queue.get()
            self.process_event(event)
        self.schedule()
        print(self.info_string())
        return self.virtual_time

    def node_costs(self):
        """Returns the cost of nodes. Cost is measured in terms of cumulative hours of
        runtime per node type.

        """
        costs = collections.defaultdict(float)

        for node in self.ip_to_nodes.values():
            if not node.in_cluster:
                continue
            runtime = self.virtual_time - node.start_time
            costs[node.node_type] += runtime
        return costs

    def info_string(self):
        num_connected_nodes = len(
            [node for node in self.ip_to_nodes.values() if node.in_cluster])
        num_pending_nodes = len(self.ip_to_nodes) - num_connected_nodes
        return (f"[t={self.virtual_time}] "
                f"Connected: {num_connected_nodes}, "
                f"Pending: {num_pending_nodes}, "
                f"Remaining: {len(self.work_queue)}")
Beispiel #5
0
    def __init__(self,
                 config_path,
                 load_metrics,
                 max_launch_batch=AUTOSCALER_MAX_LAUNCH_BATCH,
                 max_concurrent_launches=AUTOSCALER_MAX_CONCURRENT_LAUNCHES,
                 max_failures=AUTOSCALER_MAX_NUM_FAILURES,
                 process_runner=subprocess,
                 update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
                 prefix_cluster_info=False,
                 event_summarizer=None,
                 prom_metrics=None):
        self.config_path = config_path
        # Prefix each line of info string with cluster name if True
        self.prefix_cluster_info = prefix_cluster_info
        # Keep this before self.reset (self.provider needs to be created
        # exactly once).
        self.provider = None
        # Keep this before self.reset (if an exception occurs in reset
        # then prom_metrics must be instantitiated to increment the
        # exception counter)
        self.prom_metrics = prom_metrics or \
            AutoscalerPrometheusMetrics()
        self.resource_demand_scheduler = None
        self.reset(errors_fatal=True)
        self.head_node_ip = load_metrics.local_ip
        self.load_metrics = load_metrics

        self.max_failures = max_failures
        self.max_launch_batch = max_launch_batch
        self.max_concurrent_launches = max_concurrent_launches
        self.process_runner = process_runner
        self.event_summarizer = event_summarizer or EventSummarizer()

        # Map from node_id to NodeUpdater processes
        self.updaters = {}
        self.num_failed_updates = defaultdict(int)
        self.num_successful_updates = defaultdict(int)
        self.num_failures = 0
        self.last_update_time = 0.0
        self.update_interval_s = update_interval_s

        # Node launchers
        self.launch_queue = queue.Queue()
        self.pending_launches = ConcurrentCounter()
        max_batches = math.ceil(max_concurrent_launches /
                                float(max_launch_batch))
        for i in range(int(max_batches)):
            node_launcher = NodeLauncher(provider=self.provider,
                                         queue=self.launch_queue,
                                         index=i,
                                         pending=self.pending_launches,
                                         node_types=self.available_node_types,
                                         prom_metrics=self.prom_metrics)
            node_launcher.daemon = True
            node_launcher.start()

        # NodeTracker maintains soft state to track the number of recently
        # failed nodes. It is best effort only.
        self.node_tracker = NodeTracker()

        # Expand local file_mounts to allow ~ in the paths. This can't be done
        # earlier when the config is written since we might be on different
        # platform and the expansion would result in wrong path.
        self.config["file_mounts"] = {
            remote: os.path.expanduser(local)
            for remote, local in self.config["file_mounts"].items()
        }

        for local_path in self.config["file_mounts"].values():
            assert os.path.exists(local_path)
        logger.info("StandardAutoscaler: {}".format(self.config))
Beispiel #6
0
class Simulator:
    def __init__(
        self,
        config_path,
        provider,
        autoscaler_update_interval_s=AUTOSCALER_UPDATE_INTERVAL_S,
        node_startup_delay_s=120,
    ):
        self.config_path = config_path
        self.provider = provider
        self.autoscaler_update_interval_s = autoscaler_update_interval_s
        self.node_startup_delay_s = node_startup_delay_s

        self._setup_autoscaler()
        self._setup_simulator()

    def _setup_autoscaler(self):
        self.runner = MockProcessRunner()
        self.config = yaml.safe_load(open(self.config_path).read())

        self.provider.create_node(
            {},
            {
                TAG_RAY_NODE_KIND: "head",
                TAG_RAY_USER_NODE_TYPE: self.config["head_node_type"],
            },
            1,
        )
        self.head_ip = self.provider.non_terminated_node_ips({})[0]

        self.load_metrics = LoadMetrics(local_ip=self.head_ip)
        self.autoscaler = StandardAutoscaler(
            self.config_path,
            self.load_metrics,
            # Don't let the autoscaler start any node launchers. Instead, we
            # will launch nodes ourself after every update call.
            max_concurrent_launches=0,
            max_failures=0,
            process_runner=self.runner,
            update_interval_s=0,
        )

        # Manually create a node launcher. Note that we won't start it as a
        # separate thread.
        self.node_launcher = NodeLauncher(
            provider=self.autoscaler.provider,
            queue=self.autoscaler.launch_queue,
            index=0,
            pending=self.autoscaler.pending_launches,
            node_types=self.autoscaler.available_node_types,
        )

    def _setup_simulator(self):
        self.virtual_time = 0
        self.ip_to_nodes = {}
        self._update_cluster_state(join_immediately=True)

        self.work_queue = []
        self.event_queue = PriorityQueue()
        self.event_queue.put(Event(0, SIMULATOR_EVENT_AUTOSCALER_UPDATE))

    def _update_cluster_state(self, join_immediately=False):
        nodes = self.provider.non_terminated_nodes(tag_filters={})
        for node_id in nodes:
            ip = self.provider.internal_ip(node_id)
            if ip in self.ip_to_nodes:
                continue
            node_tags = self.provider.node_tags(node_id)
            if TAG_RAY_USER_NODE_TYPE in node_tags:
                node_type = node_tags[TAG_RAY_USER_NODE_TYPE]
                resources = self.config["available_node_types"][node_type].get(
                    "resources", {})
                node = Node(resources, join_immediately)
                self.ip_to_nodes[ip] = node
                if not join_immediately:
                    join_time = self.virtual_time + self.node_startup_delay_s
                    self.event_queue.put(
                        Event(join_time, SIMULATOR_EVEN_NODE_JOINED, node))

    def submit(self, work):
        if isinstance(work, list):
            self.work_queue.extend(work)
        else:
            self.work_queue.append(work)

    def _schedule_task(self, task):
        for ip, node in self.ip_to_nodes.items():
            if node.bundle_fits(task.resources):
                node.allocate(task.resources)
                task.node = node
                task.start_time = self.virtual_time
                end_time = self.virtual_time + task.duration
                self.event_queue.put(
                    Event(end_time, SIMULATOR_EVENT_TASK_DONE, task))
                if task.start_callback:
                    task.start_callback()
                return True

        return False

    def schedule(self):
        # TODO (Alex): Implement a more realistic scheduling algorithm.
        new_work_queue = []
        for work in self.work_queue:
            if isinstance(work, Task):
                scheduled = self._schedule_task(work)

            if scheduled is False:
                new_work_queue.append(work)
        self.work_queue = new_work_queue

    def _launch_nodes(self):
        """Launch all queued nodes. Since this will be run serially after
        `autoscaler.update` there are no race conditions in checking if the
        queue is empty.
        """
        while not self.node_launcher.queue.empty():
            config, count, node_type = self.node_launcher.queue.get()
            try:
                self.node_launcher._launch_node(config, count, node_type)
            except Exception:
                pass
            finally:
                self.node_launcher.pending.dec(node_type, count)

    def _infeasible(self, bundle):
        for node in self.ip_to_nodes.values():
            if node.feasible(bundle):
                return False
        return True

    def run_autoscaler(self):

        waiting_bundles = []
        infeasible_bundles = []
        for work in self.work_queue:
            if isinstance(work, Task):
                shape = work.resources
                if self._infeasible(shape):
                    infeasible_bundles.append(shape)
                else:
                    waiting_bundles.append(shape)

        for ip, node in self.ip_to_nodes.items():
            if not node.in_cluster:
                continue
            self.load_metrics.update(
                ip=ip,
                static_resources=node.total_resources,
                update_dynamic_resources=True,
                dynamic_resources=node.available_resources,
                update_resource_load=False,
                resource_load={},
                waiting_bundles=waiting_bundles,
                infeasible_bundles=infeasible_bundles,
                pending_placement_groups=[],
            )

        self.autoscaler.update()
        self._launch_nodes()
        self._update_cluster_state()

    def process_event(self, event):
        if event.event_type == SIMULATOR_EVENT_AUTOSCALER_UPDATE:
            self.run_autoscaler()
            next_update = self.virtual_time + self.autoscaler_update_interval_s
            self.event_queue.put(
                Event(next_update, SIMULATOR_EVENT_AUTOSCALER_UPDATE))
        elif event.event_type == SIMULATOR_EVENT_TASK_DONE:
            task = event.data
            task.node.free(task.resources)
            if task.done_callback:
                task.done_callback()
        elif event.event_type == SIMULATOR_EVEN_NODE_JOINED:
            node = event.data
            node.in_cluster = True

    def step(self):
        self.virtual_time = self.event_queue.queue[0].time
        while self.event_queue.queue[0].time == self.virtual_time:
            event = self.event_queue.get()
            self.process_event(event)
        self.schedule()
        print(self.info_string())
        return self.virtual_time

    def info_string(self):
        return (f"[t={self.virtual_time}] Nodes: {len(self.ip_to_nodes)} " +
                f"Remaining requests: {len(self.work_queue)} ")