Esempio n. 1
0
    def destroy_autoscaler_workers(self):
        """Cleanup the autoscaler, in case of an exception in the run() method.

        We kill the worker nodes, but retain the head node in order to keep
        logs around, keeping costs minimal. This monitor process runs on the
        head node anyway, so this is more reliable."""

        if self.autoscaler is None:
            return  # Nothing to clean up.

        if self.autoscaling_config is None:
            # This is a logic error in the program. Can't do anything.
            logger.error(
                "Monitor: Cleanup failed due to lack of autoscaler config.")
            return

        logger.info("Monitor: Exception caught. Taking down workers...")
        clean = False
        while not clean:
            try:
                teardown_cluster(
                    config_file=self.autoscaling_config,
                    yes=True,  # Non-interactive.
                    workers_only=True,  # Retain head node for logs.
                    override_cluster_name=None,
                    keep_min_workers=True,  # Retain minimal amount of workers.
                )
                clean = True
                logger.info("Monitor: Workers taken down.")
            except Exception:
                logger.error("Monitor: Cleanup exception. Trying again...")
                time.sleep(2)
Esempio n. 2
0
def down(cluster_config_file, yes, workers_only, cluster_name,
         keep_min_workers, log_style, log_color, verbose):
    """Tear down a Ray cluster."""
    cli_logger.configure(log_style, log_color, verbose)

    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
                     keep_min_workers)
Esempio n. 3
0
 def _teardown(self) -> None:
     commands.teardown_cluster(
         self.config_path,
         yes=True,
         workers_only=False,
         override_cluster_name=None,
         keep_min_workers=False,
     )
Esempio n. 4
0
def teardown_cluster(cluster_config: Union[dict, str]) -> None:
    """Destroys all nodes of a Ray cluster described by a config json.

    Args:
        cluster_config (Union[str, dict]): Either the config dict of the
            cluster, or a path pointing to a file containing the config.
    """
    return commands.teardown_cluster(
        config_file=_as_config_file(cluster_config),
        yes=True,
        workers_only=False,
        override_cluster_name=None,
        keep_min_workers=False)
Esempio n. 5
0
def teardown_cluster(cluster_config: Union[dict, str],
                     workers_only: bool = False,
                     keep_min_workers: bool = False) -> None:
    """Destroys all nodes of a Ray cluster described by a config json.

    Args:
        cluster_config (Union[str, dict]): Either the config dict of the
            cluster, or a path pointing to a file containing the config.
        workers_only (bool): Whether to keep the head node running and only
            teardown worker nodes.
        keep_min_workers (bool): Whether to keep min_workers (as specified
            in the YAML) still running.
    """
    with _as_config_file(cluster_config) as config_file:
        return commands.teardown_cluster(config_file=config_file,
                                         yes=True,
                                         workers_only=workers_only,
                                         override_cluster_name=None,
                                         keep_min_workers=keep_min_workers)