Beispiel #1
0
    def destroy_autoscaler_workers(self):
        """Cleanup the autoscaler, in case of an exception in the run() method.

        We kill the worker nodes, but retain the head node in order to keep
        logs around, keeping costs minimal. This monitor process runs on the
        head node anyway, so this is more reliable."""

        if self.autoscaler is None:
            return  # Nothing to clean up.

        if self.autoscaling_config is None:
            # This is a logic error in the program. Can't do anything.
            logger.error(
                "Monitor: Cleanup failed due to lack of autoscaler config.")
            return

        logger.info("Monitor: Exception caught. Taking down workers...")
        clean = False
        while not clean:
            try:
                teardown_cluster(
                    config_file=self.autoscaling_config,
                    yes=True,  # Non-interactive.
                    workers_only=True,  # Retain head node for logs.
                    override_cluster_name=None,
                    keep_min_workers=True,  # Retain minimal amount of workers.
                )
                clean = True
                logger.info("Monitor: Workers taken down.")
            except Exception:
                logger.error("Monitor: Cleanup exception. Trying again...")
                time.sleep(2)
Beispiel #2
0
def stop():
    project_definition = load_project_or_throw()
    teardown_cluster(
        project_definition["cluster"],
        yes=True,
        workers_only=False,
        override_cluster_name=None)
Beispiel #3
0
def stop(name):
    project_definition = load_project_or_throw()
    teardown_cluster(
        project_definition.cluster_yaml(),
        yes=True,
        workers_only=False,
        override_cluster_name=name)
Beispiel #4
0
 def __do_destroy(self):
     try:
         teardown_cluster(
             self.config_file,
             yes=True,
             workers_only=False,
             override_cluster_name=None,
             keep_min_workers=0,
         )
         self.ready = False
         self.config = None
     except BaseException as ex:
         self.destroyer.exc = CannotDestroyCluster(
             "Cannot destroy cluster", cause=ex, traceback=traceback.format_exc()
         )
         if not self.destroyer.silent:
             sys.stderr.write(f"Cannot destroy cluster:\n{traceback.format_exc()}\n")
Beispiel #5
0
def teardown(cluster_config_file, yes, workers_only, cluster_name,
             keep_min_workers):
    """Tear down a Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
                     keep_min_workers)
Beispiel #6
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    """Tear down the Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)
Beispiel #7
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)
Beispiel #8
0
def teardown(cluster_config_file, yes):
    teardown_cluster(cluster_config_file, yes)
Beispiel #9
0
def teardown(cluster_config_file, yes):
    teardown_cluster(cluster_config_file, yes)
Beispiel #10
0
def down(cluster_config_file, yes, workers_only, cluster_name,
         keep_min_workers, log_old_style, log_color, verbose):
    """Tear down a Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name,
                     keep_min_workers, log_old_style, log_color, verbose)
Beispiel #11
0
def teardown(cluster_config_file, yes, workers_only, cluster_name):
    """Tear down the Ray cluster."""
    teardown_cluster(cluster_config_file, yes, workers_only, cluster_name)