def on_step_begin(self, **info): if not os.path.exists(self.config_path): return import click from ray.autoscaler._private.commands import kill_node failures = 0 max_failures = 3 # With 10% probability inject failure to a worker. if random.random() < self.probability and not self.disable: # With 10% probability fully terminate the node. should_terminate = random.random() < self.probability while failures < max_failures: try: kill_node( self.config_path, yes=True, hard=should_terminate, override_cluster_name=None) except click.exceptions.ClickException: failures += 1 logger.exception("Killing random node failed in attempt " "{}. " "Retrying {} more times".format( str(failures), str(max_failures - failures)))
def on_step_begin(self, **info): from ray.autoscaler._private.commands import kill_node # With 10% probability inject failure to a worker. if random.random() < self.probability and not self.disable: # With 10% probability fully terminate the node. should_terminate = random.random() < self.probability kill_node(self.config_path, yes=True, hard=should_terminate, override_cluster_name=None)
def on_step_begin(self, trial_runner): """Before step(), update available resources and inject failure.""" self._update_avail_resources() # With 10% probability inject failure to a worker. if random.random() < 0.1 and not args.smoke_test: # With 10% probability fully terminate the node. should_terminate = random.random() < 0.1 kill_node( "/home/ubuntu/ray_bootstrap_config.yaml", yes=True, hard=should_terminate, override_cluster_name=None)
def remove_host(self, hosts): good_hosts = [k for k in hosts if k not in self._removed_hosts] from ray.autoscaler._private.commands import kill_node if good_hosts: if self._graceful: host = random.choice(good_hosts) else: host = kill_node( os.path.expanduser("~/ray_bootstrap_config.yaml"), True, False, None) self._removed_hosts.add(host)
def kill_random_node(cluster_config_file, yes, hard, cluster_name): """Kills a random Ray node. For testing purposes only.""" click.echo("Killed node with IP " + kill_node(cluster_config_file, yes, hard, cluster_name))