def __init__(self, settings: ElasticSettings, use_gpu: bool = False, cpus_per_slot: int = 1, gpus_per_slot: Optional[int] = None, env_vars: dict = None, override_discovery=True): if gpus_per_slot and not use_gpu: raise ValueError("gpus_per_slot is set, but use_gpu is False. " "use_gpu must be True if gpus_per_slot is set. ") gpus_per_slot = gpus_per_slot or int(use_gpu) if use_gpu and gpus_per_slot < 1: raise ValueError( f"gpus_per_slot must be >= 1: Got {gpus_per_slot}.") if override_discovery: settings.discovery = RayHostDiscovery(use_gpu=use_gpu, cpus_per_slot=cpus_per_slot, gpus_per_slot=gpus_per_slot) self.cpus_per_slot = cpus_per_slot self.gpus_per_slot = gpus_per_slot self.use_gpu = use_gpu self.settings = settings self.driver = None self.rendezvous = None self.env_vars = env_vars or {}
def create_settings(min_np: int = 1, max_np: int = None, reset_limit: int = None, elastic_timeout: int = 600, timeout_s: int = 30, ssh_identity_file: str = None, nics: str = None, **kwargs): """Returns a Settings object for ElasticRayExecutor. Note that the `discovery` property will be set at runtime. Args: min_np (int): Minimum number of processes running for training to continue. If number of available processes dips below this threshold, then training will wait for more instances to become available. max_np (int): Maximum number of training processes, beyond which no additional processes will be created. If not specified, then will be unbounded. reset_limit (int): Maximum number of times that the training job can scale up or down the number of workers after which the job is terminated. elastic_timeout (int): Timeout for elastic initialisation after re-scaling the cluster. The default value is 600 seconds. Alternatively, the environment variable HOROVOD_ELASTIC_TIMEOUT can also be used.' timeout_s (int): Horovod performs all the checks and starts the processes before the specified timeout. The default value is 30 seconds. ssh_identity_file (str): File on the driver from which the identity (private key) is read. nics (set): Network interfaces that can be used for communication. """ start_timeout = timeout.Timeout( timeout_s, message="Timed out waiting for {activity}. Please " "check connectivity between servers. You " "may need to increase the --start-timeout " "parameter if you have too many servers.") ssh_identity_file = ssh_identity_file or os.path.expanduser( "~/ray_bootstrap_key.pem") settings = ElasticSettings( discovery=None, min_np=min_np, max_np=max_np, elastic_timeout=elastic_timeout, reset_limit=reset_limit, num_proc=min_np, ssh_identity_file=ssh_identity_file, nics=nics, start_timeout=start_timeout, key=secret.make_secret_key() if secret else None, **kwargs) return settings