Exemple #1
0
    def __init__(self,
                 settings: ElasticSettings,
                 use_gpu: bool = False,
                 cpus_per_slot: int = 1,
                 gpus_per_slot: Optional[int] = None,
                 env_vars: dict = None,
                 override_discovery=True):
        if gpus_per_slot and not use_gpu:
            raise ValueError("gpus_per_slot is set, but use_gpu is False. "
                             "use_gpu must be True if gpus_per_slot is set. ")

        gpus_per_slot = gpus_per_slot or int(use_gpu)

        if use_gpu and gpus_per_slot < 1:
            raise ValueError(
                f"gpus_per_slot must be >= 1: Got {gpus_per_slot}.")

        if override_discovery:
            settings.discovery = RayHostDiscovery(use_gpu=use_gpu,
                                                  cpus_per_slot=cpus_per_slot,
                                                  gpus_per_slot=gpus_per_slot)
        self.cpus_per_slot = cpus_per_slot
        self.gpus_per_slot = gpus_per_slot
        self.use_gpu = use_gpu
        self.settings = settings
        self.driver = None
        self.rendezvous = None
        self.env_vars = env_vars or {}
Exemple #2
0
    def create_settings(min_np: int = 1,
                        max_np: int = None,
                        reset_limit: int = None,
                        elastic_timeout: int = 600,
                        timeout_s: int = 30,
                        ssh_identity_file: str = None,
                        nics: str = None,
                        **kwargs):
        """Returns a Settings object for ElasticRayExecutor.

        Note that the `discovery` property will be set at runtime.

        Args:
            min_np (int): Minimum number of processes running for
                training to continue. If number of available processes dips
                below this threshold, then training will wait for
                more instances to become available.
            max_np (int): Maximum number of training processes,
                beyond which no additional processes will be created.
                If not specified, then will be unbounded.
            reset_limit (int): Maximum number of times that the training
                job can scale up or down the number of workers after
                which the job is terminated.
            elastic_timeout (int): Timeout for elastic initialisation after
                re-scaling the cluster. The default value is 600 seconds.
                Alternatively, the environment variable
                HOROVOD_ELASTIC_TIMEOUT can also be used.'
            timeout_s (int): Horovod performs all the checks and starts the
                processes before the specified timeout.
                The default value is 30 seconds.
            ssh_identity_file (str): File on the driver from which
                the identity (private key) is read.
            nics (set): Network interfaces that can be used for communication.
        """
        start_timeout = timeout.Timeout(
            timeout_s,
            message="Timed out waiting for {activity}. Please "
            "check connectivity between servers. You "
            "may need to increase the --start-timeout "
            "parameter if you have too many servers.")
        ssh_identity_file = ssh_identity_file or os.path.expanduser(
            "~/ray_bootstrap_key.pem")
        settings = ElasticSettings(
            discovery=None,
            min_np=min_np,
            max_np=max_np,
            elastic_timeout=elastic_timeout,
            reset_limit=reset_limit,
            num_proc=min_np,
            ssh_identity_file=ssh_identity_file,
            nics=nics,
            start_timeout=start_timeout,
            key=secret.make_secret_key() if secret else None,
            **kwargs)
        return settings