Ejemplo n.º 1
0
def wrap_function(train_func, warn=True):
    if hasattr(train_func, "__mixins__"):
        inherit_from = train_func.__mixins__ + (FunctionRunner, )
    else:
        inherit_from = (FunctionRunner, )

    func_args = inspect.getfullargspec(train_func).args
    use_checkpoint = detect_checkpoint_function(train_func)
    use_config_single = detect_config_single(train_func)
    use_reporter = detect_reporter(train_func)

    if not any([use_checkpoint, use_config_single, use_reporter]):
        # use_reporter is hidden
        raise ValueError(
            "Unknown argument found in the Trainable function. "
            "The function args must include a 'config' positional "
            "parameter. Any other args must be 'checkpoint_dir'. "
            "Found: {}".format(func_args))

    if use_config_single and not use_checkpoint:
        if log_once("tune_function_checkpoint") and warn:
            logger.warning(
                "Function checkpointing is disabled. This may result in "
                "unexpected behavior when using checkpointing features or "
                "certain schedulers. To enable, set the train function "
                "arguments to be `func(config, checkpoint_dir=None)`.")

    class ImplicitFunc(*inherit_from):
        _name = train_func.__name__ if hasattr(train_func, "__name__") \
            else "func"

        def _trainable_func(self, config, reporter, checkpoint_dir):
            if not use_checkpoint and not use_reporter:
                output = train_func(config)
            elif use_checkpoint:
                output = train_func(config, checkpoint_dir=checkpoint_dir)
            else:
                output = train_func(config, reporter)

            # If train_func returns, we need to notify the main event loop
            # of the last result while avoiding double logging. This is done
            # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py.
            reporter(**{RESULT_DUPLICATE: True})
            return output

    return ImplicitFunc
Ejemplo n.º 2
0
def DistributedTrainableCreator(func: Callable,
                                num_workers: int = 1,
                                num_cpus_per_worker: int = 1,
                                num_gpus_per_worker: int = 0,
                                num_workers_per_host: Optional[int] = None,
                                backend: str = "gloo",
                                timeout_s: int = NCCL_TIMEOUT_S,
                                use_gpu=None) -> Type[_TorchTrainable]:
    """Creates a class that executes distributed training.

    Similar to running `torch.distributed.launch`.

    Note that you typically should not instantiate the object
    created.

    Args:
        func (callable): This function is a Tune trainable function.
            This function must have 2 args in the signature, and the
            latter arg must contain `checkpoint_dir`. For example:
            `func(config, checkpoint_dir=None)`.
        num_workers (int): Number of training workers to include in
            world.
        num_cpus_per_worker (int): Number of CPU resources to reserve
            per training worker.
        num_gpus_per_worker (int): Number of GPU resources to reserve
            per training worker.
        num_workers_per_host: Optional[int]: Number of workers to
            colocate per host.
        backend (str): One of "gloo", "nccl".
        timeout_s (float): Seconds before the torch process group
            times out. Useful when machines are unreliable. Defaults
            to 60 seconds. This value is also reused for triggering
            placement timeouts if forcing colocation.

    Returns:
        type(Trainable): A trainable class object that can be passed
        to Tune. Resources are automatically set within the object, so
        users do not need to set `resources_per_trainable`.

    Example:

    .. code-block:: python

        trainable_cls = DistributedTrainableCreator(
            train_func, num_workers=2)
        analysis = tune.run(trainable_cls)
    """
    if use_gpu:
        raise ValueError(
            "use_gpu is deprecated. Use 'num_gpus_per_worker' instead.")
    detect_checkpoint_function(func, abort=True)
    if num_workers_per_host:
        if num_workers % num_workers_per_host:
            raise ValueError("`num_workers` must be an integer multiple "
                             "of workers_per_node.")

    class WrappedDistributedTorchTrainable(_TorchTrainable):
        _function = func
        _num_workers = num_workers
        _num_cpus_per_worker = num_cpus_per_worker
        _num_gpus_per_worker = num_gpus_per_worker
        _num_workers_per_host = num_workers_per_host
        _timeout_s = timeout_s

        @classmethod
        def default_process_group_parameters(self) -> Dict:
            return dict(timeout=timedelta(timeout_s), backend=backend)

        @classmethod
        def default_resource_request(cls, config: Dict) -> Resources:

            return Resources(cpu=0,
                             gpu=0,
                             extra_cpu=num_cpus_per_worker * num_workers,
                             extra_gpu=num_gpus_per_worker * num_workers)

    return WrappedDistributedTorchTrainable
Ejemplo n.º 3
0
def with_parameters(trainable, **kwargs):
    """Wrapper for trainables to pass arbitrary large data objects.

    This wrapper function will store all passed parameters in the Ray
    object store and retrieve them when calling the function. It can thus
    be used to pass arbitrary data, even datasets, to Tune trainables.

    This can also be used as an alternative to ``functools.partial`` to pass
    default arguments to trainables.

    When used with the function API, the trainable function is called with
    the passed parameters as keyword arguments. When used with the class API,
    the ``Trainable.setup()`` method is called with the respective kwargs.

    If the data already exists in the object store (are instances of
    ObjectRef), using ``tune.with_parameters()`` is not necessary. You can
    instead pass the object refs to the training function via the ``config``
    or use Python partials.

    Args:
        trainable: Trainable to wrap.
        **kwargs: parameters to store in object store.

    Function API example:

    .. code-block:: python

        from ray import tune

        def train(config, data=None):
            for sample in data:
                loss = update_model(sample)
                tune.report(loss=loss)

        data = HugeDataset(download=True)

        tune.run(
            tune.with_parameters(train, data=data),
            # ...
        )

    Class API example:

    .. code-block:: python

        from ray import tune

        class MyTrainable(tune.Trainable):
            def setup(self, config, data=None):
                self.data = data
                self.iter = iter(self.data)
                self.next_sample = next(self.iter)

            def step(self):
                loss = update_model(self.next_sample)
                try:
                    self.next_sample = next(self.iter)
                except StopIteration:
                    return {"loss": loss, done: True}
                return {"loss": loss}

        data = HugeDataset(download=True)

        tune.run(
            tune.with_parameters(MyTrainable, data=data),
            # ...
        )

    """
    from ray.tune.trainable import Trainable

    if not callable(trainable) or (inspect.isclass(trainable)
                                   and not issubclass(trainable, Trainable)):
        raise ValueError(
            f"`tune.with_parameters() only works with function trainables "
            f"or classes that inherit from `tune.Trainable()`. Got type: "
            f"{type(trainable)}.")

    parameter_registry = _ParameterRegistry()
    ray.worker._post_init_hooks.append(parameter_registry.flush)

    # Objects are moved into the object store
    prefix = f"{str(trainable)}_"
    for k, v in kwargs.items():
        parameter_registry.put(prefix + k, v)

    trainable_name = getattr(trainable, "__name__", "tune_with_parameters")

    if inspect.isclass(trainable):
        # Class trainable
        keys = list(kwargs.keys())

        class _Inner(trainable):
            def setup(self, config):
                setup_kwargs = {}
                for k in keys:
                    setup_kwargs[k] = parameter_registry.get(prefix + k)
                super(_Inner, self).setup(config, **setup_kwargs)

        _Inner.__name__ = trainable_name
        return _Inner
    else:
        # Function trainable
        use_checkpoint = detect_checkpoint_function(trainable, partial=True)
        keys = list(kwargs.keys())

        def inner(config, checkpoint_dir=None):
            fn_kwargs = {}
            if use_checkpoint:
                default = checkpoint_dir
                sig = inspect.signature(trainable)
                if "checkpoint_dir" in sig.parameters:
                    default = sig.parameters[
                        "checkpoint_dir"].default or default
                fn_kwargs["checkpoint_dir"] = default

            for k in keys:
                fn_kwargs[k] = parameter_registry.get(prefix + k)
            trainable(config, **fn_kwargs)

        inner.__name__ = trainable_name

        # Use correct function signature if no `checkpoint_dir` parameter
        # is set
        if not use_checkpoint:

            def _inner(config):
                inner(config, checkpoint_dir=None)

            _inner.__name__ = trainable_name

            if hasattr(trainable, "__mixins__"):
                _inner.__mixins__ = trainable.__mixins__
            return _inner

        if hasattr(trainable, "__mixins__"):
            inner.__mixins__ = trainable.__mixins__

        return inner
Ejemplo n.º 4
0
    def __init__(self,
                 name,
                 run,
                 stop=None,
                 time_budget_s=None,
                 config=None,
                 resources_per_trial=None,
                 num_samples=1,
                 local_dir=None,
                 upload_dir=None,
                 trial_name_creator=None,
                 trial_dirname_creator=None,
                 loggers=None,
                 log_to_file=False,
                 sync_to_driver=None,
                 checkpoint_freq=0,
                 checkpoint_at_end=False,
                 sync_on_checkpoint=True,
                 keep_checkpoints_num=None,
                 checkpoint_score_attr=None,
                 export_formats=None,
                 max_failures=0,
                 restore=None):

        if loggers is not None:
            # Most users won't run into this as `tune.run()` does not pass
            # the argument anymore. However, we will want to inform users
            # if they instantiate their `Experiment` objects themselves.
            raise ValueError(
                "Passing `loggers` to an `Experiment` is deprecated. Use "
                "an `ExperimentLogger` callback instead, e.g. by passing the "
                "`Logger` classes to `tune.logger.LegacyExperimentLogger` and "
                "passing this as part of the `callback` parameter to "
                "`tune.run()`.")

        config = config or {}
        if callable(run) and detect_checkpoint_function(run):
            if checkpoint_at_end:
                raise ValueError("'checkpoint_at_end' cannot be used with a "
                                 "checkpointable function. You can specify "
                                 "and register checkpoints within "
                                 "your trainable function.")
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function.")
        self._run_identifier = Experiment.register_if_needed(run)
        self.name = name or self._run_identifier

        # If the name has been set explicitly, we don't want to create
        # dated directories. The same is true for string run identifiers.
        if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \
           or isinstance(run, str):
            self.dir_name = self.name
        else:
            self.dir_name = "{}_{}".format(self.name, date_str())

        if upload_dir:
            self.remote_checkpoint_dir = os.path.join(upload_dir,
                                                      self.dir_name)
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif issubclass(type(stop), Stopper):
                self._stopper = stop
            else:
                raise ValueError("Provided stop object must be either a dict, "
                                 "a function, or a subclass of "
                                 "`ray.tune.Stopper`.")
        else:
            raise ValueError("Invalid stop criteria: {}. Must be a "
                             "callable or dict".format(stop))

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(self._stopper,
                                                TimeoutStopper(time_budget_s))
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        _raise_on_durable(self._run_identifier, sync_to_driver, upload_dir)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run":
            self._run_identifier,
            "stop":
            stopping_criteria,
            "config":
            config,
            "resources_per_trial":
            resources_per_trial,
            "num_samples":
            num_samples,
            "local_dir":
            os.path.abspath(
                os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)),
            "upload_dir":
            upload_dir,
            "remote_checkpoint_dir":
            self.remote_checkpoint_dir,
            "trial_name_creator":
            trial_name_creator,
            "trial_dirname_creator":
            trial_dirname_creator,
            "loggers":
            loggers,
            "log_to_file": (stdout_file, stderr_file),
            "sync_to_driver":
            sync_to_driver,
            "checkpoint_freq":
            checkpoint_freq,
            "checkpoint_at_end":
            checkpoint_at_end,
            "sync_on_checkpoint":
            sync_on_checkpoint,
            "keep_checkpoints_num":
            keep_checkpoints_num,
            "checkpoint_score_attr":
            checkpoint_score_attr,
            "export_formats":
            export_formats or [],
            "max_failures":
            max_failures,
            "restore":
            os.path.abspath(os.path.expanduser(restore)) if restore else None
        }
        self.spec = spec
Ejemplo n.º 5
0
def with_parameters(fn, **kwargs):
    """Wrapper for function trainables to pass arbitrary large data objects.

    This wrapper function will store all passed parameters in the Ray
    object store and retrieve them when calling the function. It can thus
    be used to pass arbitrary data, even datasets, to Tune trainable functions.

    This can also be used as an alternative to `functools.partial` to pass
    default arguments to trainables.

    Args:
        fn: function to wrap
        **kwargs: parameters to store in object store.


    .. code-block:: python

        from ray import tune

        def train(config, data=None):
            for sample in data:
                # ...
                tune.report(loss=loss)

        data = HugeDataset(download=True)

        tune.run(
            tune.with_parameters(train, data=data),
            #...
        )

    """
    if not callable(fn):
        raise ValueError(
            "`tune.with_parameters()` only works with the function API. "
            "If you want to pass parameters to Trainable _classes_, consider "
            "passing them via the `config` parameter.")

    prefix = f"{str(fn)}_"
    for k, v in kwargs.items():
        parameter_registry.put(prefix + k, v)

    use_checkpoint = detect_checkpoint_function(fn)
    keys = list(kwargs.keys())

    def inner(config, checkpoint_dir=None):
        fn_kwargs = {}
        if use_checkpoint:
            default = checkpoint_dir
            sig = inspect.signature(fn)
            if "checkpoint_dir" in sig.parameters:
                default = sig.parameters["checkpoint_dir"].default \
                          or default
            fn_kwargs["checkpoint_dir"] = default

        for k in keys:
            fn_kwargs[k] = parameter_registry.get(prefix + k)
        fn(config, **fn_kwargs)

    # Use correct function signature if no `checkpoint_dir` parameter is set
    if not use_checkpoint:

        def _inner(config):
            inner(config, checkpoint_dir=None)

        if hasattr(fn, "__mixins__"):
            _inner.__mixins__ = fn.__mixins__
        return _inner

    if hasattr(fn, "__mixins__"):
        inner.__mixins__ = fn.__mixins__
    return inner
Ejemplo n.º 6
0
def DistributedTrainableCreator(
        func: Callable,
        use_gpu: bool = False,
        num_workers: int = 1,
        num_cpus_per_worker: int = 1,
        backend: str = "gloo",
        timeout_s: int = NCCL_TIMEOUT_S) -> Type[_TorchTrainable]:
    """Creates a class that executes distributed training.

    Similar to running `torch.distributed.launch`.

    Note that you typically should not instantiate the object
    created.

    Args:
        func (callable): This function is a Tune trainable function.
            This function must have 2 args in the signature, and the
            latter arg must contain `checkpoint_dir`. For example:
            `func(config, checkpoint_dir=None)`.
        use_gpu (bool): Sets resource allocation for workers to 1 GPU
            if true. Also automatically sets CUDA_VISIBLE_DEVICES
            for each training worker.
        num_workers (int): Number of training workers to include in
            world.
        num_cpus_per_worker (int): Number of CPU resources to reserve
            per training worker.
        backend (str): One of "gloo", "nccl".
        timeout_s (float): Seconds before the torch process group
            times out. Useful when machines are unreliable. Defaults
            to 60 seconds.

    Returns:
        type(Trainable): A trainable class object that can be passed
        to Tune. Resources are automatically set within the object, so
        users do not need to set `resources_per_trainable`.

    Example:

    .. code-block:: python

        trainable_cls = DistributedTrainableCreator(
            train_func, num_workers=2)
        analysis = tune.run(trainable_cls)
    """
    detect_checkpoint_function(func, abort=True)

    class WrappedDistributedTorchTrainable(_TorchTrainable):
        _function = func
        _num_workers = num_workers
        _use_gpu = use_gpu
        _num_cpus_per_worker = num_cpus_per_worker

        @classmethod
        def default_process_group_parameters(self) -> Dict:
            return dict(timeout=timedelta(timeout_s), backend=backend)

        @classmethod
        def default_resource_request(cls, config: Dict) -> Resources:
            num_workers_ = int(config.get("num_workers", num_workers))
            num_cpus = int(
                config.get("num_cpus_per_worker", num_cpus_per_worker))
            use_gpu_ = config.get("use_gpu", use_gpu)

            return Resources(
                cpu=0,
                gpu=0,
                extra_cpu=num_cpus * num_workers_,
                extra_gpu=num_workers_ if use_gpu_ else 0)

    return WrappedDistributedTorchTrainable
Ejemplo n.º 7
0
    def __init__(self,
                 name,
                 run,
                 stop=None,
                 time_budget_s=None,
                 config=None,
                 resources_per_trial=None,
                 num_samples=1,
                 local_dir=None,
                 upload_dir=None,
                 trial_name_creator=None,
                 trial_dirname_creator=None,
                 loggers=None,
                 log_to_file=False,
                 sync_to_driver=None,
                 checkpoint_freq=0,
                 checkpoint_at_end=False,
                 sync_on_checkpoint=True,
                 keep_checkpoints_num=None,
                 checkpoint_score_attr=None,
                 export_formats=None,
                 max_failures=0,
                 restore=None):

        config = config or {}
        if callable(run) and detect_checkpoint_function(run):
            if checkpoint_at_end:
                raise ValueError("'checkpoint_at_end' cannot be used with a "
                                 "checkpointable function. You can specify "
                                 "and register checkpoints within "
                                 "your trainable function.")
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function.")
        self._run_identifier = Experiment.register_if_needed(run)
        self.name = name or self._run_identifier
        if upload_dir:
            self.remote_checkpoint_dir = os.path.join(upload_dir, self.name)
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif issubclass(type(stop), Stopper):
                self._stopper = stop
            else:
                raise ValueError("Provided stop object must be either a dict, "
                                 "a function, or a subclass of "
                                 "`ray.tune.Stopper`.")
        else:
            raise ValueError("Invalid stop criteria: {}. Must be a "
                             "callable or dict".format(stop))

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(self._stopper,
                                                TimeoutStopper(time_budget_s))
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        _raise_on_durable(self._run_identifier, sync_to_driver, upload_dir)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run": self._run_identifier,
            "stop": stopping_criteria,
            "config": config,
            "resources_per_trial": resources_per_trial,
            "num_samples": num_samples,
            "local_dir": os.path.abspath(
                os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)),
            "upload_dir": upload_dir,
            "remote_checkpoint_dir": self.remote_checkpoint_dir,
            "trial_name_creator": trial_name_creator,
            "trial_dirname_creator": trial_dirname_creator,
            "loggers": loggers,
            "log_to_file": (stdout_file, stderr_file),
            "sync_to_driver": sync_to_driver,
            "checkpoint_freq": checkpoint_freq,
            "checkpoint_at_end": checkpoint_at_end,
            "sync_on_checkpoint": sync_on_checkpoint,
            "keep_checkpoints_num": keep_checkpoints_num,
            "checkpoint_score_attr": checkpoint_score_attr,
            "export_formats": export_formats or [],
            "max_failures": max_failures,
            "restore": os.path.abspath(os.path.expanduser(restore))
            if restore else None
        }
        self.spec = spec
Ejemplo n.º 8
0
def wrap_function(
    train_func: Callable[[Any], Any], warn: bool = True, name: Optional[str] = None
):
    inherit_from = (FunctionTrainable,)

    if hasattr(train_func, "__mixins__"):
        inherit_from = train_func.__mixins__ + inherit_from

    func_args = inspect.getfullargspec(train_func).args
    use_checkpoint = detect_checkpoint_function(train_func)
    use_config_single = detect_config_single(train_func)
    use_reporter = detect_reporter(train_func)

    if not any([use_checkpoint, use_config_single, use_reporter]):
        # use_reporter is hidden
        raise ValueError(
            "Unknown argument found in the Trainable function. "
            "The function args must include a 'config' positional "
            "parameter. Any other args must be 'checkpoint_dir'. "
            "Found: {}".format(func_args)
        )

    if use_config_single and not use_checkpoint:
        if log_once("tune_function_checkpoint") and warn:
            logger.warning(
                "Function checkpointing is disabled. This may result in "
                "unexpected behavior when using checkpointing features or "
                "certain schedulers. To enable, set the train function "
                "arguments to be `func(config, checkpoint_dir=None)`."
            )

    class ImplicitFunc(*inherit_from):
        _name = name or (
            train_func.__name__ if hasattr(train_func, "__name__") else "func"
        )

        def __repr__(self):
            return self._name

        def _trainable_func(self, config, reporter, checkpoint_dir):
            if not use_checkpoint and not use_reporter:
                fn = partial(train_func, config)
            elif use_checkpoint:
                fn = partial(train_func, config, checkpoint_dir=checkpoint_dir)
            else:
                fn = partial(train_func, config, reporter)

            def handle_output(output):
                if not output:
                    return
                elif isinstance(output, dict):
                    reporter(**output)
                elif isinstance(output, Number):
                    reporter(_metric=output)
                else:
                    raise ValueError(
                        "Invalid return or yield value. Either return/yield "
                        "a single number or a dictionary object in your "
                        "trainable function."
                    )

            output = None
            if inspect.isgeneratorfunction(train_func):
                for output in fn():
                    handle_output(output)
            else:
                output = fn()
                handle_output(output)

            # If train_func returns, we need to notify the main event loop
            # of the last result while avoiding double logging. This is done
            # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py.
            reporter(**{RESULT_DUPLICATE: True})
            return output

    return ImplicitFunc
Ejemplo n.º 9
0
def DistributedTrainableCreator(
    func: Callable[[Dict, Optional[str]], Any],
    num_workers: int = 1,
    num_cpus_per_worker: int = 1,
    num_gpus_per_worker: int = 0,
    num_workers_per_host: Optional[int] = None,
    backend: str = "gloo",
    timeout_s: int = NCCL_TIMEOUT_S,
) -> Type[_TorchTrainable]:
    """Creates a class that executes distributed training.

    Similar to running `torch.distributed.launch`.

    Note that you typically should not instantiate the object
    created.

    Args:
        func: This function is a Tune trainable function.
            This function must have 2 args in the signature, and the
            latter arg must contain `checkpoint_dir`. For example:
            `func(config, checkpoint_dir=None)`.
        num_workers: Number of training workers to include in
            world.
        num_cpus_per_worker: Number of CPU resources to reserve
            per training worker.
        num_gpus_per_worker: Number of GPU resources to reserve
            per training worker.
        num_workers_per_host: Optional[int]: Number of workers to
            colocate per host.
        backend: One of "gloo", "nccl".
        timeout_s: Seconds before the torch process group
            times out. Useful when machines are unreliable. Defaults
            to 1800 seconds. This value is also reused for triggering
            placement timeouts if forcing colocation.

    Returns:
        type(Trainable): A trainable class object that can be passed
        to Tune. Resources are automatically set within the object, so
        users do not need to set `resources_per_trainable`.

    Example:

    .. code-block:: python

        trainable_cls = DistributedTrainableCreator(
            train_func, num_workers=2)
        analysis = tune.run(trainable_cls)
    """

    warnings.warn(
        "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray "
        "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR ("
        "https://docs.ray.io/en/latest/ray-air/getting-started.html) will "
        "provide greater functionality than `DistributedTrainableCreator`, "
        "and with a more flexible and easy-to-use API.",
        PendingDeprecationWarning,
        stacklevel=2,
    )

    detect_checkpoint_function(func, abort=True)
    if num_workers_per_host:
        if num_workers % num_workers_per_host:
            raise ValueError(
                "`num_workers` must be an integer multiple of workers_per_node."
            )

    class WrappedDistributedTorchTrainable(_TorchTrainable):
        _function = func
        _num_workers = num_workers
        _num_cpus_per_worker = num_cpus_per_worker
        _num_gpus_per_worker = num_gpus_per_worker
        _num_workers_per_host = num_workers_per_host
        _timeout_s = timeout_s

        @classmethod
        def default_process_group_parameters(self) -> Dict:
            return dict(timeout=timedelta(seconds=timeout_s), backend=backend)

        @classmethod
        def default_resource_request(cls, config: Dict) -> PlacementGroupFactory:
            return PlacementGroupFactory(
                [{}]
                + [{"CPU": cls._num_cpus_per_worker, "GPU": cls._num_gpus_per_worker}]
                * num_workers
            )

    return WrappedDistributedTorchTrainable
Ejemplo n.º 10
0
    def __init__(self,
                 name,
                 run,
                 stop=None,
                 time_budget_s=None,
                 config=None,
                 resources_per_trial=None,
                 num_samples=1,
                 local_dir=None,
                 upload_dir=None,
                 trial_name_creator=None,
                 trial_dirname_creator=None,
                 log_to_file=False,
                 sync_to_driver=None,
                 sync_to_cloud=None,
                 checkpoint_freq=0,
                 checkpoint_at_end=False,
                 sync_on_checkpoint=True,
                 keep_checkpoints_num=None,
                 checkpoint_score_attr=None,
                 export_formats=None,
                 max_failures=0,
                 restore=None):

        config = config or {}
        if callable(run) and not inspect.isclass(run) and \
                detect_checkpoint_function(run):
            if checkpoint_at_end:
                raise ValueError("'checkpoint_at_end' cannot be used with a "
                                 "checkpointable function. You can specify "
                                 "and register checkpoints within "
                                 "your trainable function.")
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function.")
        self._run_identifier = Experiment.register_if_needed(run)
        self.name = name or self._run_identifier

        # If the name has been set explicitly, we don't want to create
        # dated directories. The same is true for string run identifiers.
        if int(os.environ.get("TUNE_DISABLE_DATED_SUBDIR", 0)) == 1 or name \
           or isinstance(run, str):
            self.dir_name = self.name
        else:
            self.dir_name = "{}_{}".format(self.name, date_str())

        if upload_dir:
            self.remote_checkpoint_dir = os.path.join(upload_dir,
                                                      self.dir_name)
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, list):
            if any(not isinstance(s, Stopper) for s in stop):
                raise ValueError(
                    "If you pass a list as the `stop` argument to "
                    "`tune.run()`, each element must be an instance of "
                    "`tune.stopper.Stopper`.")
            self._stopper = CombinedStopper(*stop)
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif issubclass(type(stop), Stopper):
                self._stopper = stop
            else:
                raise ValueError("Provided stop object must be either a dict, "
                                 "a function, or a subclass of "
                                 "`ray.tune.Stopper`.")
        else:
            raise ValueError("Invalid stop criteria: {}. Must be a "
                             "callable or dict".format(stop))

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(self._stopper,
                                                TimeoutStopper(time_budget_s))
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        _raise_on_durable(self.is_durable_trainable, sync_to_driver,
                          upload_dir)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run": self._run_identifier,
            "stop": stopping_criteria,
            "config": config,
            "resources_per_trial": resources_per_trial,
            "num_samples": num_samples,
            "local_dir": os.path.abspath(
                os.path.expanduser(local_dir or DEFAULT_RESULTS_DIR)),
            "upload_dir": upload_dir,
            "remote_checkpoint_dir": self.remote_checkpoint_dir,
            "trial_name_creator": trial_name_creator,
            "trial_dirname_creator": trial_dirname_creator,
            "log_to_file": (stdout_file, stderr_file),
            "sync_to_driver": sync_to_driver,
            "sync_to_cloud": sync_to_cloud,
            "checkpoint_freq": checkpoint_freq,
            "checkpoint_at_end": checkpoint_at_end,
            "sync_on_checkpoint": sync_on_checkpoint,
            "keep_checkpoints_num": keep_checkpoints_num,
            "checkpoint_score_attr": checkpoint_score_attr,
            "export_formats": export_formats or [],
            "max_failures": max_failures,
            "restore": os.path.abspath(os.path.expanduser(restore))
            if restore else None
        }
        self.spec = spec
Ejemplo n.º 11
0
    def __init__(
        self,
        name,
        run,
        stop=None,
        time_budget_s=None,
        config=None,
        resources_per_trial=None,
        num_samples=1,
        local_dir=None,
        _experiment_checkpoint_dir: Optional[str] = None,
        sync_config=None,
        trial_name_creator=None,
        trial_dirname_creator=None,
        log_to_file=False,
        checkpoint_freq=0,
        checkpoint_at_end=False,
        keep_checkpoints_num=None,
        checkpoint_score_attr=None,
        export_formats=None,
        max_failures=0,
        restore=None,
    ):

        local_dir = _get_local_dir_with_expand_user(local_dir)
        # `_experiment_checkpoint_dir` is for internal use only for better
        # support of Tuner API.
        # If set, it should be a subpath under `local_dir`. Also deduce `dir_name`.
        self._experiment_checkpoint_dir = _experiment_checkpoint_dir
        if _experiment_checkpoint_dir:
            experiment_checkpoint_dir_path = Path(_experiment_checkpoint_dir)
            local_dir_path = Path(local_dir)
            assert local_dir_path in experiment_checkpoint_dir_path.parents
            # `dir_name` is set by `_experiment_checkpoint_dir` indirectly.
            self.dir_name = os.path.relpath(_experiment_checkpoint_dir, local_dir)

        config = config or {}
        sync_config = sync_config or SyncConfig()
        if (
            callable(run)
            and not inspect.isclass(run)
            and detect_checkpoint_function(run)
        ):
            if checkpoint_at_end:
                raise ValueError(
                    "'checkpoint_at_end' cannot be used with a "
                    "checkpointable function. You can specify "
                    "and register checkpoints within "
                    "your trainable function."
                )
            if checkpoint_freq:
                raise ValueError(
                    "'checkpoint_freq' cannot be used with a "
                    "checkpointable function. You can specify checkpoints "
                    "within your trainable function."
                )
        try:
            self._run_identifier = Experiment.register_if_needed(run)
        except grpc.RpcError as e:
            if e.code() == grpc.StatusCode.RESOURCE_EXHAUSTED:
                raise TuneError(
                    f"The Trainable/training function is too large for grpc resource "
                    f"limit. Check that its definition is not implicitly capturing a "
                    f"large array or other object in scope. "
                    f"Tip: use tune.with_parameters() to put large objects "
                    f"in the Ray object store. \n"
                    f"Original exception: {traceback.format_exc()}"
                )
            else:
                raise e

        self.name = name or self._run_identifier

        if not _experiment_checkpoint_dir:
            self.dir_name = _get_dir_name(run, name, self.name)

        assert self.dir_name

        if sync_config.upload_dir:
            self.remote_checkpoint_dir = os.path.join(
                sync_config.upload_dir, self.dir_name
            )
        else:
            self.remote_checkpoint_dir = None

        self._stopper = None
        stopping_criteria = {}
        if not stop:
            pass
        elif isinstance(stop, list):
            bad_stoppers = [s for s in stop if not isinstance(s, Stopper)]
            if bad_stoppers:
                stopper_types = [type(s) for s in stop]
                raise ValueError(
                    "If you pass a list as the `stop` argument to "
                    "`tune.run()`, each element must be an instance of "
                    f"`tune.stopper.Stopper`. Got {stopper_types}."
                )
            self._stopper = CombinedStopper(*stop)
        elif isinstance(stop, dict):
            stopping_criteria = stop
        elif callable(stop):
            if FunctionStopper.is_valid_function(stop):
                self._stopper = FunctionStopper(stop)
            elif isinstance(stop, Stopper):
                self._stopper = stop
            else:
                raise ValueError(
                    "Provided stop object must be either a dict, "
                    "a function, or a subclass of "
                    f"`ray.tune.Stopper`. Got {type(stop)}."
                )
        else:
            raise ValueError(
                f"Invalid stop criteria: {stop}. Must be a "
                f"callable or dict. Got {type(stop)}."
            )

        if time_budget_s:
            if self._stopper:
                self._stopper = CombinedStopper(
                    self._stopper, TimeoutStopper(time_budget_s)
                )
            else:
                self._stopper = TimeoutStopper(time_budget_s)

        stdout_file, stderr_file = _validate_log_to_file(log_to_file)

        spec = {
            "run": self._run_identifier,
            "stop": stopping_criteria,
            "time_budget_s": time_budget_s,
            "config": config,
            "resources_per_trial": resources_per_trial,
            "num_samples": num_samples,
            "local_dir": local_dir,
            "sync_config": sync_config,
            "remote_checkpoint_dir": self.remote_checkpoint_dir,
            "trial_name_creator": trial_name_creator,
            "trial_dirname_creator": trial_dirname_creator,
            "log_to_file": (stdout_file, stderr_file),
            "checkpoint_freq": checkpoint_freq,
            "checkpoint_at_end": checkpoint_at_end,
            "keep_checkpoints_num": keep_checkpoints_num,
            "checkpoint_score_attr": checkpoint_score_attr,
            "export_formats": export_formats or [],
            "max_failures": max_failures,
            "restore": os.path.abspath(os.path.expanduser(restore))
            if restore
            else None,
        }
        self.spec = spec
Ejemplo n.º 12
0
def DistributedTrainableCreator(
    func: Callable,
    num_workers: int = 2,
    num_gpus_per_worker: int = 0,
    num_cpus_per_worker: int = 1,
    num_workers_per_host: Optional[int] = None,
    timeout_s: int = 15 * 60,
) -> Type[_TensorFlowTrainable]:
    """Converts TensorFlow MultiWorkerMirror training to be executable by Tune.

    Requires TensorFlow > 2.0 to work, recommends TensorFlow > 2.2.

    This function wraps and sets resources for a TF distributed training
    function to be used with Tune. It generates a TensorFlow Trainable
    which can be a distributed training job.

    Note: there is no fault tolerance at the moment.

    Args:
        func (Callable[[dict], None]): A training function that takes in
            a config dict for hyperparameters and should initialize
            horovod via horovod.init.
        num_gpus_per_worker (int); Number of GPUs to request
            from Ray per worker.
        num_cpus_per_worker (int): Number of CPUs to request
            from Ray per worker.
        num_workers (int): Number of hosts that each trial is expected
            to use.
        num_workers_per_host (Optional[int]): Number of workers to
            colocate per host. None if not specified.
        timeout_s (float): Seconds before triggering placement timeouts
            if forcing colocation. Default to 15 minutes.


    Returns:
        Trainable class that can be passed into `tune.run`.

    .. versionadded:: 1.1.0

    Example:

    .. code-block:: python

        # Please refer to full example in tf_distributed_keras_example.py
        tf_trainable = DistributedTrainableCreator(
            train_mnist,
            num_workers=2)
        tune.run(tf_trainable,
                 num_samples=1)
    """
    detect_checkpoint_function(func, abort=True)
    if num_workers_per_host:
        if num_workers % num_workers_per_host:
            raise ValueError("`num_workers` must be an integer multiple "
                             f"of num_workers_per_host. Got: "
                             f"num_workers: {num_workers}, "
                             f"num_workers_per_host: {num_workers_per_host}")

    class WrappedDistributedTensorFlowTrainable(_TensorFlowTrainable):
        _function = func
        _num_workers = num_workers
        _num_cpus_per_worker = num_cpus_per_worker
        _num_workers_per_host = num_workers_per_host
        _num_gpus_per_worker = num_gpus_per_worker
        _timeout_s = timeout_s

        @classmethod
        def default_resource_request(cls, config: Dict) -> Resources:
            return PlacementGroupFactory([{}] +
                                         [{
                                             "CPU": cls._num_cpus_per_worker,
                                             "GPU": cls._num_gpus_per_worker
                                         }] * num_workers)

    return WrappedDistributedTensorFlowTrainable
Ejemplo n.º 13
0
def wrap_function(train_func: Callable[[Any], Any],
                  warn: bool = True,
                  name: Optional[str] = None) -> Type["FunctionTrainable"]:
    inherit_from = (FunctionTrainable, )

    if hasattr(train_func, "__mixins__"):
        inherit_from = train_func.__mixins__ + inherit_from

    func_args = inspect.getfullargspec(train_func).args
    use_checkpoint = detect_checkpoint_function(train_func)
    use_config_single = detect_config_single(train_func)
    use_reporter = detect_reporter(train_func)

    if not any([use_checkpoint, use_config_single, use_reporter]):
        # use_reporter is hidden
        raise ValueError(
            "Unknown argument found in the Trainable function. "
            "The function args must include a 'config' positional "
            "parameter. Any other args must be 'checkpoint_dir'. "
            "Found: {}".format(func_args))

    if use_config_single and not use_checkpoint:
        if log_once("tune_function_checkpoint") and warn:
            logger.warning(
                "Function checkpointing is disabled. This may result in "
                "unexpected behavior when using checkpointing features or "
                "certain schedulers. To enable, set the train function "
                "arguments to be `func(config, checkpoint_dir=None)`.")

    if use_checkpoint:
        if log_once("tune_checkpoint_dir_deprecation") and warn:
            with warnings.catch_warnings():
                warnings.simplefilter("always")
                warning_msg = (
                    "`checkpoint_dir` in `func(config, checkpoint_dir)` is "
                    "being deprecated. "
                    "To save and load checkpoint in trainable functions, "
                    "please use the `ray.air.session` API:\n\n"
                    "from ray.air import session\n\n"
                    "def train(config):\n"
                    "    # ...\n"
                    '    session.report({"metric": metric}, checkpoint=checkpoint)\n\n'
                    "For more information please see "
                    "https://docs.ray.io/en/master/ray-air/key-concepts.html#session\n"
                )
                warnings.warn(
                    warning_msg,
                    DeprecationWarning,
                )

    class ImplicitFunc(*inherit_from):
        _name = name or (train_func.__name__ if hasattr(
            train_func, "__name__") else "func")

        def __repr__(self):
            return self._name

        def _trainable_func(self, config, reporter, checkpoint_dir):
            if not use_checkpoint and not use_reporter:
                fn = partial(train_func, config)
            elif use_checkpoint:
                fn = partial(train_func, config, checkpoint_dir=checkpoint_dir)
            else:
                fn = partial(train_func, config, reporter)

            def handle_output(output):
                if not output:
                    return
                elif isinstance(output, dict):
                    reporter(**output)
                elif isinstance(output, Number):
                    reporter(_metric=output)
                else:
                    raise ValueError(
                        "Invalid return or yield value. Either return/yield "
                        "a single number or a dictionary object in your "
                        "trainable function.")

            output = None
            if inspect.isgeneratorfunction(train_func):
                for output in fn():
                    handle_output(output)
            else:
                output = fn()
                handle_output(output)

            # If train_func returns, we need to notify the main event loop
            # of the last result while avoiding double logging. This is done
            # with the keyword RESULT_DUPLICATE -- see tune/trial_runner.py.
            reporter(**{RESULT_DUPLICATE: True})
            return output

    return ImplicitFunc
Ejemplo n.º 14
0
def DistributedTrainableCreator(
    func: Callable[[Dict], None],
    num_workers: int = 2,
    num_gpus_per_worker: int = 0,
    num_cpus_per_worker: int = 1,
    num_workers_per_host: Optional[int] = None,
    timeout_s: int = 15 * 60,
) -> Type[_TensorFlowTrainable]:
    """Converts TensorFlow MultiWorkerMirror training to be executable by Tune.

    Requires TensorFlow > 2.0 to work, recommends TensorFlow > 2.2.

    This function wraps and sets resources for a TF distributed training
    function to be used with Tune. It generates a TensorFlow Trainable
    which can be a distributed training job.

    Note: there is no fault tolerance at the moment.

    Args:
        func: A training function that takes in
            a config dict for hyperparameters.
        num_gpus_per_worker: Number of GPUs to request
            from Ray per worker.
        num_cpus_per_worker: Number of CPUs to request
            from Ray per worker.
        num_workers: Number of hosts that each trial is expected
            to use.
        num_workers_per_host: Number of workers to colocate per host.
            None if not specified.
        timeout_s: Seconds before triggering placement timeouts
            if forcing colocation. Default to 15 minutes.


    Returns:
        Trainable class that can be passed into `tune.run`.

    .. versionadded:: 1.1.0

    Example:

    .. code-block:: python

        # Please refer to full example in tf_distributed_keras_example.py
        tf_trainable = DistributedTrainableCreator(
            train_mnist,
            num_workers=2)
        tune.run(tf_trainable,
                 num_samples=1)
    """
    warnings.warn(
        "Ray Tune's `DistributedTrainableCreator` will be deprecated in Ray "
        "2.0, and will be replaced by Ray AI Runtime (Ray AIR). Ray AIR ("
        "https://docs.ray.io/en/latest/ray-air/getting-started.html) will "
        "provide greater functionality than `DistributedTrainableCreator`, "
        "and with a more flexible and easy-to-use API.",
        PendingDeprecationWarning,
        stacklevel=2,
    )

    detect_checkpoint_function(func, abort=True)
    if num_workers_per_host:
        if num_workers % num_workers_per_host:
            raise ValueError("`num_workers` must be an integer multiple "
                             f"of num_workers_per_host. Got: "
                             f"num_workers: {num_workers}, "
                             f"num_workers_per_host: {num_workers_per_host}")

    class WrappedDistributedTensorFlowTrainable(_TensorFlowTrainable):
        _function = func
        _num_workers = num_workers
        _num_cpus_per_worker = num_cpus_per_worker
        _num_workers_per_host = num_workers_per_host
        _num_gpus_per_worker = num_gpus_per_worker
        _timeout_s = timeout_s

        @classmethod
        def default_resource_request(cls, config: Dict) -> Resources:
            return PlacementGroupFactory([{}] +
                                         [{
                                             "CPU": cls._num_cpus_per_worker,
                                             "GPU": cls._num_gpus_per_worker
                                         }] * num_workers)

    return WrappedDistributedTensorFlowTrainable