Example #1
0
    def run(self, func: Callable, *args, **kwargs):
        """Execute ``func`` with provided arguments in distributed context.

        Example

        .. code-block:: python

            def training(local_rank, config, **kwargs):
                # ...
                print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())
                # ...

        Args:
            func (Callable): function to execute. First argument of the function should be `local_rank` - local process
                index.
            *args: positional arguments of ``func`` (without `local_rank`).
            **kwargs: keyword arguments of ``func``.

        """
        if self._spawn_params is not None:
            self.logger.info("Spawn function '{}' in {} processes".format(
                func, self._spawn_params["nproc_per_node"]))
            idist.spawn(self.backend,
                        func,
                        args=args,
                        kwargs_dict=kwargs,
                        **self._spawn_params)
        else:
            self.logger.info("- Run '{}' in {} processes".format(
                func, idist.get_world_size()))
            local_rank = idist.get_local_rank()
            func(local_rank, *args, **kwargs)

        self.logger.info("End of run")
Example #2
0
def auto_model(model: nn.Module) -> nn.Module:
    """Helper method to adapt provided model for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we perform to following:

    - send model to current :meth:`~ignite.distributed.utils.device()`.
    - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1
    - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        model = idist.auto_model(model)

    Args:
        model (torch.nn.Module): model to adapt.

    Returns:
        torch.nn.Module

    .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel
    .. _torch DataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel
    """
    logger = setup_logger(__name__ + ".auto_model")

    model.to(idist.device())

    # distributed data parallel model
    if idist.get_world_size() > 1:
        if idist.backend() == idist_native.NCCL:
            lrank = idist.get_local_rank()
            logger.info(
                "Apply torch DistributedDataParallel on model, device id: {}".
                format(lrank))
            model = torch.nn.parallel.DistributedDataParallel(model,
                                                              device_ids=[
                                                                  lrank,
                                                              ])
        elif idist.backend() == idist_native.GLOO:
            logger.info("Apply torch DistributedDataParallel on model")
            model = torch.nn.parallel.DistributedDataParallel(model)

    # not distributed but multiple GPUs reachable so data parallel model
    elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type:
        logger.info("Apply torch DataParallel on model")
        model = torch.nn.parallel.DataParallel(model)

    return model
Example #3
0
    def run(self, func: Callable, *args: Any, **kwargs: Any) -> None:
        """Execute ``func`` with provided arguments in distributed context.

        Example

        .. code-block:: python

            def training(local_rank, config, **kwargs):
                # ...
                print(idist.get_rank(), ": run with config:", config, "- backend=", idist.backend())
                # ...

            with idist.Parallel(backend=backend) as parallel:
                parallel.run(training, config, a=1, b=2)

        Args:
            func: function to execute. First argument of the function should be `local_rank` - local process
                index.
            args: positional arguments of ``func`` (without `local_rank`).
            kwargs: keyword arguments of ``func``.

        """
        if self._spawn_params is not None and self.backend is not None:
            self._logger.info(  # type: ignore[attr-defined]
                f"Spawn function '{func}' in {self._spawn_params['nproc_per_node']} processes"
            )
            idist.spawn(self.backend,
                        func,
                        args=args,
                        kwargs_dict=kwargs,
                        **self._spawn_params)
        else:
            self._logger.info(
                f"- Run '{func}' in {idist.get_world_size()} processes"
            )  # type: ignore[attr-defined]
            local_rank = idist.get_local_rank()
            func(local_rank, *args, **kwargs)

        self._logger.info("End of run")  # type: ignore[attr-defined]
Example #4
0
def auto_model(model: nn.Module,
               sync_bn: bool = False,
               **kwargs: Any) -> nn.Module:
    """Helper method to adapt provided model for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we perform to following:

    - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device.
    - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1.
    - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available.
    - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        model = idist.auto_model(model)

    In addition with NVidia/Apex, it can be used in the following way:

    .. code-block:: python

        import ignite.distribted as idist

        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
        model = idist.auto_model(model)

    Args:
        model: model to adapt.
        sync_bn: if True, applies `torch convert_sync_batchnorm`_ to the model for native torch
            distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be
            applied before calling ``amp.initialize``.
        kwargs: kwargs to model's wrapping class: `torch DistributedDataParallel`_ or `torch DataParallel`_
            if applicable. Please, make sure to use acceptable kwargs for given backend.

    Returns:
        torch.nn.Module

    .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.
        DistributedDataParallel.html
    .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html
    .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#
        torch.nn.SyncBatchNorm.convert_sync_batchnorm

    .. versionchanged:: 0.4.2

        - Added Horovod distributed framework.
        - Added ``sync_bn`` argument.

    .. versionchanged:: 0.4.3
        Added kwargs to ``idist.auto_model``.
    """
    logger = setup_logger(__name__ + ".auto_model")

    # Put model's parameters to device if its parameters are not on the device
    device = idist.device()
    if not all([p.device == device for p in model.parameters()]):
        model.to(device)

    # distributed data parallel model
    if idist.get_world_size() > 1:
        bnd = idist.backend()
        if idist.has_native_dist_support and bnd in (idist_native.NCCL,
                                                     idist_native.GLOO,
                                                     idist_native.MPI):
            if sync_bn:
                logger.info("Convert batch norm to sync batch norm")
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

            if torch.cuda.is_available():
                if "device_ids" in kwargs:
                    raise ValueError(
                        f"Argument kwargs should not contain 'device_ids', but got {kwargs}"
                    )

                lrank = idist.get_local_rank()
                logger.info(
                    f"Apply torch DistributedDataParallel on model, device id: {lrank}"
                )
                kwargs["device_ids"] = [
                    lrank,
                ]
            else:
                logger.info("Apply torch DistributedDataParallel on model")

            model = torch.nn.parallel.DistributedDataParallel(model, **kwargs)
        elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD:
            import horovod.torch as hvd

            logger.info(
                "Broadcast the initial variable states from rank 0 to all other processes"
            )
            hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # not distributed but multiple GPUs reachable so data parallel model
    elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type:
        logger.info("Apply torch DataParallel on model")
        model = torch.nn.parallel.DataParallel(model, **kwargs)

    return model
Example #5
0
def auto_model(model: nn.Module, sync_bn: bool = False) -> nn.Module:
    """Helper method to adapt provided model for non-distributed and distributed configurations (supporting
    all available backends from :meth:`~ignite.distributed.utils.available_backends()`).

    Internally, we perform to following:

    - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device.
    - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1.
    - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available.
    - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used.

    Examples:

    .. code-block:: python

        import ignite.distribted as idist

        model = idist.auto_model(model)

    In addition with NVidia/Apex, it can be used in the following way:

    .. code-block:: python

        import ignite.distribted as idist

        model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
        model = idist.auto_model(model)

    Args:
        model (torch.nn.Module): model to adapt.
        sync_bn (bool): if True, applies `torch convert_sync_batchnorm`_ to the model for native torch
            distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be
            applied before calling ``amp.initialize``.

    Returns:
        torch.nn.Module

    .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel.
        DistributedDataParallel.html
    .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html
    .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#
        torch.nn.SyncBatchNorm.convert_sync_batchnorm

    """
    logger = setup_logger(__name__ + ".auto_model")

    # Put model's parameters to device if its parameters are not on the device
    device = idist.device()
    if not all([p.device == device for p in model.parameters()]):
        model.to(device)

    # distributed data parallel model
    if idist.get_world_size() > 1:
        bnd = idist.backend()
        if idist.has_native_dist_support and bnd == idist_native.NCCL:
            if sync_bn:
                logger.info("Convert batch norm to sync batch norm")
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

            lrank = idist.get_local_rank()
            logger.info(
                "Apply torch DistributedDataParallel on model, device id: {}".
                format(lrank))
            model = torch.nn.parallel.DistributedDataParallel(model,
                                                              device_ids=[
                                                                  lrank,
                                                              ])
        elif idist.has_native_dist_support and bnd == idist_native.GLOO:
            if sync_bn:
                logger.info("Convert batch norm to sync batch norm")
                model = nn.SyncBatchNorm.convert_sync_batchnorm(model)

            logger.info("Apply torch DistributedDataParallel on model")
            model = torch.nn.parallel.DistributedDataParallel(model)
        elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD:
            import horovod.torch as hvd

            logger.info(
                "Broadcast the initial variable states from rank 0 to all other processes"
            )
            hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # not distributed but multiple GPUs reachable so data parallel model
    elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type:
        logger.info("Apply torch DataParallel on model")
        model = torch.nn.parallel.DataParallel(model)

    return model