def auto_model(model: nn.Module) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()`. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1 - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) Args: model (torch.nn.Module): model to adapt. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel .. _torch DataParallel: https://pytorch.org/docs/stable/nn.html#torch.nn.DataParallel """ logger = setup_logger(__name__ + ".auto_model") model.to(idist.device()) # distributed data parallel model if idist.get_world_size() > 1: if idist.backend() == idist_native.NCCL: lrank = idist.get_local_rank() logger.info( "Apply torch DistributedDataParallel on model, device id: {}". format(lrank)) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[ lrank, ]) elif idist.backend() == idist_native.GLOO: logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model) return model
def auto_optim(optimizer: Optimizer) -> Optimizer: """Helper method to adapt optimizer for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, this method is no-op for non-distributed and torch native distributed configuration. For XLA distributed configuration, we create a new class that inherits from provided optimizer. The goal is to override the `step()` method with specific `xm.optimizer_step`_ implementation. Examples: .. code-block:: python import ignite.distribted as idist optimizer = idist.auto_optim(optimizer) Args: optimizer (Optimizer): input torch optimizer Returns: Optimizer .. _xm.optimizer_step: http://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step """ if not (idist.has_xla_support and idist.backend() == idist_xla.XLA_TPU): return optimizer cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_XLADistributedOptimizer.__dict__)) return cls(optimizer)
def auto_optim(optimizer: Optimizer, **kwargs: Any) -> Optimizer: """Helper method to adapt optimizer for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, this method is no-op for non-distributed and torch native distributed configuration. For XLA distributed configuration, we create a new class that inherits from provided optimizer. The goal is to override the `step()` method with specific `xm.optimizer_step`_ implementation. For Horovod distributed configuration, optimizer is wrapped with Horovod Distributed Optimizer and its state is broadcasted from rank 0 to all other processes. Args: optimizer: input torch optimizer kwargs: kwargs to Horovod backend's DistributedOptimizer. Returns: Optimizer Examples: .. code-block:: python import ignite.distributed as idist optimizer = idist.auto_optim(optimizer) .. _xm.optimizer_step: http://pytorch.org/xla/release/1.5/index.html#torch_xla.core.xla_model.optimizer_step .. versionchanged:: 0.4.2 Added Horovod distributed optimizer. .. versionchanged:: 0.5.0 Added kwargs to ``idist.auto_optim``. """ bnd = idist.backend() if idist.has_xla_support and bnd == idist_xla.XLA_TPU: cls = type(optimizer.__class__.__name__, (optimizer.__class__, ), dict(_XLADistributedOptimizer.__dict__)) return cls(optimizer) if idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd optimizer = hvd.DistributedOptimizer(optimizer, **kwargs) hvd.broadcast_optimizer_state(optimizer, root_rank=0) return optimizer return optimizer
def auto_dataloader(dataset: Dataset, **kwargs: Any) -> Union[DataLoader, "_MpDeviceLoader"]: """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we create a dataloader with provided kwargs while applying the following updates: - batch size is scaled by world size: ``batch_size / world_size`` if larger or equal world size. - number of workers is scaled by number of local processes: ``num_workers / nprocs`` if larger or equal world size. - if no sampler provided by user, a `torch DistributedSampler`_ is setup. - if a `torch DistributedSampler`_ is provided by user, it is used without wrapping it. - if another sampler is provided, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`. - if the default device is 'cuda', `pin_memory` is automatically set to `True`. .. warning:: Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch sampler is compatible with distributed configuration. Examples: .. code-block:: python import ignite.distribted as idist train_loader = idist.auto_dataloader( train_dataset, batch_size=32, num_workers=4, shuffle=True, pin_memory="cuda" in idist.device().type, drop_last=True, ) Args: dataset: input torch dataset. If input dataset is `torch IterableDataset`_ then dataloader will be created without any distributed sampling. Please, make sure that the dataset itself produces different data on different ranks. kwargs: keyword arguments for `torch DataLoader`_. Returns: `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178 .. _torch DistributedSampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler .. _torch IterableDataset: https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset """ rank = idist.get_rank() world_size = idist.get_world_size() logger = setup_logger(__name__ + ".auto_dataloader") if world_size > 1: if "batch_size" in kwargs and kwargs["batch_size"] >= world_size: kwargs["batch_size"] //= world_size nproc = idist.get_nproc_per_node() if "num_workers" in kwargs and kwargs["num_workers"] >= nproc: kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc if "batch_sampler" not in kwargs: if isinstance(dataset, IterableDataset): logger.info( "Found iterable dataset, dataloader will be created without any distributed sampling. " "Please, make sure that the dataset itself produces different data on different ranks." ) else: sampler: Optional[Union[DistributedProxySampler, DistributedSampler, Sampler]] sampler = kwargs.get("sampler", None) if isinstance(sampler, DistributedSampler): if sampler.rank != rank: warnings.warn( f"Found distributed sampler with rank={sampler.rank}, but process rank is {rank}" ) if sampler.num_replicas != world_size: warnings.warn( f"Found distributed sampler with num_replicas={sampler.num_replicas}, " f"but world size is {world_size}") elif sampler is None: # removes "shuffle" from kwargs if sampler is used shuffle = kwargs.pop("shuffle", True) sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=shuffle) else: sampler = DistributedProxySampler(sampler, num_replicas=world_size, rank=rank) kwargs["sampler"] = sampler else: warnings.warn( "Found batch_sampler in provided kwargs. Please, make sure that it is compatible " "with distributed configuration") if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and kwargs.get("pin_memory", False): # TODO: How about XLA GPU ? warnings.warn( "Found incompatible options: xla support and pin_memory args equal True. " "Argument `pin_memory=False` will be used to construct data loader." ) kwargs["pin_memory"] = False else: kwargs["pin_memory"] = kwargs.get("pin_memory", "cuda" in idist.device().type) logger.info( f"Use data loader kwargs for dataset '{repr(dataset)[:20].strip()}': \n\t{kwargs}" ) dataloader = DataLoader(dataset, **kwargs) if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and world_size > 1: logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA") mp_device_loader_cls = _MpDeviceLoader try: from torch_xla.distributed.parallel_loader import MpDeviceLoader mp_device_loader_cls = MpDeviceLoader except ImportError: pass mp_dataloader = mp_device_loader_cls(dataloader, idist.device()) mp_dataloader.sampler = dataloader.sampler # type: ignore[attr-defined] return mp_dataloader return dataloader
def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1. - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) In addition with NVidia/Apex, it can be used in the following way: .. code-block:: python import ignite.distribted as idist model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = idist.auto_model(model) Args: model: model to adapt. sync_bn: if True, applies `torch convert_sync_batchnorm`_ to the model for native torch distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be applied before calling ``amp.initialize``. kwargs: kwargs to model's wrapping class: `torch DistributedDataParallel`_ or `torch DataParallel`_ if applicable. Please, make sure to use acceptable kwargs for given backend. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel. DistributedDataParallel.html .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html# torch.nn.SyncBatchNorm.convert_sync_batchnorm .. versionchanged:: 0.4.2 - Added Horovod distributed framework. - Added ``sync_bn`` argument. .. versionchanged:: 0.4.3 Added kwargs to ``idist.auto_model``. """ logger = setup_logger(__name__ + ".auto_model") # Put model's parameters to device if its parameters are not on the device device = idist.device() if not all([p.device == device for p in model.parameters()]): model.to(device) # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() if idist.has_native_dist_support and bnd in (idist_native.NCCL, idist_native.GLOO, idist_native.MPI): if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) if torch.cuda.is_available(): if "device_ids" in kwargs: raise ValueError( f"Argument kwargs should not contain 'device_ids', but got {kwargs}" ) lrank = idist.get_local_rank() logger.info( f"Apply torch DistributedDataParallel on model, device id: {lrank}" ) kwargs["device_ids"] = [ lrank, ] else: logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model, **kwargs) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd logger.info( "Broadcast the initial variable states from rank 0 to all other processes" ) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model, **kwargs) return model
def auto_model(model: nn.Module, sync_bn: bool = False) -> nn.Module: """Helper method to adapt provided model for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we perform to following: - send model to current :meth:`~ignite.distributed.utils.device()` if model's parameters are not on the device. - wrap the model to `torch DistributedDataParallel`_ for native torch distributed if world size is larger than 1. - wrap the model to `torch DataParallel`_ if no distributed context found and more than one CUDA devices available. - broadcast the initial variable states from rank 0 to all other processes if Horovod distributed framework is used. Examples: .. code-block:: python import ignite.distribted as idist model = idist.auto_model(model) In addition with NVidia/Apex, it can be used in the following way: .. code-block:: python import ignite.distribted as idist model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) model = idist.auto_model(model) Args: model (torch.nn.Module): model to adapt. sync_bn (bool): if True, applies `torch convert_sync_batchnorm`_ to the model for native torch distributed only. Default, False. Note, if using Nvidia/Apex, batchnorm conversion should be applied before calling ``amp.initialize``. Returns: torch.nn.Module .. _torch DistributedDataParallel: https://pytorch.org/docs/stable/generated/torch.nn.parallel. DistributedDataParallel.html .. _torch DataParallel: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html .. _torch convert_sync_batchnorm: https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html# torch.nn.SyncBatchNorm.convert_sync_batchnorm """ logger = setup_logger(__name__ + ".auto_model") # Put model's parameters to device if its parameters are not on the device device = idist.device() if not all([p.device == device for p in model.parameters()]): model.to(device) # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() if idist.has_native_dist_support and bnd == idist_native.NCCL: if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) lrank = idist.get_local_rank() logger.info( "Apply torch DistributedDataParallel on model, device id: {}". format(lrank)) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[ lrank, ]) elif idist.has_native_dist_support and bnd == idist_native.GLOO: if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd logger.info( "Broadcast the initial variable states from rank 0 to all other processes" ) hvd.broadcast_parameters(model.state_dict(), root_rank=0) # not distributed but multiple GPUs reachable so data parallel model elif torch.cuda.device_count() > 1 and "cuda" in idist.device().type: logger.info("Apply torch DataParallel on model") model = torch.nn.parallel.DataParallel(model) return model
def auto_dataloader(dataset, **kwargs): """Helper method to create a dataloader adapted for non-distributed and distributed configurations (supporting all available backends from :meth:`~ignite.distributed.utils.available_backends()`). Internally, we create a dataloader with provided kwargs while applying the following updates: - batch size is scaled by world size: ``batch_size / world_size``. - number of workers is scaled by number of local processes: ``num_workers / nprocs``. - if no sampler provided by user, `torch DistributedSampler` is setup. - if a sampler is provided by user, it is wrapped by :class:`~ignite.distributed.auto.DistributedProxySampler`. .. warning:: Custom batch sampler is not adapted for distributed configuration. Please, make sure that provided batch sampler is compatible with distributed configuration. Examples: .. code-block:: python import ignite.distribted as idist train_loader = idist.auto_dataloader( train_dataset, batch_size=32, num_workers=4, shuffle=True, pin_memory="cuda" in idist.device().type, drop_last=True, ) Args: dataset (Dataset): input torch dataset **kwargs: keyword arguments for `torch DataLoader`_. Returns: `torch DataLoader`_ or `XLA MpDeviceLoader`_ for XLA devices .. _torch DataLoader: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader .. _XLA MpDeviceLoader: https://github.com/pytorch/xla/blob/master/torch_xla/distributed/parallel_loader.py#L178 .. _torch DistributedSampler: https://pytorch.org/docs/stable/data.html#torch.utils.data.distributed.DistributedSampler """ rank = idist.get_rank() world_size = idist.get_world_size() logger = setup_logger(__name__ + ".auto_dataloader") if world_size > 1: if "batch_size" in kwargs: kwargs["batch_size"] //= world_size if "num_workers" in kwargs: nproc = idist.get_nproc_per_node() kwargs["num_workers"] = (kwargs["num_workers"] + nproc - 1) // nproc if "batch_sampler" not in kwargs: if kwargs.get("sampler", None) is not None: sampler = DistributedProxySampler(kwargs["sampler"], num_replicas=world_size, rank=rank) else: sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=kwargs.get( "shuffle", True)) # we need to remove "shuffle" from kwargs if sampler is used if "shuffle" in kwargs: del kwargs["shuffle"] kwargs["sampler"] = sampler else: warnings.warn( "Found batch_sampler in provided kwargs. Please, make sure that it is compatible " "with distributed configuration") if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and kwargs.get("pin_memory", False): # TODO: How about XLA GPU ? warnings.warn( "Found incompatible options: xla support and pin_memory args equal True. " "Argument `pin_memory=False` will be used to construct data loader." ) kwargs["pin_memory"] = False logger.info("Use data loader kwargs for dataset '{}': \n\t{}".format( repr(dataset)[:20].strip(), kwargs)) dataloader = DataLoader(dataset, **kwargs) if idist.has_xla_support and idist.backend( ) == idist_xla.XLA_TPU and world_size > 1: logger.info("DataLoader is wrapped by `MpDeviceLoader` on XLA") mp_device_loader_cls = _MpDeviceLoader try: from torch_xla.distributed.parallel_loader import MpDeviceLoader mp_device_loader_cls = MpDeviceLoader except ImportError: pass sampler = dataloader.sampler dataloader = mp_device_loader_cls(dataloader, idist.device()) dataloader.sampler = sampler return dataloader