Ejemplo n.º 1
0
    def _ddp_init_helper(self):
        """
        Initialization helper function that does the following:

        (1) replicating the module from device[0] to the other devices
        (2) bucketing the parameters for reductions
        (3) resetting the bucketing states
        (4) registering the grad hooks
        (5) passing a handle of DDP to SyncBatchNorm Layer
        """
        if len(self.device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module,
                                            self.device_ids,
                                            detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             module_copy.parameters()):
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        self.modules_params_data = [[] for _ in range(len(self.device_ids))]
        self.modules_buffers_data = [[] for _ in range(len(self.device_ids))]

        for dev_idx, module in enumerate(self._module_copies):
            self.modules_params_data[dev_idx] = [
                p.data for p in module.parameters()
            ]
            self.modules_buffers_data[dev_idx] = [
                b.data for b in module.buffers()
            ]

        param_list = [
            list(filter(lambda p: p.requires_grad, module.parameters()))
            for module in self._module_copies
        ]

        # The bucket size limit is specified in the constructor.
        # Additionally, we allow for a single small bucket for parameters
        # that are defined first, such that their gradients don't spill into
        # a much larger bucket, adding unnecessary latency after gradient
        # computation finishes. Experiments showed 1MB is a reasonable value.
        bucket_indices = dist._compute_bucket_assignment_by_size(
            param_list[0], [1024 * 1024, self.bucket_bytes_cap])

        # Note: reverse list of buckets because we want to approximate the
        # order in which their gradients are produced, and assume they
        # are used in the forward pass in the order they are defined.
        self.reducer = dist.Reducer(param_list, list(reversed(bucket_indices)),
                                    self.process_group)

        # passing a handle to torch.nn.SyncBatchNorm layer
        self._passing_sync_batchnorm_handle(self._module_copies)
Ejemplo n.º 2
0
 def test_multi_limit_single_dtype(self):
     tensors = [
         torch.empty([10], dtype=torch.float),
         torch.empty([10], dtype=torch.float),
         torch.empty([10], dtype=torch.float),
         torch.empty([10], dtype=torch.float),
     ]
     result = dist._compute_bucket_assignment_by_size(tensors, [40, 80])
     self.assertEqual([[0], [1, 2], [3]], result)
Ejemplo n.º 3
0
 def test_single_limit_multi_dtype(self):
     tensors = [
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
     ]
     result = dist._compute_bucket_assignment_by_size(tensors, [400])
     self.assertEqual([[0, 2], [1, 3], [4], [5]], result)
Ejemplo n.º 4
0
 def test_single_limit_single_dtype(self):
     tensors = [
         torch.empty([100], dtype=torch.float),
         torch.empty([200], dtype=torch.float),
         torch.empty([100], dtype=torch.float),
         torch.empty([50], dtype=torch.float),
     ]
     result, per_bucket_size_limits = dist._compute_bucket_assignment_by_size(
         tensors, [400])
     self.assertTrue(
         all(size_lim == 400 for size_lim in per_bucket_size_limits))
     self.assertEqual([[0], [1], [2], [3]], result)
Ejemplo n.º 5
0
 def test_multi_limit_multi_dtype(self):
     tensors = [
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
         torch.empty([50], dtype=torch.float),
         torch.empty([25], dtype=torch.double),
     ]
     result, per_bucket_size_limits = dist._compute_bucket_assignment_by_size(
         tensors, [200, 400])
     self.assertEqual([[0], [1], [2, 4], [3, 5]], result)
     self.assertEqual(per_bucket_size_limits, [200, 200, 400, 400])
Ejemplo n.º 6
0
    def _ddp_init_helper(self):
        """
        Initialization helper function that does the following:

        (1) replicating the module from device[0] to the other devices
        (2) bucketing the parameters for reductions
        (3) resetting the bucketing states
        (4) registering the grad hooks
        (5) passing a handle of DDP to SyncBatchNorm Layer
        """
        def parameters(m, recurse=True):
            def model_parameters(m):
                ps = m._former_parameters.values() \
                    if hasattr(m, "_former_parameters") \
                    else m.parameters(recurse=False)
                for p in ps:
                    yield p

            for m in m.modules() if recurse else [m]:
                for p in model_parameters(m):
                    yield p

        if self.device_ids and len(self.device_ids) > 1:

            import warnings
            warnings.warn(
                "Single-Process Multi-GPU is not the recommended mode for "
                "DDP. In this mode, each DDP instance operates on multiple "
                "devices and creates multiple module replicas within one "
                "process. The overhead of scatter/gather and GIL contention "
                "in every forward pass can slow down training. "
                "Please consider using one DDP instance per device or per "
                "module replica by explicitly setting device_ids or "
                "CUDA_VISIBLE_DEVICES. ")

            # only create replicas for single-device CUDA modules
            #
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module,
                                            self.device_ids,
                                            detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             parameters(module_copy)):
                    # Reducer requires param copies have the same strides across replicas.
                    # Fixes up copy_param strides in case replicate didn't match param strides.
                    if param.layout is torch.strided and param.stride(
                    ) != copy_param.stride():
                        with torch.no_grad():
                            copy_param.set_(copy_param.clone().as_strided(
                                param.size(),
                                param.stride()).copy_(copy_param))
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        self.modules_params = [
            list(parameters(m)) for m in self._module_copies
        ]
        self.modules_buffers = [list(m.buffers()) for m in self._module_copies]

        # Build tuple of (module, parameter) for all parameters that require grads.
        modules_and_parameters = [[
            (module, parameter) for module in replica.modules()
            for parameter in filter(lambda parameter: parameter.requires_grad,
                                    parameters(module, recurse=False))
        ] for replica in self._module_copies]

        # Build list of parameters.
        parameters = [
            list(parameter for _, parameter in replica)
            for replica in modules_and_parameters
        ]

        # Checks if a module will produce a sparse gradient.
        def produces_sparse_gradient(module):
            if isinstance(module, torch.nn.Embedding):
                return module.sparse
            if isinstance(module, torch.nn.EmbeddingBag):
                return module.sparse
            return False

        # Build list of booleans indicating whether or not to expect sparse
        # gradients for the corresponding parameters.
        expect_sparse_gradient = [
            list(produces_sparse_gradient(module) for module, _ in replica)
            for replica in modules_and_parameters
        ]

        # The bucket size limit is specified in the constructor.
        # Additionally, we allow for a single small bucket for parameters
        # that are defined first, such that their gradients don't spill into
        # a much larger bucket, adding unnecessary latency after gradient
        # computation finishes. Experiments showed 1MB is a reasonable value.
        bucket_indices = dist._compute_bucket_assignment_by_size(
            parameters[0],
            [dist._DEFAULT_FIRST_BUCKET_BYTES, self.bucket_bytes_cap],
            expect_sparse_gradient[0])

        # Note: reverse list of buckets because we want to approximate the
        # order in which their gradients are produced, and assume they
        # are used in the forward pass in the order they are defined.
        self.reducer = dist.Reducer(parameters, list(reversed(bucket_indices)),
                                    self.process_group, expect_sparse_gradient,
                                    self.bucket_bytes_cap,
                                    self.find_unused_parameters)

        # passing a handle to torch.nn.SyncBatchNorm layer
        self._passing_sync_batchnorm_handle(self._module_copies)
    def _ddp_init_helper(self):
        """
        Initialization helper function that does the following:

        (1) replicating the module from device[0] to the other devices
        (2) bucketing the parameters for reductions
        (3) resetting the bucketing states
        (4) registering the grad hooks
        (5) passing a handle of DDP to SyncBatchNorm Layer
        """
        if self.device_ids and len(self.device_ids) > 1:
            # only create replicas for single-device CUDA modules
            #
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module,
                                            self.device_ids,
                                            detach=True)
            self._module_copies[0] = self.module

            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             module_copy.parameters()):
                    copy_param.requires_grad = param.requires_grad

        else:
            self._module_copies = [self.module]

        self.modules_params = [
            list(m.parameters()) for m in self._module_copies
        ]
        self.modules_buffers = [list(m.buffers()) for m in self._module_copies]

        # Build tuple of (module, parameter) for all parameters that require grads.
        modules_and_parameters = [[
            (module, parameter) for module in replica.modules()
            for parameter in filter(lambda parameter: parameter.requires_grad,
                                    module.parameters(recurse=False))
        ] for replica in self._module_copies]

        # Build list of parameters.
        parameters = [
            list(parameter for _, parameter in replica)
            for replica in modules_and_parameters
        ]

        # Checks if a module will produce a sparse gradient.
        def produces_sparse_gradient(module):
            if isinstance(module, torch.nn.Embedding):
                return module.sparse
            if isinstance(module, torch.nn.EmbeddingBag):
                return module.sparse
            return False

        # Build list of booleans indicating whether or not to expect sparse
        # gradients for the corresponding parameters.
        expect_sparse_gradient = [
            list(produces_sparse_gradient(module) for module, _ in replica)
            for replica in modules_and_parameters
        ]

        # The bucket size limit is specified in the constructor.
        # Additionally, we allow for a single small bucket for parameters
        # that are defined first, such that their gradients don't spill into
        # a much larger bucket, adding unnecessary latency after gradient
        # computation finishes. Experiments showed 1MB is a reasonable value.
        bucket_indices = dist._compute_bucket_assignment_by_size(
            parameters[0], [1024 * 1024, self.bucket_bytes_cap],
            expect_sparse_gradient[0])

        # Note: reverse list of buckets because we want to approximate the
        # order in which their gradients are produced, and assume they
        # are used in the forward pass in the order they are defined.
        self.reducer = dist.Reducer(parameters, list(reversed(bucket_indices)),
                                    self.process_group, expect_sparse_gradient)

        # passing a handle to torch.nn.SyncBatchNorm layer
        self._passing_sync_batchnorm_handle(self._module_copies)