Ejemplo n.º 1
0
 def train(self, mode=True):
     # Clear NCCL communicator and CUDA event cache of the default group ID,
     # These cache will be recreated at the later call. This is currently a
     # work-around for a potential NCCL deadlock.
     if dist._backend == dist.dist_backend.NCCL:
         dist._clear_group_cache()
     super(DistributedDataParallel, self).train(mode)
     for module in self._module_copies[1:]:
         module.train(mode)
Ejemplo n.º 2
0
 def train(self, mode=True):
     # Clear NCCL communicator and CUDA event cache of the default group ID,
     # These cache will be recreated at the later call. This is currently a
     # work-around for a potential NCCL deadlock.
     if dist._backend == dist.dist_backend.NCCL:
         dist._clear_group_cache()
     super(DistributedDataParallel, self).train(mode)
     for module in self._module_copies[1:]:
         module.train(mode)
Ejemplo n.º 3
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(DistributedDataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device

        # Sync params and buffers
        for p in self.module.state_dict().values():
            dist.broadcast(p, 0)

        # Clear NCCL communicator and CUDA event cache of the default group ID,
        # These cache will be recreated at the later call. This is currently a
        # work-around for a potential NCCL deadlock.
        if dist._backend == dist.dist_backend.NCCL:
            dist._clear_group_cache()

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids)
            self._module_copies[0] = self.module
            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(),
                                             module_copy.parameters()):
                    copy_param.detach_()
                    copy_param.requires_grad = param.requires_grad
        else:
            self._module_copies = [self.module]

        # Split parameters into buckets that will coalesce reductions
        # TODO: different types need different buckets
        t = None
        for p in self.module.parameters():
            tp = type(p.data)
            if t is not None and t is not tp:
                raise ValueError(
                    "DistributedDataParallel requires all parameters' data to be of the same type"
                )
            t = tp

        self.bucket_sizes = []
        self.bucket_map = {}
        MB = 1024 * 1024
        self.broadcast_bucket_size = 10 * MB  # used for param sync before forward
        # Currently NCCL backend only supports single reduction thread/bucket
        if dist._backend == dist.dist_backend.NCCL:
            bucket_bytes_cap = float('inf')
        else:
            bucket_bytes_cap = 1 * MB
        bucket_bytes = bucket_bytes_cap  # to init the first bucket immediately
        for param_tuple in zip(
                *map(lambda m: m.parameters(), self._module_copies)):
            if param_tuple[0].requires_grad:
                if bucket_bytes >= bucket_bytes_cap:
                    self.bucket_sizes.append(0)
                    bucket_bytes = 0
                for p in param_tuple:
                    self.bucket_map[p] = len(self.bucket_sizes) - 1
                bucket_bytes += p.numel() * p.element_size()
                self.bucket_sizes[-1] += 1

        self.buckets = [[[] for _ in range(len(self.device_ids))]
                        for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids)
                              for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Ejemplo n.º 4
0
    def __init__(self, module, device_ids=None, output_device=None, dim=0):
        super(DistributedDataParallel, self).__init__()
        if device_ids is None:
            device_ids = list(range(torch.cuda.device_count()))
        if output_device is None:
            output_device = device_ids[0]
        self.dim = dim
        self.module = module
        self.device_ids = device_ids
        self.output_device = output_device

        # Sync params and buffers
        for p in self.module.state_dict().values():
            dist.broadcast(p, 0)

        # Clear NCCL communicator and CUDA event cache of the default group ID,
        # These cache will be recreated at the later call. This is currently a
        # work-around for a potential NCCL deadlock.
        if dist._backend == dist.dist_backend.NCCL:
            dist._clear_group_cache()

        if len(device_ids) > 1:
            # TODO: we don't need to replicate params in here. they're always going to
            # be broadcasted using larger blocks in broadcast_coalesced, so it might be
            # better to not pollute the caches with these small blocks
            self._module_copies = replicate(self.module, self.device_ids)
            self._module_copies[0] = self.module
            for module_copy in self._module_copies[1:]:
                for param, copy_param in zip(self.module.parameters(), module_copy.parameters()):
                    copy_param.detach_()
                    copy_param.requires_grad = param.requires_grad
        else:
            self._module_copies = [self.module]

        # Split parameters into buckets that will coalesce reductions
        # TODO: different types need different buckets
        t = None
        for p in self.module.parameters():
            tp = type(p.data)
            if t is not None and t is not tp:
                raise ValueError("DistributedDataParallel requires all parameters' data to be of the same type")
            t = tp

        self.bucket_sizes = []
        self.bucket_map = {}
        MB = 1024 * 1024
        self.broadcast_bucket_size = 10 * MB  # used for param sync before forward
        # Currently NCCL backend only supports single reduction thread/bucket
        if dist._backend == dist.dist_backend.NCCL:
            bucket_bytes_cap = float('inf')
        else:
            bucket_bytes_cap = 1 * MB
        bucket_bytes = bucket_bytes_cap  # to init the first bucket immediately
        for param_tuple in zip(*map(lambda m: m.parameters(), self._module_copies)):
            if param_tuple[0].requires_grad:
                if bucket_bytes >= bucket_bytes_cap:
                    self.bucket_sizes.append(0)
                    bucket_bytes = 0
                for p in param_tuple:
                    self.bucket_map[p] = len(self.bucket_sizes) - 1
                bucket_bytes += p.numel() * p.element_size()
                self.bucket_sizes[-1] += 1

        self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))]
        self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))]
        self.reduced = [False] * len(self.bucket_sizes)

        self._register_grad_hooks()

        self.dispatch_lock = threading.Lock()
        self._start_reduction_threads()
Ejemplo n.º 5
0
 def __init__(self, module):
     super(DistModule, self).__init__()
     self.module = module
     broadcast_params(self.module)
     dist._clear_group_cache()
Ejemplo n.º 6
0
 def train(self, mode=True):
     dist._clear_group_cache()
     super(DistModule, self).train(mode)
     self.module.train(mode)