def _ddp_init_helper(self): self.modules_params_data = [[p.data for p in self.params], ] param_buckets = [] # Split the parameters into buckets and by types as well # We only need to bucket and reduce parameters that require grad and # this is also true for backward since only the backward hooks for # parameters that require grad will be registered with gradient # reduction functions params_to_bucket = [[], ] for p in self.params: if p.requires_grad: params_to_bucket[0].append(p) param_buckets = [dist._dist_bucket_tensors(dev_params_to_bucket, int(self.bucket_bytes_cap), fine_grained=False) for dev_params_to_bucket in params_to_bucket] self.bucket_sizes = [] self.bucket_map = {} for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for param_tuple in zip(*param_buckets_tuple): if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = (bucket_idx, self.bucket_sizes[bucket_idx]) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 # When all buckets are reduced, this will be set to True. This flag is # useful for sanity checks to ensure that each iteration's backward has # always reduced all buckets self.all_buckets_reduced = False self.check_previous_reduction = False self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] # default stream tracking to launch nccl reduce kernels self.default_streams = [] for dev_id in self.device_ids: with torch.cuda.device(dev_id): self.default_streams.append(torch.cuda.current_stream())
def _ddp_init_helper(self): """ Initialization helper function that does the following: (1) replicating the module from device[0] to the other devices (2) bucketing the parameters for reductions (3) resetting the bucketing states (4) registering the grad hooks """ if len(self.device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] self.modules_params_data = [[] for _ in range(len(self.device_ids))] self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] for dev_idx, module in enumerate(self._module_copies): self.modules_params_data[dev_idx] = [ p.data for p in module.parameters() ] self.modules_buffers_data[dev_idx] = [ b.data for b in module.buffers() ] # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well # We only need to bucket and reduce parameters that require grad and # this is also true for backward since only the backward hooks for # parameters that require grad will be registered with gradient # reduction functions params_to_bucket = [[] for _ in self._module_copies] for dev_idx, m in enumerate(self._module_copies): for p in m.parameters(): if p.requires_grad: params_to_bucket[dev_idx].append(p) param_buckets = [ dist._dist_bucket_tensors(dev_params_to_bucket, int(self.bucket_bytes_cap), fine_grained=False) for dev_params_to_bucket in params_to_bucket ] self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for param_tuple in zip(*param_buckets_tuple): if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = (bucket_idx, self.bucket_sizes[bucket_idx]) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # The number of params ready in each bucket self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 # When all buckets are reduced, this will be set to True. This flag is # useful for sanity checks to ensure that each iteration's backward has # always reduced all buckets self.all_buckets_reduced = False self.check_previous_reduction = False self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self._register_grad_hooks()
def __init__(self, module, device_ids=None, output_device=None, dim=0, broadcast_buffers=True, process_group=None, bucket_cap_mb=25): super(DistributedDataParallel, self).__init__() # Use all devices by default if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] if process_group is None: self.process_group = dist.get_default_group() else: self.process_group = process_group self.dim = dim self.module = module self.device_ids = list( map(lambda x: _get_device_index(x, True), device_ids)) self.output_device = _get_device_index(output_device, True) self.broadcast_buffers = broadcast_buffers MB = 1024 * 1024 # used for intra-node param sync and inter-node sync as well self.broadcast_bucket_size = 250 * MB # Sync params and buffers module_states = list(self.module.state_dict().values()) if len(module_states) > 0: self._dist_broadcast_coalesced(module_states, self.broadcast_bucket_size) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids, detach=True) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] self.modules_params_data = [[] for _ in range(len(self.device_ids))] self.modules_buffers_data = [[] for _ in range(len(self.device_ids))] for dev_idx, module in enumerate(self._module_copies): self.modules_params_data[dev_idx] = [ p.data for p in module.parameters() ] self.modules_buffers_data[dev_idx] = [ b.data for b in module.buffers() ] bucket_bytes_cap = bucket_cap_mb * MB # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well param_buckets = [ dist._dist_bucket_tensors(list(m.parameters()), int(bucket_bytes_cap), fine_grained=False) for m in self._module_copies ] self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for param_tuple in zip(*param_buckets_tuple): if not param_tuple[0].requires_grad: continue for p in param_tuple: self.bucket_map[p] = (bucket_idx, self.bucket_sizes[bucket_idx]) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # The number of params ready in each bucket self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))] self._register_grad_hooks()
def _set_buckets(self): self.num_elements = [{ layer: ceil(layer.numel() * self.randk) for layer in m.parameters() } for m in self._module_copies] # This is a triply-nested list where the "dimensions" are: devices, buckets, bucket_elems param_buckets = [] # Split the parameters into buckets and by types as well # We only need to bucket and reduce parameters that require grad and # this is also true for backward since only the backward hooks for # parameters that require grad will be registered with gradient # reduction functions params_to_bucket = [[] for _ in self._module_copies] layer_map = {} for dev_idx, m in enumerate(self._module_copies): for layer in m.parameters(): if layer.requires_grad: # only need correct number of elements for calculating bucket sizes sparsified = layer.view( -1)[0:self.num_elements[dev_idx][layer]] layer_map[sparsified] = layer params_to_bucket[dev_idx].append(sparsified) param_buckets = [ dist._dist_bucket_tensors(dev_params_to_bucket, int(self.bucket_bytes_cap), fine_grained=False) for dev_params_to_bucket in params_to_bucket ] self.bucket_sizes = [] self.bucket_map = {} # We transpose param_buckets, so the loop is over buckets. # param_buckets_tuple is a doubly-nested list with "dims": devices, bucket_elems for bucket_idx, param_buckets_tuple in enumerate(zip(*param_buckets)): self.bucket_sizes.append(0) # Now, we transpose again, so we iterate over bucket_elems, but getting tuples # of params from each device. for param_tuple in zip(*param_buckets_tuple): if not param_tuple[0].requires_grad: continue for sparsified in param_tuple: self.bucket_map[layer_map[sparsified]] = ( bucket_idx, self.bucket_sizes[bucket_idx]) self.bucket_sizes[bucket_idx] += 1 self.buckets = [[[None for _ in range(self.bucket_sizes[i])] for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # The number of params ready in each bucket self.buckets_ready_size = [[0 for _ in range(len(self.device_ids))] for i in range(len(self.bucket_sizes))] # coalesced bucket for only device 0 self.buckets_coalesced = [[] for _ in range(len(self.bucket_sizes))] # We will always reduce the bucket following the reverse order # that is, alway reduces following the order of: n - 1, n - 2, ..., 0 self.next_bucket = len(self.bucket_sizes) - 1 # When all buckets are reduced, this will be set to True. This flag is # useful for sanity checks to ensure that each iteration's backward has # always reduced all buckets self.all_buckets_reduced = False self.ready_buckets_not_reduced = set() self.reduction_works = [None for _ in range(len(self.bucket_sizes))] self.devs_ready = [0 for _ in range(len(self.bucket_sizes))]