def flat_dist_call(tensors, call, extra_args=None): flat_dist_call.warn_on_half = True buckets = {} for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) if flat_dist_call.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") flat_dist_call.warn_on_half = False for tp in buckets: bucket = buckets[tp] coalesced = _flatten_dense_tensors(bucket) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)): buf.copy_(synced)
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_dense_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def reduction_fn_nccl(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [[] for _ in range(len(self._module_copies))] all_grads_buckets_iters = [] # Bucketing all the gradients for dev_idx, module in enumerate(self._module_copies): for param in module.parameters(): if not param.requires_grad or param.grad is None: continue if param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") # Adding the gradients for reduction all_grads[dev_idx].append(param.grad.data) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads[dev_idx], self.nccl_reduce_bucket_size) all_grads_buckets_iters.append(dev_grads_buckets) # Now reduce each bucket one after another for grads_batch in zip(*all_grads_buckets_iters): grads_batch_coalesced = [] # Coalesce each bucket for dev_idx, dev_grads_batch in enumerate(grads_batch): dev_id = self.device_ids[dev_idx] with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # We will only use device 0's results, but this single op should be # faster than doing the following two operation sequentially: # (1) intra-node reduce to lead GPU, followed by # (2) inter-node allreduce for all the first lead GPUs in all nodes dist.all_reduce_multigpu(grads_batch_coalesced, group=self.nccl_reduction_group_id) # Now only work on the first device of self.device_ids, uncoalesce # the gradients for each bucket grads_batch_coalesced[0] /= dist.get_world_size() grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0]) for grad, reduced in zip(grads_batch[0], grads_batch_reduced): grad.copy_(reduced) # clear the gradients and save memory for replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def all_gather_multigpu(output_tensor_lists, input_tensor_list, group=group.WORLD): """Gathers tensors from the whole group in a list. Each tensor in tensor_list should reside on a separate GPU Only nccl backend is currently supported tensors should only be GPU tensors Arguments: output_tensor_lists (List[List[Tensor]]): Output lists. It should contain correctly-sized tensors on each GPU to be used for output of the collective. e.g. output_tensor_lists[i] contains the all_gather result that resides on the GPU of input_tensor_list[i]. Note that each element of output_tensor_lists[i] has the size of world_size * len(input_tensor_list), since the function all gathers the result from every single GPU in the group. To interpret each element of output_tensor_list[i], note that input_tensor_list[j] of rank k will be appear in output_tensor_list[i][rank * world_size + j] Also note that len(output_tensor_lists), and the size of each element in output_tensor_lists (each element is a list, therefore len(output_tensor_lists[i])), need to be the same for all the distributed processes calling this function. input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to be broadcast from current process. Note that len(input_tensor_list) needs to be the same for all the distributed processes calling this function. group (optional): Group of the collective. """ assert torch.distributed._initialized == _INITIALIZED_PG, \ "collective only supported in process-group mode" flatten_tensor_list = [] for output_tensor_list in output_tensor_lists: flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, input_tensor_list, group) for output_tensor_list, flatten_tensor in zip(output_tensor_lists, flatten_tensor_list): for tensor, value in zip(output_tensor_list, _unflatten_dense_tensors(flatten_tensor, output_tensor_list)): tensor.copy_(value) return ret
def _dist_broadcast_coalesced(self, tensors, buffer_size): """ Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. buffer_size (int): maximum size of the buffer for coalescing """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def all_gather_multigpu(output_tensor_lists, input_tensor_list, group=group.WORLD): """Gathers tensors from the whole group in a list. Each tensor in tensor_list should reside on a separate GPU Only nccl backend is currently supported tensors should only be GPU tensors Arguments: output_tensor_lists (List[List[Tensor]]): Output lists. It should contain correctly-sized tensors on each GPU to be used for output of the collective. input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to be broadcast from current process. group (optional): Group of the collective. """ assert torch.distributed._initialized == _INITIALIZED_PG, \ "collective only supported in process-group mode" warnings.warn(""" ================================================================================ WARNING ================================================================================ all_gather_multigpu is still experimental. The API will change without notice and we're can't guarantee full correctness and expected performance yet. We'll announce it once it's ready. """) flatten_tensor_list = [] for output_tensor_list in output_tensor_lists: flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list)) ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list, input_tensor_list, group) for output_tensor_list, flatten_tensor in zip(output_tensor_lists, flatten_tensor_list): for tensor, value in zip(output_tensor_list, _unflatten_dense_tensors(flatten_tensor, output_tensor_list)): tensor.copy_(value) return ret
def _sync_params(self): params = [p.data for p in self.module.parameters()] result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, param in zip(tensors, module.parameters()): param.data.set_(tensor) buffers = list(self.module._all_buffers()) if len(buffers) > 0: # cross-node buffer sync flat_buffers = _flatten_dense_tensors(buffers) dist.broadcast(flat_buffers, 0) for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): buf.copy_(synced) # intra-node buffer sync result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, buf in zip(tensors, module._all_buffers()): buf.set_(tensor)
def allreduce_params(): if module.needs_reduction: module.needs_reduction = False # bucketing params based on value types buckets = {} for param in module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced, op=dist.reduce_op.SUM) coalesced /= dist.get_world_size() for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def model_grads_to_master_grads(model_params, master_params, flat_master=False): """ Copy model gradients to master gradients. Args: model_params: List of model parameters created by :func:`prep_param_lists`. master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. """ if flat_master: # The flattening may incur one more deep copy than is necessary. master_params[0].grad.data.copy_( _flatten_dense_tensors([p.grad.data for p in model_params])) else: for model, master in zip(model_params, master_params): if model.grad is not None: if master.grad is None: master.grad = Variable(master.data.new(*master.data.size())) master.grad.data.copy_(model.grad.data) else: master.grad = None
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): if bucket_size_mb > 0: bucket_size_bytes = bucket_size_mb * 1024 * 1024 buckets = _take_tensors(tensors, bucket_size_bytes) else: buckets = OrderedDict() for tensor in tensors: tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) buckets = buckets.values() for bucket in buckets: flat_tensors = _flatten_dense_tensors(bucket) dist.all_reduce(flat_tensors) flat_tensors.div_(world_size) for tensor, synced in zip( bucket, _unflatten_dense_tensors(flat_tensors, bucket)): tensor.copy_(synced)
def _sync_params(self): params = [p.data for p in self.module.parameters()] result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, param in zip(tensors, module.parameters()): param.data.set_(tensor) buffers = list(self.module._all_buffers()) if len(buffers) > 0: # cross-node buffer sync flat_buffers = _flatten_dense_tensors(buffers) dist.broadcast(flat_buffers, 0) for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)): buf.copy_(synced) # intra-node buffer sync result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, buf in zip(tensors, module._all_buffers()): buf.set_(tensor)
def step_fused_adam(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] norm_groups = [] for i, group in enumerate(self.fp16_groups): grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ])) norm_groups.append( get_weight_norm(grads_groups_flat[i], mpu=self.mpu)) self.overflow = self.overflow_checker.check_using_norm(norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: print("[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format( prev_scale, self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(grads_groups_flat, norm_groups, apply_scale=False) # norm is in fact norm*cur_scale self.optimizer.step(grads=[[g] for g in grads_groups_flat], output_params=[[p] for p in self.fp16_groups_flat], scale=combined_scale, grad_norms=norm_groups) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data return self.overflow
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): if chunk[0].is_sparse: flat_indices, flat_values = _flatten_sparse_tensors(chunk) result_indices = broadcast(flat_indices, devices) result_values = broadcast(flat_values, devices) unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values)) else: flat = _flatten_dense_tensors(chunk) results = broadcast(flat, devices) unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results) # use the broadcasted tensors for the remaining devices for dst, unflat_res in zip(outputs[1:], unflat_results[1:]): dst.extend(unflat_res) for i, output in enumerate(outputs): outputs[i] = _reorder_tensors_as(output, tensors) return tuple(outputs)
def broadcast_coalesced(tensors, devices, buffer_size=10485760): """Broadcasts a sequence tensors to the specified GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: tensors (sequence): tensors to broadcast. devices (Iterable): an iterable of devices among which to broadcast. Note that it should be like (src, dst1, dst2, ...), the first element of which is the source device to broadcast from. buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple containing copies of the ``tensor``, placed on devices corresponding to indices from ``devices``. """ for tensor in tensors: if tensor.get_device() != devices[0]: raise RuntimeError('all tensors must be on devices[0]') outputs = [[] for _ in devices] # use the original tensors for the first device outputs[0].extend(tensors) for chunk in _take_tensors(tensors, buffer_size): if chunk[0].is_sparse: flat_indices, flat_values = _flatten_sparse_tensors(chunk) result_indices = broadcast(flat_indices, devices) result_values = broadcast(flat_values, devices) unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values)) else: flat = _flatten_dense_tensors(chunk) results = broadcast(flat, devices) unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results) # use the broadcasted tensors for the remaining devices for dst, unflat_res in zip(outputs[1:], unflat_results[1:]): dst.extend(unflat_res) for i, output in enumerate(outputs): outputs[i] = _reorder_tensors_as(output, tensors) return tuple(outputs)
def flatten_dense_tensors_aligned(tensor_list, alignment, pg): num_elements = 0 for tensor in tensor_list: num_elements = num_elements + tensor.numel() remaining = num_elements % alignment if remaining: elements_to_add = alignment - remaining pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype) padded_tensor_list = tensor_list + [pad_tensor] num_elements = num_elements + elements_to_add else: padded_tensor_list = tensor_list if dist.get_rank(group=pg) == 0: print("Number of Elements is ", num_elements) return _flatten_dense_tensors(padded_tensor_list)
def _queue_reduction(self, bucket_idx): grads_batch = self.buckets[bucket_idx] grads_batch_coalesced = [] # coalesce the bucket for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch): with torch.cuda.device(dev_id): dev_grads_batch_coalesced = _flatten_dense_tensors( dev_grads_batch) grads_batch_coalesced.append(dev_grads_batch_coalesced) # reduce to the first GPU in self.device_ids if len(self.device_ids) > 1: nccl.reduce(grads_batch_coalesced, root=0, streams=self.default_streams) # now work on the first gpu reduction_work = self.process_group.allreduce( [grads_batch_coalesced[0]], self.allreduce_opts) self.reduction_works[bucket_idx] = reduction_work self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
def prep_param_lists(model, flat_master=False): """ Creates a list of FP32 master parameters for a given model, as in `Training Neural Networks with Mixed Precision: Real Examples`_. Args: model (torch.nn.Module): Existing Pytorch model flat_master (bool, optional, default=False): Flatten the master parameters into a single tensor, as a performance optimization. Returns: A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`. ``master_params`` is a list of FP32 master gradients. If ``flat_master=True``, ``master_params`` will be a list with one element. Example:: model_params, master_params = prep_param_lists(model) .. warning:: Currently, if ``flat_master=True``, all the model's parameters must be the same type. If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`. .. _`Training Neural Networks with Mixed Precision: Real Examples`: http://on-demand.gputechconf.com/gtc/2018/video/S81012/ """ model_params = [param for param in model.parameters() if param.requires_grad] if flat_master: # Give the user some more useful error messages try: # flatten_dense_tensors returns a contiguous flat array. # http://pytorch.org/docs/master/_modules/torch/_utils.html master_params = _flatten_dense_tensors([param.data for param in model_params]).float() except: print("Error in prep_param_lists: model may contain a mixture of parameters " "of different types. Use flat_master=False, or use F16_Optimizer.") raise master_params = torch.nn.Parameter(master_params) master_params.requires_grad = True # master_params.register_hook(backwards_debug_hook) if master_params.grad is None: master_params.grad = master_params.new(*master_params.size()) return model_params, [master_params] else: master_params = [param.clone().float().detach() for param in model_params] for param in master_params: param.requires_grad = True return model_params, master_params
def reduction_fn(): # This function only needs to be called once if not self.need_reduction: return self.need_reduction = False all_grads = [] # Bucketing all the gradients for param in self.module.parameters(): if not param.requires_grad: continue if param.grad is not None and param.grad.requires_grad: raise RuntimeError("DistributedDataParallel only works " "with gradients that don't require " "grad") if param.grad is not None: # Adding the gradients for reduction all_grads.append(param.grad.data) else: all_grads.append(torch.zeros_like(param)) # Now bucketing the parameters dev_grads_buckets = _take_tensors(all_grads, self.reduce_bucket_size) # Now reduce each bucket one after another for grads_batch in dev_grads_buckets: grads_batch_coalesced = _flatten_dense_tensors(grads_batch) grads_batch_coalesced /= self.world_size distributed_utils.all_reduce(grads_batch_coalesced, self.process_group) grads_batch_reduced = _unflatten_dense_tensors( grads_batch_coalesced, grads_batch) for grad, reduced in zip(grads_batch, grads_batch_reduced): grad.copy_(reduced)
def step(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] norm_groups = [] skip = False for i, group in enumerate(self.fp16_groups): #grads_groups_flat.append(_flatten_dense_tensors([p.grad for p in group])) grads_groups_flat.append( _flatten_dense_tensors([ p.grad if p.grad is not None else p.new_zeros(p.size()) for p in group ])) norm_groups.append(self._compute_grad_norm(grads_groups_flat[i])) if norm_groups[i] == -1: #TODO: early break skip = True if skip: self._update_scale(skip) return # norm is in fact norm*cur_scale self.optimizer.step(grads=[[g] for g in grads_groups_flat], output_params=[[p] for p in self.fp16_groups_flat], scale=self.cur_scale, grad_norms=norm_groups) # TODO: we probably don't need this? just to be safe for i in range(len(norm_groups)): updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data self._update_scale(False) return
def _test_CheckOverflow(check_using_norm: bool): groups.initialize_model_parallel(1) groups.initialize_expert_parallel(2) param1 = torch.nn.Parameter(torch.Tensor([0])) param1.grad = torch.Tensor([1]) param2 = torch.nn.Parameter(torch.Tensor([0])) if dist.get_rank() == 0: param2.grad = torch.Tensor([1]) else: param2.grad = torch.Tensor([float("inf")]) param2.allreduce = False # param2 is now MoE parameter parameters = [param1, param2] if check_using_norm: grads_group_flat = [_flatten_dense_tensors([p.grad for p in parameters])] norm = ds_utils.get_weight_norm(grads_group_flat) overflow_checker = ds_utils.CheckOverflow([parameters]) overflow = overflow_checker.check_using_norm([norm], reduce_overflow=False) else: overflow_checker = ds_utils.CheckOverflow([parameters]) overflow = overflow_checker.check() assert overflow
def reduce_gradients(module): buckets = {} for name, param in module.named_parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be slow." + " It is recommended to use the NCCL backend in this case.") warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) torch.cuda.synchronize() coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [[] for _ in inputs] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] flat_result = reduce_add(flat_tensors, destination) output.extend(_unflatten_dense_tensors(flat_result, chunks[0])) return tuple(_reorder_tensors_as(output, ref_order))
def step_fused_lamb(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] grads_groups = [] norm_groups = [] for i, group in enumerate(self.fp16_groups): grads = [ torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ] grads_groups.append(grads) grads_groups_flat.append(_flatten_dense_tensors(grads)) norm_groups.append( get_weight_norm(grads_groups_flat[i], mpu=self.mpu)) self.overflow = self.overflow_checker.check_using_norm(norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info( "[deepspeed] OVERFLOW! Skipping step. Attempted loss " "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False) self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale) return self.overflow
def allreduce_params(reduce_after=True, no_scale=False, fp32_allreduce=False): if (self.needs_reduction): self.needs_reduction = False buckets = {} for name, param in self.module.named_parameters(): if param.requires_grad and param.grad is not None: tp = (param.data.type()) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print( "WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case." ) self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) if fp32_allreduce: coalesced = coalesced.float() if not no_scale and not reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) dist.all_reduce(coalesced, group=self.data_parallel_group) torch.cuda.synchronize() if not no_scale and reduce_after: coalesced /= dist.get_world_size( group=self.data_parallel_group) for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def model_grads_to_master_grads(model_params, master_params, flat_master=False, loss_scale=1.0, params_have_main_grad=False): """ Copy model gradients to master gradients. Args: model_params: List of model parameters created by :func:`prep_param_lists`. master_params: List of FP32 master parameters created by :func:`prep_param_lists`. If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`. """ if flat_master: # The flattening may incur one more deep copy than is necessary. master_params[0].grad.data.copy_( _flatten_dense_tensors([p.grad.data for p in model_params])) else: for model, master in zip(model_params, master_params): if model.device.type == "cpu": continue if model.grad is not None: if master.grad is None: if params_have_main_grad: # If gradient_as_bucket_view is False, this will be a copy master.grad = model.grad.float() else: master.grad = Variable( master.data.new(*master.data.size())) else: master.grad = None model_grads = [p.grad for p in model_params if p.grad is not None] master_grads = [p.grad for p in master_params if p.grad is not None] if len(model_grads) == 0 or len(master_grads) == 0: return _overflow_buf = torch.cuda.IntTensor([0]) multi_tensor_applier(amp_C.multi_tensor_scale, _overflow_buf, [model_grads, master_grads], 1.0 / loss_scale)
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760): """Sums tensors from multiple GPUs. Small tensors are first coalesced into a buffer to reduce the number of synchronizations. Arguments: inputs (Iterable[Iterable[Tensor]]): iterable of iterables that contain tensors from a single device. destination (int, optional): a device on which the output will be placed (default: current device). buffer_size (int): maximum size of the buffer used for coalescing Returns: A tuple of tensors containing an elementwise sum of each group of inputs, placed on the ``destination`` device. """ dense_tensors = [[] for _ in inputs] # shape (num_gpus, num_tensors) output = [] ref_order = [] # process sparse ones first since they may have different sizes on different gpus for tensor_at_gpus in zip(*inputs): if all(t.is_sparse for t in tensor_at_gpus): result = reduce_add(tensor_at_gpus, destination) output.append(result) ref_order.append(tensor_at_gpus[0]) else: for coll, t in zip(dense_tensors, tensor_at_gpus): coll.append(t.to_dense() if t.is_sparse else t) ref_order.append(dense_tensors[0][-1]) itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors] # now the dense ones, which have consistent sizes for chunks in zip(*itrs): flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks] flat_result = reduce_add(flat_tensors, destination) output.extend(_unflatten_dense_tensors(flat_result, chunks[0])) return tuple(_reorder_tensors_as(output, ref_order))
def allreduce_params(): if (self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def allreduce_params(): if(self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def __init__(self, init_optimizer, deepspeed=None, static_loss_scale=1.0, dynamic_loss_scale=False, initial_dynamic_scale=2**32, dynamic_loss_args=None, verbose=True, mpu=None, clip_grad=0.0, fused_adam_legacy=False, timers=None): self.fused_adam_legacy = fused_adam_legacy self.timers = timers if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.optimizer = init_optimizer # param flattened by groups self.fp16_groups = [] self.fp16_groups_flat = [] self.fp32_groups_flat = [] # loop to deal with groups for i, param_group in enumerate(self.optimizer.param_groups): # push this group to list before modify self.fp16_groups.append(param_group['params']) # init fp16 weight buffer, flattened self.fp16_groups_flat.append( _flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]])) # set model fp16 weight to slices of flattened buffer updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data # init master weight, flattened self.fp32_groups_flat.append( self.fp16_groups_flat[i].clone().float().detach()) # modify optimizer of have flat master weight self.fp32_groups_flat[ i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.fp32_groups_flat[i]] # we may have a way of fusing dynamic scale. Do not support for now if dynamic_loss_scale: self.dynamic_loss_scale = True self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = 2 if dynamic_loss_args is None: self.cur_scale = initial_dynamic_scale self.scale_window = 1000 self.min_loss_scale = 1 else: self.cur_scale = dynamic_loss_args[INITIAL_LOSS_SCALE] self.scale_window = dynamic_loss_args[SCALE_WINDOW] self.min_loss_scale = dynamic_loss_args[MIN_LOSS_SCALE] else: self.dynamic_loss_scale = False self.cur_iter = 0 self.cur_scale = static_loss_scale self.verbose = verbose self.clip_grad = clip_grad self.norm_type = 2 TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) if TORCH_MAJOR == 0 and TORCH_MINOR <= 4: self.clip_grad_norm = torch.nn.utils.clip_grad_norm else: self.clip_grad_norm = torch.nn.utils.clip_grad_norm_ #model parallel object self.mpu = mpu self.overflow = False self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed) self.initialize_optimizer_states()
def step(self, closure=None): """ Not supporting closure. """ if self.fused_adam_legacy: return self.step_fused_adam() COMPUTE_NORM = "compute_norm" OVERFLOW_CHECK = 'overflow_check' OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK] UNSCALE_AND_CLIP = 'unscale_and_clip' BASIC_STEP = 'basic_step' UPDATE_FP16 = 'update_fp16' STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16] # First determine if there is overflow. self.start_timers([OVERFLOW_CHECK]) fp16_params = [] for i, group in enumerate(self.fp16_groups): fp16_params.extend([p for p in group if p.grad is not None]) self.overflow = self.overflow_checker.has_overflow(fp16_params) self.stop_timers([OVERFLOW_CHECK]) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: log_dist( "Overflow detected. Skipping step. Attempted loss " f"scale: {prev_scale}, reducing to {self.cur_scale}", ranks=[0]) # Clear gradients for i, group in enumerate(self.fp16_groups): for p in group: p.grad = None self.log_timers(OVERFLOW_TIMERS) return self.overflow grads_groups_flat = [] for i, group in enumerate(self.fp16_groups): data_type = self.fp32_groups_flat[i].dtype grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type) for p in group ])) for p in group: p.grad = None self.fp32_groups_flat[i].grad = grads_groups_flat[i] self.start_timers([COMPUTE_NORM]) all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu) self.stop_timers([COMPUTE_NORM]) self.start_timers([UNSCALE_AND_CLIP]) self.unscale_and_clip_grads(grads_groups_flat, [all_groups_norm]) self.stop_timers([UNSCALE_AND_CLIP]) self.start_timers([BASIC_STEP]) self.optimizer.step() self.stop_timers([BASIC_STEP]) #get rid of the fp32 gradients. Not needed anymore for group in self.fp32_groups_flat: group.grad = None self.start_timers([UPDATE_FP16]) for i in range(len(self.fp16_groups)): updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) self.stop_timers([UPDATE_FP16]) self.log_timers(STEP_TIMERS) return self.overflow
def __init__(self, optimizer, static_loss_scale=1.0, dynamic_loss_scale=False): if not torch.cuda.is_available: raise SystemError('Cannot use fp16 without CUDA') self.fp16_param_groups = [] self.fp32_param_groups = [] self.fp32_flattened_groups = [] for i, param_group in enumerate(optimizer.param_groups): print("FP16_Optimizer processing param group {}:".format(i)) fp16_params_this_group = [] fp32_params_this_group = [] for param in param_group['params']: if param.requires_grad: if param.type() == 'torch.cuda.HalfTensor': print( "FP16_Optimizer received torch.cuda.HalfTensor with {}" .format(param.size())) fp16_params_this_group.append(param) elif param.type() == 'torch.cuda.FloatTensor': print( "FP16_Optimizer received torch.cuda.FloatTensor with {}" .format(param.size())) fp32_params_this_group.append(param) else: raise TypeError( "Wrapped parameters must be either " "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " "Received {}".format(param.type())) fp32_flattened_this_group = None if len(fp16_params_this_group) > 0: fp32_flattened_this_group = _flatten_dense_tensors([ param.detach().data.clone().float() for param in fp16_params_this_group ]) fp32_flattened_this_group = Variable(fp32_flattened_this_group, requires_grad=True) fp32_flattened_this_group.grad = fp32_flattened_this_group.new( *fp32_flattened_this_group.size()) # python's lovely list concatenation via + if fp32_flattened_this_group is not None: param_group['params'] = [fp32_flattened_this_group ] + fp32_params_this_group else: param_group['params'] = fp32_params_this_group self.fp16_param_groups.append(fp16_params_this_group) self.fp32_param_groups.append(fp32_params_this_group) self.fp32_flattened_groups.append(fp32_flattened_this_group) # print("self.fp32_flattened_groups = ", self.fp32_flattened_groups) # print("self.fp16_param_groups = ", self.fp16_param_groups) self.optimizer = optimizer.__class__(optimizer.param_groups) # self.optimizer.load_state_dict(optimizer.state_dict()) self.param_groups = self.optimizer.param_groups if dynamic_loss_scale: self.dynamic_loss_scale = True self.loss_scaler = DynamicLossScaler() else: self.dynamic_loss_scale = False self.loss_scaler = LossScaler(static_loss_scale) self.overflow = False self.first_closure_call_this_step = True
def get_flat_sub_partitions(comm_tensor_list, comm_param_offsets, sub_partition_size, dtype, num_comm_intervals=None, default_device=None, return_partition_params=False): partition_params = [] final_param_offsets = [] flat_sub_partitions = [] for tensor_list, param_offsets in zip(comm_tensor_list, comm_param_offsets): flat_tensor_list = [] current_size = 0 my_offsets = [] my_params = [] if dtype is None: dtype = tensor_list[0].dtype for i, tensor in enumerate(tensor_list): if tensor.grad is None: tensor.grad = torch.zeros(tensor.size(), dtype=tensor.dtype, device=tensor.device) param = tensor tensor = tensor.grad num_elements = tensor.numel() tensor_offset = 0 #we need to offset to get to the right element if i == 0 and param_offsets[i] > 0: tensor_offset = param_offsets[i] num_elements = num_elements - tensor_offset # We don't need all elements of the tensor if this tensor is # larger than we have space for in our curr sub-partition if num_elements > (sub_partition_size - current_size): num_elements = sub_partition_size - current_size #we need a narrow view of the tensor based on the tensor offset and number of elements that #we need from this tensor if tensor_offset > 0 or num_elements < tensor.numel(): flat_tensor_list.append( tensor.contiguous().view(-1).narrow( 0, int(tensor_offset), int(num_elements)).to(dtype)) else: flat_tensor_list.append(tensor.to(dtype)) my_params.append(param) #remember offset into partition and #elems for this tensor my_offsets.append((current_size, num_elements)) current_size = current_size + num_elements #this means its the last partition and does not align with the dp boundary. We need to pad before flattening if current_size < sub_partition_size: my_offsets.append((None, None)) my_params.append(None) if len(tensor_list) == 0: assert default_device != None flat_tensor_list.append( torch.zeros(int(sub_partition_size - current_size), dtype=dtype, device=default_device)) else: flat_tensor_list.append( torch.zeros(int(sub_partition_size - current_size), dtype=dtype, device=tensor_list[0].device)) partition_params.append(my_params) #flat_tensor_list) final_param_offsets.append(my_offsets) assert len(flat_tensor_list) == len(my_offsets), "{} {}".format( len(flat_tensor_list), len(my_offsets)) flat_sub_partitions.append( _flatten_dense_tensors(flat_tensor_list)) if num_comm_intervals is not None and len( flat_sub_partitions) < num_comm_intervals: #print("padding w. sub partitions to ensure uniform communication") device = flat_sub_partitions[0].device for _ in range(num_comm_intervals - len(flat_sub_partitions)): flat_sub_partitions.append( torch.zeros(int(sub_partition_size), dtype=dtype, device=device)) partition_params.append([None]) final_param_offsets.append([(None, None)]) if return_partition_params: assert len(flat_sub_partitions) == len(partition_params) assert len(partition_params) == len( final_param_offsets), "{} {}".format(len(partition_params), len(final_param_offsets)) return flat_sub_partitions, partition_params, final_param_offsets return flat_sub_partitions
def flatten_dense_tensors_sub_partition_aligned(tensor_list, dp, max_elements_per_comm, pg): num_elements = 0 for tensor in tensor_list: num_elements = num_elements + tensor.numel() pprint("Total number of elements in model: {}, max elements per com: {}". format(num_elements, max_elements_per_comm)) max_elements_per_comm = min(max_elements_per_comm, num_elements) sub_partition_size = int(max_elements_per_comm // dp) alignment = sub_partition_size # if alignment == 0: # # number of elements not divisible by dp, outside range and small model must pad with zeroes # pad_tensor = torch.zeros(max_elements_per_comm, # device=tensor_list[0].device, # dtype=tensor_list[0].dtype) # return _flatten_dense_tensors(pad_tensor) remaining = int(num_elements % alignment) # ensure we have equal sized sub-partitions elements_to_add = 0 if remaining: elements_to_add = alignment - remaining # adding padded tensor later after we check comm alignment pprint("adding pad tensor for alignment, {} + {}->{}".format( num_elements, elements_to_add, num_elements + elements_to_add)) #num_elements = num_elements + elements_to_add else: padded_tensor_list = tensor_list num_partitions = int( (num_elements + elements_to_add) // sub_partition_size) assert (num_elements + elements_to_add) % sub_partition_size == 0, "num elements should be " \ "aligned by sub partition " \ "size" num_comm_intervals = int(num_partitions // dp) partition_remaining = int(num_partitions % dp) pprint("num_comm_intervals={}, partition_remaining={}".format( num_comm_intervals, partition_remaining)) if partition_remaining != 0: pprint("adding pad tensor and/or extra sub partition") # add pad tensor for alignment of comm interval, this overrules previous possibly sub-partition alignment num_comm_intervals += 1 aligned_comm_elements = num_comm_intervals * sub_partition_size * dp elements_to_add = aligned_comm_elements - num_elements pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype) padded_tensor_list = tensor_list + [pad_tensor] pprint( "adding pad tensor and/or extra sub partition, {} + {}->{}".format( num_elements, elements_to_add, num_elements + elements_to_add)) num_elements += elements_to_add elif elements_to_add > 0: # add pad tensor for just alignment of sub-partition pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype) padded_tensor_list = tensor_list + [pad_tensor] num_elements += elements_to_add if pg is None or dist.get_rank(group=pg) == 0: print("Number of Elements (w. padding) is ", num_elements) padded_num_elems = 0 for p in padded_tensor_list: padded_num_elems += p.numel() assert num_elements == padded_num_elems, "{} != {}, rank={}".format( num_elements, padded_num_elems, dist.get_rank()) return _flatten_dense_tensors(padded_tensor_list)
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum), d_p) d_p.copy_(buf) all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: dist.all_reduce(d_p_new, group=0) #self.all_gpu else: if self.nodes > 1: if self.compression_buffer: coded, data_time = QSGD_gpu.encode(d_p_new) #specific coded dic just on CPU tensor_signs = coded['signs'].float().to( self.device) tensor_selected = coded['selected'].float().to( self.device) tensor_norm = coded['norm'] #size tensor_signs_size = self.pack_len_tensor_into_tensor( tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor( tensor_selected) #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size #custom ''' print(tensor_signs.type()) print(tensor_selected.type()) print(tensor_norm.type()) ''' else: d_p_new = torch.sign(d_p_new) if self.local_rank == 0: if self.all_gather_commu: #This version only for instances each with one GPU for node_index in self.inter_node_list: if node_index != self.nodes_rank: d.set() f.set() coded_temp = coded.copy() f.record() b.set() tensor_signs_size_temp = tensor_signs_size.clone( ) dist.broadcast( tensor_signs_size_temp, node_index, group=self.all_inter_node_group) b.record() c.set() tensor_signs_temp = torch.zeros( [int(tensor_signs_size_temp[0])], device=self.device, dtype=torch.float) c.record() a.set() dist.broadcast( tensor_signs_temp, node_index, group=self.all_inter_node_group) a.record() d.record() e.set() tensor_selected_size_temp = tensor_selected_size.clone( ) dist.broadcast( tensor_selected_size_temp, node_index, group=self.all_inter_node_group) tensor_selected_temp = torch.zeros( [ int(tensor_selected_size_temp[ 0]) ], device=self.device, dtype=torch.float) dist.broadcast( tensor_selected_temp, node_index, group=self.all_inter_node_group) e.record() tensor_norm_temp = tensor_norm.clone() dist.broadcast( tensor_norm_temp, node_index, group=self.all_inter_node_group) coded_temp[ 'signs'] = tensor_signs_temp.int() coded_temp[ 'selected'] = tensor_selected_temp.long( ) coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode( coded_temp, cuda=True) d_p_new = d_p_new + tensor_decoded ''' print('a', a.get_time()) print('b', b.get_time()) print('c', c.get_time()) print('d', d.get_time()) print('e', e.get_time()) print('f', f.get_time()) ''' else: dist.broadcast( tensor_signs_size, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_signs, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_selected_size, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_selected, node_index, group=self.all_inter_node_group) dist.broadcast( tensor_norm, node_index, group=self.all_inter_node_group) d_p_new = d_p_new / dist.get_world_size() else: if dist.get_rank() == 0: for index, inter_node_group in enumerate( self.inter_node_group_list): coded_temp = coded.copy() tensor_signs_size_temp = tensor_signs_size.clone( ) dist.broadcast( tensor_signs_size_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_signs_temp = torch.zeros( [int(tensor_signs_size_temp[0])], device=self.device, dtype=torch.float) dist.broadcast( tensor_signs_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_selected_size_temp = tensor_selected_size.clone( ) dist.broadcast( tensor_selected_size_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_selected_temp = torch.zeros( [ int(tensor_selected_size_temp[ 0]) ], device=self.device, dtype=torch.float) dist.broadcast( tensor_selected_temp, self.inter_node_list[index + 1], group=inter_node_group) tensor_norm_temp = tensor_norm.clone() dist.broadcast( tensor_norm_temp, self.inter_node_list[index + 1], group=inter_node_group) coded_temp[ 'signs'] = tensor_signs_temp.int() coded_temp[ 'selected'] = tensor_selected_temp.long( ) coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode( coded_temp, cuda=True) d_p_new = d_p_new + tensor_decoded ''' #temp print(tensor_decoded) tensor_decoded_temp = tensor_decoded.clone() dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group) if tensor_decoded == tensor_decoded_temp: print('success') print(tensor_signs_size_temp) print(tensor_selected_size_temp) ''' d_p_new = d_p_new / dist.get_world_size() else: dist.broadcast( tensor_signs_size, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_signs, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_selected_size, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_selected, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast( tensor_norm, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) ''' #temp tensor_decoded = QSGD_gpu.decode(coded, cuda = True) print(tensor_decoded) dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) print(tensor_signs_size) print(tensor_selected_size) ''' dist.barrier( group=self.all_inter_node_group) #os._exit() if self.bidirection_compress: if dist.get_rank() == 0: coded, data_time = QSGD_gpu.encode( d_p_new) tensor_signs = coded['signs'] tensor_selected = coded['selected'] tensor_norm = coded['norm'] tensor_signs_size = self.pack_len_tensor_into_tensor( tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor( tensor_selected) dist.barrier( group=self.all_inter_node_group) dist.broadcast( tensor_signs_size, 0, group=self.all_inter_node_group) dist.broadcast( tensor_selected_size, 0, group=self.all_inter_node_group) if dist.get_rank() != 0: tensor_signs = torch.randn([ int(tensor_signs_size[0]) ]).type_as(tensor_signs) tensor_selected = torch.randn([ int(tensor_selected_size[0]) ]).type_as(tensor_selected) dist.barrier( group=self.all_inter_node_group) dist.broadcast( tensor_signs, 0, group=self.all_inter_node_group) dist.broadcast( tensor_selected, 0, group=self.all_inter_node_group) dist.broadcast( tensor_norm, 0, group=self.all_inter_node_group) coded['signs'] = tensor_signs coded['selected'] = tensor_selected coded['norm'] = tensor_norm tensor_decoded = QSGD_gpu.decode(coded, cuda=True) d_p_new = tensor_decoded else: if dist.get_rank() == 0: dist.barrier( group=self.all_inter_node_group) dist.broadcast( d_p_new, 0, group=self.all_inter_node_group) else: # test for one coded, data_time = QSGD_gpu.encode(d_p_new) tensor_decoded = QSGD_gpu.decode(coded, cuda=True) d_p_new = tensor_decoded #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if self.compression_buffer: if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) return loss
def sync_buffers_bucket(self): buffers = [p.data for p in list(self.model._all_buffers())] for tensors in _take_tensors(buffers, self.mpi_size): flat_tensors = _flatten_dense_tensors(tensors) dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
def get_model_state(self, group): params = self.params[group] return _flatten_dense_tensors([p.data.float() for p in params])
def step(self, closure=None): args = self.args loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] cur_lr = group['lr'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_(d_p) d_p.add_(momentum, buf) all_grads.append(d_p) length = 0 for _ in _take_tensors(all_grads, self.bucket_size): length += 1 dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for i, dev_grads in enumerate(dev_grads_buckets): d_p_new = _flatten_dense_tensors(dev_grads) if len(self.err_buf) < length: self.err_buf.append(torch.zeros_like(d_p_new)) self.server_err_buf.append(torch.zeros_like(d_p_new)) err_buf = self.err_buf[i] server_err_buf = self.server_err_buf[i] d_p_new.add_(self.prev_lr / cur_lr, err_buf) p_buf = d_p_new if self.all_reduce: dist.all_reduce(d_p_new) #self.all_gpu, group = 0 if self.signum: d_p_new = torch.sign(d_p_new) elif self.signum: if self.nodes > 1: if self.compression_buffer: d_p_new_scale = torch.ones(1) d_p_new_scale[0] = d_p_new.abs().sum().cpu().item( ) / d_p_new.numel() d_p_new, tensor_size = self.compressor.compress( d_p_new) tmp = self.compressor.uncompress( d_p_new.clone(), tensor_size) tmp.mul_(d_p_new_scale.item()) err_buf.copy_(p_buf).sub_(tmp) else: d_p_new = torch.sign(d_p_new) if dist.get_rank() == 0: d_p_new_list = [] d_p_new_scale_list = [] for index, inter_node_group in enumerate( self.inter_node_group_list): d_p_temp = d_p_new.clone() d_p_scale_temp = d_p_new_scale.clone() dist.broadcast(d_p_scale_temp, self.inter_node_list[index + 1], group=inter_node_group) dist.broadcast(d_p_temp, self.inter_node_list[index + 1], group=inter_node_group) d_p_new_list.append(d_p_temp) d_p_new_scale_list.append(d_p_scale_temp) else: dist.broadcast(d_p_new_scale, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.broadcast(d_p_new, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.barrier(group=self.all_inter_node_group) if dist.get_rank() == 0: if self.compression_buffer: d_p_new_list.append(d_p_new) #count itself d_p_new_scale_list.append( d_p_new_scale) #count itself #d_p_new = self.compressor.majority_vote(d_p_new_list) d_p_new = torch.zeros(tensor_size).cuda() for d_p, d_p_scale in zip( d_p_new_list, d_p_new_scale_list): tmp = self.compressor.uncompress( d_p, tensor_size) d_p_new.add_(d_p_scale.item(), tmp) d_p_new /= self.nodes d_p_new.add_(self.prev_lr / cur_lr, server_err_buf) un_compr = d_p_new d_p_new_scale = torch.ones(1) d_p_new_scale[0] = d_p_new.abs().sum().cpu( ).item() / d_p_new.numel() d_p_new, _ = self.compressor.compress(d_p_new) tmp = self.compressor.uncompress( d_p_new.clone(), tensor_size) tmp.mul_(d_p_new_scale.item()) server_err_buf.copy_(un_compr).sub_(tmp) else: for d_p_temp in d_p_new_list: d_p_new.add_(d_p_temp) d_p_new = d_p_new / self.nodes dist.barrier(group=self.all_inter_node_group) dist.broadcast(d_p_new, 0, group=self.all_inter_node_group) if self.compression_buffer: dist.broadcast(d_p_new_scale, 0, group=self.all_inter_node_group) if self.compression_buffer: d_p_new = self.compressor.uncompress( d_p_new, tensor_size) d_p_new.mul_(d_p_new_scale.item()) else: print('You can not run without signum or all_reduce') #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if self.compression_buffer: #This part of code is temporary if weight_decay != 0: if momentum != 0: param_state = self.state[p] if 'wd_mom' not in param_state: buf = param_state['wd_mom'] = torch.zeros_like( p.data) else: buf = param_state['wd_mom'] buf.mul_(momentum).add_(weight_decay, p.data) p.grad.data.add_(momentum, buf) p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) self.prev_lr = group['lr'] return loss
def step(self, closure=None): args = self.args loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer == False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state[ 'momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum), d_p) d_p.copy_(buf) all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: dist.all_reduce(d_p_new) #self.all_gpu, group = 0 if self.signum: d_p_new = torch.sign(d_p_new) elif self.signum: if self.nodes > 1: if self.compression_buffer: d_p_new, tensor_size = self.compressor.compress( d_p_new) else: d_p_new = torch.sign(d_p_new) if self.local_rank == 0: if dist.get_rank() == 0: d_p_new_list = [] for index, inter_node_group in enumerate( self.inter_node_group_list): d_p_temp = d_p_new.clone() dist.broadcast(d_p_temp, self.inter_node_list[index + 1], group=inter_node_group) d_p_new_list.append(d_p_temp) else: dist.broadcast( d_p_new, dist.get_rank(), group=self.inter_node_group_list[ self.nodes_rank - 1]) dist.barrier(group=self.all_inter_node_group) if dist.get_rank() == 0: if self.compression_buffer: d_p_new_list.append(d_p_new) #count itself d_p_new = self.compressor.majority_vote( d_p_new_list) else: for d_p_temp in d_p_new_list: d_p_new.add_(d_p_temp) d_p_new = d_p_new / self.nodes dist.barrier(group=self.all_inter_node_group) dist.broadcast(d_p_new, 0, group=self.all_inter_node_group) if self.compression_buffer: d_p_new = self.compressor.uncompress( d_p_new, tensor_size) else: print('You can not run without signum or all_reduce') #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) #LARC saving self.layer_adaptive_lr = [] layer_index = 0 laryer_saving = [ 1, 2, 3, 23, 49, 87 ] #conv1.weight(no bias), bn1.weight, layer1.1.conv1.weight, layer2.1.conv1.weight, layer3.1.conv1.weight, layer4.1.conv1.weight ### for p in group['params']: layer_index += 1 ### ''' LARC This part of code was originally forked from (https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py) ''' if args.larc_enable: trust_coefficient = args.larc_trust_coefficient clip = args.larc_clip eps = args.larc_eps param_norm = torch.norm(p.data) grad_norm = torch.norm(p.grad.data) if param_norm != 0 and grad_norm != 0: # calculate adaptive lr + weight decay adaptive_lr = trust_coefficient * (param_norm) / ( grad_norm + param_norm * weight_decay + eps) #add adaptive lr saving if layer_index in laryer_saving: self.layer_adaptive_lr.append(adaptive_lr) # clip learning rate for LARC if clip: # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)` adaptive_lr = min(adaptive_lr / group['lr'], 1) else: adaptive_lr = adaptive_lr / group['lr'] p.grad.data *= adaptive_lr ### if self.compression_buffer: #This part of code is temporary if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) return loss
def step_fused_lamb(self, closure=None): """ Not supporting closure. """ # First compute norm for all group so we know if there is overflow grads_groups_flat = [] grads_groups = [] norm_groups = [] expert_norm_groups = [] for i, group in enumerate(self.fp16_groups): grads = [ torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group ] grads_groups.append(grads) grads_groups_flat.append(_flatten_dense_tensors(grads)) grads_for_norm, expert_grads_for_norm = split_params_grads_into_shared_and_expert_params( group) norm_group_value = 0.0 if len(grads_for_norm) > 0: norm_group_value = get_weight_norm( _flatten_dense_tensors(grads_for_norm), mpu=self.mpu) norm_groups.append(norm_group_value) expert_norm_group_value = 0.0 if len(expert_grads_for_norm) > 0: expert_norm_group_value = get_weight_norm( _flatten_dense_tensors(expert_grads_for_norm), mpu=self.mpu) expert_norm_groups.append(expert_norm_group_value) self.overflow = self.overflow_checker.check_using_norm( norm_groups + expert_norm_groups) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: logger.info( "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss " "scale: {}, reducing to {}".format(prev_scale, self.cur_scale)) return self.overflow self._global_grad_norm = get_global_norm(norm_list=norm_groups) combined_scale = self.unscale_and_clip_grads(self._global_grad_norm, apply_scale=False) self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale) for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups): for idx, (fp32_param, fp16_param) in enumerate(zip(fp32_group, fp16_group)): #remove the fp32 grad fp32_param.grad = None #copy data from fp32 to fp16 fp16_param.data.copy_(fp32_param.data) return self.overflow
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None, verbose=True): # The fused optimizer does all the work. We need this layer for two reason: # 1. maintain same user API from apex.fp16_utils # 2. keep common stuff here in case we need to add new fused optimizer later # differences from apex.fp16_utils: # - assume all model params in fp16 # - assume all params requires grad # - flat by groups, not keeping state. TODO: remove state explicitly? # - master gard and unflat master weight never exist. TODO: a way to save out unflat master? if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.optimizer = init_optimizer # param flattened by groups self.fp16_groups = [] self.fp16_groups_flat = [] self.fp32_groups_flat = [] # loop to deal with groups for i, param_group in enumerate(self.optimizer.param_groups): # push this group to list before modify self.fp16_groups.append(param_group['params']) # init fp16 weight buffer, flattened self.fp16_groups_flat.append( _flatten_dense_tensors( [p.clone().detach() for p in self.fp16_groups[i]])) # set model fp16 weight to slices of flattened buffer updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data = q.data # init master weight, flattened self.fp32_groups_flat.append( self.fp16_groups_flat[i].clone().float().detach()) # modify optimizer of have flat master weight self.fp32_groups_flat[ i].requires_grad = True # keep this in case internal optimizer uses it param_group['params'] = [self.fp32_groups_flat[i]] # we may have a way of fusing dynamic scale. Do not support for now if dynamic_loss_scale: if dynamic_loss_args is not None: raise SystemError( "Do not support dynamic loss scale args for now.") self.dynamic_loss_scale = True self.cur_scale = 2**16 self.cur_iter = 0 self.last_overflow_iter = -1 self.scale_factor = 2 self.scale_window = 1000 else: self.dynamic_loss_scale = False self.cur_iter = 0 self.cur_scale = static_loss_scale self.verbose = verbose