コード例 #1
0
ファイル: distributed.py プロジェクト: Henley13/imagenet-fast
def flat_dist_call(tensors, call, extra_args=None):
    flat_dist_call.warn_on_half = True
    buckets = {}
    for tensor in tensors:
        tp = tensor.type()
        if tp not in buckets:
            buckets[tp] = []
        buckets[tp].append(tensor)
                    
    if flat_dist_call.warn_on_half:
        if torch.cuda.HalfTensor in buckets:
            print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                  " It is recommended to use the NCCL backend in this case.")
            flat_dist_call.warn_on_half = False

    for tp in buckets:
        bucket = buckets[tp]
        coalesced = _flatten_dense_tensors(bucket)
        if extra_args is not None:
            call(coalesced, *extra_args)
        else:
            call(coalesced)
        coalesced /= dist.get_world_size()
        for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
            buf.copy_(synced)
コード例 #2
0
ファイル: distributed.py プロジェクト: lxlhh/pytorch
        def _process_batch():
            dev_grad_batch, dev_events, job_event = queue.get()
            dev_coalesced = []
            # Coalesce the tensors on all devices and start a local reduction
            for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams):
                with torch.cuda.device(dev_id), torch.cuda.stream(stream):
                    stream.wait_event(event)
                    coalesced = _flatten_dense_tensors(grad_batch)
                    dev_coalesced.append(coalesced)
            # Wait for all copies to complete before starting the NCCL kernel
            for stream in reduction_streams:
                stream.synchronize()
            nccl.reduce(dev_coalesced, root=0, streams=nccl_streams)

            # From now on we're only going to work on the first device (from device_ids)
            grad_batch = dev_grad_batch[0]
            coalesced = dev_coalesced[0]
            reduce_stream = reduction_streams[0]
            with torch.cuda.stream(reduce_stream):
                reduce_stream.wait_stream(nccl_streams[0])
                coalesced /= dist.get_world_size()
                dist.all_reduce(coalesced, group=group_id)
                for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)):
                    grad.copy_(reduced)
            job_event.set()
コード例 #3
0
ファイル: distributed.py プロジェクト: inkawhich/pytorch
        def reduction_fn_nccl():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = [[] for _ in range(len(self._module_copies))]
            all_grads_buckets_iters = []

            # Bucketing all the gradients
            for dev_idx, module in enumerate(self._module_copies):
                for param in module.parameters():
                    if not param.requires_grad or param.grad is None:
                        continue
                    if param.grad.requires_grad:
                        raise RuntimeError("DistributedDataParallel only works "
                                           "with gradients that don't require "
                                           "grad")
                    # Adding the gradients for reduction
                    all_grads[dev_idx].append(param.grad.data)

                # Now bucketing the parameters
                dev_grads_buckets = _take_tensors(all_grads[dev_idx],
                                                  self.nccl_reduce_bucket_size)

                all_grads_buckets_iters.append(dev_grads_buckets)

            # Now reduce each bucket one after another
            for grads_batch in zip(*all_grads_buckets_iters):
                grads_batch_coalesced = []
                # Coalesce each bucket
                for dev_idx, dev_grads_batch in enumerate(grads_batch):
                    dev_id = self.device_ids[dev_idx]
                    with torch.cuda.device(dev_id):
                        dev_grads_batch_coalesced = _flatten_dense_tensors(dev_grads_batch)
                        grads_batch_coalesced.append(dev_grads_batch_coalesced)

                # We will only use device 0's results, but this single op should be
                # faster than doing the following two operation sequentially:
                # (1) intra-node reduce to lead GPU, followed by
                # (2) inter-node allreduce for all the first lead GPUs in all nodes
                dist.all_reduce_multigpu(grads_batch_coalesced,
                                         group=self.nccl_reduction_group_id)

                # Now only work on the first device of self.device_ids, uncoalesce
                # the gradients for each bucket
                grads_batch_coalesced[0] /= dist.get_world_size()
                grads_batch_reduced = _unflatten_dense_tensors(grads_batch_coalesced[0], grads_batch[0])
                for grad, reduced in zip(grads_batch[0], grads_batch_reduced):
                    grad.copy_(reduced)

            # clear the gradients and save memory for replicas
            for module in self._module_copies[1:]:
                for param in module.parameters():
                    if param.requires_grad:
                        param.grad = None
                        param.data.set_()
コード例 #4
0
ファイル: __init__.py プロジェクト: Jsmilemsj/pytorch
def all_gather_multigpu(output_tensor_lists,
                        input_tensor_list,
                        group=group.WORLD):
    """Gathers tensors from the whole group in a list.
    Each tensor in tensor_list should reside on a separate GPU

    Only nccl backend is currently supported
    tensors should only be GPU tensors

    Arguments:
        output_tensor_lists (List[List[Tensor]]): Output lists. It should
            contain correctly-sized tensors on each GPU to be used for output of
            the collective.

            e.g. output_tensor_lists[i] contains the all_gather
            result that resides on the GPU of input_tensor_list[i].

            Note that each element of output_tensor_lists[i] has the size of
            world_size * len(input_tensor_list), since the function all gathers
            the result from every single GPU in the group. To interpret each
            element of output_tensor_list[i], note that input_tensor_list[j] of
            rank k will be appear in
            output_tensor_list[i][rank * world_size + j]

            Also note that len(output_tensor_lists), and the size of each
            element in output_tensor_lists (each element is a list,
            therefore len(output_tensor_lists[i])), need to be the same
            for all the distributed processes calling this function.

        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
            be broadcast from current process.
            Note that len(input_tensor_list) needs to be the same for
            all the distributed processes calling this function.
        group (optional): Group of the collective.
    """
    assert torch.distributed._initialized == _INITIALIZED_PG, \
        "collective only supported in process-group mode"

    flatten_tensor_list = []
    for output_tensor_list in output_tensor_lists:
        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))

    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
                                             input_tensor_list,
                                             group)

    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
                                                  flatten_tensor_list):
        for tensor, value in zip(output_tensor_list,
                                 _unflatten_dense_tensors(flatten_tensor,
                                                          output_tensor_list)):
            tensor.copy_(value)

    return ret
コード例 #5
0
ファイル: distributed.py プロジェクト: inkawhich/pytorch
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(tensors,
                                      _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
コード例 #6
0
ファイル: distributed_cpu.py プロジェクト: RichieMay/pytorch
        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
コード例 #7
0
ファイル: __init__.py プロジェクト: lxlhh/pytorch
def all_gather_multigpu(output_tensor_lists,
                        input_tensor_list,
                        group=group.WORLD):
    """Gathers tensors from the whole group in a list.
    Each tensor in tensor_list should reside on a separate GPU

    Only nccl backend is currently supported
    tensors should only be GPU tensors

    Arguments:
        output_tensor_lists (List[List[Tensor]]): Output lists. It should
            contain correctly-sized tensors on each GPU to be used for output of
            the collective.
        input_tensor_list (List[Tensor]): List of tensors(on different GPUs) to
            be broadcast from current process.
        group (optional): Group of the collective.
    """
    assert torch.distributed._initialized == _INITIALIZED_PG, \
        "collective only supported in process-group mode"

    warnings.warn("""
    ================================================================================
                                        WARNING
    ================================================================================
    all_gather_multigpu is still experimental. The API will change without
    notice and we're can't guarantee full correctness and expected performance yet.
    We'll announce it once it's ready.
    """)

    flatten_tensor_list = []
    for output_tensor_list in output_tensor_lists:
        flatten_tensor_list.append(_flatten_dense_tensors(output_tensor_list))

    ret = torch._C._dist_all_gather_multigpu(flatten_tensor_list,
                                             input_tensor_list,
                                             group)

    for output_tensor_list, flatten_tensor in zip(output_tensor_lists,
                                                  flatten_tensor_list):
        for tensor, value in zip(output_tensor_list,
                                 _unflatten_dense_tensors(flatten_tensor,
                                                          output_tensor_list)):
            tensor.copy_(value)

    return ret
コード例 #8
0
    def _sync_params(self):
        params = [p.data for p in self.module.parameters()]
        result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
        for tensors, module in zip(result[1:], self._module_copies[1:]):
            for tensor, param in zip(tensors, module.parameters()):
                param.data.set_(tensor)

        buffers = list(self.module._all_buffers())
        if len(buffers) > 0:
            # cross-node buffer sync
            flat_buffers = _flatten_dense_tensors(buffers)
            dist.broadcast(flat_buffers, 0)
            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
                buf.copy_(synced)

            # intra-node buffer sync
            result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
            for tensors, module in zip(result[1:], self._module_copies[1:]):
                for tensor, buf in zip(tensors, module._all_buffers()):
                    buf.set_(tensor)
コード例 #9
0
ファイル: distribute.py プロジェクト: cs50victor/riri
 def allreduce_params():
     if module.needs_reduction:
         module.needs_reduction = False
         # bucketing params based on value types
         buckets = {}
         for param in module.parameters():
             if param.requires_grad and param.grad is not None:
                 tp = type(param.data)
                 if tp not in buckets:
                     buckets[tp] = []
                 buckets[tp].append(param)
         for tp in buckets:
             bucket = buckets[tp]
             grads = [param.grad.data for param in bucket]
             coalesced = _flatten_dense_tensors(grads)
             dist.all_reduce(coalesced, op=dist.reduce_op.SUM)
             coalesced /= dist.get_world_size()
             for buf, synced in zip(
                     grads, _unflatten_dense_tensors(coalesced, grads)):
                 buf.copy_(synced)
コード例 #10
0
def model_grads_to_master_grads(model_params, master_params, flat_master=False):
    """
    Copy model gradients to master gradients.  

    Args:
        model_params:  List of model parameters created by :func:`prep_param_lists`.
        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
    """
    if flat_master:
        # The flattening may incur one more deep copy than is necessary.
        master_params[0].grad.data.copy_(
            _flatten_dense_tensors([p.grad.data for p in model_params]))
    else:
        for model, master in zip(model_params, master_params):
            if model.grad is not None:
                if master.grad is None:
                    master.grad = Variable(master.data.new(*master.data.size()))
                master.grad.data.copy_(model.grad.data)
            else:
                master.grad = None
コード例 #11
0
def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
    if bucket_size_mb > 0:
        bucket_size_bytes = bucket_size_mb * 1024 * 1024
        buckets = _take_tensors(tensors, bucket_size_bytes)
    else:
        buckets = OrderedDict()
        for tensor in tensors:
            tp = tensor.type()
            if tp not in buckets:
                buckets[tp] = []
            buckets[tp].append(tensor)
        buckets = buckets.values()

    for bucket in buckets:
        flat_tensors = _flatten_dense_tensors(bucket)
        dist.all_reduce(flat_tensors)
        flat_tensors.div_(world_size)
        for tensor, synced in zip(
                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
            tensor.copy_(synced)
コード例 #12
0
ファイル: distributed.py プロジェクト: lxlhh/pytorch
    def _sync_params(self):
        params = [p.data for p in self.module.parameters()]
        result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size)
        for tensors, module in zip(result[1:], self._module_copies[1:]):
            for tensor, param in zip(tensors, module.parameters()):
                param.data.set_(tensor)

        buffers = list(self.module._all_buffers())
        if len(buffers) > 0:
            # cross-node buffer sync
            flat_buffers = _flatten_dense_tensors(buffers)
            dist.broadcast(flat_buffers, 0)
            for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
                buf.copy_(synced)

            # intra-node buffer sync
            result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size)
            for tensors, module in zip(result[1:], self._module_copies[1:]):
                for tensor, buf in zip(tensors, module._all_buffers()):
                    buf.set_(tensor)
コード例 #13
0
ファイル: fp16_optimizer.py プロジェクト: vidushv/DeepSpeed
    def step_fused_adam(self, closure=None):
        """
        Not supporting closure.
        """
        # First compute norm for all group so we know if there is overflow
        grads_groups_flat = []
        norm_groups = []
        for i, group in enumerate(self.fp16_groups):
            grads_groups_flat.append(
                _flatten_dense_tensors([
                    torch.zeros(p.size(), dtype=p.dtype, device=p.device)
                    if p.grad is None else p.grad for p in group
                ]))
            norm_groups.append(
                get_weight_norm(grads_groups_flat[i], mpu=self.mpu))

        self.overflow = self.overflow_checker.check_using_norm(norm_groups)
        prev_scale = self.cur_scale
        self._update_scale(self.overflow)

        if self.overflow:
            if self.verbose:
                print("[deepspeed] OVERFLOW! Skipping step. Attempted loss "
                      "scale: {}, reducing to {}".format(
                          prev_scale, self.cur_scale))
            return self.overflow
        combined_scale = self.unscale_and_clip_grads(grads_groups_flat,
                                                     norm_groups,
                                                     apply_scale=False)
        # norm is in fact norm*cur_scale
        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
                            output_params=[[p] for p in self.fp16_groups_flat],
                            scale=combined_scale,
                            grad_norms=norm_groups)
        # TODO: we probably don't need this? just to be safe
        for i in range(len(norm_groups)):
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data
        return self.overflow
コード例 #14
0
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        if chunk[0].is_sparse:
            flat_indices, flat_values = _flatten_sparse_tensors(chunk)
            result_indices = broadcast(flat_indices, devices)
            result_values = broadcast(flat_values, devices)
            unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values))
        else:
            flat = _flatten_dense_tensors(chunk)
            results = broadcast(flat, devices)
            unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results)
        # use the broadcasted tensors for the remaining devices
        for dst, unflat_res in zip(outputs[1:], unflat_results[1:]):
            dst.extend(unflat_res)
    for i, output in enumerate(outputs):
        outputs[i] = _reorder_tensors_as(output, tensors)
    return tuple(outputs)
コード例 #15
0
ファイル: comm.py プロジェクト: Northrend/pytorch
def broadcast_coalesced(tensors, devices, buffer_size=10485760):
    """Broadcasts a sequence tensors to the specified GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        tensors (sequence): tensors to broadcast.
        devices (Iterable): an iterable of devices among which to broadcast.
          Note that it should be like (src, dst1, dst2, ...), the first element
          of which is the source device to broadcast from.
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple containing copies of the ``tensor``, placed on devices
        corresponding to indices from ``devices``.
    """
    for tensor in tensors:
        if tensor.get_device() != devices[0]:
            raise RuntimeError('all tensors must be on devices[0]')
    outputs = [[] for _ in devices]
    # use the original tensors for the first device
    outputs[0].extend(tensors)
    for chunk in _take_tensors(tensors, buffer_size):
        if chunk[0].is_sparse:
            flat_indices, flat_values = _flatten_sparse_tensors(chunk)
            result_indices = broadcast(flat_indices, devices)
            result_values = broadcast(flat_values, devices)
            unflat_results = tuple(_unflatten_sparse_tensors(iv, chunk) for iv in zip(result_indices, result_values))
        else:
            flat = _flatten_dense_tensors(chunk)
            results = broadcast(flat, devices)
            unflat_results = tuple(_unflatten_dense_tensors(tensor, chunk) for tensor in results)
        # use the broadcasted tensors for the remaining devices
        for dst, unflat_res in zip(outputs[1:], unflat_results[1:]):
            dst.extend(unflat_res)
    for i, output in enumerate(outputs):
        outputs[i] = _reorder_tensors_as(output, tensors)
    return tuple(outputs)
コード例 #16
0
def flatten_dense_tensors_aligned(tensor_list, alignment, pg):
    num_elements = 0
    for tensor in tensor_list:
        num_elements = num_elements + tensor.numel()

    remaining = num_elements % alignment

    if remaining:
        elements_to_add = alignment - remaining
        pad_tensor = torch.zeros(elements_to_add,
                                 device=tensor_list[0].device,
                                 dtype=tensor_list[0].dtype)
        padded_tensor_list = tensor_list + [pad_tensor]

        num_elements = num_elements + elements_to_add
    else:
        padded_tensor_list = tensor_list

    if dist.get_rank(group=pg) == 0:
        print("Number of Elements is ", num_elements)

    return _flatten_dense_tensors(padded_tensor_list)
コード例 #17
0
ファイル: distributed_c10d.py プロジェクト: xiching/pytorch
    def _queue_reduction(self, bucket_idx):
        grads_batch = self.buckets[bucket_idx]
        grads_batch_coalesced = []

        # coalesce the bucket
        for dev_id, dev_grads_batch in zip(self.device_ids, grads_batch):
            with torch.cuda.device(dev_id):
                dev_grads_batch_coalesced = _flatten_dense_tensors(
                    dev_grads_batch)
                grads_batch_coalesced.append(dev_grads_batch_coalesced)

        # reduce to the first GPU in self.device_ids
        if len(self.device_ids) > 1:
            nccl.reduce(grads_batch_coalesced,
                        root=0,
                        streams=self.default_streams)

        # now work on the first gpu
        reduction_work = self.process_group.allreduce(
            [grads_batch_coalesced[0]], self.allreduce_opts)
        self.reduction_works[bucket_idx] = reduction_work
        self.buckets_coalesced[bucket_idx] = grads_batch_coalesced[0]
コード例 #18
0
def prep_param_lists(model, flat_master=False):
    """
    Creates a list of FP32 master parameters for a given model, as in
    `Training Neural Networks with Mixed Precision:  Real Examples`_.
    Args:
        model (torch.nn.Module): Existing Pytorch model
        flat_master (bool, optional, default=False):  Flatten the master parameters into a single tensor, as a performance optimization.
    Returns:
        A tuple (``model_params``, ``master_params``). ``model_params`` is a list of the model's parameters for later use with :func:`model_grads_to_master_grads` and :func:`master_params_to_model_params`.  ``master_params`` is a list of FP32 master gradients.  If ``flat_master=True``, ``master_params`` will be a list with one element.
    Example::
        model_params, master_params = prep_param_lists(model)
    .. warning::
        Currently, if ``flat_master=True``, all the model's parameters must be the same type.  If the model has parameters of different types, use ``flat_master=False``, or use :class:`FP16_Optimizer`.
    .. _`Training Neural Networks with Mixed Precision:  Real Examples`:
        http://on-demand.gputechconf.com/gtc/2018/video/S81012/
    """
    model_params = [param for param in model.parameters() if param.requires_grad]

    if flat_master:
        # Give the user some more useful error messages
        try:
            # flatten_dense_tensors returns a contiguous flat array.
            # http://pytorch.org/docs/master/_modules/torch/_utils.html
            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
        except:
            print("Error in prep_param_lists:  model may contain a mixture of parameters "
                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
            raise
        master_params = torch.nn.Parameter(master_params)
        master_params.requires_grad = True
        # master_params.register_hook(backwards_debug_hook)
        if master_params.grad is None:
            master_params.grad = master_params.new(*master_params.size())
        return model_params, [master_params]
    else:
        master_params = [param.clone().float().detach() for param in model_params]
        for param in master_params:
            param.requires_grad = True
        return model_params, master_params
コード例 #19
0
        def reduction_fn():
            # This function only needs to be called once
            if not self.need_reduction:
                return

            self.need_reduction = False
            all_grads = []

            # Bucketing all the gradients
            for param in self.module.parameters():
                if not param.requires_grad:
                    continue
                if param.grad is not None and param.grad.requires_grad:
                    raise RuntimeError("DistributedDataParallel only works "
                                       "with gradients that don't require "
                                       "grad")
                if param.grad is not None:
                    # Adding the gradients for reduction
                    all_grads.append(param.grad.data)
                else:
                    all_grads.append(torch.zeros_like(param))

            # Now bucketing the parameters
            dev_grads_buckets = _take_tensors(all_grads,
                                              self.reduce_bucket_size)

            # Now reduce each bucket one after another
            for grads_batch in dev_grads_buckets:
                grads_batch_coalesced = _flatten_dense_tensors(grads_batch)

                grads_batch_coalesced /= self.world_size

                distributed_utils.all_reduce(grads_batch_coalesced,
                                             self.process_group)

                grads_batch_reduced = _unflatten_dense_tensors(
                    grads_batch_coalesced, grads_batch)
                for grad, reduced in zip(grads_batch, grads_batch_reduced):
                    grad.copy_(reduced)
コード例 #20
0
    def step(self, closure=None):
        """
        Not supporting closure.
        """
        # First compute norm for all group so we know if there is overflow
        grads_groups_flat = []
        norm_groups = []
        skip = False
        for i, group in enumerate(self.fp16_groups):
            #grads_groups_flat.append(_flatten_dense_tensors([p.grad for p in group]))
            grads_groups_flat.append(
                _flatten_dense_tensors([
                    p.grad if p.grad is not None else p.new_zeros(p.size())
                    for p in group
                ]))
            norm_groups.append(self._compute_grad_norm(grads_groups_flat[i]))
            if norm_groups[i] == -1:  #TODO: early break
                skip = True

        if skip:
            self._update_scale(skip)
            return

        # norm is in fact norm*cur_scale
        self.optimizer.step(grads=[[g] for g in grads_groups_flat],
                            output_params=[[p] for p in self.fp16_groups_flat],
                            scale=self.cur_scale,
                            grad_norms=norm_groups)

        # TODO: we probably don't need this? just to be safe
        for i in range(len(norm_groups)):
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data

        self._update_scale(False)
        return
コード例 #21
0
    def _test_CheckOverflow(check_using_norm: bool):
        groups.initialize_model_parallel(1)
        groups.initialize_expert_parallel(2)

        param1 = torch.nn.Parameter(torch.Tensor([0]))
        param1.grad = torch.Tensor([1])
        param2 = torch.nn.Parameter(torch.Tensor([0]))
        if dist.get_rank() == 0:
            param2.grad = torch.Tensor([1])
        else:
            param2.grad = torch.Tensor([float("inf")])
        param2.allreduce = False
        # param2 is now MoE parameter
        parameters = [param1, param2]
        if check_using_norm:
            grads_group_flat = [_flatten_dense_tensors([p.grad for p in parameters])]
            norm = ds_utils.get_weight_norm(grads_group_flat)
            overflow_checker = ds_utils.CheckOverflow([parameters])
            overflow = overflow_checker.check_using_norm([norm], reduce_overflow=False)
        else:
            overflow_checker = ds_utils.CheckOverflow([parameters])
            overflow = overflow_checker.check()
        assert overflow
コード例 #22
0
ファイル: distributed.py プロジェクト: raulpuric/PyMAML
def reduce_gradients(module):
	buckets = {}
	for name, param in module.named_parameters():
		if param.requires_grad and param.grad is not None:
			tp = type(param.data)
			if tp not in buckets:
				buckets[tp] = []
			buckets[tp].append(param)
	if warn_on_half:
		if torch.cuda.HalfTensor in buckets:
			print("WARNING: gloo dist backend for half parameters may be slow." +
				  " It is recommended to use the NCCL backend in this case.")
			warn_on_half = False

	for tp in buckets:
		bucket = buckets[tp]
		grads = [param.grad.data for param in bucket]
		coalesced = _flatten_dense_tensors(grads)
		dist.all_reduce(coalesced)
		torch.cuda.synchronize()
		coalesced /= dist.get_world_size()
		for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
			buf.copy_(synced)
コード例 #23
0
ファイル: comm.py プロジェクト: Jsmilemsj/pytorch
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
コード例 #24
0
    def step_fused_lamb(self, closure=None):
        """
        Not supporting closure.
        """
        # First compute norm for all group so we know if there is overflow
        grads_groups_flat = []
        grads_groups = []
        norm_groups = []
        for i, group in enumerate(self.fp16_groups):
            grads = [
                torch.zeros(p.size(), dtype=p.dtype, device=p.device)
                if p.grad is None else p.grad for p in group
            ]
            grads_groups.append(grads)
            grads_groups_flat.append(_flatten_dense_tensors(grads))
            norm_groups.append(
                get_weight_norm(grads_groups_flat[i], mpu=self.mpu))

        self.overflow = self.overflow_checker.check_using_norm(norm_groups)
        prev_scale = self.cur_scale

        self._update_scale(self.overflow)
        if self.overflow:
            if self.verbose:
                logger.info(
                    "[deepspeed] OVERFLOW! Skipping step. Attempted loss "
                    "scale: {}, reducing to {}".format(prev_scale,
                                                       self.cur_scale))
            return self.overflow

        combined_scale = self.unscale_and_clip_grads(norm_groups,
                                                     apply_scale=False)
        self.optimizer.step(grads=grads_groups,
                            output_params=self.fp16_groups,
                            scale=combined_scale)

        return self.overflow
コード例 #25
0
 def allreduce_params(reduce_after=True,
                      no_scale=False,
                      fp32_allreduce=False):
     if (self.needs_reduction):
         self.needs_reduction = False
         buckets = {}
         for name, param in self.module.named_parameters():
             if param.requires_grad and param.grad is not None:
                 tp = (param.data.type())
                 if tp not in buckets:
                     buckets[tp] = []
                 buckets[tp].append(param)
         if self.warn_on_half:
             if torch.cuda.HalfTensor in buckets:
                 print(
                     "WARNING: gloo dist backend for half parameters may be extremely slow."
                     +
                     " It is recommended to use the NCCL backend in this case."
                 )
                 self.warn_on_half = False
         for tp in buckets:
             bucket = buckets[tp]
             grads = [param.grad.data for param in bucket]
             coalesced = _flatten_dense_tensors(grads)
             if fp32_allreduce:
                 coalesced = coalesced.float()
             if not no_scale and not reduce_after:
                 coalesced /= dist.get_world_size(
                     group=self.data_parallel_group)
             dist.all_reduce(coalesced, group=self.data_parallel_group)
             torch.cuda.synchronize()
             if not no_scale and reduce_after:
                 coalesced /= dist.get_world_size(
                     group=self.data_parallel_group)
             for buf, synced in zip(
                     grads, _unflatten_dense_tensors(coalesced, grads)):
                 buf.copy_(synced)
コード例 #26
0
def model_grads_to_master_grads(model_params,
                                master_params,
                                flat_master=False,
                                loss_scale=1.0,
                                params_have_main_grad=False):
    """
    Copy model gradients to master gradients.

    Args:
        model_params:  List of model parameters created by :func:`prep_param_lists`.
        master_params:  List of FP32 master parameters created by :func:`prep_param_lists`.  If ``master_params`` was created with ``flat_master=True``, ``flat_master=True`` should also be supplied to :func:`model_grads_to_master_grads`.
    """
    if flat_master:
        # The flattening may incur one more deep copy than is necessary.
        master_params[0].grad.data.copy_(
            _flatten_dense_tensors([p.grad.data for p in model_params]))
    else:
        for model, master in zip(model_params, master_params):
            if model.device.type == "cpu":
                continue
            if model.grad is not None:
                if master.grad is None:
                    if params_have_main_grad:
                        # If gradient_as_bucket_view is False, this will be a copy
                        master.grad = model.grad.float()
                    else:
                        master.grad = Variable(
                            master.data.new(*master.data.size()))
            else:
                master.grad = None
        model_grads = [p.grad for p in model_params if p.grad is not None]
        master_grads = [p.grad for p in master_params if p.grad is not None]
        if len(model_grads) == 0 or len(master_grads) == 0:
            return
        _overflow_buf = torch.cuda.IntTensor([0])
        multi_tensor_applier(amp_C.multi_tensor_scale, _overflow_buf,
                             [model_grads, master_grads], 1.0 / loss_scale)
コード例 #27
0
def reduce_add_coalesced(inputs, destination=None, buffer_size=10485760):
    """Sums tensors from multiple GPUs.

    Small tensors are first coalesced into a buffer to reduce the number
    of synchronizations.

    Arguments:
        inputs (Iterable[Iterable[Tensor]]): iterable of iterables that
            contain tensors from a single device.
        destination (int, optional): a device on which the output will be
            placed (default: current device).
        buffer_size (int): maximum size of the buffer used for coalescing

    Returns:
        A tuple of tensors containing an elementwise sum of each group of
        inputs, placed on the ``destination`` device.
    """
    dense_tensors = [[] for _ in inputs]  # shape (num_gpus, num_tensors)
    output = []
    ref_order = []
    # process sparse ones first since they may have different sizes on different gpus
    for tensor_at_gpus in zip(*inputs):
        if all(t.is_sparse for t in tensor_at_gpus):
            result = reduce_add(tensor_at_gpus, destination)
            output.append(result)
            ref_order.append(tensor_at_gpus[0])
        else:
            for coll, t in zip(dense_tensors, tensor_at_gpus):
                coll.append(t.to_dense() if t.is_sparse else t)
            ref_order.append(dense_tensors[0][-1])
    itrs = [_take_tensors(tensors, buffer_size) for tensors in dense_tensors]
    # now the dense ones, which have consistent sizes
    for chunks in zip(*itrs):
        flat_tensors = [_flatten_dense_tensors(chunk) for chunk in chunks]
        flat_result = reduce_add(flat_tensors, destination)
        output.extend(_unflatten_dense_tensors(flat_result, chunks[0]))
    return tuple(_reorder_tensors_as(output, ref_order))
コード例 #28
0
        def allreduce_params():
            if (self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                              " It is recommended to use the NCCL backend in this case.")
                        self.warn_on_half = False

                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
コード例 #29
0
        def allreduce_params():
            if(self.needs_reduction):
                self.needs_reduction = False
                buckets = {}
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        if tp not in buckets:
                            buckets[tp] = []
                        buckets[tp].append(param)
                if self.warn_on_half:
                    if torch.cuda.HalfTensor in buckets:
                        print("WARNING: gloo dist backend for half parameters may be extremely slow." +
                              " It is recommended to use the NCCL backend in this case.")
                        self.warn_on_half = False

                for tp in buckets:
                    bucket = buckets[tp]
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
コード例 #30
0
    def __init__(self,
                 init_optimizer,
                 deepspeed=None,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False,
                 initial_dynamic_scale=2**32,
                 dynamic_loss_args=None,
                 verbose=True,
                 mpu=None,
                 clip_grad=0.0,
                 fused_adam_legacy=False,
                 timers=None):

        self.fused_adam_legacy = fused_adam_legacy
        self.timers = timers

        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")
        self.optimizer = init_optimizer

        # param flattened by groups
        self.fp16_groups = []
        self.fp16_groups_flat = []
        self.fp32_groups_flat = []

        # loop to deal with groups
        for i, param_group in enumerate(self.optimizer.param_groups):
            # push this group to list before modify
            self.fp16_groups.append(param_group['params'])
            # init fp16 weight buffer, flattened
            self.fp16_groups_flat.append(
                _flatten_dense_tensors([p.clone().detach()
                                        for p in self.fp16_groups[i]]))
            # set model fp16 weight to slices of flattened buffer
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data
            # init master weight, flattened
            self.fp32_groups_flat.append(
                self.fp16_groups_flat[i].clone().float().detach())
            # modify optimizer of have flat master weight
            self.fp32_groups_flat[
                i].requires_grad = True  # keep this in case internal optimizer uses it
            param_group['params'] = [self.fp32_groups_flat[i]]

        # we may have a way of fusing dynamic scale. Do not support for now
        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            self.cur_iter = 0
            self.last_overflow_iter = -1
            self.scale_factor = 2

            if dynamic_loss_args is None:
                self.cur_scale = initial_dynamic_scale
                self.scale_window = 1000
                self.min_loss_scale = 1
            else:
                self.cur_scale = dynamic_loss_args[INITIAL_LOSS_SCALE]
                self.scale_window = dynamic_loss_args[SCALE_WINDOW]
                self.min_loss_scale = dynamic_loss_args[MIN_LOSS_SCALE]
        else:
            self.dynamic_loss_scale = False
            self.cur_iter = 0
            self.cur_scale = static_loss_scale
        self.verbose = verbose

        self.clip_grad = clip_grad
        self.norm_type = 2

        TORCH_MAJOR = int(torch.__version__.split('.')[0])
        TORCH_MINOR = int(torch.__version__.split('.')[1])
        if TORCH_MAJOR == 0 and TORCH_MINOR <= 4:
            self.clip_grad_norm = torch.nn.utils.clip_grad_norm
        else:
            self.clip_grad_norm = torch.nn.utils.clip_grad_norm_

        #model parallel object
        self.mpu = mpu

        self.overflow = False
        self.overflow_checker = CheckOverflow(self.fp16_groups,
                                              mpu=self.mpu,
                                              deepspeed=deepspeed)
        self.initialize_optimizer_states()
コード例 #31
0
    def step(self, closure=None):
        """
        Not supporting closure.
        """

        if self.fused_adam_legacy:
            return self.step_fused_adam()

        COMPUTE_NORM = "compute_norm"
        OVERFLOW_CHECK = 'overflow_check'
        OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK]
        UNSCALE_AND_CLIP = 'unscale_and_clip'
        BASIC_STEP = 'basic_step'
        UPDATE_FP16 = 'update_fp16'
        STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16]

        # First determine if there is overflow.
        self.start_timers([OVERFLOW_CHECK])
        fp16_params = []
        for i, group in enumerate(self.fp16_groups):
            fp16_params.extend([p for p in group if p.grad is not None])
        self.overflow = self.overflow_checker.has_overflow(fp16_params)
        self.stop_timers([OVERFLOW_CHECK])
        prev_scale = self.cur_scale
        self._update_scale(self.overflow)
        if self.overflow:
            if self.verbose:
                log_dist(
                    "Overflow detected. Skipping step. Attempted loss "
                    f"scale: {prev_scale}, reducing to {self.cur_scale}",
                    ranks=[0])
            # Clear gradients
            for i, group in enumerate(self.fp16_groups):
                for p in group:
                    p.grad = None

            self.log_timers(OVERFLOW_TIMERS)
            return self.overflow

        grads_groups_flat = []
        for i, group in enumerate(self.fp16_groups):
            data_type = self.fp32_groups_flat[i].dtype

            grads_groups_flat.append(
                _flatten_dense_tensors([
                    torch.zeros(p.size(),
                                dtype=data_type,
                                device=p.device)
                    if p.grad is None else p.grad.to(data_type) for p in group
                ]))

            for p in group:
                p.grad = None

            self.fp32_groups_flat[i].grad = grads_groups_flat[i]

        self.start_timers([COMPUTE_NORM])
        all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
        self.stop_timers([COMPUTE_NORM])

        self.start_timers([UNSCALE_AND_CLIP])
        self.unscale_and_clip_grads(grads_groups_flat, [all_groups_norm])
        self.stop_timers([UNSCALE_AND_CLIP])

        self.start_timers([BASIC_STEP])
        self.optimizer.step()
        self.stop_timers([BASIC_STEP])

        #get rid of the fp32 gradients. Not needed anymore
        for group in self.fp32_groups_flat:
            group.grad = None

        self.start_timers([UPDATE_FP16])
        for i in range(len(self.fp16_groups)):
            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data.copy_(q.data)
        self.stop_timers([UPDATE_FP16])

        self.log_timers(STEP_TIMERS)

        return self.overflow
コード例 #32
0
    def __init__(self,
                 optimizer,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False):
        if not torch.cuda.is_available:
            raise SystemError('Cannot use fp16 without CUDA')

        self.fp16_param_groups = []
        self.fp32_param_groups = []
        self.fp32_flattened_groups = []
        for i, param_group in enumerate(optimizer.param_groups):
            print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            for param in param_group['params']:
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.HalfTensor with {}"
                            .format(param.size()))
                        fp16_params_this_group.append(param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.FloatTensor with {}"
                            .format(param.size()))
                        fp32_params_this_group.append(param)
                    else:
                        raise TypeError(
                            "Wrapped parameters must be either "
                            "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                            "Received {}".format(param.type()))

            fp32_flattened_this_group = None
            if len(fp16_params_this_group) > 0:
                fp32_flattened_this_group = _flatten_dense_tensors([
                    param.detach().data.clone().float()
                    for param in fp16_params_this_group
                ])

                fp32_flattened_this_group = Variable(fp32_flattened_this_group,
                                                     requires_grad=True)

                fp32_flattened_this_group.grad = fp32_flattened_this_group.new(
                    *fp32_flattened_this_group.size())

            # python's lovely list concatenation via +
            if fp32_flattened_this_group is not None:
                param_group['params'] = [fp32_flattened_this_group
                                         ] + fp32_params_this_group
            else:
                param_group['params'] = fp32_params_this_group

            self.fp16_param_groups.append(fp16_params_this_group)
            self.fp32_param_groups.append(fp32_params_this_group)
            self.fp32_flattened_groups.append(fp32_flattened_this_group)

        # print("self.fp32_flattened_groups = ", self.fp32_flattened_groups)
        # print("self.fp16_param_groups = ", self.fp16_param_groups)

        self.optimizer = optimizer.__class__(optimizer.param_groups)

        # self.optimizer.load_state_dict(optimizer.state_dict())

        self.param_groups = self.optimizer.param_groups

        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            self.loss_scaler = DynamicLossScaler()
        else:
            self.dynamic_loss_scale = False
            self.loss_scaler = LossScaler(static_loss_scale)

        self.overflow = False
        self.first_closure_call_this_step = True
コード例 #33
0
    def get_flat_sub_partitions(comm_tensor_list,
                                comm_param_offsets,
                                sub_partition_size,
                                dtype,
                                num_comm_intervals=None,
                                default_device=None,
                                return_partition_params=False):
        partition_params = []
        final_param_offsets = []
        flat_sub_partitions = []
        for tensor_list, param_offsets in zip(comm_tensor_list,
                                              comm_param_offsets):
            flat_tensor_list = []
            current_size = 0
            my_offsets = []
            my_params = []

            if dtype is None:
                dtype = tensor_list[0].dtype

            for i, tensor in enumerate(tensor_list):
                if tensor.grad is None:
                    tensor.grad = torch.zeros(tensor.size(),
                                              dtype=tensor.dtype,
                                              device=tensor.device)
                param = tensor
                tensor = tensor.grad
                num_elements = tensor.numel()
                tensor_offset = 0

                #we need to offset to get to the right element
                if i == 0 and param_offsets[i] > 0:
                    tensor_offset = param_offsets[i]
                    num_elements = num_elements - tensor_offset

                # We don't need all elements of the tensor if this tensor is
                # larger than we have space for in our curr sub-partition
                if num_elements > (sub_partition_size - current_size):
                    num_elements = sub_partition_size - current_size

                #we need a narrow view of the tensor based on the tensor offset and number of elements that
                #we need from this tensor
                if tensor_offset > 0 or num_elements < tensor.numel():
                    flat_tensor_list.append(
                        tensor.contiguous().view(-1).narrow(
                            0, int(tensor_offset),
                            int(num_elements)).to(dtype))
                else:
                    flat_tensor_list.append(tensor.to(dtype))
                my_params.append(param)

                #remember offset into partition and #elems for this tensor
                my_offsets.append((current_size, num_elements))

                current_size = current_size + num_elements

            #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
            if current_size < sub_partition_size:
                my_offsets.append((None, None))
                my_params.append(None)
                if len(tensor_list) == 0:
                    assert default_device != None
                    flat_tensor_list.append(
                        torch.zeros(int(sub_partition_size - current_size),
                                    dtype=dtype,
                                    device=default_device))
                else:
                    flat_tensor_list.append(
                        torch.zeros(int(sub_partition_size - current_size),
                                    dtype=dtype,
                                    device=tensor_list[0].device))
            partition_params.append(my_params)  #flat_tensor_list)
            final_param_offsets.append(my_offsets)
            assert len(flat_tensor_list) == len(my_offsets), "{} {}".format(
                len(flat_tensor_list), len(my_offsets))
            flat_sub_partitions.append(
                _flatten_dense_tensors(flat_tensor_list))
        if num_comm_intervals is not None and len(
                flat_sub_partitions) < num_comm_intervals:
            #print("padding w. sub partitions to ensure uniform communication")
            device = flat_sub_partitions[0].device
            for _ in range(num_comm_intervals - len(flat_sub_partitions)):
                flat_sub_partitions.append(
                    torch.zeros(int(sub_partition_size),
                                dtype=dtype,
                                device=device))
                partition_params.append([None])
                final_param_offsets.append([(None, None)])

        if return_partition_params:
            assert len(flat_sub_partitions) == len(partition_params)
            assert len(partition_params) == len(
                final_param_offsets), "{} {}".format(len(partition_params),
                                                     len(final_param_offsets))
            return flat_sub_partitions, partition_params, final_param_offsets
        return flat_sub_partitions
コード例 #34
0
def flatten_dense_tensors_sub_partition_aligned(tensor_list, dp,
                                                max_elements_per_comm, pg):
    num_elements = 0
    for tensor in tensor_list:
        num_elements = num_elements + tensor.numel()

    pprint("Total number of elements in model: {}, max elements per com: {}".
           format(num_elements, max_elements_per_comm))

    max_elements_per_comm = min(max_elements_per_comm, num_elements)
    sub_partition_size = int(max_elements_per_comm // dp)

    alignment = sub_partition_size

    # if alignment == 0:
    #     # number of elements not divisible by dp, outside range and small model must pad with zeroes
    #     pad_tensor = torch.zeros(max_elements_per_comm,
    #                              device=tensor_list[0].device,
    #                              dtype=tensor_list[0].dtype)
    #     return _flatten_dense_tensors(pad_tensor)

    remaining = int(num_elements % alignment)

    # ensure we have equal sized sub-partitions
    elements_to_add = 0
    if remaining:
        elements_to_add = alignment - remaining
        # adding padded tensor later after we check comm alignment
        pprint("adding pad tensor for alignment, {} + {}->{}".format(
            num_elements, elements_to_add, num_elements + elements_to_add))
        #num_elements = num_elements + elements_to_add
    else:
        padded_tensor_list = tensor_list

    num_partitions = int(
        (num_elements + elements_to_add) // sub_partition_size)
    assert (num_elements + elements_to_add) % sub_partition_size == 0, "num elements should be " \
                                                                       "aligned by sub partition " \
                                                                       "size"
    num_comm_intervals = int(num_partitions // dp)
    partition_remaining = int(num_partitions % dp)
    pprint("num_comm_intervals={}, partition_remaining={}".format(
        num_comm_intervals, partition_remaining))
    if partition_remaining != 0:
        pprint("adding pad tensor and/or extra sub partition")
        # add pad tensor for alignment of comm interval, this overrules previous possibly sub-partition alignment
        num_comm_intervals += 1
        aligned_comm_elements = num_comm_intervals * sub_partition_size * dp
        elements_to_add = aligned_comm_elements - num_elements

        pad_tensor = torch.zeros(elements_to_add,
                                 device=tensor_list[0].device,
                                 dtype=tensor_list[0].dtype)
        padded_tensor_list = tensor_list + [pad_tensor]
        pprint(
            "adding pad tensor and/or extra sub partition, {} + {}->{}".format(
                num_elements, elements_to_add, num_elements + elements_to_add))
        num_elements += elements_to_add
    elif elements_to_add > 0:
        # add pad tensor for just alignment of sub-partition
        pad_tensor = torch.zeros(elements_to_add,
                                 device=tensor_list[0].device,
                                 dtype=tensor_list[0].dtype)
        padded_tensor_list = tensor_list + [pad_tensor]
        num_elements += elements_to_add

    if pg is None or dist.get_rank(group=pg) == 0:
        print("Number of Elements (w. padding) is ", num_elements)

    padded_num_elems = 0
    for p in padded_tensor_list:
        padded_num_elems += p.numel()
    assert num_elements == padded_num_elems, "{} != {}, rank={}".format(
        num_elements, padded_num_elems, dist.get_rank())

    return _flatten_dense_tensors(padded_tensor_list)
    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum), d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)

                if self.all_reduce:
                    dist.all_reduce(d_p_new, group=0)  #self.all_gpu
                else:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            coded, data_time = QSGD_gpu.encode(d_p_new)
                            #specific coded dic just on CPU
                            tensor_signs = coded['signs'].float().to(
                                self.device)
                            tensor_selected = coded['selected'].float().to(
                                self.device)
                            tensor_norm = coded['norm']
                            #size
                            tensor_signs_size = self.pack_len_tensor_into_tensor(
                                tensor_signs)
                            tensor_selected_size = self.pack_len_tensor_into_tensor(
                                tensor_selected)
                            #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size

                            #custom
                            '''
                            print(tensor_signs.type())
                            print(tensor_selected.type())
                            print(tensor_norm.type())
                            '''

                        else:
                            d_p_new = torch.sign(d_p_new)

                        if self.local_rank == 0:

                            if self.all_gather_commu:
                                #This version only for instances each with one GPU
                                for node_index in self.inter_node_list:
                                    if node_index != self.nodes_rank:

                                        d.set()
                                        f.set()
                                        coded_temp = coded.copy()
                                        f.record()
                                        b.set()
                                        tensor_signs_size_temp = tensor_signs_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_signs_size_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        b.record()
                                        c.set()
                                        tensor_signs_temp = torch.zeros(
                                            [int(tensor_signs_size_temp[0])],
                                            device=self.device,
                                            dtype=torch.float)
                                        c.record()
                                        a.set()
                                        dist.broadcast(
                                            tensor_signs_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        a.record()
                                        d.record()

                                        e.set()
                                        tensor_selected_size_temp = tensor_selected_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_selected_size_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        tensor_selected_temp = torch.zeros(
                                            [
                                                int(tensor_selected_size_temp[
                                                    0])
                                            ],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_selected_temp,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        e.record()

                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(
                                            tensor_norm_temp,
                                            node_index,
                                            group=self.all_inter_node_group)

                                        coded_temp[
                                            'signs'] = tensor_signs_temp.int()
                                        coded_temp[
                                            'selected'] = tensor_selected_temp.long(
                                            )
                                        coded_temp['norm'] = tensor_norm_temp

                                        tensor_decoded = QSGD_gpu.decode(
                                            coded_temp, cuda=True)
                                        d_p_new = d_p_new + tensor_decoded
                                        '''
                                        print('a', a.get_time())
                                        print('b', b.get_time())
                                        print('c', c.get_time())
                                        print('d', d.get_time())
                                        print('e', e.get_time())
                                        print('f', f.get_time())
                                        '''

                                    else:
                                        dist.broadcast(
                                            tensor_signs_size,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_signs,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_selected_size,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_selected,
                                            node_index,
                                            group=self.all_inter_node_group)
                                        dist.broadcast(
                                            tensor_norm,
                                            node_index,
                                            group=self.all_inter_node_group)
                                d_p_new = d_p_new / dist.get_world_size()

                            else:
                                if dist.get_rank() == 0:
                                    for index, inter_node_group in enumerate(
                                            self.inter_node_group_list):
                                        coded_temp = coded.copy()

                                        tensor_signs_size_temp = tensor_signs_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_signs_size_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)
                                        tensor_signs_temp = torch.zeros(
                                            [int(tensor_signs_size_temp[0])],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_signs_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        tensor_selected_size_temp = tensor_selected_size.clone(
                                        )
                                        dist.broadcast(
                                            tensor_selected_size_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)
                                        tensor_selected_temp = torch.zeros(
                                            [
                                                int(tensor_selected_size_temp[
                                                    0])
                                            ],
                                            device=self.device,
                                            dtype=torch.float)
                                        dist.broadcast(
                                            tensor_selected_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        tensor_norm_temp = tensor_norm.clone()
                                        dist.broadcast(
                                            tensor_norm_temp,
                                            self.inter_node_list[index + 1],
                                            group=inter_node_group)

                                        coded_temp[
                                            'signs'] = tensor_signs_temp.int()
                                        coded_temp[
                                            'selected'] = tensor_selected_temp.long(
                                            )
                                        coded_temp['norm'] = tensor_norm_temp

                                        tensor_decoded = QSGD_gpu.decode(
                                            coded_temp, cuda=True)
                                        d_p_new = d_p_new + tensor_decoded
                                        '''
                                        #temp
                                        print(tensor_decoded)
                                        tensor_decoded_temp = tensor_decoded.clone()
                                        dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group)
                                        if tensor_decoded == tensor_decoded_temp:
                                            print('success')
                                        print(tensor_signs_size_temp)
                                        print(tensor_selected_size_temp)
                                        '''

                                    d_p_new = d_p_new / dist.get_world_size()

                                else:
                                    dist.broadcast(
                                        tensor_signs_size,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_signs,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_selected_size,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_selected,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    dist.broadcast(
                                        tensor_norm,
                                        dist.get_rank(),
                                        group=self.inter_node_group_list[
                                            self.nodes_rank - 1])
                                    '''
                                    #temp
                                    tensor_decoded = QSGD_gpu.decode(coded, cuda = True)
                                    print(tensor_decoded)
                                    dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) 
                                    print(tensor_signs_size)
                                    print(tensor_selected_size)
                                    '''

                                    dist.barrier(
                                        group=self.all_inter_node_group)

                                #os._exit()

                                if self.bidirection_compress:
                                    if dist.get_rank() == 0:

                                        coded, data_time = QSGD_gpu.encode(
                                            d_p_new)
                                        tensor_signs = coded['signs']
                                        tensor_selected = coded['selected']
                                        tensor_norm = coded['norm']

                                        tensor_signs_size = self.pack_len_tensor_into_tensor(
                                            tensor_signs)
                                        tensor_selected_size = self.pack_len_tensor_into_tensor(
                                            tensor_selected)

                                        dist.barrier(
                                            group=self.all_inter_node_group)

                                    dist.broadcast(
                                        tensor_signs_size,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_selected_size,
                                        0,
                                        group=self.all_inter_node_group)
                                    if dist.get_rank() != 0:
                                        tensor_signs = torch.randn([
                                            int(tensor_signs_size[0])
                                        ]).type_as(tensor_signs)
                                        tensor_selected = torch.randn([
                                            int(tensor_selected_size[0])
                                        ]).type_as(tensor_selected)

                                    dist.barrier(
                                        group=self.all_inter_node_group)

                                    dist.broadcast(
                                        tensor_signs,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_selected,
                                        0,
                                        group=self.all_inter_node_group)
                                    dist.broadcast(
                                        tensor_norm,
                                        0,
                                        group=self.all_inter_node_group)

                                    coded['signs'] = tensor_signs
                                    coded['selected'] = tensor_selected
                                    coded['norm'] = tensor_norm

                                    tensor_decoded = QSGD_gpu.decode(coded,
                                                                     cuda=True)
                                    d_p_new = tensor_decoded

                                else:
                                    if dist.get_rank() == 0:
                                        dist.barrier(
                                            group=self.all_inter_node_group)
                                    dist.broadcast(
                                        d_p_new,
                                        0,
                                        group=self.all_inter_node_group)

                    else:
                        # test for one
                        coded, data_time = QSGD_gpu.encode(d_p_new)
                        tensor_decoded = QSGD_gpu.decode(coded, cuda=True)
                        d_p_new = tensor_decoded

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)
            for p in group['params']:
                if self.compression_buffer:
                    if weight_decay != 0:
                        p.grad.data.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], p.grad.data)

        return loss
コード例 #36
0
ファイル: worker.py プロジェクト: srQ-cpc/LIBBLE-DL
 def sync_buffers_bucket(self):
     buffers = [p.data for p in list(self.model._all_buffers())]
     for tensors in _take_tensors(buffers, self.mpi_size):
         flat_tensors = _flatten_dense_tensors(tensors)
         dist.reduce(flat_tensors, dst=0, op=reduce_op.SUM)
コード例 #37
0
ファイル: ema.py プロジェクト: pallekc91/jukebox
 def get_model_state(self, group):
     params = self.params[group]
     return _flatten_dense_tensors([p.data.float() for p in params])
コード例 #38
0
    def step(self, closure=None):

        args = self.args

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            cur_lr = group['lr']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_(d_p)
                    d_p.add_(momentum, buf)

                all_grads.append(d_p)

            length = 0
            for _ in _take_tensors(all_grads, self.bucket_size):
                length += 1

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for i, dev_grads in enumerate(dev_grads_buckets):
                d_p_new = _flatten_dense_tensors(dev_grads)

                if len(self.err_buf) < length:
                    self.err_buf.append(torch.zeros_like(d_p_new))
                    self.server_err_buf.append(torch.zeros_like(d_p_new))

                err_buf = self.err_buf[i]
                server_err_buf = self.server_err_buf[i]

                d_p_new.add_(self.prev_lr / cur_lr, err_buf)

                p_buf = d_p_new

                if self.all_reduce:
                    dist.all_reduce(d_p_new)  #self.all_gpu, group = 0
                    if self.signum:
                        d_p_new = torch.sign(d_p_new)
                elif self.signum:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            d_p_new_scale = torch.ones(1)
                            d_p_new_scale[0] = d_p_new.abs().sum().cpu().item(
                            ) / d_p_new.numel()
                            d_p_new, tensor_size = self.compressor.compress(
                                d_p_new)

                            tmp = self.compressor.uncompress(
                                d_p_new.clone(), tensor_size)
                            tmp.mul_(d_p_new_scale.item())

                            err_buf.copy_(p_buf).sub_(tmp)
                        else:
                            d_p_new = torch.sign(d_p_new)

                        if dist.get_rank() == 0:
                            d_p_new_list = []
                            d_p_new_scale_list = []
                            for index, inter_node_group in enumerate(
                                    self.inter_node_group_list):
                                d_p_temp = d_p_new.clone()
                                d_p_scale_temp = d_p_new_scale.clone()
                                dist.broadcast(d_p_scale_temp,
                                               self.inter_node_list[index + 1],
                                               group=inter_node_group)
                                dist.broadcast(d_p_temp,
                                               self.inter_node_list[index + 1],
                                               group=inter_node_group)
                                d_p_new_list.append(d_p_temp)
                                d_p_new_scale_list.append(d_p_scale_temp)
                        else:
                            dist.broadcast(d_p_new_scale,
                                           dist.get_rank(),
                                           group=self.inter_node_group_list[
                                               self.nodes_rank - 1])
                            dist.broadcast(d_p_new,
                                           dist.get_rank(),
                                           group=self.inter_node_group_list[
                                               self.nodes_rank - 1])
                            dist.barrier(group=self.all_inter_node_group)

                        if dist.get_rank() == 0:
                            if self.compression_buffer:
                                d_p_new_list.append(d_p_new)  #count itself
                                d_p_new_scale_list.append(
                                    d_p_new_scale)  #count itself
                                #d_p_new = self.compressor.majority_vote(d_p_new_list)
                                d_p_new = torch.zeros(tensor_size).cuda()
                                for d_p, d_p_scale in zip(
                                        d_p_new_list, d_p_new_scale_list):
                                    tmp = self.compressor.uncompress(
                                        d_p, tensor_size)
                                    d_p_new.add_(d_p_scale.item(), tmp)
                                d_p_new /= self.nodes

                                d_p_new.add_(self.prev_lr / cur_lr,
                                             server_err_buf)

                                un_compr = d_p_new

                                d_p_new_scale = torch.ones(1)
                                d_p_new_scale[0] = d_p_new.abs().sum().cpu(
                                ).item() / d_p_new.numel()

                                d_p_new, _ = self.compressor.compress(d_p_new)

                                tmp = self.compressor.uncompress(
                                    d_p_new.clone(), tensor_size)
                                tmp.mul_(d_p_new_scale.item())

                                server_err_buf.copy_(un_compr).sub_(tmp)
                            else:
                                for d_p_temp in d_p_new_list:
                                    d_p_new.add_(d_p_temp)
                                d_p_new = d_p_new / self.nodes

                            dist.barrier(group=self.all_inter_node_group)

                        dist.broadcast(d_p_new,
                                       0,
                                       group=self.all_inter_node_group)
                        if self.compression_buffer:
                            dist.broadcast(d_p_new_scale,
                                           0,
                                           group=self.all_inter_node_group)

                        if self.compression_buffer:
                            d_p_new = self.compressor.uncompress(
                                d_p_new, tensor_size)
                            d_p_new.mul_(d_p_new_scale.item())
                else:
                    print('You can not run without signum or all_reduce')

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)

            for p in group['params']:
                if self.compression_buffer:  #This part of code is temporary
                    if weight_decay != 0:
                        if momentum != 0:
                            param_state = self.state[p]
                            if 'wd_mom' not in param_state:
                                buf = param_state['wd_mom'] = torch.zeros_like(
                                    p.data)
                            else:
                                buf = param_state['wd_mom']

                            buf.mul_(momentum).add_(weight_decay, p.data)
                            p.grad.data.add_(momentum, buf)

                        p.grad.data.add_(weight_decay, p.data)

                p.data.add_(-group['lr'], p.grad.data)

            self.prev_lr = group['lr']

        return loss
コード例 #39
0
    def step(self, closure=None):

        args = self.args

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']

            all_grads = []

            for p in group['params']:
                if p.grad is None:
                    continue

                d_p = p.grad.data
                if self.compression_buffer == False:
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    # signum
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                    else:
                        buf = param_state['momentum_buffer']

                    buf.mul_(momentum).add_((1 - momentum), d_p)
                    d_p.copy_(buf)
                all_grads.append(d_p)

            dev_grads_buckets = _take_tensors(all_grads, self.bucket_size)
            for dev_grads in dev_grads_buckets:
                d_p_new = _flatten_dense_tensors(dev_grads)

                if self.all_reduce:
                    dist.all_reduce(d_p_new)  #self.all_gpu, group = 0
                    if self.signum:
                        d_p_new = torch.sign(d_p_new)
                elif self.signum:
                    if self.nodes > 1:
                        if self.compression_buffer:
                            d_p_new, tensor_size = self.compressor.compress(
                                d_p_new)
                        else:
                            d_p_new = torch.sign(d_p_new)

                        if self.local_rank == 0:
                            if dist.get_rank() == 0:
                                d_p_new_list = []
                                for index, inter_node_group in enumerate(
                                        self.inter_node_group_list):
                                    d_p_temp = d_p_new.clone()
                                    dist.broadcast(d_p_temp,
                                                   self.inter_node_list[index +
                                                                        1],
                                                   group=inter_node_group)
                                    d_p_new_list.append(d_p_temp)
                            else:
                                dist.broadcast(
                                    d_p_new,
                                    dist.get_rank(),
                                    group=self.inter_node_group_list[
                                        self.nodes_rank - 1])
                                dist.barrier(group=self.all_inter_node_group)

                            if dist.get_rank() == 0:
                                if self.compression_buffer:
                                    d_p_new_list.append(d_p_new)  #count itself
                                    d_p_new = self.compressor.majority_vote(
                                        d_p_new_list)
                                else:
                                    for d_p_temp in d_p_new_list:
                                        d_p_new.add_(d_p_temp)
                                    d_p_new = d_p_new / self.nodes
                                dist.barrier(group=self.all_inter_node_group)
                            dist.broadcast(d_p_new,
                                           0,
                                           group=self.all_inter_node_group)

                        if self.compression_buffer:
                            d_p_new = self.compressor.uncompress(
                                d_p_new, tensor_size)
                else:
                    print('You can not run without signum or all_reduce')

                #unflatten
                dev_grads_new = _unflatten_dense_tensors(d_p_new, dev_grads)
                for grad, reduced in zip(dev_grads, dev_grads_new):
                    grad.copy_(reduced)
            #LARC saving
            self.layer_adaptive_lr = []
            layer_index = 0
            laryer_saving = [
                1, 2, 3, 23, 49, 87
            ]  #conv1.weight(no bias), bn1.weight, layer1.1.conv1.weight, layer2.1.conv1.weight, layer3.1.conv1.weight, layer4.1.conv1.weight
            ###
            for p in group['params']:
                layer_index += 1
                ###
                '''
                LARC
                This part of code was originally forked from (https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py)
                '''
                if args.larc_enable:
                    trust_coefficient = args.larc_trust_coefficient
                    clip = args.larc_clip
                    eps = args.larc_eps
                    param_norm = torch.norm(p.data)
                    grad_norm = torch.norm(p.grad.data)
                    if param_norm != 0 and grad_norm != 0:
                        # calculate adaptive lr + weight decay
                        adaptive_lr = trust_coefficient * (param_norm) / (
                            grad_norm + param_norm * weight_decay + eps)

                        #add adaptive lr saving
                        if layer_index in laryer_saving:
                            self.layer_adaptive_lr.append(adaptive_lr)

                        # clip learning rate for LARC
                        if clip:
                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
                            adaptive_lr = min(adaptive_lr / group['lr'], 1)

                        else:
                            adaptive_lr = adaptive_lr / group['lr']

                        p.grad.data *= adaptive_lr
                ###

                if self.compression_buffer:  #This part of code is temporary
                    if weight_decay != 0:
                        p.grad.data.add_(weight_decay, p.data)
                p.data.add_(-group['lr'], p.grad.data)

        return loss
コード例 #40
0
    def step_fused_lamb(self, closure=None):
        """
        Not supporting closure.
        """
        # First compute norm for all group so we know if there is overflow
        grads_groups_flat = []
        grads_groups = []
        norm_groups = []
        expert_norm_groups = []
        for i, group in enumerate(self.fp16_groups):
            grads = [
                torch.zeros(p.size(), dtype=p.dtype, device=p.device)
                if p.grad is None else p.grad for p in group
            ]
            grads_groups.append(grads)
            grads_groups_flat.append(_flatten_dense_tensors(grads))
            grads_for_norm, expert_grads_for_norm = split_params_grads_into_shared_and_expert_params(
                group)
            norm_group_value = 0.0
            if len(grads_for_norm) > 0:
                norm_group_value = get_weight_norm(
                    _flatten_dense_tensors(grads_for_norm), mpu=self.mpu)
            norm_groups.append(norm_group_value)
            expert_norm_group_value = 0.0
            if len(expert_grads_for_norm) > 0:
                expert_norm_group_value = get_weight_norm(
                    _flatten_dense_tensors(expert_grads_for_norm),
                    mpu=self.mpu)
            expert_norm_groups.append(expert_norm_group_value)

        self.overflow = self.overflow_checker.check_using_norm(
            norm_groups + expert_norm_groups)
        prev_scale = self.cur_scale

        self._update_scale(self.overflow)
        if self.overflow:
            if self.verbose:
                logger.info(
                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
                    "scale: {}, reducing to {}".format(prev_scale,
                                                       self.cur_scale))
            return self.overflow

        self._global_grad_norm = get_global_norm(norm_list=norm_groups)
        combined_scale = self.unscale_and_clip_grads(self._global_grad_norm,
                                                     apply_scale=False)
        self.optimizer.step(grads=grads_groups,
                            output_params=self.fp16_groups,
                            scale=combined_scale)

        for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups):
            for idx, (fp32_param,
                      fp16_param) in enumerate(zip(fp32_group, fp16_group)):

                #remove the fp32 grad
                fp32_param.grad = None

                #copy data from fp32 to fp16
                fp16_param.data.copy_(fp32_param.data)

        return self.overflow
コード例 #41
0
ファイル: fp16_optimizer.py プロジェクト: felixwzh/DialoGPT
    def __init__(self,
                 init_optimizer,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False,
                 dynamic_loss_args=None,
                 verbose=True):

        # The fused optimizer does all the work. We need this layer for two reason:
        # 1. maintain same user API from apex.fp16_utils
        # 2. keep common stuff here in case we need to add new fused optimizer later

        # differences from apex.fp16_utils:
        # - assume all model params in fp16
        # - assume all params requires grad
        # - flat by groups, not keeping state. TODO: remove state explicitly?
        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")
        self.optimizer = init_optimizer

        # param flattened by groups
        self.fp16_groups = []
        self.fp16_groups_flat = []
        self.fp32_groups_flat = []

        # loop to deal with groups
        for i, param_group in enumerate(self.optimizer.param_groups):
            # push this group to list before modify
            self.fp16_groups.append(param_group['params'])
            # init fp16 weight buffer, flattened
            self.fp16_groups_flat.append(
                _flatten_dense_tensors(
                    [p.clone().detach() for p in self.fp16_groups[i]]))
            # set model fp16 weight to slices of flattened buffer
            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data = q.data
            # init master weight, flattened
            self.fp32_groups_flat.append(
                self.fp16_groups_flat[i].clone().float().detach())
            # modify optimizer of have flat master weight
            self.fp32_groups_flat[
                i].requires_grad = True  # keep this in case internal optimizer uses it
            param_group['params'] = [self.fp32_groups_flat[i]]

        # we may have a way of fusing dynamic scale. Do not support for now
        if dynamic_loss_scale:
            if dynamic_loss_args is not None:
                raise SystemError(
                    "Do not support dynamic loss scale args for now.")
            self.dynamic_loss_scale = True
            self.cur_scale = 2**16
            self.cur_iter = 0
            self.last_overflow_iter = -1
            self.scale_factor = 2
            self.scale_window = 1000
        else:
            self.dynamic_loss_scale = False
            self.cur_iter = 0
            self.cur_scale = static_loss_scale
        self.verbose = verbose