Beispiel #1
0
def _tensors_allreduce_with_sparse_ps(degree, mean, allgather, allreduce,
                                      allreduce_filter, grad, ps_parameter):
    """
    Apply allgather on gradient instead of allreduce for sparse feature.
    Allgather is a communication operation used for distributed deep learning.

    Args:
        degree (int): The mean coefficient.
        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
        allgather (Primitive): The communication operator for sparse gradients.
        allreduce (Primitive): The communication operator for gradients.
        allreduce_filter (bool): When it is true, allgather would apply.
        grad (tuple): The indices, gradient tensor and tensor_shape before operation.
        ps_parameter (bool): Use parameter server or not.

    Returns:
        RowTensor, the gradient after operation.
    """
    if ps_parameter:
        return grad

    if allreduce_filter:
        indices = allgather(grad.indices)
        dout = allgather(grad.values)
        if mean:
            degree = F.scalar_cast(degree, F.dtype(grad.values))
            cast_op = P.Cast()
            mul_op = P.Mul()
            dout = mul_op(
                dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout)))
        grad = RowTensor(indices, dout, grad.dense_shape)
    return grad
Beispiel #2
0
def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient):
    """Get grad with weight_decay."""
    if if_apply:
        indices = gradient.indices
        values = op_add((op_gather(weight, indices, 0) * F.cast(weight_decay, F.dtype(weight)), gradient.values))
        shape = gradient.dense_shape
        return RowTensor(indices, values, shape)
    return gradient
Beispiel #3
0
def _tensor_apply_grad_centralization_with_sparse(if_apply, gradient):
    """Get grad with grad_centralization."""
    if if_apply:
        indices = gradient.indices
        values = op_gc(gradient.values, -1)
        shape = gradient.dense_shape
        return RowTensor(indices, values, shape)
    return gradient
Beispiel #4
0
def rowtensor_deduplicate_indices_slices(grad):
    """Unique the indices and sums the 'values' corresponding to the duplicate indices."""
    indices = grad.indices
    values = grad.values

    unique_indices, index_position = P.Unique()(indices)
    summed_values = P.UnsortedSegmentSum()(values, index_position, P.DynamicShape()(unique_indices)[0])

    return RowTensor(unique_indices, summed_values, grad.dense_shape)
Beispiel #5
0
def _tensors_cast_datatype_with_sparse(datatype, grad):
    """
    Cast gradient to datatype.

    Args:
        datatype (mstype): the destination datatype of gradient.
        grad (RowTensor): The gradient before operation.

    Returns:
        RowTensor, the gradient after operation.
    """
    dout = F.cast(grad.values, datatype)
    return RowTensor(grad.indices, dout, grad.dense_shape)
Beispiel #6
0
def _tensor_apply_grad_centralization_with_sparse(if_apply, gradient):
    """Get grad with grad_centralization."""
    if if_apply:
        indices = gradient.indices
        shape = gradient.dense_shape
        grad_shape = F.shape(gradient)
        axis = []
        for i in range(1, len(grad_shape)):
            axis.append(i)
        if len(axis) >= 1:
            if grad_shape[1] % 16 != 0:
                return gradient
            values = op_gc(gradient.values, axis)
            return RowTensor(indices, values, shape)
    return gradient
Beispiel #7
0
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce,
                                   allreduce_filter, grad):
    """
    Apply allgather on gradient instead of allreduce for sparse feature.
    Allgather is a communication operation used for distributed deep learning.

    Args:
        degree (int): The mean coefficient.
        mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients.
        allgather (Primitive): The communication operator for sparse gradients.
        allreduce (Primitive): The communication operator for gradients.
        allreduce_filter (bool): When it is true, allgather would apply.
        grad (tuple): The indices, gradient tensor and tensor_shape before operation.

    Returns:
        RowTensor, the gradient after operation.
    """
    if allreduce_filter:
        indices = allgather(grad.indices)
        dout = allgather(grad.values)
        if mean:
            dout = F.tensor_mul(dout, F.cast(degree, F.dtype(dout)))
        grad = RowTensor(indices, dout, grad.dense_shape)
    return grad
Beispiel #8
0
def tensor_grad_scale_with_sparse(scale, grad):
    """Get grad with scale."""
    return RowTensor(grad.indices,
                     grad.values * F.cast(scale, F.dtype(grad.values)),
                     grad.dense_shape)
Beispiel #9
0
def tensor_grad_scale_with_sparse(scale, grad):
    """Get grad with scale."""
    if scale == 1.0:
        return grad
    return RowTensor(grad.indices, grad.values * scale, grad.dense_shape)