def _tensors_allreduce_with_sparse_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. ps_parameter (bool): Use parameter server or not. Returns: RowTensor, the gradient after operation. """ if ps_parameter: return grad if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values)) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad
def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices values = op_add((op_gather(weight, indices, 0) * F.cast(weight_decay, F.dtype(weight)), gradient.values)) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient
def _tensor_apply_grad_centralization_with_sparse(if_apply, gradient): """Get grad with grad_centralization.""" if if_apply: indices = gradient.indices values = op_gc(gradient.values, -1) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient
def rowtensor_deduplicate_indices_slices(grad): """Unique the indices and sums the 'values' corresponding to the duplicate indices.""" indices = grad.indices values = grad.values unique_indices, index_position = P.Unique()(indices) summed_values = P.UnsortedSegmentSum()(values, index_position, P.DynamicShape()(unique_indices)[0]) return RowTensor(unique_indices, summed_values, grad.dense_shape)
def _tensors_cast_datatype_with_sparse(datatype, grad): """ Cast gradient to datatype. Args: datatype (mstype): the destination datatype of gradient. grad (RowTensor): The gradient before operation. Returns: RowTensor, the gradient after operation. """ dout = F.cast(grad.values, datatype) return RowTensor(grad.indices, dout, grad.dense_shape)
def _tensor_apply_grad_centralization_with_sparse(if_apply, gradient): """Get grad with grad_centralization.""" if if_apply: indices = gradient.indices shape = gradient.dense_shape grad_shape = F.shape(gradient) axis = [] for i in range(1, len(grad_shape)): axis.append(i) if len(axis) >= 1: if grad_shape[1] % 16 != 0: return gradient values = op_gc(gradient.values, axis) return RowTensor(indices, values, shape) return gradient
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce, allreduce_filter, grad): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. Returns: RowTensor, the gradient after operation. """ if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: dout = F.tensor_mul(dout, F.cast(degree, F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad
def tensor_grad_scale_with_sparse(scale, grad): """Get grad with scale.""" return RowTensor(grad.indices, grad.values * F.cast(scale, F.dtype(grad.values)), grad.dense_shape)
def tensor_grad_scale_with_sparse(scale, grad): """Get grad with scale.""" if scale == 1.0: return grad return RowTensor(grad.indices, grad.values * scale, grad.dense_shape)