def _tensors_allreduce_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. ps_parameter (bool): Use parameter server or not. Returns: Tensor, the gradient tensor after operation. """ if ps_parameter: return grad if allreduce_filter: grad = allreduce(grad) if mean: grad = F.tensor_mul(grad, F.cast(degree, F.dtype(grad))) return grad return grad
def _tensors_allreduce_with_sparse_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. ps_parameter (bool): Use parameter server or not. Returns: RowTensor, the gradient after operation. """ if ps_parameter: return grad if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: dout = F.tensor_mul(dout, F.cast(degree, F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad
def tensor_grad_scale(scale, grad, accu_grad): #mul = P.Mul() new_grad = accu_grad * reciprocal(scale) zeros = F.tensor_mul(accu_grad, 0.0) clear = F.assign(accu_grad, zeros) F.control_depend(new_grad, clear) F.control_depend(grad, new_grad) return new_grad
def bprop(x, out, dout): if fusion == 0: dx = reduce_scatter(dout) else: grad = all_reduce(dout) dx = split(grad)[rank] if mean_flag: dx = F.tensor_mul(dx, scale) return (dx, )
def bprop(x, z, out, dout): if do_mirror: if mean_flag: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] dx = F.tensor_mul(dx, scale) else: z = F.depend(z, F.assign_add(z, dout)) grad = all_reduce(z) dx = split(grad)[rank] else: dx = dout return (dx, zeros_like(z))
def _tensors_allreduce(degree, mean, allgather, allreduce, allreduce_filter, grad): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: grad = allreduce(grad) if mean: degree = F.scalar_cast(degree, F.dtype(grad)) grad = F.tensor_mul( grad, F.cast(F.scalar_to_array(1.0 / degree), F.dtype(grad))) return grad return grad
def construct(self, t1, t2): z = F.tensor_mul(t1, t2) return z
def construct(self, t): z = F.tensor_mul(t, self.f) return z