def construct(self, box1, box2): """ box1: pred_box [batch, gx, gy, anchors, 1, 4] ->4: [x_center, y_center, w, h] box2: gt_box [batch, 1, 1, 1, maxbox, 4] convert to topLeft and rightDown """ box1_xy = box1[:, :, :, :, :, :2] box1_wh = box1[:, :, :, :, :, 2:4] box1_mins = box1_xy - box1_wh / F.scalar_to_array(2.0) # topLeft box1_maxs = box1_xy + box1_wh / F.scalar_to_array(2.0) # rightDown box2_xy = box2[:, :, :, :, :, :2] box2_wh = box2[:, :, :, :, :, 2:4] box2_mins = box2_xy - box2_wh / F.scalar_to_array(2.0) box2_maxs = box2_xy + box2_wh / F.scalar_to_array(2.0) intersect_mins = self.max(box1_mins, box2_mins) intersect_maxs = self.min(box1_maxs, box2_maxs) intersect_wh = self.max(intersect_maxs - intersect_mins, F.scalar_to_array(0.0)) # P.squeeze: for effiecient slice intersect_area = P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 0:1]) * \ P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 1:2]) box1_area = P.Squeeze(-1)(box1_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box1_wh[:, :, :, :, :, 1:2]) box2_area = P.Squeeze(-1)(box2_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)(box2_wh[:, :, :, :, :, 1:2]) iou = intersect_area / (box1_area + box2_area - intersect_area) # iou : [batch, gx, gy, anchors, maxboxes] return iou
def construct(self, logits, label): label_one_hot = self.one_hot(label, F.shape(logits)[-1], F.scalar_to_array(1.0), F.scalar_to_array(0.0)) loss = self.reduce_sum(-1.0 * logits * label_one_hot, (1, )) return self.get_loss(loss)
def construct(self, logit, label): '''construct''' mask = self.reshape(label, (F.shape(label)[0], F.shape(label)[1], 1)) mask = self.cast(mask, mstype.float32) mask = mask + F.scalar_to_array(0.00001) mask = self.relu(mask) / (mask) logit = logit * mask exp = self.exp(logit) exp_sum = self.sum(exp, -1) exp_sum = self.reshape(exp_sum, (F.shape(exp_sum)[0], F.shape(exp_sum)[1], 1)) softmax_result = self.log(exp / exp_sum + self.eps_const) one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[2], self.on_value, self.off_value) loss = (softmax_result * self.cast(one_hot_label, mstype.float32) * self.cast(F.scalar_to_array(-1), mstype.float32)) loss = self.sum(loss, -1) loss = self.sum(loss, -1) loss = self.sum(loss, 0) loss = loss return loss
def construct(self, box1, box2): box1_xy = box1[:, :, :, :, :, :2] box1_wh = box1[:, :, :, :, :, 2:4] box1_mins = box1_xy - box1_wh / F.scalar_to_array(2.0) box1_maxs = box1_xy + box1_wh / F.scalar_to_array(2.0) box2_xy = box2[:, :, :, :, :, :2] box2_wh = box2[:, :, :, :, :, 2:4] box2_mins = box2_xy - box2_wh / F.scalar_to_array(2.0) box2_maxs = box2_xy + box2_wh / F.scalar_to_array(2.0) intersect_mins = self.max(box1_mins, box2_mins) intersect_maxs = self.min(box1_maxs, box2_maxs) intersect_wh = self.max(intersect_maxs - intersect_mins, F.scalar_to_array(0.0)) intersect_area = P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 0:1]) * \ P.Squeeze(-1)(intersect_wh[:, :, :, :, :, 1:2]) box1_area = P.Squeeze(-1)(box1_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)( box1_wh[:, :, :, :, :, 1:2]) box2_area = P.Squeeze(-1)(box2_wh[:, :, :, :, :, 0:1]) * P.Squeeze(-1)( box2_wh[:, :, :, :, :, 1:2]) iou = intersect_area / (box1_area + box2_area - intersect_area) return iou
def construct(self, input_, label): input_n = self.normalize(input_) w = self.normalize2(self.weight) fc_o = self.fc(input_n, w) fc_o_shape = F.shape(fc_o) one_hot_float = self.onehot(label, fc_o_shape[1], self.on_value, self.off_value) local_label = self.cast(one_hot_float, mstype.int32) exp_o = self.exp(fc_o) mul_const_o = self.mul_const(self.a_const, exp_o) mul_const2_o = self.mul_const2(self.b_const, mul_const_o) exp2_o = self.exp2(mul_const2_o) mul_const3_o = self.mul_const3(exp2_o, self.c_const) mul_const4_o = self.mul_const4(F.scalar_to_array(1), local_label) mul6_o = self.mul6(self.mul(mul_const3_o, one_hot_float), self.mul2(fc_o, self.cast2(mul_const4_o, mstype.float32))) mul_const5_o = self.mul_const5(mul6_o, self.d_const) max_o = self.reduce_max(mul_const5_o, -1) mul4_o = self.mul4(mul_const5_o, max_o) exp3_o = self.exp3(mul4_o) sum_o = self.reduce_sum(exp3_o, -1) reshape_o = self.reshape(sum_o, (F.shape(sum_o)[0], 1)) mul5_o = self.mul5(exp3_o, reshape_o) log_o = self.log(self.mul9(mul5_o, self.e_const)) mul3_o = self.mul3(log_o, one_hot_float) mul7_o = self.mul7(mul3_o, self.cast3(F.scalar_to_array(-1), mstype.float32)) sum2_o = self.reduce_sum_2(mul7_o, -1) loss = self.mul8(self.reduce_sum_3(sum2_o, -1), self.cast4(F.scalar_to_array(F.shape(mul_const5_o)[0]), mstype.float32)) return loss
def bprop(x, out, dout): if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): dx = all_reduce(dout) float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: indices = all_gather(dout.indices) grad = all_gather(dout.values) float_one = F.scalar_cast(1.0, F.dtype(grad)) num = F.scalar_cast(dev_num, F.dtype(grad)) grad = mul( grad, cast(F.scalar_to_array(float_one / num), F.dtype(grad))) dx = RowTensor(indices, grad, dout.dense_shape) else: if F.issubclass_(F.typeof(dout), mstype.tensor): dx = all_reduce(dout) else: indices = all_gather(dout.indices) grad = all_gather(dout.values) dx = RowTensor(indices, grad, dout.dense_shape) return (dx, )
def construct(self, logits, label): label_one_hot = self.one_hot(label, F.shape(logits)[-1], F.scalar_to_array(1.0), F.scalar_to_array(0.0)) #print('NLLLoss label_one_hot:',label_one_hot, label_one_hot.shape) #print('NLLLoss logits:',logits, logits.shape) #print('xxx:', logits * label_one_hot) loss = self.reduce_sum(-1.0 * logits * label_one_hot, (1, )) return self.get_loss(loss)
def bprop(x, out, dout): if F.issubclass_(F.typeof(dout), mstype.tensor): if F.issubclass_(F.dtype(dout), mstype.bool_): return (dout, ) dx = op(dout, cast(F.scalar_to_array(divisor), dtype(dout))) return (dx, ) dx = () input_nums = F.tuple_len(dout) for i in range(input_nums): ele_grad = op(dout[i], cast(F.scalar_to_array(divisor), dtype(dout[i]))) dx = dx + (ele_grad, ) return (dx, )
def bprop(x, z, out, dout): if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: dx = zeros_like( x) # The grad accumulation do not support row tensor now else: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: z = F.depend(z, F.assign_add(z, dout)) real_grad = all_reduce(z) dx = real_grad else: dx = dout else: dx = zeros_like( x) # The grad accumulation do not support row tensor now return (dx, zeros_like(z))
def construct(self, x): alpha_array = P.Cast()(F.scalar_to_array(self.alpha), P.DType()(x)) if self.alpha <= 1: out = P.Maximum()(alpha_array * x, x) else: out = P.Minimum()(alpha_array * x, x) return out
def _tensors_allreduce_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. ps_parameter (bool): Use parameter server or not. Returns: Tensor, the gradient tensor after operation. """ if ps_parameter: return grad if allreduce_filter: grad = allreduce(grad) if mean: degree = F.scalar_cast(degree, F.dtype(grad)) cast_op = P.Cast() mul_op = P.Mul() grad = mul_op( grad, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(grad))) return grad return grad
def construct(self, x, label): '''construct''' x = self.normalize(x) w = self.normalize(self.weight) cosine = self.fc(x, w) cosine_shape = F.shape(cosine) one_hot_float = self.onehot(self.cast(label, mstype.int32), cosine_shape[1], self.on_value, self.off_value) one_hot_float = self.cast(one_hot_float, mstype.float16) if self.m == 0 and self.a == 1: one_hot_float = one_hot_float * self.b_const output = cosine - one_hot_float output = output * self.s_const else: theta = self.acos(cosine) theta = self.a_const * theta theta = self.m_const + theta body = self.cos(theta) body = body - self.b_const cos_mask = self.cast(F.scalar_to_array(1.0), mstype.float16) - one_hot_float output = body * one_hot_float + cosine * cos_mask output = output * self.s_const return output
def tensor_grad_scale(scale, grad): """Get grad with scale.""" if scale == 1.0: return grad cast_op = P.Cast() type_op = P.DType() return grad * cast_op(F.scalar_to_array(scale), type_op(grad))
def _tensors_allreduce_with_sparse(degree, mean, allgather, allreduce_filter, grad, allreduce): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (IndexedSlices): The gradient before operation. allreduce (Primitive): The communication operator for gradients. Returns: IndexedSlices, the gradient after operation. """ if allreduce_filter: indices = allgather(grad.indices()) dout = allgather(grad.values()) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values())) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = IndexedSlices(indices, dout, grad.dense_shape()) return grad
def _tensors_allreduce_with_sparse_ps(degree, mean, allgather, allreduce, allreduce_filter, grad, ps_parameter): """ Apply allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allgather would apply. grad (tuple): The indices, gradient tensor and tensor_shape before operation. ps_parameter (bool): Use parameter server or not. Returns: RowTensor, the gradient after operation. """ if ps_parameter: return grad if allreduce_filter: indices = allgather(grad.indices) dout = allgather(grad.values) if mean: degree = F.scalar_cast(degree, F.dtype(grad.values)) cast_op = P.Cast() mul_op = P.Mul() dout = mul_op( dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = RowTensor(indices, dout, grad.dense_shape) return grad
def construct(self, logit, label): '''construct''' exp = self.exp(logit) exp_sum = self.sum(exp, -1) exp_sum = self.reshape(exp_sum, (F.shape(exp_sum)[0], 1)) softmax_result = exp / exp_sum one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(logit)[1], self.on_value, self.off_value) loss = self.sum((self.log(softmax_result + self.eps_const) * self.cast(one_hot_label, mstype.float32) * self.cast(F.scalar_to_array(-1), mstype.float32)), -1) batch_size = F.shape(logit)[0] batch_size_tensor = self.cast(F.scalar_to_array(batch_size), mstype.float32) loss = self.sum(loss, -1) / batch_size_tensor return loss
def bprop(x, out, dout): if mean_flag: dx = all_reduce(dout) float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one / num), F.dtype(dx))) else: dx = all_reduce(dout) return (dx, )
def bprop(x, y, z, out, dout): do_mirror = equal(y, grad_accumulation_step) do_mirror = reshape(do_mirror, (())) if mean_flag: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: tmp = z + dout real_grad = all_reduce(tmp) dx = real_grad - z else: dx = dout float_one = F.scalar_cast(1.0, F.dtype(dx)) num = F.scalar_cast(dev_num, F.dtype(dx)) dx = mul(dx, cast(F.scalar_to_array(float_one/num), F.dtype(dx))) else: if do_mirror: indices = all_gather(dout.indices) grad = all_gather(dout.values) else: indices = dout.indices grad = dout.values float_one = F.scalar_cast(1.0, F.dtype(grad)) num = F.scalar_cast(dev_num, F.dtype(grad)) grad = mul(grad, cast(F.scalar_to_array(float_one/num), F.dtype(grad))) dx = RowTensor(indices, grad, dout.dense_shape) else: if F.issubclass_(F.typeof(dout), mstype.tensor): if do_mirror: tmp = z + dout real_grad = all_reduce(tmp) dx = real_grad - z else: dx = dout else: if do_mirror: indices = all_gather(dout.indices) grad = all_gather(dout.values) else: indices = dout.indices grad = dout.values dx = RowTensor(indices, grad, dout.dense_shape) return (dx, zeros_like(y), zeros_like(z))
def construct(self, x, label): mask = self.reshape(label, (F.shape(label)[0], 1)) mask = self.cast(mask, mstype.float32) mask = mask + F.scalar_to_array(0.00001) mask = self.relu(mask) / (mask) x = x * mask one_hot_label = self.onehot(self.cast(label, mstype.int32), F.shape(x)[1], self.on_value, self.off_value) loss = self.ce(x, one_hot_label) loss = self.sum(loss, 0) return loss
def construct(self, *args): weights = self.weights loss = self.network(*args) sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) grads = self.grad(self.network, weights)(*args, sens) if self.reducer_flag: # apply grad reducer on grads grads = self.grad_reducer(grads) if self.use_global_norm: grads = self.hyper_map(F.partial(grad_scale, F.scalar_to_array(self.sens)), grads) grads = C.clip_by_global_norm(grads) return F.depend(loss, self.optimizer(grads))
def construct(self, logit, label): logit_max = self.reduce_max(logit, -1) exp = self.exp(self.sub(logit, logit_max)) exp_sum = self.reduce_sum(exp, -1) softmax_result = self.div(exp, exp_sum) if self.sparse: label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) softmax_result_log = self.log(softmax_result) loss = self.sum_cross_entropy((self.mul(softmax_result_log, label)), -1) loss = self.mul2(F.scalar_to_array(-1.0), loss) loss = self.reduce_mean(loss, -1) return loss
def construct(self, gradients): params = self.parameters if self.dynamic_lr: lr = self.gather(self.learning_rate, self.global_step, self.axis) F.control_depend(lr, self.assignadd(self.global_step, 1)) else: lr = F.scalar_to_array(self.learning_rate) if self.reciprocal_scale != 1.0: gradients = self.hyper_map( F.partial(grad_scale, self.reciprocal_scale), gradients) grad_t = self.hyper_map( F.partial(lars_opt, self.lars, self.weight_decay, lr), gradients, params, self.decay_flag, self.lars_flag) success = self.opt(grad_t) return success
def construct(self, x, label): '''Construct function.''' w = self.normalize(self.weight) cosine = self.fc(self.cast(x, mstype.float16), self.cast(w, mstype.float16)) cosine = self.cast(cosine, mstype.float32) cosine_shape = F.shape(cosine) one_hot_float = self.onehot( self.cast(label, mstype.int32), cosine_shape[1], self.on_value, self.off_value) theta = self.acos(cosine) theta = self.a_const * theta theta = self.m_const + theta body = self.cos(theta) body = body - self.b_const cos_mask = F.scalar_to_array(1.0) - one_hot_float output = body * one_hot_float + cosine * cos_mask output = output * self.s_const return output, cosine
def _tensors_allreduce_mean(mul, degree, allreduce, parameters): """ Apply allreduce on parameters. Args: mul(Primitive): The mul operator for parameters. degree (int): The mean coefficient. allreduce (Primitive): The communication operator for parameters. parameters (Tensor): The parameters before operation. Returns: Tensor, the parameters after operation. """ degree = F.scalar_cast(degree, F.dtype(parameters)) parameters = allreduce(parameters) cast_op = P.Cast() return mul(parameters, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(parameters)))
def _tensors_allreduce_mean(mul, degree, allreduce_filter, grad): """ Apply mean and allreduce on gradient. Allreduce is a communication operation used for distributed deep learning. Args: mul (Primitive): Div operation. degree (int): The mean coefficient. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: degree = F.scalar_cast(degree, F.dtype(grad)) grad = _all_reduce(grad) cast_op = P.Cast() return mul(grad, cast_op(F.scalar_to_array(1.0/degree), F.dtype(grad))) return grad
def _tensors_allreduce_mean_with_sparse(mul, degree, allreduce_filter, grad): """ Apply mean and allgather on gradient instead of allreduce for sparse feature. Allgather is a communication operation used for distributed deep learning. Args: mul (Primitive): Div operation. degree (int): The mean coefficient. allreduce_filter (bool): When it is true, allgather would apply. grad (Tuple): The indices, gradient tensor and tensor_shape before operation. Returns: Tuple, include indices, the gradient tensor and tensor_shape after operation. """ if allreduce_filter: indices = _all_gather(grad[0]) degree = F.scalar_cast(degree, F.dtype(grad[1])) dout = _all_gather(grad[1]) cast_op = P.Cast() dout = mul(dout, cast_op(F.scalar_to_array(1.0 / degree), F.dtype(dout))) grad = (indices, dout, grad[2]) return grad
def _tensors_allreduce(degree, mean, allgather, allreduce, allreduce_filter, grad): """ Apply allreduce on gradient. Args: degree (int): The mean coefficient. mean (bool): When mean is true, the mean coefficient (degree) would apply on gradients. allgather (Primitive): The communication operator for sparse gradients. allreduce (Primitive): The communication operator for gradients. allreduce_filter (bool): When it is true, allreduce would apply. grad (Tensor): The gradient tensor before operation. Returns: Tensor, the gradient tensor after operation. """ if allreduce_filter: grad = allreduce(grad) if mean: degree = F.scalar_cast(degree, F.dtype(grad)) grad = F.tensor_mul( grad, F.cast(F.scalar_to_array(1.0 / degree), F.dtype(grad))) return grad return grad
def construct(self, grads): square_sum = self.hyper_map(get_square_sum, grads) global_norms = F.sqrt( F.addn(square_sum) / F.scalar_to_array(len(square_sum))) return global_norms
def construct(self, x): one = self.cast(F.scalar_to_array(1.0), mstype.float32) out = x * one ret = self.reduce(out) return ret
def construct(self, x, y, bias): out = self.fc_nobias(x, y) out = self.reduce_sum(out, (0, 1)) out = self.mul(out, F.scalar_to_array(2.0)) return out