def construct(self, grads, clip_type, clip_value):
        # return grads
        if clip_type != 0 and clip_type != 1:
            return grads

        new_grads = ()
        for grad in grads:
            dt = self.dtype(grad)
            if clip_type == 0:
                t = C.clip_by_value(
                    grad, self.cast(F.tuple_to_array((-clip_value, )), dt),
                    self.cast(F.tuple_to_array((clip_value, )), dt))
            else:
                t = self.clip_by_norm(
                    grad, self.cast(F.tuple_to_array((clip_value, )), dt))
            new_grads = new_grads + (t, )

        return new_grads
Esempio n. 2
0
    def construct(self):
        """Generates matrix of relative positions between inputs."""
        range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32)
        range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1))
        tile_row_out = self.tile(range_vec_row_out, (self._length,))
        tile_col_out = self.tile(range_vec_col_out, (1, self._length))
        range_mat_out = self.range_mat(tile_row_out, (self._length, self._length))
        transpose_out = self.range_mat(tile_col_out, (self._length, self._length))
        distance_mat = self.sub(range_mat_out, transpose_out)

        distance_mat_clipped = C.clip_by_value(distance_mat,
                                               self._min_relative_position,
                                               self._max_relative_position)

        # Shift values to be >=0. Each integer still uniquely identifies a
        # relative position difference.
        final_mat = distance_mat_clipped + self._max_relative_position
        return final_mat
Esempio n. 3
0
    def construct(self, grads, clip_type, clip_value):
        """Defines the gradients clip."""
        if clip_type not in (0, 1):
            return grads

        new_grads = ()
        for grad in grads:
            dt = self.dtype(grad)
            if clip_type == 0:
                t = C.clip_by_value(
                    grad, self.cast(F.tuple_to_array((-clip_value, )), dt),
                    self.cast(F.tuple_to_array((clip_value, )), dt))
            else:
                t = self.clip_by_norm(
                    grad, self.cast(F.tuple_to_array((clip_value, )), dt))
            new_grads = new_grads + (t, )

        return new_grads
Esempio n. 4
0
    def construct(self, grads, clip_type, clip_value):
        """
        construct a compute flow.
        """
        # pylint: disable=consider-using-in
        if clip_type != 0 and clip_type != 1:
            return grads

        new_grads = ()
        for grad in grads:
            if clip_type == 0:
                t = C.clip_by_value(grad, F.tuple_to_array((-clip_value,)),
                                    F.tuple_to_array((clip_value,)))
            else:
                t = self.clip_by_norm(grad, F.tuple_to_array((clip_value,)))
            new_grads = new_grads + (t,)

        return new_grads
def _clip_grad(clip_type, clip_value, grad):
    """
    Clip gradients.

    Inputs:
        clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
        clip_value (float): Specifies how much to clip.
        grad (tuple[Tensor]): Gradients.

    Outputs:
        tuple[Tensor], clipped gradients.
    """
    if clip_type not in (0, 1):
        return grad
    dt = F.dtype(grad)
    if clip_type == 0:
        new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
                                   F.cast(F.tuple_to_array((clip_value,)), dt))
    else:
        new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
    return new_grad
Esempio n. 6
0
 def construct(self, box_p, box_gt):
     """construct method"""
     box_p_area = (box_p[..., 2:3] - box_p[..., 0:1]) * (box_p[..., 3:4] - box_p[..., 1:2])
     box_gt_area = (box_gt[..., 2:3] - box_gt[..., 0:1]) * (box_gt[..., 3:4] - box_gt[..., 1:2])
     x_1 = self.max(box_p[..., 0:1], box_gt[..., 0:1])
     x_2 = self.min(box_p[..., 2:3], box_gt[..., 2:3])
     y_1 = self.max(box_p[..., 1:2], box_gt[..., 1:2])
     y_2 = self.min(box_p[..., 3:4], box_gt[..., 3:4])
     intersection = (y_2 - y_1) * (x_2 - x_1)
     xc_1 = self.min(box_p[..., 0:1], box_gt[..., 0:1])
     xc_2 = self.max(box_p[..., 2:3], box_gt[..., 2:3])
     yc_1 = self.min(box_p[..., 1:2], box_gt[..., 1:2])
     yc_2 = self.max(box_p[..., 3:4], box_gt[..., 3:4])
     c_area = (xc_2 - xc_1) * (yc_2 - yc_1)
     union = box_p_area + box_gt_area - intersection
     union = union + self.eps
     c_area = c_area + self.eps
     iou = self.div(self.cast(intersection, ms.float32), self.cast(union, ms.float32))
     res_mid0 = c_area - union
     res_mid1 = self.div(self.cast(res_mid0, ms.float32), self.cast(c_area, ms.float32))
     giou = iou - res_mid1
     giou = C.clip_by_value(giou, -1.0, 1.0)
     return giou
Esempio n. 7
0
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m,
                   v, gradient, decay_flag, optim_filter):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay (Number): Weight decay. Should be equal to or greater than 0.
        global_step (Tensor): Global step.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Specifies whether param update with weight decay.
        optim_filter(bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
    if optim_filter:
        op_mul = P.Mul()
        op_sqrt = P.Sqrt()
        op_rsqrt = P.Rsqrt()
        op_square = P.Square()
        op_cast = P.Cast()
        op_reshape = P.Reshape()
        op_shape = P.Shape()
        op_pow = P.Pow()
        op_norm = layer.Norm()
        op_select = P.Select()
        op_greater = P.Greater()
        op_fill = P.Fill()
        op_dtype = P.DType()

        param_fp32 = op_cast(param, mstype.float32)
        m_fp32 = op_cast(m, mstype.float32)
        v_fp32 = op_cast(v, mstype.float32)
        gradient_fp32 = op_cast(gradient, mstype.float32)

        next_m = op_mul(beta1, m_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta1, gradient_fp32)

        next_v = op_mul(beta2, v_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))

        next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow(
            beta1, op_cast(global_step + num_one, mstype.float32)))
        next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow(
            beta2, op_cast(global_step + num_one, mstype.float32)))
        w_norm = op_norm(param_fp32)
        g_norm = op_norm(gradient_fp32)

        g_norm_hat = op_norm(
            op_mul(next_mm, op_rsqrt(next_vv + eps)) +
            weight_decay * param_fp32)
        zeros = F.zeros_like(w_norm)
        ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
        trust_ratio = op_select(
            op_greater(w_norm, zeros),
            op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones),
            ones)
        tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
        trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
        update = next_mm / (op_sqrt(next_vv) + eps)

        if decay_flag:
            update = update + op_mul(weight_decay, param_fp32)

        update_with_lr = op_mul(op_mul(trust_ratio, lr), update)

        next_param = param_fp32 - op_reshape(update_with_lr,
                                             op_shape(param_fp32))

        next_param = F.depend(
            next_param, F.assign(param, op_cast(next_param, F.dtype(param))))
        next_param = F.depend(next_param,
                              F.assign(m, op_cast(next_m, F.dtype(m))))
        next_param = F.depend(next_param,
                              F.assign(v, op_cast(next_v, F.dtype(v))))

        return op_cast(next_param, F.dtype(param))
    return gradient
Esempio n. 8
0
def hard_swish(x):
    x = P.Cast()(x, ms.float32)
    y = x + 3.0
    y = clip_by_value(y, 0.0, 6.0)
    y = y / 6.0
    return x * y
Esempio n. 9
0
    def construct(self, enc_states, enc_attention_mask):
        """
        Process source sentence

        Inputs:
            enc_states (Tensor): Output of transformer encoder with shape (batch_size * beam_width, T, D).
            enc_attention_mask (Tensor): encoder attention mask with shape (batch_size * beam_width, T).

        Returns:
            Tensor, predictions output.
        """
        # beam search start
        cur_input_ids = self.start_ids
        state_log_probs = self.init_scores
        state_seq = self.init_seq
        state_finished = self.init_finished
        state_length = self.init_length
        decoder_hidden_state = self.decoder_hidden_state
        accu_attn_scores = self.accu_attn_scores

        if not self.is_using_while:
            for _ in range(self.max_decode_length + 1):
                cur_input_ids, state_log_probs, state_seq, state_length, decoder_hidden_state, accu_attn_scores, \
                state_finished = self.one_step(cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
                                               state_seq, state_length, None, decoder_hidden_state, accu_attn_scores,
                                               state_finished)
        else:
            # At present, only ascend910 supports while operation.
            idx = self.start + 1
            ends = self.start + self.max_decode_length + 1
            while idx < ends:
                cur_input_ids, state_log_probs, state_seq, state_length, decoder_hidden_state, accu_attn_scores, \
                state_finished = self.one_step(cur_input_ids, enc_states, enc_attention_mask, state_log_probs,
                                               state_seq, state_length, idx, decoder_hidden_state, accu_attn_scores,
                                               state_finished)
                idx = idx + 1

        # add length penalty scores
        penalty_len = self.length_penalty(state_length)
        # return penalty_len
        log_probs = self.real_div(state_log_probs, penalty_len)
        penalty_cov = C.clip_by_value(accu_attn_scores, 0.0, 1.0)
        penalty_cov = self.log(penalty_cov)
        penalty_less = self.less(penalty_cov, self.neg_inf_3d)
        penalty = self.select(penalty_less, self.zeros_3d, penalty_cov)
        penalty = self.reducesum(penalty, 2)
        log_probs = log_probs + penalty * self.cov_penalty_factor
        # sort according to scores
        _, top_beam_indices = self.topk(log_probs, self.beam_width)
        gather_indices = self.concat(
            (self.expand(self.batch_ids,
                         -1), self.expand(top_beam_indices, -1)))
        # sort sequence and attention scores
        predicted_ids = self.gather_nd(state_seq, gather_indices)
        if not self.is_using_while:
            predicted_ids = predicted_ids[:, 0:1,
                                          1:(self.max_decode_length + 1)]
        else:
            predicted_ids = predicted_ids[:, 0:1, :self.max_decode_length]

        return predicted_ids
Esempio n. 10
0
    def bilinear_sampler(self, img, x, y):
        """
        Performs bilinear sampling of the input images according to the
        normalized coordinates provided by the sampling grid. Note that
        the sampling is done identically for each channel of the input.

        To test if the function works properly, output image should be
        identical to input image when theta is initialized to identity
        transform.

        Input
        -----
        - img: batch of images in (B, H, W, C) layout.
        - grid: x, y which is the output of affine_grid_generator.

        Returns
        -------
        - out: interpolated images according to grids. Same size as grid.
        """
        shape = P.Shape()
        H = shape(img)[1]
        W = shape(img)[2]
        cast = P.Cast()
        max_y = cast(H - 1, mindspore.float32)
        max_x = cast(W - 1, mindspore.float32)
        zero = self.zero

        # rescale x and y to [0, W-1/H-1]
        x = 0.5 * ((x + 1.0) * (max_x - 1))
        y = 0.5 * ((y + 1.0) * (max_y - 1))

        # grab 4 nearest corner points for each (x_i, y_i)
        floor = P.Floor()
        x0 = floor(x)
        x1 = x0 + 1
        y0 = floor(y)
        y1 = y0 + 1

        # clip to range [0, H-1/W-1] to not violate img boundaries
        x0 = C.clip_by_value(x0, zero, max_x)
        x1 = C.clip_by_value(x1, zero, max_x)
        y0 = C.clip_by_value(y0, zero, max_y)
        y1 = C.clip_by_value(y1, zero, max_y)

        # get pixel value at corner coords
        Ia = self.get_pixel_value(img, x0, y0)
        Ib = self.get_pixel_value(img, x0, y1)
        Ic = self.get_pixel_value(img, x1, y0)
        Id = self.get_pixel_value(img, x1, y1)

        # recast as float for delta calculation
        x0 = cast(x0, mindspore.float32)
        x1 = cast(x1, mindspore.float32)
        y0 = cast(y0, mindspore.float32)
        y1 = cast(y1, mindspore.float32)

        # calculate deltas
        wa = (x1 - x) * (y1 - y)
        wb = (x1 - x) * (y - y0)
        wc = (x - x0) * (y1 - y)
        wd = (x - x0) * (y - y0)

        # add dimension for addition
        expand_dims = P.ExpandDims()
        wa = expand_dims(wa, 3)
        wb = expand_dims(wb, 3)
        wc = expand_dims(wc, 3)
        wd = expand_dims(wd, 3)

        # compute output
        add_n = P.AddN()
        out = add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id])

        return out