def construct(self, grads, clip_type, clip_value): # return grads if clip_type != 0 and clip_type != 1: return grads new_grads = () for grad in grads: dt = self.dtype(grad) if clip_type == 0: t = C.clip_by_value( grad, self.cast(F.tuple_to_array((-clip_value, )), dt), self.cast(F.tuple_to_array((clip_value, )), dt)) else: t = self.clip_by_norm( grad, self.cast(F.tuple_to_array((clip_value, )), dt)) new_grads = new_grads + (t, ) return new_grads
def construct(self): """Generates matrix of relative positions between inputs.""" range_vec_row_out = self.cast(F.tuple_to_array(F.make_range(self._length)), mstype.int32) range_vec_col_out = self.range_mat(range_vec_row_out, (self._length, -1)) tile_row_out = self.tile(range_vec_row_out, (self._length,)) tile_col_out = self.tile(range_vec_col_out, (1, self._length)) range_mat_out = self.range_mat(tile_row_out, (self._length, self._length)) transpose_out = self.range_mat(tile_col_out, (self._length, self._length)) distance_mat = self.sub(range_mat_out, transpose_out) distance_mat_clipped = C.clip_by_value(distance_mat, self._min_relative_position, self._max_relative_position) # Shift values to be >=0. Each integer still uniquely identifies a # relative position difference. final_mat = distance_mat_clipped + self._max_relative_position return final_mat
def construct(self, grads, clip_type, clip_value): """Defines the gradients clip.""" if clip_type not in (0, 1): return grads new_grads = () for grad in grads: dt = self.dtype(grad) if clip_type == 0: t = C.clip_by_value( grad, self.cast(F.tuple_to_array((-clip_value, )), dt), self.cast(F.tuple_to_array((clip_value, )), dt)) else: t = self.clip_by_norm( grad, self.cast(F.tuple_to_array((clip_value, )), dt)) new_grads = new_grads + (t, ) return new_grads
def construct(self, grads, clip_type, clip_value): """ construct a compute flow. """ # pylint: disable=consider-using-in if clip_type != 0 and clip_type != 1: return grads new_grads = () for grad in grads: if clip_type == 0: t = C.clip_by_value(grad, F.tuple_to_array((-clip_value,)), F.tuple_to_array((clip_value,))) else: t = self.clip_by_norm(grad, F.tuple_to_array((clip_value,))) new_grads = new_grads + (t,) return new_grads
def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ if clip_type not in (0, 1): return grad dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), F.cast(F.tuple_to_array((clip_value,)), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) return new_grad
def construct(self, box_p, box_gt): """construct method""" box_p_area = (box_p[..., 2:3] - box_p[..., 0:1]) * (box_p[..., 3:4] - box_p[..., 1:2]) box_gt_area = (box_gt[..., 2:3] - box_gt[..., 0:1]) * (box_gt[..., 3:4] - box_gt[..., 1:2]) x_1 = self.max(box_p[..., 0:1], box_gt[..., 0:1]) x_2 = self.min(box_p[..., 2:3], box_gt[..., 2:3]) y_1 = self.max(box_p[..., 1:2], box_gt[..., 1:2]) y_2 = self.min(box_p[..., 3:4], box_gt[..., 3:4]) intersection = (y_2 - y_1) * (x_2 - x_1) xc_1 = self.min(box_p[..., 0:1], box_gt[..., 0:1]) xc_2 = self.max(box_p[..., 2:3], box_gt[..., 2:3]) yc_1 = self.min(box_p[..., 1:2], box_gt[..., 1:2]) yc_2 = self.max(box_p[..., 3:4], box_gt[..., 3:4]) c_area = (xc_2 - xc_1) * (yc_2 - yc_1) union = box_p_area + box_gt_area - intersection union = union + self.eps c_area = c_area + self.eps iou = self.div(self.cast(intersection, ms.float32), self.cast(union, ms.float32)) res_mid0 = c_area - union res_mid1 = self.div(self.cast(res_mid0, ms.float32), self.cast(c_area, ms.float32)) giou = iou - res_mid1 giou = C.clip_by_value(giou, -1.0, 1.0) return giou
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Specifies whether param update with weight decay. optim_filter(bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_sqrt = P.Sqrt() op_rsqrt = P.Rsqrt() op_square = P.Square() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() op_pow = P.Pow() op_norm = layer.Norm() op_select = P.Select() op_greater = P.Greater() op_fill = P.Fill() op_dtype = P.DType() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32)) next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow( beta1, op_cast(global_step + num_one, mstype.float32))) next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow( beta2, op_cast(global_step + num_one, mstype.float32))) w_norm = op_norm(param_fp32) g_norm = op_norm(gradient_fp32) g_norm_hat = op_norm( op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay * param_fp32) zeros = F.zeros_like(w_norm) ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0) trust_ratio = op_select( op_greater(w_norm, zeros), op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones), ones) tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0) trust_ratio = C.clip_by_value(trust_ratio, zeros, tens) update = next_mm / (op_sqrt(next_vv) + eps) if decay_flag: update = update + op_mul(weight_decay, param_fp32) update_with_lr = op_mul(op_mul(trust_ratio, lr), update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend( next_param, F.assign(param, op_cast(next_param, F.dtype(param)))) next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m)))) next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v)))) return op_cast(next_param, F.dtype(param)) return gradient
def hard_swish(x): x = P.Cast()(x, ms.float32) y = x + 3.0 y = clip_by_value(y, 0.0, 6.0) y = y / 6.0 return x * y
def construct(self, enc_states, enc_attention_mask): """ Process source sentence Inputs: enc_states (Tensor): Output of transformer encoder with shape (batch_size * beam_width, T, D). enc_attention_mask (Tensor): encoder attention mask with shape (batch_size * beam_width, T). Returns: Tensor, predictions output. """ # beam search start cur_input_ids = self.start_ids state_log_probs = self.init_scores state_seq = self.init_seq state_finished = self.init_finished state_length = self.init_length decoder_hidden_state = self.decoder_hidden_state accu_attn_scores = self.accu_attn_scores if not self.is_using_while: for _ in range(self.max_decode_length + 1): cur_input_ids, state_log_probs, state_seq, state_length, decoder_hidden_state, accu_attn_scores, \ state_finished = self.one_step(cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_length, None, decoder_hidden_state, accu_attn_scores, state_finished) else: # At present, only ascend910 supports while operation. idx = self.start + 1 ends = self.start + self.max_decode_length + 1 while idx < ends: cur_input_ids, state_log_probs, state_seq, state_length, decoder_hidden_state, accu_attn_scores, \ state_finished = self.one_step(cur_input_ids, enc_states, enc_attention_mask, state_log_probs, state_seq, state_length, idx, decoder_hidden_state, accu_attn_scores, state_finished) idx = idx + 1 # add length penalty scores penalty_len = self.length_penalty(state_length) # return penalty_len log_probs = self.real_div(state_log_probs, penalty_len) penalty_cov = C.clip_by_value(accu_attn_scores, 0.0, 1.0) penalty_cov = self.log(penalty_cov) penalty_less = self.less(penalty_cov, self.neg_inf_3d) penalty = self.select(penalty_less, self.zeros_3d, penalty_cov) penalty = self.reducesum(penalty, 2) log_probs = log_probs + penalty * self.cov_penalty_factor # sort according to scores _, top_beam_indices = self.topk(log_probs, self.beam_width) gather_indices = self.concat( (self.expand(self.batch_ids, -1), self.expand(top_beam_indices, -1))) # sort sequence and attention scores predicted_ids = self.gather_nd(state_seq, gather_indices) if not self.is_using_while: predicted_ids = predicted_ids[:, 0:1, 1:(self.max_decode_length + 1)] else: predicted_ids = predicted_ids[:, 0:1, :self.max_decode_length] return predicted_ids
def bilinear_sampler(self, img, x, y): """ Performs bilinear sampling of the input images according to the normalized coordinates provided by the sampling grid. Note that the sampling is done identically for each channel of the input. To test if the function works properly, output image should be identical to input image when theta is initialized to identity transform. Input ----- - img: batch of images in (B, H, W, C) layout. - grid: x, y which is the output of affine_grid_generator. Returns ------- - out: interpolated images according to grids. Same size as grid. """ shape = P.Shape() H = shape(img)[1] W = shape(img)[2] cast = P.Cast() max_y = cast(H - 1, mindspore.float32) max_x = cast(W - 1, mindspore.float32) zero = self.zero # rescale x and y to [0, W-1/H-1] x = 0.5 * ((x + 1.0) * (max_x - 1)) y = 0.5 * ((y + 1.0) * (max_y - 1)) # grab 4 nearest corner points for each (x_i, y_i) floor = P.Floor() x0 = floor(x) x1 = x0 + 1 y0 = floor(y) y1 = y0 + 1 # clip to range [0, H-1/W-1] to not violate img boundaries x0 = C.clip_by_value(x0, zero, max_x) x1 = C.clip_by_value(x1, zero, max_x) y0 = C.clip_by_value(y0, zero, max_y) y1 = C.clip_by_value(y1, zero, max_y) # get pixel value at corner coords Ia = self.get_pixel_value(img, x0, y0) Ib = self.get_pixel_value(img, x0, y1) Ic = self.get_pixel_value(img, x1, y0) Id = self.get_pixel_value(img, x1, y1) # recast as float for delta calculation x0 = cast(x0, mindspore.float32) x1 = cast(x1, mindspore.float32) y0 = cast(y0, mindspore.float32) y1 = cast(y1, mindspore.float32) # calculate deltas wa = (x1 - x) * (y1 - y) wb = (x1 - x) * (y - y0) wc = (x - x0) * (y1 - y) wd = (x - x0) * (y - y0) # add dimension for addition expand_dims = P.ExpandDims() wa = expand_dims(wa, 3) wb = expand_dims(wb, 3) wc = expand_dims(wc, 3) wd = expand_dims(wd, 3) # compute output add_n = P.AddN() out = add_n([wa * Ia, wb * Ib, wc * Ic, wd * Id]) return out