def _taylor_compute(data_x, x_square=None): """ do arcsinx compute use the 15th order taylor expansion when 0 <= x <= BOUNDARY_1 asin(x) = x + 1/6*x^3 + 3/40*x^5 + 5/112*x^7 + ... + 13!!/(14!!*15)*x^15 Parameters: ---------- data_x : the placeholder of data input x_square : the placeholder of the square of data_x Returns : A Tensor. Has the same type as data. ------- """ if x_square is None: x_square = te.lang.cce.vmul(data_x, data_x) res = te.lang.cce.vmuls(x_square, tvm.const(COEF[TAYLOR_COUNT], x_square.dtype)) for temp in reversed(range(TAYLOR_COUNT)): res = te.lang.cce.vadds(res, tvm.const(COEF[temp], x_square.dtype)) if temp == 0: res = te.lang.cce.vmul(res, data_x) else: res = te.lang.cce.vmul(x_square, res) return res
def _newton_taylor_iter(input_x, input_y, input_z): """ do element-wise Newton compute z(n+1) = z(n) - (e^(z(n)*x(n)^-1) - y(n))/x(n)^-1*e^(z(n)*x(n)^-1) Parameters: ---------- input_x: TVM tensor, the placeholder of input_x input_y: TVM tensor, the placeholder of input_y input_z: start value of Newton iteration Returns : A Tensor. Has the same type as input_z. ------- """ #Newton begin:z(n+1) = z(n) - x(n) + x(n)*y(n)*e^(-z(n)*x(n)^-1) input_x_mul = te.lang.cce.vmuls(input_x, tvm.const(SCALAR_NEG_ONE, "float32")) newton_taylor = te.lang.cce.vadd(input_x_mul, input_z) input_xy = te.lang.cce.vmul(input_x, input_y) input_x_rec = te.lang.cce.vrec(input_x) input_x_res = te.lang.cce.vmuls(input_x_rec, tvm.const(SCALAR_NEG_ONE, "float32")) input_z_mul = te.lang.cce.vmul(input_x_res, input_z) input_z_taylor = _exp_taylor_compute(input_z_mul) input_z_res = te.lang.cce.vmul(input_z_taylor, input_xy) newton_taylor = te.lang.cce.vadd(newton_taylor, input_z_res) return newton_taylor
def _get_pd_x_front_nz(data, param_nz, cast_dtype): """ compute front part of pd_x according to data, params and shape_x """ pd_xl = _get_pd_xl_nz(data, param_nz) pd_var, var_elta_2, sub_x_mean = _get_pd_var_nz(data, param_nz, pd_xl, cast_dtype) pd_mean = _get_pd_mean_nz(param_nz, pd_xl, pd_var, var_elta_2, sub_x_mean, cast_dtype) var_elta_2_cast = _broadcast_nz(var_elta_2, param_nz.get("shape_x_nz")) pd_x_1 = te.lang.cce.vmul(var_elta_2_cast, pd_xl) pdx2_broad = _broadcast_nz(pd_var, param_nz.get("shape_x_nz")) pdx2_mul = te.lang.cce.vmul(pdx2_broad, sub_x_mean) pd_x_2 = \ te.lang.cce.vmuls(pdx2_mul, tvm.const((2*(param_nz.get("mean_nz_num")**(-1))), dtype=cast_dtype)) pd_x_3 = \ te.lang.cce.vmuls(pd_mean, tvm.const((param_nz.get("mean_nz_num")**(-1)), dtype=cast_dtype)) return pd_x_1, pd_x_2, pd_x_3
def l1_loss_grad_compute(grad_out, predict, target, y, reduction="mean", kernel_name="l1_loss_grad"): predict_dtype = predict.dtype.lower() zero_tensor = te.lang.cce.vmuls(predict, tvm.const(0, dtype=predict_dtype)) one_tensor = te.lang.cce.vadds(zero_tensor, tvm.const(1, dtype=predict_dtype)) neg_one_tensor = te.lang.cce.vadds(zero_tensor, tvm.const(-1, dtype=predict_dtype)) # if predict is equal or bigger than target, the sign will be given 1; else -1 sign = te.lang.cce.vcmpsel(predict, target, "gt", one_tensor, neg_one_tensor) # rectify sign to 0 when predict equal to target sign = te.lang.cce.vcmpsel(predict, target, "eq", zero_tensor, sign) grad_shape = te.lang.cce.util.shape_to_list(grad_out.shape) n = reduce(lambda x, y: x * y, grad_shape) norm = grad_out # if choose "mean", grad_out should divide over n if reduction == "mean": norm = te.lang.cce.vmuls(norm, tvm.const(1 / n, dtype=predict_dtype)) # chain multiplication to get the gradient of L1 with respect to weights(grad_out) res = te.lang.cce.vmul(sign, norm) return res
def fake_quant_perchannel_compute(x, min_val, max_val, y, quant_min, quant_max, kernel_name="fake_quant_perchannel"): """FakeQuantPerChannel""" x_shape = te.lang.cce.util.shape_to_list(x.shape) minmax_shape = te.lang.cce.util.shape_to_list(min_val.shape) quant_min = tvm.const(quant_min, x.dtype) quant_max = tvm.const(quant_max, x.dtype) quant_min = te.lang.cce.broadcast(quant_min, minmax_shape, x.dtype) quant_max = te.lang.cce.broadcast(quant_max, minmax_shape, x.dtype) # CalNudge(NudgeMinMax) scale = te.lang.cce.vdiv(te.lang.cce.vsub( max_val, min_val), te.lang.cce.vsub(quant_max, quant_min)) zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale)) # Nudge zero point nudge_zp_ = te.lang.cce.vmin( quant_max, te.lang.cce.vmax(quant_min, zp_from_min)) nudge_zp = te.lang.cce.floor(te.lang.cce.vadds(nudge_zp_, 0.5)) nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale) nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale) # FakeQuant nudge_min_b = te.lang.cce.broadcast(nudge_min, x_shape) nudge_max_b = te.lang.cce.broadcast(nudge_max, x_shape) scale_b = te.lang.cce.broadcast(scale, x_shape) input_x = te.lang.cce.vmin(nudge_max_b, te.lang.cce.vmax(nudge_min_b, x)) nudge_input_ = te.lang.cce.vdiv( te.lang.cce.vsub(input_x, nudge_min_b), scale_b) nudge_input = te.lang.cce.floor(te.lang.cce.vadds(nudge_input_, 0.5)) res = te.lang.cce.vadd(te.lang.cce.vmul(nudge_input, scale_b), nudge_min_b) return res
def _trans_input_shape(self, axis): """ trans the input shape into three dimensions (left, mid, right) and get the range of different dims of the input shape. Returns: ------- x_reshape: new input shape of format with (left, mid, right) left_range:left dim range right_range:right dim range """ real_axis = axis + len(self.dims_var) if axis < 0 else axis left_dim = tvm.const(1) left_upper = 1 for idx in range(real_axis): left_dim *= self.dim_vars[idx] left_upper *= self.dim_bounds[idx][1] self.left_range = (1, left_upper) right_dim = tvm.const(1) right_upper = 1 for idx in range(real_axis + 1, len(self.dim_vars)): right_dim *= self.dim_vars[idx] right_upper *= self.dim_bounds[idx][1] self.right_range = (1, right_upper) self.x_reshape = (left_dim, self.dim_vars[real_axis], right_dim)
def _get_pd_var_front(data, cast_dtype): """ compute front part of pd_var according to data_variance Parameters ---------- data: dict placeholders after cast Returns ------- pd_var_1: tvm.tensor np.power((data_variance + EPSLON), (-1.5)) var_elta_2: tvm.tensor np.power((data_variance + EPSLON), (-0.5)) """ var_elta = te.lang.cce.vadds(data.get("data_variance"), tvm.const(EPSLON, dtype=cast_dtype)) var_elta_log = te.lang.cce.vlog(var_elta) var_elta_mul = te.lang.cce.vmuls(var_elta_log, tvm.const(-0.5, dtype=cast_dtype)) var_elta_2 = te.lang.cce.vexp(var_elta_mul) pdvar1_mul = te.lang.cce.vmul(var_elta_2, var_elta_2) pd_var_1 = te.lang.cce.vmul(pdvar1_mul, var_elta_2) return pd_var_1, var_elta_2
def _less_compare_float32(data_x, data_y): """ Compare data_x and data_y to determine whether data_x is less than data_y. If the element in data_x is less than in data_y, then return 1, else return 0. max num of float32 is 2**126 but cce can only support 2**62, so use 62/62/2 to adaptor 126 (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 so min_value*max_value*max_value*factor_value = 1 """ shape_inputs = te.lang.cce.util.shape_to_list(data_x.shape) min_value = tvm.const(2 ** (-126), dtype=D_TYPE) max_value = tvm.const(2 ** 62, dtype=D_TYPE) factor_value = tvm.const(2 ** 2, dtype=D_TYPE) if api_check_support("te.lang.cce.vmaxs", data_x.dtype): res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmins(res_sub, min_value) res_max = te.lang.cce.vmaxs(res_min, tvm.const(0, dtype=D_TYPE)) else: data_zero = te.lang.cce.vmuls(data_x, 0) min_value_tensor = te.lang.cce.vadds(data_zero, min_value) res_sub = te.lang.cce.vsub(data_y, data_x) res_min = te.lang.cce.vmin(res_sub, min_value_tensor) res_max = te.lang.cce.vmax(res_min, data_zero) res_max_mul = te.lang.cce.vmuls(res_max, max_value) res_max_mul_max = te.lang.cce.vmuls(res_max_mul, max_value) res = te.lang.cce.vmuls(res_max_mul_max, factor_value) return res
def rsqrt_grad_compute(input_y, input_dy, output_z, kernel_name="rsqrt_grad"): """ compute for rsqrt_grad Parameters ---------- input_y: TVM tensor the placeholder of input_y input_dy: TVM tensor the placeholder of input_dy output_z: dict dict info of output_z kernel_name: str cce kernel name, default value is "rsqrt_grad" Returns ------- res: TVM tensor the result of compute """ dtype_input_y = input_y.dtype rsqrt_const = tvm.const(SCALAR, dtype=dtype_input_y) if dtype_input_y in ("int8", "float16"): rsqrt_const = tvm.const(SCALAR, dtype="float32") input_y = te.lang.cce.cast_to(input_y, "float32") input_dy = te.lang.cce.cast_to(input_dy, "float32") res_vmul = te.lang.cce.vmul(input_y, input_y) res_vmul1 = te.lang.cce.vmul(res_vmul, input_y) res_vmul2 = te.lang.cce.vmul(res_vmul1, input_dy) res = te.lang.cce.vmuls(res_vmul2, rsqrt_const) if dtype_input_y in ("int8", "int32", "float16"): res = te.lang.cce.cast_to(res, dtype_input_y, f1628IntegerFlag=True) return res
def sigmoid_compute(input_x): """ calculating sigmoid """ data_input = input_x dtype = input_x.dtype exp_support = cce.cce_conf.api_check_support( "te.lang.cce.vexp", "float32") mul_support = cce.cce_conf.api_check_support( "te.lang.cce.vmuls", "float32") if dtype == "float32" and not mul_support: error_manager_vector.raise_err_specific_reson("DynamicLSTM", "Input dtype only support float16 while input dtype is float32") const_num_neg_one = tvm.const(-1, dtype=dtype) const_num_one = tvm.const(1, dtype=dtype) tmp_negative = te.lang.cce.vmuls(data_input, const_num_neg_one) if dtype == "float32" and not exp_support: tmp_negative = te.lang.cce.cast_to(tmp_negative, "float16") tmp_exp = te.lang.cce.vexp(tmp_negative) if dtype == "float32" and not exp_support: tmp_exp = te.lang.cce.cast_to(tmp_exp, "float32") tmp_sum = te.lang.cce.vadds(tmp_exp, const_num_one) if dtype == "float32": inp_shape = tmp_sum.shape tensor_one = te.lang.cce.broadcast(tvm.const(1, dtype), inp_shape) res = te.lang.cce.vdiv(tensor_one, tmp_sum) else: res = te.lang.cce.vrec(tmp_sum) return res
def _negative_compute(input_x, input_y): """ compute result of pow when data_x is less than 0, use [-2 * (|y| % 2) + 1] * exp(y * ln|x|) """ dtype = input_x.dtype shape = input_x.shape abs_value = te.lang.cce.vabs(input_y) if not tbe_platform.cce_conf.api_check_support("te.lang.cce.vmod", "float32"): dtype = "float16" abs_value = te.lang.cce.cast_to(abs_value, "float16") data_two = te.lang.cce.broadcast(tvm.const(2, dtype), shape, dtype) mod_value = te.lang.cce.vmod(abs_value, data_two) mul_value = te.lang.cce.vmuls(mod_value, tvm.const(-2, dtype)) add_value = te.lang.cce.vadds(mul_value, tvm.const(1, dtype)) if tbe_platform.cce_conf.api_check_support("te.lang.cce.vexp", "float32"): add_value = te.lang.cce.cast_to(add_value, "float32") abs_data_x = te.lang.cce.vabs(input_x) log_value = te.lang.cce.vlog(abs_data_x) mul_value = te.lang.cce.vmul(input_y, log_value) exp_value = te.lang.cce.vexp(mul_value) res = te.lang.cce.vmul(add_value, exp_value) return res
def _compute(data_input, shape): """ Algrithm: atanh(x) = 0.5*log((1+x)/(1-x)) Parameters ---------- data_input: the placeholder of data input shape: the shape of data_input Returns ------- data_res : return of atanh """ data_1_sum_x = te.lang.cce.vadds(data_input, tvm.const(CONST_ONE, data_input.dtype)) data_sub_x = te.lang.cce.vmuls(data_input, tvm.const(CONST_NEG_ONE, data_input.dtype)) data_1_sub_x = te.lang.cce.vadds(data_sub_x, tvm.const(CONST_ONE, data_input.dtype)) data_x_mul = te.lang.cce.vdiv(data_1_sum_x, data_1_sub_x) data_x_log = te.lang.cce.vlog(data_x_mul, 1) data_res = te.lang.cce.vmuls(data_x_log, tvm.const(CONST_HALF, data_input.dtype)) return data_res
def _compute_process(var, m, lr_broad, alpha_broad, sign_decay_broad, beta_broad, grad): """ calculate m_t <- beta1 * m_{t-1} + (1 - beta1) * g update <- (alpha + sign_decay * sign(g) *sign(m)) * g variable <- variable - lr_t * update Parameters: ---------- var: the dict of var, support float16, float32 m: the dict of m, support float16, float32 lr: the dict of lr, support float16, float32 alpha: the dict of alpha, support float16, float32 sign_decay: the dict of sign_decay, support float16, float32 beta: the dict of beta, support float16, float32 grad: the dict of grad, support float16, float32 Returns ------- the new value of var and out the output """ m_out = _update_m(m, beta_broad, grad) sign_gm = te.lang.cce.vmul(_sign_compute(grad), _sign_compute(m_out)) decay_gm = te.lang.cce.vmul(sign_gm, sign_decay_broad) var_out = _update_var(decay_gm, alpha_broad, lr_broad, grad, var) output_data = te.lang.cce.vadds(var_out, tvm.const(CONST_ZERO, "float32")) m_output_data = te.lang.cce.vadds(m_out, tvm.const(CONST_ZERO, "float32")) return m_out, var_out, output_data, m_output_data
def _sigmoid_compute(input_x): """ calculating sigmoid """ data_input = input_x dtype = input_x.dtype exp_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vexp", "float32") mul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmuls", "float32") if dtype == "float32" and not mul_support: error_manager_vector.raise_err_check_params_rules( "DynamicGRU", 'vmuls should support float32', 'mul_support', str(mul_support)) const_num_neg_one = tvm.const(-1, dtype=dtype) const_num_one = tvm.const(1, dtype=dtype) tmp_negative = tbe.vmuls(data_input, const_num_neg_one) if dtype == "float32" and not exp_support: tmp_negative = tbe.cast_to(tmp_negative, "float16") tmp_exp = tbe.vexp(tmp_negative) if dtype == "float32" and not exp_support: tmp_exp = tbe.cast_to(tmp_exp, "float32") tmp_sum = tbe.vadds(tmp_exp, const_num_one) if dtype == "float32": inp_shape = tmp_sum.shape tensor_one = tbe.broadcast(tvm.const(1, dtype), inp_shape) res = tbe.vdiv(tensor_one, tmp_sum) else: res = tbe.vrec(tmp_sum) return res
def less_compute(input_x, input_y, output_z, kernel_name="less"): """ if x is less than y, then return 1, else return 0. Parameters: ---------- input_x: TVM tensor the placeholder of first input data input_y: TVM tensor the placeholder of second input data output_x: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is less Returns ------- the result """ shape_x = te.lang.cce.util.shape_to_list(input_x.shape) shape_y = te.lang.cce.util.shape_to_list(input_y.shape) shape_x, shape_y, shape = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") cce_product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") dtype = input_x.dtype if dtype in ("uint8", "int8"): input_x = te.lang.cce.cast_to(input_x, "float16") input_y = te.lang.cce.cast_to(input_y, "float16") dtype = "float16" if dtype == "float32": # minimun num of float32 2**(-126) data_min = te.lang.cce.broadcast(tvm.const(2**(-126), dtype=dtype), shape, dtype) elif dtype == "float16" and cce_product not in ("Ascend910", "Ascend710"): # minimun num of float16 2**(-24) data_min = te.lang.cce.broadcast(tvm.const(2**(-24), dtype=dtype), shape, dtype) elif dtype == "float16" and cce_product in ("Ascend910", "Ascend710"): input_x = te.lang.cce.cast_to(input_x, "float32") input_y = te.lang.cce.cast_to(input_y, "float32") dtype = "float32" data_min = te.lang.cce.broadcast(tvm.const(2**(-126), dtype=dtype), shape, dtype) elif dtype == "int32" and cce_product not in ("Ascend910", "Ascend710"): data_min = te.lang.cce.broadcast(tvm.const(1, dtype=dtype), shape, dtype) else: input_x = te.lang.cce.cast_to(input_x, "float32") input_y = te.lang.cce.cast_to(input_y, "float32") dtype = "float32" data_min = te.lang.cce.broadcast(tvm.const(2**(-126), dtype=dtype), shape, dtype) input_x = te.lang.cce.broadcast(input_x, shape) input_y = te.lang.cce.broadcast(input_y, shape) return _less_compare((input_x, input_y), shape, dtype, data_min)
def _taylor_compute(data): """ algrithm: log(x) = ((((0.2x - 0.25)x + 0.33333)x - 0.5)x + 1)x Parameters ---------- data: input tensor that we want to calculate log Returns ------- None """ # 0.2x - 0.25 taylor_five = te.lang.cce.vmuls(data, tvm.const(CONST_ONE_FIVE, "float32")) taylor_four_1 = te.lang.cce.vadds(taylor_five, tvm.const(CONST_ONE_FOUR_NEG, "float32")) # (0.2x - 0.25)x + 0.33333 taylor_four_2 = te.lang.cce.vmul(taylor_four_1, data) taylor_three_1 = te.lang.cce.vadds(taylor_four_2, tvm.const(CONST_ONE_THREE, "float32")) # ((0.2x - 0.25)x + 0.33333)x - 0.5 taylor_three_2 = te.lang.cce.vmul(taylor_three_1, data) taylor_two_1 = te.lang.cce.vadds( taylor_three_2, tvm.const(CONST_NEWTON_FACTOR_NEG, "float32")) # (((0.2x - 0.25)x + 0.33333)x - 0.5)x+1 taylor_two_2 = te.lang.cce.vmul(taylor_two_1, data) taylor_one = te.lang.cce.vadds(taylor_two_2, tvm.const(CONST_ONE, "float32")) # ((((0.2x - 0.25)x + 0.33333)x - 0.5)x + 1)x taylor = te.lang.cce.vmul(taylor_one, data) return taylor
def _cosh_taylor_compute(data): """ Calculate cosh = 1 + x^2( 1/2! + x^2( 1/4! + x^2/6!)) Parameters: ---------- data : the placeholder of data input Returns ------- A Tensor represents cosh(data). Has the same type as data. """ # x^2 / 6! pow_2 = te.lang.cce.vmul(data, data) pow_2_div = te.lang.cce.vmuls(pow_2, tvm.const(TAYLOR_SIXTH, data.dtype)) # 1/4! + x^2 / 6! pow_2_plus = te.lang.cce.vadds(pow_2_div, tvm.const(TAYLOR_FOURTH, data.dtype)) # 1/2! + x^2( 1/4! + x^2/6!) pow_4 = te.lang.cce.vmul(pow_2_plus, pow_2) pow_4_plus = te.lang.cce.vadds(pow_4, tvm.const(TAYLOR_SECOND, data.dtype)) # 1 + x^2( 1/2! + x^2( 1/4! + x^2/6!)) pow_6 = te.lang.cce.vmul(pow_4_plus, pow_2) res = te.lang.cce.vadds(pow_6, tvm.const(NUM_ONE, data.dtype)) return res
def _compare_value_float(x_data, y_data): """ The input data type of the function only support float; The return value of the function: if x_data >= y_data return 1; else return 0. """ # The smallest positive subnormal number of float32 is 2**(-126) min_value = tvm.const(2**(-126), dtype="float32") # (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 # so min_value*max_value*max_value*max_value_1 = 1 max_value = tvm.const(2**(62), dtype="float32") max_value_1 = tvm.const(2**(2), dtype="float32") data_zero = te.lang.cce.vmuls(x_data, 0) min_value_tensor = te.lang.cce.vadds(data_zero, min_value) max_value_tensor = te.lang.cce.vadds(data_zero, max_value) max_value_1_tensor = te.lang.cce.vadds(data_zero, max_value_1) sub_xy = te.lang.cce.vsub(x_data, y_data) add_min_value = te.lang.cce.vadds(sub_xy, min_value) vmax_zero = te.lang.cce.vmax(add_min_value, data_zero) vmin_min_value = te.lang.cce.vmin(vmax_zero, min_value_tensor) vmul_max_value = te.lang.cce.vmul(vmin_min_value, max_value_tensor) vmul_max_value_1 = te.lang.cce.vmul(vmul_max_value, max_value_tensor) result = te.lang.cce.vmul(vmul_max_value_1, max_value_1_tensor) return result
def _log1p_mini_compute(mini_res, input_x, shape): """ do element-wise log(x + 1) compute in mini scene f(y) = e^y(n), y(n) <= TAYLOR_NEGATIVE_THRESHOLD or y(n) >= TAYLOR_POSITIVE_THRESHOLD f(y) = seventh taylor computer, TAYLOR_NEGATIVE_THRESHOLD < y(n) < TAYLOR_POSITIVE_THRESHOLD Parameters: ---------- mini_res: TVM tensor, the tensor of log(x + 1) input_x : TVM tensor, the placeholder of input_x shape : tuple, the shape of input_x Returns : A Tensor. Has the same type as mini_res. ------- """ input_y = mini_res newton_taylor_res = _newton_taylor_log1p(input_x, input_y) newton_exp_res = _newton_exp_log1p(input_x, input_y) input_left_border = tvm.const(TAYLOR_NEGATIVE_THRESHOLD, input_y.dtype) tensor_input_left_border = te.lang.dynamic.broadcast( input_left_border, shape) input_right_border = tvm.const(TAYLOR_POSITIVE_THRESHOLD, input_y.dtype) tensor_input_right_border = te.lang.dynamic.broadcast( input_right_border, shape) exp_taylor_neg = te.lang.dynamic.vcmpsel(input_y, tensor_input_left_border, 'gt', newton_taylor_res, newton_exp_res) exp_taylor_neg = te.lang.dynamic.vcmpsel(input_y, tensor_input_right_border, 'lt', exp_taylor_neg, newton_exp_res) return mini_res
def prelu_compute(input_x, weight_input, output_y, kernel_name="prelu"): """ calculating data Parameters ---------- input_x : TVM tensor the placeholder of input_x output_y : dict dict of output_y, include keys(shape and dtype) weight_input : TVM tensor the placeholder of weight_input kernel_name : str kernel name, default value is "prelu" Returns ------- output tensor """ shape_x = te.lang.cce.util.shape_to_list(input_x.shape) if input_x.dtype == "float16": scalar_zero = tvm.const(0, dtype="float16") else: scalar_zero = tvm.const(0, dtype="float32") val_max = te.lang.cce.vmaxs(input_x, scalar_zero) val_min = te.lang.cce.vmins(input_x, scalar_zero) weight_input = te.lang.cce.broadcast(weight_input, shape_x) val_prod = te.lang.cce.vmul(val_min, weight_input) res = te.lang.cce.vadd(val_max, val_prod) return res
def abs_grad_compute(y, dy, z, kernel_name="abs_grad"): """ do abs_grad compute Parameters: ---------------- y: input tensor y dy: input tensor dy z: output dict kernel_name: cce kernel name, default value is "abs_grad" return: data_dy * sign(data_y) ---------------- """ dtype = dy.dtype if dtype == "float16": fp_max = tvm.const(2**15, dtype) fp_min = tvm.const(2**(-15), dtype) else: fp_max = tvm.const(2**62, dtype) fp_min = tvm.const(2**(-127), dtype) new_data = te.lang.cce.vmuls(y, fp_max) abs_data = te.lang.cce.vabs(new_data) denominator = te.lang.cce.vadds(abs_data, fp_min) res = te.lang.cce.vdiv(new_data, denominator) res = te.lang.cce.round(res) data1_res = te.lang.cce.vmul(res, dy) return data1_res
def tanh_grad_compute(y, dy, z, kernel_name="tanh_grad"): """ do element-wise tanh_grad operation between two input tensors Parameters ---------- y: TVM tensor the placeholder of y input data dy: TVM tensor the placeholder of dy input data z: dict shape and dtype of output, should be same shape and type as input kernel_name: str cce kernel name, default value is tanh_grad Returns ------- res : tvm.tensor the result of tanh_grad """ dtype = y.dtype if dtype == "float16": y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") data1_square = te.lang.cce.vmul(y, y) data_mul = te.lang.cce.vmuls(data1_square, tvm.const(-1, dtype=dtype)) anuminate = te.lang.cce.vadds(data_mul, tvm.const(1, dtype=dtype)) res = te.lang.cce.vmul(anuminate, dy) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def mish_compute(input_x, output_y, kernel_name="mish"): """ algorithm: mish calculating data's mish,y= x*(1 - 2/(1+(1+exp(x))^2)) Parameters ---------- input_x: TVM tensor the placeholder of input data output_y: dict shape and dtype of output, should be same shape and type as input kernel_name: str cce kernel name, default value is mish Returns ------- res : tvm.tensor the result of mish """ dtype = input_x.dtype exp_val = te.lang.cce.vexp(input_x) add_exp_val = te.lang.cce.vadds(exp_val, tvm.const(1, dtype)) pow_var = te.lang.cce.vmul(add_exp_val, add_exp_val) add_val = te.lang.cce.vadds(pow_var, tvm.const(1, dtype)) rec_val = te.lang.cce.vrec(add_val) mul_val = te.lang.cce.vmuls(rec_val, tvm.const(-2, dtype=dtype)) add_val2 = te.lang.cce.vadds(mul_val, tvm.const(1, dtype=dtype)) res = te.lang.cce.vmul(input_x, add_val2) return res
def acos_grad_compute(y, dy, z, kernel_name="acos_grad"): """ do acos_grad compute with sqrt and div Parameters: ---------------- y: input tensor y dy: input tensor dy z: output dict kernel_name: cce kernel name, default value is "acos_grad" return: dy * (- 1 / (1 - data_y^2)^1/2) ---------------- """ dtype = y.dtype dtype_1 = dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") dy = te.lang.cce.cast_to(dy, "float32") dtype = "float32" data1_square = te.lang.cce.vmul(y, y) data1_square = te.lang.cce.vmuls(data1_square, tvm.const(NUM_MINUS_ONE, dtype=dtype)) data1_square = te.lang.cce.vadds(data1_square, tvm.const(NUM_ONE, dtype=dtype)) data1_reciprocal = te.lang.cce.vsqrt(data1_square, 1) data1_reciprocal = te.lang.cce.vdiv(dy, data1_reciprocal) res = te.lang.cce.vmuls(data1_reciprocal, tvm.const(NUM_MINUS_ONE, dtype=dtype)) if dtype_1 == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _compute_positive(prox_v, alpha_broad, l1_broad, l2_broad): """ the operator's compute var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} Parameters: ---------- prox_v: the value of prox_v alpha_broad: the value of alpha_broad l1_broad: the value of l1_broad l2_broad: the value of l2_broad Returns the value of var_res """ prox_v_abs = te.lang.cce.vabs(prox_v) prox_v_sign = sign(prox_v) # 1+alpha*l2 alpha_l2 = te.lang.cce.vmul(alpha_broad, l2_broad) alpha_l2_1 = te.lang.cce.vadds(alpha_l2, tvm.const(CONST_ONE, "float32")) # max{|prox_v|-alpha*l1,0} alpha_l1 = te.lang.cce.vmul(alpha_broad, l1_broad) alpha_l1_neg = te.lang.cce.vmuls(alpha_l1, tvm.const(CONST_ONE_NEG, "float32")) prox_v_l1 = te.lang.cce.vadd(prox_v_abs, alpha_l1_neg) max_value = te.lang.cce.vmax( prox_v_l1, te.lang.cce.broadcast(tvm.const(CONST_ZERO, "float32"), prox_v.shape)) # sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} res = te.lang.cce.vdiv(prox_v_sign, alpha_l2_1) var_res = te.lang.cce.vmul(res, max_value) return var_res
def fake_quant_with_min_max_grad_compute(dout, x, min_val, max_val, quant_min, quant_max, kernel_name="fake_quant_with_min_max_grad"): """FakeQuantWithMinMaxGrad""" shape = te.lang.cce.util.shape_to_list(x.shape) shape_min = te.lang.cce.util.shape_to_list(min_val.shape) quant_min = tvm.const(quant_min, x.dtype) quant_max = tvm.const(quant_max, x.dtype) quant_min = te.lang.cce.broadcast(quant_min, shape_min) quant_max = te.lang.cce.broadcast(quant_max, shape_min) # CalNudge(NudgeMinMax) scale = te.lang.cce.vdiv(te.lang.cce.vsub(max_val, min_val), te.lang.cce.vsub(quant_max, quant_min)) zp_from_min = te.lang.cce.vsub(quant_min, te.lang.cce.vdiv(min_val, scale)) # Nudge zero point nudge_zp = te.lang.cce.round(te.lang.cce.vmin(quant_max, te.lang.cce.vmax(quant_min, zp_from_min))) nudge_min = te.lang.cce.vmul(te.lang.cce.vsub(quant_min, nudge_zp), scale) nudge_max = te.lang.cce.vmul(te.lang.cce.vsub(quant_max, nudge_zp), scale) nudge_min = te.lang.cce.broadcast(nudge_min, shape) nudge_max = te.lang.cce.broadcast(nudge_max, shape) bool_over_min = _less_compare_float32(nudge_min, x) bool_less_max = _less_compare_float32(x, nudge_max) bool_between = te.lang.cce.vmul(bool_over_min, bool_less_max) res = te.lang.cce.vmul(dout, bool_between) return res
def eltwise_compute(x, y, mode=1, coeff=[], kernel_name="eltwise"): ''' Compute elementwise operation ''' tensor_num = len(x) inp_dtype = x[0].dtype data0_tmp = x[0] tmp_y = {} tmp_y["addr_type"] = 0 tmp_y["valid_shape"] = [] tmp_y["slice_offset"] = [] fuse_y = tmp_y if y is None else y fusion_params = get_fusion_params(x, fuse_y, tensor_num) if mode == 1: if len(coeff) != 0 and len(coeff) != tensor_num: errorInfo = {} errorInfo['errCode'] = "E81002" errorInfo['op_name'] = 'eltwise' errorInfo['coeff_length'] = str(len(coeff)) errorInfo['input_num'] = str(tensor_num) raise RuntimeError(errorInfo, "In op[%s], the parameter[coeff]'s length[%s] should be " "equal to inputs'num[%s]." % (errorInfo['op_name'], errorInfo['coeff_length'], errorInfo['input_num'])) if len(coeff) == tensor_num: if type(coeff[0]) != int and type(coeff[0]) != float: raise RuntimeError("ele of coeff must be a number.") if coeff[0] != 1: coeff1 = tvm.const(coeff[0], dtype=inp_dtype) data0_tmp = te.lang.cce.vmuls(data0_tmp, coeff1) res = None if tensor_num == 1: const_val_0 = tvm.const(0, dtype=inp_dtype) data0_tmp = te.lang.cce.vadds(data0_tmp, const_val_0) res = data0_tmp elif tensor_num > 1: for i in range(1, tensor_num): datan_tmp = x[i] if mode == 0: data0_tmp = te.lang.cce.vmul(data0_tmp, datan_tmp) elif mode == 2: data0_tmp = te.lang.cce.vmax(data0_tmp, datan_tmp) else: if len(coeff) == 0: data0_tmp = te.lang.cce.vadd(data0_tmp, datan_tmp) elif coeff[i] == 1: data0_tmp = te.lang.cce.vadd(data0_tmp, datan_tmp) else: coeff2 = tvm.const(coeff[i], dtype=inp_dtype) datan_tmp = te.lang.cce.vmuls(datan_tmp, coeff2) data0_tmp = te.lang.cce.vadd(data0_tmp, datan_tmp) res = data0_tmp res.op.attrs["ele_fusion_params"] = fusion_params return res
def newton_iteration(shape, tensor_x_rec, tensor_x, symbol, iter_num): """ the function of newton_iteration Parameters ---------- shape: tensor shape tensor_x_rec: tensor tensor_x: tensor symbol: tensor symbol Returns ------- tensor_list: dict scope_list: dict emit_list: dict """ dtype_c = tensor_x_rec.dtype num_two = tvm.const(2, dtype=dtype_c) neg_one = tvm.const(-1, dtype=dtype_c) tmp = tensor_x_rec tensor_list = {} scope_list = {} emit_list = {} tmp_mul = None tmp_neg = None tmp_add = None for index in range(0, iter_num): key = "tmp_mul_" + symbol + str(index) tmp_mul = tvm.compute(shape, lambda *i: tensor_x(*i) * tmp(*i), name=key) tensor_list[key] = tmp_mul scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_mul" key = "tmp_neg_" + symbol + str(index) tmp_neg = tvm.compute(shape, lambda *i: tmp_mul(*i) * neg_one, name=key) tensor_list[key] = tmp_neg scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_muls" key = "tmp_add_" + symbol + str(index) tmp_add = tvm.compute(shape, lambda *i: tmp_neg(*i) + num_two, name=key) tensor_list[key] = tmp_add scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_adds" key = "tmp_" + symbol + str(index) tmp = tvm.compute(shape, lambda *i: tmp_add(*i) * tmp(*i), name=key) tensor_list[key] = tmp scope_list[key] = cce.scope_ubuf emit_list[key] = "vector_mul" return tensor_list, scope_list, emit_list
def select_compute(condition, x1, x2, kernel_name="select"): """ compute for select Parameters ---------- condition: TVM tensor the placeholder of input condition x1: TVM tensor the placeholder of first input data x2: TVM tensor the placeholder of second input data kernel_name: str cce kernel name, default value is "select" Returns ------- res : output of the result of select compute """ shape = te.lang.cce.util.shape_to_list(x1.shape) x1_dtype = x1.dtype con_shape = te.lang.cce.util.shape_to_list(condition.shape) bool_dtype = condition.dtype if x1_dtype in ("int8", "uint8"): x1_dtype = "float32" ones = te.lang.cce.broadcast(tvm.const(1, dtype=x1_dtype), shape, output_dtype=x1_dtype) x1 = te.lang.cce.cast_to(x1, "float32") x2 = te.lang.cce.cast_to(x2, "float32") else: ones = te.lang.cce.broadcast(tvm.const(1, dtype=x1_dtype), shape, output_dtype=x1_dtype) if bool_dtype == "int8": if x1_dtype == "int32": condition_dtype = te.lang.cce.ceil(condition) else: condition_dtype = te.lang.cce.cast_to(condition, x1_dtype) else: if x1_dtype == "int32": condition_dtype = condition else: condition_dtype = te.lang.cce.cast_to(condition, x1_dtype) if list(con_shape) != list(shape): condition_dtype = te.lang.cce.broadcast(condition_dtype, shape) condition_opp = te.lang.cce.vsub(ones, condition_dtype) temp_x = te.lang.cce.vmul(x1, condition_dtype) temp_y = te.lang.cce.vmul(x2, condition_opp) res = te.lang.cce.vadd(temp_x, temp_y) if x1_dtype in ("int8", "uint8"): res = te.lang.cce.cast_to(res, x1_dtype) return res
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ do element-wise xlogy_grad compute Parameters ---------- placeholders : the placeholder of data input shape_max : the shape of broadcast dtype : the type of data input rx : the reduction indices of data input with broadcast ry : the reduction indices for data input with broadcast Returns ------- output_y1 : result of xlogy_grad output_y2 : result of xlogy_grad None """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] fp32_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vdiv", "float32") if dtype == "float32" and not fp32_support: raise RuntimeError("Don't support float32 in the platform.") if dtype == "float16" and fp32_support: x1 = te.lang.cce.cast_to(x1_ori, "float32") x2 = te.lang.cce.cast_to(x2_ori, "float32") grad = te.lang.cce.cast_to(grad_ori, "float32") x1 = te.lang.cce.broadcast(x1, shape_max) x2 = te.lang.cce.broadcast(x2, shape_max) grad = te.lang.cce.broadcast(grad, shape_max) else: x1 = te.lang.cce.broadcast(x1_ori, shape_max) x2 = te.lang.cce.broadcast(x2_ori, shape_max) grad = te.lang.cce.broadcast(grad_ori, shape_max) if dtype == "float16" and not fp32_support: esp_min = tvm.const(1.18e-7, dtype="float16") else: esp_min = tvm.const(1.18e-38, dtype="float32") x1_addespmin = te.lang.cce.vadds(x1, esp_min) not_zero_x1 = te.lang.cce.vdiv(x1, x1_addespmin) log_x2 = te.lang.cce.vlog(x2) partial_x1 = te.lang.cce.vmul(not_zero_x1, log_x2) partial_x1g = te.lang.cce.vmul(partial_x1, grad) partial_x2 = te.lang.cce.vdiv(x1, x2) partial_x2g = te.lang.cce.vmul(partial_x2, grad) output_y1 = te.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = te.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16" and fp32_support: output_y1 = te.lang.cce.cast_to(output_y1, "float16") output_y2 = te.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2