def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def tan_compute(input_x): """tan compute implemention""" dtype = input_x.dtype # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32 and not product_is_mini()): input_x = topi.cast(input_x, FLOAT_32) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32))) # cast to type float16 when type is int32 in mini elif dtype == INT_32 and product_is_mini(): input_x = topi.cast(input_x, FLOAT_16) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16))) res = _tan_2x_multi(input_x, TAN_2X_TIMES) # cast the dtype to original dtype res = topi.cast(res, dtype) return res
def matrix_diag_part_compute(input_diagonal, input_help): """matrix_diag_part compute implemention""" shape_input_diagonal = get_shape(input_diagonal) dtype_input_diagonal = input_diagonal.dtype if dtype_input_diagonal == "int8" or dtype_input_diagonal == "uint8": input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") if dtype_input_diagonal == "int32" and product_is_mini(): input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") if dtype_input_diagonal == "int32" and not product_is_mini(): input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") res_vmul = topi.multiply(input_help, input_diagonal) if shape_input_diagonal[-2] < shape_input_diagonal[-1]: res = topi.sum(res_vmul, -1) else: res = topi.sum(res_vmul, -2) if dtype_input_diagonal == "int32" and product_is_mini(): res = topi.cast(res, "float16") res = topi.cast(res, dtype_input_diagonal) return res
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'): transform_list = [data_2, data_4, data_5, data_6, data_7] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.full_like(data_7, 0.0) data_tmp2 = topi.greater(data_7, data_tmp1) data_tmp3 = topi.add(data_5, data_6) data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1) data_tmp5 = topi.cast(data_tmp4, 'float32') data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2)) n, h, w, c = data_7.shape data_tmp8 = topi.cast(data_2, 'float32') data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w)) data_tmp10 = topi.multiply(data_1, data_tmp9) data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape) data_tmp12 = topi.subtract(data_tmp8, data_tmp11) data_tmp13 = topi.multiply(data_tmp5, data_tmp12) data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2)) data_tmp16 = topi.cast(data_4, 'float32') data_tmp17 = topi.multiply(data_3, data_tmp9) data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape) data_tmp19 = topi.subtract(data_tmp16, data_tmp18) data_tmp20 = topi.multiply(data_tmp5, data_tmp19) data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2)) return [data_tmp7, data_tmp15, data_tmp22]
def fused_l2loss_grad(data_f16, data_f32, layout='NHWC', fill_data=4e-05, target=utils.CUDA): """ fused_l2loss_grad. Args: input: tvm.tensor.Tensor. Returns: ret. """ if layout == "NCHW": data_f16 = topi.transpose(data_f16, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) data_f16 = topi.cast(data_f16, 'float32') constant_tmp = topi.cast(fill_data, 'float32') data_constant = topi.full_like(data_f32, constant_tmp) data_out = topi.multiply(data_constant, data_f32) data_out = topi.add(data_f16, data_out) return data_out
def selu_compute(input_data): """selu compute implemention""" # if input_dtype is float16,convert it to float32 dtype = input_data.dtype if dtype == "float16" or dtype == "float32": input_data = topi.cast(input_data, "float32") type_tmp = "float32" else: input_data = topi.cast(input_data, "float16") type_tmp = "float16" # generate tensor_zero to be compared tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp)) # generate negative_res and positive_res to compute # When the element value is greater than 0 and less than 0 negative_res = topi.minimum(input_data, tensor_zero) positive_res = topi.maximum(input_data, tensor_zero) exp_res = exp(negative_res) sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp)) negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp)) if dtype == "int8": negative_muls_res = akg.lang.cce.ceil(negative_muls_res) positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp)) res = topi.add(negative_muls_res, positive_muls_res) # cast to ori_dtype if dtype == "float16" or dtype == "int8" or dtype == "int32": res = topi.cast(res, dtype) return res
def _compute_mini(data_input, shape): """ Use log and taylor to compute arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) """ data_abs = topi.abs(data_input) result_ln = _compute_log(data_abs) result_taylor = _compute_taylor(data_abs) data_abs = topi.cast(data_abs, "float16") data_input = topi.cast(data_input, "float16") result_taylor = topi.cast(result_taylor, "float16") result_ln = topi.cast(result_ln, "float16") # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log() data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"), result_taylor(*i), result_ln(*i)), name="le") # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16")) data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"), data_res_neg(*i), data_res(*i)), name="neg") return data_res
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 6 data0: tensor1 after bn_double_relu data1-6: bn parameters for conv2d tensor2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2, 0) ) """ if layout == 'NCHW': data0 = topi.transpose(data0, (0, 2, 3, 1)) data5 = topi.transpose(data5, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) n, h, w, c = data0.shape inter_dtype = 'float32' add0 = fused_bn_follow(data1, data2, data3, data4, data5) add0 = topi.cast(add0, data0.dtype) add1 = topi.add(data0, add0) output = topi.maximum(add1, 0) output = topi.cast(output, inter_dtype) output = topi.sum(output, axis=(1, 2)) output = topi.divide(output, h * w) output = topi.cast(output, out_dtype) return output
def atanh(input_data): """ Return atanh(x)=0.5*ln((1+x)/(1-x)) if abs(x)<1. Args: input_data (tvm.tensor.Tensor): Input tensor, only support float16, float32. Returns: A tvm.tensor.Tensor as result of atanh. Supported Platforms: 'Ascend' """ shape = get_shape(input_data) utils.check_shape(shape) inp_dtype = input_data.dtype utils.ops_dtype_check(inp_dtype, utils.DtypeForDavinci.ALL_FLOAT) if inp_dtype == "float16": input_data = topi.cast(input_data, "float32") if product_is_mini(): res = _compute_mini(input_data, shape) else: res = _compute_cloud(input_data) res = topi.cast(res, inp_dtype) return res
def fused_bn_reduce(data, layout, out_dtype): """ input: data: 4-D Tensor layout: input layout, only 'NCHW', 'NHWC' supported out_dtype: "float16" or "float32" output: out1_sum: 1-D tensor (C), sum on the axis "C" of input out2_squared_sum: 1-D tensor (C), sum of squared on the axis "C" of input """ if layout == "NCHW": data = topi.transpose(data, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) inter_dtype = 'float32' data_cast = topi.cast(data, inter_dtype) out1_sum = topi.sum(data_cast, axis=(0, 1, 2)) out1_sum = topi.cast(out1_sum, out_dtype) squared = topi.multiply(data_cast, data_cast) out2_squared_sum = topi.sum(squared, axis=(0, 1, 2)) out2_squared_sum = topi.cast(out2_squared_sum, out_dtype) return [out1_sum, out2_squared_sum]
def fused_mul_div_rsqrt_mul_isfinite_red(input1, input2, out_dtype): """ fused operator. Args: input1: tvm.tensor.Tensor. input2: tvm.tensor.Tensor. dtype: dtype of Tensor. Returns: list of tvm.tensor.Tensor. """ mul_param1 = topi.multiply(input2, input2) divide_val = topi.divide(1, mul_param1) rsqrt_val = topi.rsqrt(divide_val) mul_param0 = topi.multiply(input1, rsqrt_val) isfinite = topi.isfinite(mul_param0) reduce_and = topi.all(isfinite) if mul_param0.dtype != out_dtype: mul_param0 = topi.cast(mul_param0, out_dtype) rsqrt_val = topi.cast(rsqrt_val, out_dtype) divide_val = topi.cast(divide_val, out_dtype) return [reduce_and, mul_param0, rsqrt_val, divide_val]
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout): """sigmoid_cross_entropy_with_logits_grad compute implemention""" dtype = predict.dtype if dtype == "float16": predict = topi.cast(predict, "float32") target = topi.cast(target, "float32") dout = topi.cast(dout, "float32") # e^x val1 = exp(predict) # 1 + e^x val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32")) # e^x / (1 + e^x) val3 = topi.divide(val1, val2) # -target val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE, dtype="float32")) # e^x / (1 + e^x) -y val5 = topi.add(val3, val4) result = topi.multiply(val5, dout) if dtype == "float16": result = topi.cast(result, dtype) return result
def asinh(x, target=utils.CCE): r""" Compute asinh function. .. math:: asinh(x) = log(x+\sqrt{x*x+1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x) # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero. # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1)) compute_dtype = dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 compute_dtype = "float32" x = topi.cast(x, compute_dtype) x_abs = topi.abs(x) if product_is_mini(): # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|)) vsquare_add_one = topi.add(1, topi.divide(1, topi.multiply(x_abs, x_abs))) sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one) sqrt_value = topi.multiply(x_abs, sqrt_compute_value) else: x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1) sqrt_value = topi.sqrt(x_abs_square_add_one) x_add_sqrt = topi.add(x_abs, sqrt_value) if product_is_mini(): log_value = log_compute_mini_impl(x_add_sqrt, target) else: log_value = topi.log(x_add_sqrt) res = topi.multiply(Sign(x, target), log_value) if res.dtype != dtype: res = topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15, layout="NHWC", out_dtype="float16", target=utils.CUDA): if layout == 'NCHW': data5 = topi.transpose(data5, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) data13 = topi.transpose(data13, (0, 2, 3, 1)) data14 = topi.transpose(data14, (0, 2, 3, 1)) data15 = topi.transpose(data15, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = "float32" n, h, w, c = data5.shape scale = n * h * w mul = topi.multiply(data2, data3) mul1221 = topi.divide(mul, scale) # ReluGrad zero = tvm.const(0, data15.dtype) add = topi.add(data13, data14) addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE) addgrad = topi.cast(addgrad, inter_dtype) mul3283 = topi.multiply(scale, addgrad) sub1159 = topi.subtract(mul3283, data6) data5_cast = topi.cast(data5, inter_dtype) mul2372 = topi.divide(data4, scale) sub631 = topi.subtract(data5_cast, mul2372) mul1220 = topi.multiply(sub631, data1) div = topi.divide(mul1220, data0) sub271 = topi.subtract(sub1159, div) mul1218 = topi.multiply(mul1221, sub271) mul1218_cast = topi.cast(mul1218, out_dtype) mul1231 = topi.multiply(data11, data12) mul1230 = topi.divide(mul1231, scale) data9_cast = topi.cast(data9, inter_dtype) mul2364 = topi.divide(data8, scale) sub625 = topi.subtract(data9_cast, mul2364) mul1229 = topi.multiply(data10, sub625) div272 = topi.divide(mul1229, data7) sub272 = topi.subtract(sub1159, div272) mul1228 = topi.multiply(mul1230, sub272) mul1228_cast = topi.cast(mul1228, out_dtype) if layout == "NCHW": mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2)) mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2)) return [mul1218_cast, mul1228_cast]
def _apply_rms_prop_compute(var, ms, mom, grad, lr, momentum, rho, epsilon): """Compute apply_rms_prop""" compute_dtype = "float32" dtype = var.dtype if dtype != compute_dtype: var, ms, mom, grad, lr, momentum, rho = [ topi.cast(t, compute_dtype) for t in [var, ms, mom, grad, lr, momentum, rho] ] shape = get_shape(var) cons_eps = akg.tvm.const(epsilon, dtype=compute_dtype) one_minus_rho = akg.tvm.compute( (1, ), lambda *indice: akg.tvm.const(1.0, compute_dtype) - rho[0], name="one_minus_rho") # var_update = var - (momentum * mom + lr * grad / sqrt(rho * ms + (1 - rho) * grad * grad + epsilon)) mom_1 = akg.tvm.compute(shape, lambda *indice: momentum[0] * mom(*indice), name="mom_1") lr_grad = akg.tvm.compute(shape, lambda *indice: grad(*indice) * lr[0], name="lr_grad") rho_ms = akg.tvm.compute(shape, lambda *indice: ms(*indice) * rho[0], name="rho_ms") rho_grad2 = akg.tvm.compute( shape, lambda *indice: grad(*indice) * grad(*indice) * one_minus_rho[0], name="rho_grad2") ms_update = akg.tvm.compute( shape, lambda *indice: rho_ms(*indice) + rho_grad2(*indice), name="ms_update") ms_eps = akg.tvm.compute(shape, lambda *indice: ms_update(*indice) + cons_eps, name="ms_eps") rsq = rsqrt(ms_eps, target="cce") mom_2 = akg.tvm.compute(shape, lambda *indice: lr_grad(*indice) * rsq(*indice), name="mom_2") mom_update = akg.tvm.compute( shape, lambda *indice: mom_1(*indice) + mom_2(*indice), name="mom_update") var_update = akg.tvm.compute( shape, lambda *indice: var(*indice) - mom_update(*indice), name="var_update") if var_update.dtype != dtype: var_update, ms_update, mom_update = [ topi.cast(t, dtype) for t in [var_update, ms_update, mom_update] ] return var_update, ms_update, mom_update
def fused_relu_grad_bn_reduce_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout='NHWC', target=utils.CUDA): """ fused_relu_grad_bn_reduce_grad. Args: data_1~data_9: tvm.tensor.Tensor. layout: input layout, only 'NCHW', 'NHWC' supported Returns: tvm.tensor.Tensor. """ transform_list = [data_7, data_8, data_9] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.multiply(data_4, data_5) n, h, w, c = data_9.shape data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w)) data_tmp3 = topi.multiply(data_tmp1, data_tmp2) data_tmp5 = topi.full_like(data_9, 0.0) data_tmp6 = topi.greater(data_9, data_tmp5) data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5) data_tmp8 = topi.cast(data_tmp7, 'float32') data_tmp9 = topi.full_like(data_tmp8, n * h * w) data_tmp10 = topi.multiply(data_tmp8, data_tmp9) data_tmp12 = topi.subtract(data_tmp10, data_3) data_tmp14 = topi.cast(data_7, 'float32') data_tmp15 = topi.multiply(data_6, data_tmp2) data_tmp17 = topi.subtract(data_tmp14, data_tmp15) data_tmp18 = topi.multiply(data_2, data_tmp17) data_tmp20 = topi.divide(data_tmp18, data_1) data_tmp21 = topi.subtract(data_tmp12, data_tmp20) data_tmp22 = topi.multiply(data_tmp3, data_tmp21) data_out = topi.cast(data_tmp22, 'float16') return data_out
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def _reduce_any_d_compute(x, axis=None, keepdims=None): """reduce_any_d compute implemention""" dtype = x.dtype data_fp16 = topi.cast(x, "float16") data_abs = topi.abs(data_fp16) res_tmp = akg.lang.ascend.reduce_max(data_abs, axis=axis, keepdims=keepdims) shape_len = len(x.shape) if axis[-1] == shape_len - 1 and not keepdims: res_shape = [x.value for x in res_tmp.shape] res_shape.pop() res_tmp = tvm.compute(res_shape, lambda *indice: res_tmp(*indice, 0), name="reduce_res") res_s8 = topi.cast(res_tmp, dtype) return res_s8
def fused_bn_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, layout='NHWC', out_dtype='float16', target=utils.CUDA): if layout == 'NCHW': data3 = topi.transpose(data3, (0, 2, 3, 1)) data7 = topi.transpose(data7, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) n, h, w, c = data3.shape const = n * h * w inter_dtype = 'float32' out1 = topi.multiply(data4, data5) out1 = topi.divide(out1, const) out1 = topi.expand_dims(out1, axis=0, num_newaxis=3) out1 = topi.broadcast_to(out1, (n, h, w, c)) data3 = topi.cast(data3, inter_dtype) data2 = topi.expand_dims(data2, axis=0, num_newaxis=3) data2 = topi.broadcast_to(data2, (n, h, w, c)) out2 = topi.multiply(data3, const) out2 = topi.subtract(out2, data2) data1 = topi.expand_dims(data1, axis=0, num_newaxis=3) data1 = topi.broadcast_to(data1, (n, h, w, c)) data7 = topi.cast(data7, inter_dtype) out3 = topi.divide(data6, const) out3 = topi.subtract(data7, out3) out3 = topi.multiply(data1, out3) out3 = topi.divide(out3, data0) output = topi.subtract(out2, out3) output = topi.multiply(output, out1) output = topi.cast(output, out_dtype) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat, dampening=0.0, weight_decay=0.0, nesterov=False): """sgd compute implementation""" dtype = parameters.dtype if dtype == "float16": parameters = topi.cast(parameters, "float32") accum = topi.cast(accum, "float32") learning_rate = topi.cast(learning_rate, "float32") gradient = topi.cast(gradient, "float32") momentum = topi.cast(momentum, "float32") stat = topi.cast(stat, "float32") # if weight_decay != 0.0, need compute grad_delta to update gradient if weight_decay != 0.0: parameters = topi.multiply(parameters, tvm.const(1.0, 'float32')) grad_delta = topi.multiply(parameters, weight_decay) gradient = topi.add(gradient, grad_delta) stat_mid = topi.multiply(stat, tvm.const(-1, "float32")) stat_act = topi.add(stat_mid, tvm.const(1, "float32")) dampening_t = topi.multiply(stat_act, dampening) # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0]) gradient_damp = topi.multiply(gradient, dampening_t) accum_t = topi.add(accum_delta, gradient) if dampening != 0.0: accum_t = topi.subtract(accum_t, gradient_damp) # update parameters if nesterov: parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0]) parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0]) parameters_delta_2 = tvm.compute(parameters_delta_2.shape, lambda *indice: parameters_delta_2(*indice) * learning_rate[0]) parameters_delta = topi.add(parameters_delta, parameters_delta_2) parameters_t = topi.subtract(parameters, parameters_delta) else: parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0]) parameters_t = topi.subtract(parameters, parameters_delta) # update stat stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32')) if dtype == "float16": parameters_t = topi.cast(parameters_t, "float16") accum_t = topi.cast(accum_t, "float16") stat_t = topi.cast(stat_t, "float16") return parameters_t, accum_t, stat_t
def fused_pad(input, pad_before, pad_after, layout='NHWC', pad_value=0.0, target=utils.CUDA): """ fused_pad. Args: input : tvm.Tensor or Expr pad_before : list / tuple of n ints. (Pad width on each dimension to pad the before the axis begin.) pad_after : list / tuple of n ints. (Pad width each dimension to pad the after the axis end.) pad_value : float. (The value to be padded.) Returns tvm.Tensor """ if layout == "NCHW": data = topi.transpose(data, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) cast_after = topi.cast(input, 'float16') output = topi.nn.pad(cast_after, pad_before, pad_after, pad_value) return output
def _apply_rms_prop_mixed_precision_compute(var, ms, mom, grad, lr, momentum, rho, epsilon): """Compute apply_rms_prop_mixed_precision""" out_var, out_ms, out_mom = _apply_rms_prop_compute(var, ms, mom, grad, lr, momentum, rho, epsilon) out_var_fp16 = topi.cast(out_var, "float16") return out_var, out_var_fp16, out_ms, out_mom
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA): """ input: data: length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d layout: (N, C, H, W) output: beta + gamma * xi_variance * ( xi - xi_mean/(N*H*W) ) """ n, h, w, c = data4.shape const = n * h * w inter_dtype = 'float32' data4 = topi.cast(data4, inter_dtype) multiply0 = topi.divide(data3, const) multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3) multiply0 = topi.broadcast_to(multiply0, (n, h, w, c)) subtract0 = topi.subtract(data4, multiply0) multiply1 = topi.multiply(subtract0, data2) multiply2 = topi.multiply(multiply1, data1) add0 = topi.add(multiply2, data0) return add0
def fused_bn_follow_relu(data0, data1, data2, data3, data4, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data0-4: bn parameters for conv2d tensor, length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d, float16 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: ReLU: max(batch-normalized tensor, 0) """ if layout == 'NCHW': data4 = topi.transpose(data4, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) add0 = fused_bn_follow(data0, data1, data2, data3, data4) add0 = topi.cast(add0, out_dtype) output = topi.maximum(add0, 0) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def sinh_compute(x): """Compute sinh.""" dtype = x.dtype # in order to get the precise calcuate result if dtype == "float16": x = topi.cast(x, "float32") data_exp = Exp(x, utils.CCE) negative_data = topi.multiply(x, -1) negative_data_exp = Exp(negative_data, utils.CCE) data_exp_sub = topi.subtract(data_exp, negative_data_exp) res = topi.multiply(data_exp_sub, tvm.const(0.5, "float32")) if dtype == "float16": res = topi.cast(res, "float16") return res
def reduction_layer(data, axis, op, coeff): """ Reduce data on axis and scale by coeff. Args: data (tvm.tensor.Tensor): tensor with type float16 or float32, int8, uint8. axis (int): the beginning axis to reduce, -1 means the last axis. if 0, reduction to scalar. op (str): one of "SUM", "ASUM"(abs and sum), "SUMSQ"(sqr and sum), "MEAN". coeff ([int, float]): scale Returns: tvm.tensor.Tensor. """ dtype = data.dtype vc_util.ops_dtype_check(data.dtype, [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8]) vc_util.check_shape(data.shape) if op not in ["SUM", "ASUM", "SUMSQ", "MEAN"]: raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ, MEAN") shape = get_shape(data) vc_util.reduce_axis_check(shape, axis) axis = _get_axis_list(axis, shape) if dtype in ["int8", "uint8"]: data = topi.cast(data, "float16") data = topi.cast(data, "float32") cof = tvm.const(coeff, "float32") if op == "ASUM": tmp = _asum(data, axis, cof) elif op == "SUMSQ": tmp =_sumsq(data, axis, cof) elif op == "MEAN": tmp = _mean(data, axis, cof, shape) elif op == "SUM": tmp = _sum(data, axis, cof) if dtype in ["int8", "uint8"]: tmp = topi.cast(tmp, "float16") res = topi.cast(tmp, dtype) return res
def _init_atan2_mask(data_y_, data_x_): """ Compute mask for atan2. Args: data_y (tvm.tensor.Tensor): The y of atan2(y, x). data_x (tvm.tensor.Tensor): The x of atan2(y, x). Returns: mask (tvm.tensor.Tensor): The mask of x's and y's value. """ is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32" # in mini, select only support float16 if is_cast_for_mini: data_x = topi.cast(data_x_, "float16") data_y = topi.cast(data_y_, "float16") else: data_x = data_x_ data_y = data_y_ dtype_input = data_y.dtype tensor_one = dc.one_const(dtype_input) tensor_zero = dc.zero_const(dtype_input) tensor_neg_one = dc.neg_one_const(dtype_input) y_ge_zero = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one), name="y_ge_zero") x_lt_zero_y_mask = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero), name="xlt0_y_mask") if is_cast_for_mini: x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32") y_ge_zero = topi.cast(y_ge_zero, "float32") return (x_lt_zero_y_mask, y_ge_zero)
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon): """Compute apply_adadelta""" dtype = var.dtype if dtype == "float16": var = topi.cast(var, "float32") accum = topi.cast(accum, "float32") accum_update = topi.cast(accum_update, "float32") lr = topi.cast(lr, "float32") rho = topi.cast(rho, "float32") grad = topi.cast(grad, "float32") epsilon = tvm.const(epsilon, "float32") tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape) tensor_rho = topi.broadcast_to(rho, var.shape) tensor_rho_gs = topi.subtract(tensor_one, tensor_rho) tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape) # accum = accum * rho + grad ** 2 * (1 - rho) rhs = topi.multiply(accum, tensor_rho) lhs = topi.multiply(grad, grad) lhs = topi.multiply(lhs, tensor_rho_gs) accum_res = akg.lang.ascend.vadd(lhs, rhs) # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad rhs = topi.add(accum_update, tensor_epsilon) rhs = sqrt(rhs, target=utils.CCE) lhs = topi.add(accum_res, tensor_epsilon) lhs = rsqrt(lhs, target=utils.CCE) lhs = topi.multiply(grad, lhs) update = topi.multiply(lhs, rhs) # var -= update * lr var_res = topi.broadcast_to(lr, var.shape) var_res = topi.multiply(update, var_res) var_res = topi.subtract(var, var_res) # accum_update = rho * accum_update + (1 - rho) * update.square rhs = topi.multiply(accum_update, tensor_rho) lhs = topi.multiply(update, update) lhs = topi.multiply(lhs, tensor_rho_gs) accum_update_res = akg.lang.ascend.vadd(lhs, rhs) if dtype == "float16": var_res = topi.cast(var_res, "float16") accum_res = topi.cast(accum_res, "float16") accum_update_res = topi.cast(accum_update_res, "float16") return var_res, accum_res, accum_update_res