def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def _tan_2x_multi(input_x, times): """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times""" # Calculate tan (x/2^times) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16)) res = _tan_expand(input_x_divide) else: input_x_divide = topi.multiply(input_x, 1.0/(2.0**times)) res = _tan_expand(input_x_divide) while times != 0: # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16)) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16)) else: res_numerator = topi.multiply(res, 2.0) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0) if utils.product_is_mini(): res = mul(res_numerator, reciprocal(res_denominator)) else: res = div(res_numerator, res_denominator) times = times - 1 return res
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout): """sigmoid_cross_entropy_with_logits_grad compute implemention""" dtype = predict.dtype if dtype == "float16": predict = topi.cast(predict, "float32") target = topi.cast(target, "float32") dout = topi.cast(dout, "float32") # e^x val1 = exp(predict) # 1 + e^x val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32")) # e^x / (1 + e^x) val3 = topi.divide(val1, val2) # -target val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE, dtype="float32")) # e^x / (1 + e^x) -y val5 = topi.add(val3, val4) result = topi.multiply(val5, dout) if dtype == "float16": result = topi.cast(result, dtype) return result
def selu_compute(input_data): """selu compute implemention""" # if input_dtype is float16,convert it to float32 dtype = input_data.dtype if dtype == "float16" or dtype == "float32": input_data = topi.cast(input_data, "float32") type_tmp = "float32" else: input_data = topi.cast(input_data, "float16") type_tmp = "float16" # generate tensor_zero to be compared tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp)) # generate negative_res and positive_res to compute # When the element value is greater than 0 and less than 0 negative_res = topi.minimum(input_data, tensor_zero) positive_res = topi.maximum(input_data, tensor_zero) exp_res = exp(negative_res) sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp)) negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp)) if dtype == "int8": negative_muls_res = akg.lang.cce.ceil(negative_muls_res) positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp)) res = topi.add(negative_muls_res, positive_muls_res) # cast to ori_dtype if dtype == "float16" or dtype == "int8" or dtype == "int32": res = topi.cast(res, dtype) return res
def matrix_set_diag_compute(input_matrix, input_diagonal, input_help): """matrix_set_diag compute implemention""" shape_input = get_shape(input_matrix) input_dtype = input_matrix.dtype if input_dtype == "int8" or input_dtype == "uint8": input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") if input_dtype == "int32" and product_is_mini(): input_matrix = topi.cast(input_matrix, "float16") input_diagonal = topi.cast(input_diagonal, "float16") input_help = topi.cast(input_help, "float16") input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") if input_dtype == "int32" and not product_is_mini(): input_matrix = topi.cast(input_matrix, "float32") input_diagonal = topi.cast(input_diagonal, "float32") input_help = topi.cast(input_help, "float32") diag_tmp = topi.broadcast_to(input_diagonal, shape_input) help_tmp = topi.add(input_help, -1) help_y = topi.abs(help_tmp) res_vmul_x = topi.multiply(input_matrix, help_y) res_vmul_y = topi.multiply(diag_tmp, input_help) res = topi.add(res_vmul_x, res_vmul_y) if input_dtype == "int32" and product_is_mini(): res = topi.cast(res, "float16") res = topi.cast(res, input_dtype) return res
def asinh(x, target=utils.CCE): r""" Compute asinh function. .. math:: asinh(x) = log(x+\sqrt{x*x+1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x) # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero. # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1)) compute_dtype = dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 compute_dtype = "float32" x = topi.cast(x, compute_dtype) x_abs = topi.abs(x) if product_is_mini(): # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|)) vsquare_add_one = topi.add(1, topi.divide(1, topi.multiply(x_abs, x_abs))) sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one) sqrt_value = topi.multiply(x_abs, sqrt_compute_value) else: x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1) sqrt_value = topi.sqrt(x_abs_square_add_one) x_add_sqrt = topi.add(x_abs, sqrt_value) if product_is_mini(): log_value = log_compute_mini_impl(x_add_sqrt, target) else: log_value = topi.log(x_add_sqrt) res = topi.multiply(Sign(x, target), log_value) if res.dtype != dtype: res = topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def _update_m(m, beta, grad): """Update m_out = m * beta + grad * (1 - beta)""" m_beta = topi.multiply(m, beta) beta_neg = topi.multiply(beta, tvm.const(-1, beta.dtype)) beta_1 = topi.add(beta_neg, tvm.const(1, beta_neg.dtype)) grad_beta_gs = topi.multiply(grad, beta_1) m_out = topi.add(m_beta, grad_beta_gs) return m_out
def _update_var(decay_gm, alpha, lr, grad, var): """Update var_out = var - lr * (alpha + decay_gm) * grad""" decay_gm_alpha = topi.add(decay_gm, alpha) res = topi.multiply(decay_gm_alpha, lr) res = topi.multiply(res, grad) res_neg = topi.multiply(res, tvm.const(-1, res.dtype)) var_out = topi.add(var, res_neg) return var_out
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def _compute_log(data_input): """Atanh(x) = 0.5*log((1+x)/(1-x))""" data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype)) data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype)) data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype)) data_x_mul = data_1_sum_x / data_1_sub_x data_x_log = log.log(data_x_mul) data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype)) return data_res
def _compute_log(data_input, target=utils.CCE): """atanh(x) value is 0.5*log((1+x)/(1-x))""" data_1_sum_x = topi.add(data_input, dc.one_const(data_input.dtype)) data_sub_x = topi.multiply(data_input, dc.neg_one_const(data_input.dtype)) data_1_sub_x = topi.add(data_sub_x, dc.one_const(data_input.dtype)) data_x_mul = data_1_sum_x / data_1_sub_x data_x_log = log(data_x_mul, target) data_res = topi.multiply(data_x_log, dc.half_const(data_input.dtype)) return data_res
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat, dampening=0.0, weight_decay=0.0, nesterov=False): """sgd compute implementation""" dtype = parameters.dtype if dtype == "float16": parameters = topi.cast(parameters, "float32") accum = topi.cast(accum, "float32") learning_rate = topi.cast(learning_rate, "float32") gradient = topi.cast(gradient, "float32") momentum = topi.cast(momentum, "float32") stat = topi.cast(stat, "float32") # if weight_decay != 0.0, need compute grad_delta to update gradient if weight_decay != 0.0: parameters = topi.multiply(parameters, tvm.const(1.0, 'float32')) grad_delta = topi.multiply(parameters, weight_decay) gradient = topi.add(gradient, grad_delta) stat_mid = topi.multiply(stat, tvm.const(-1, "float32")) stat_act = topi.add(stat_mid, tvm.const(1, "float32")) dampening_t = topi.multiply(stat_act, dampening) # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0]) gradient_damp = topi.multiply(gradient, dampening_t) accum_t = topi.add(accum_delta, gradient) if dampening != 0.0: accum_t = topi.subtract(accum_t, gradient_damp) # update parameters if nesterov: parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0]) parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0]) parameters_delta_2 = tvm.compute(parameters_delta_2.shape, lambda *indice: parameters_delta_2(*indice) * learning_rate[0]) parameters_delta = topi.add(parameters_delta, parameters_delta_2) parameters_t = topi.subtract(parameters, parameters_delta) else: parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0]) parameters_t = topi.subtract(parameters, parameters_delta) # update stat stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32')) if dtype == "float16": parameters_t = topi.cast(parameters_t, "float16") accum_t = topi.cast(accum_t, "float16") stat_t = topi.cast(stat_t, "float16") return parameters_t, accum_t, stat_t
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def _log_taylor(data): """log algrithm is log(1+x) = ((((0.2x - 0.25)x + 0.33333)x - 0.5)x + 1)x""" data = topi.subtract(data, 1) taylor_params = [0.2, -0.25, 1 / 3, -0.5, 1] taylor_five = topi.multiply(data, taylor_params[0]) taylor_four_1 = topi.add(taylor_five, taylor_params[1]) taylor_four_2 = topi.multiply(taylor_four_1, data) taylor_three_1 = topi.add(taylor_four_2, taylor_params[2]) taylor_three_2 = topi.multiply(taylor_three_1, data) taylor_two_1 = topi.add(taylor_three_2, taylor_params[3]) taylor_two_2 = topi.multiply(taylor_two_1, data) taylor_one = topi.add(taylor_two_2, taylor_params[4]) taylor = topi.multiply(taylor_one, data) return taylor
def fake_quant_with_min_max_args_gradient(input_gradients, input_data, min=-6, max=6, num_bits=8, narrow_range=False): """ Computes gradients of Fake-quantize on the 'input_data' tensor, output_backprops = input_gradients*(if input_data>=nudged_min and <=nudged_max 1 else 0) Args: input_gradients (tvm.tensor.Tensor): input gradients from previously operation input_data (tvm.tensor.Tensor): input of fake-quantize, only supports "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) utils.elemwise_shape_check(input_gradients.shape, input_data.shape) utils.ops_dtype_check(input_data.dtype, utils.DtypeForDavinci.FLOAT32) utils.ops_dtype_check(input_gradients.dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min, max, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) # where((input_data<=nudged_max)&(x>=nudged_min),1,0),Convert the input to 0 and 1 tensor between_nudged_min_max = _cmpare_value(input_data, nudged_min_tensor, nudged_max_tensor) res = topi.multiply(input_gradients, between_nudged_min_max) return res
def _tan_expand(input_x): """calculating tan x = x + x^3/3 + 2*x^5/15 + 17*x^7/315 + 62*x^9/2835 + 1382*x^11/155925...(|x|<pi/2)""" # Taylor expansion coefficient factors = [1/3, 2/15, 17/315, 62/2835, 1382/155925] input_x_power = topi.multiply(input_x, input_x) iter_value = input_x res = input_x for i in range(TAN_EXPANSION_ORDER): if input_x.dtype == FLOAT_16 and utils.product_is_mini(): iter_value = topi.multiply(input_x_power, iter_value) res = topi.add(res, topi.multiply(iter_value, tvm.const(factors[i], FLOAT_16))) else: iter_value = topi.multiply(input_x_power, iter_value) res = topi.add(res, topi.multiply(iter_value, factors[i])) return res
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon): """Compute apply_adadelta""" dtype = var.dtype if dtype == "float16": var = topi.cast(var, "float32") accum = topi.cast(accum, "float32") accum_update = topi.cast(accum_update, "float32") lr = topi.cast(lr, "float32") rho = topi.cast(rho, "float32") grad = topi.cast(grad, "float32") epsilon = tvm.const(epsilon, "float32") tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape) tensor_rho = topi.broadcast_to(rho, var.shape) tensor_rho_gs = topi.subtract(tensor_one, tensor_rho) tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape) # accum = accum * rho + grad ** 2 * (1 - rho) rhs = topi.multiply(accum, tensor_rho) lhs = topi.multiply(grad, grad) lhs = topi.multiply(lhs, tensor_rho_gs) accum_res = akg.lang.ascend.vadd(lhs, rhs) # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad rhs = topi.add(accum_update, tensor_epsilon) rhs = sqrt(rhs, target=utils.CCE) lhs = topi.add(accum_res, tensor_epsilon) lhs = rsqrt(lhs, target=utils.CCE) lhs = topi.multiply(grad, lhs) update = topi.multiply(lhs, rhs) # var -= update * lr var_res = topi.broadcast_to(lr, var.shape) var_res = topi.multiply(update, var_res) var_res = topi.subtract(var, var_res) # accum_update = rho * accum_update + (1 - rho) * update.square rhs = topi.multiply(accum_update, tensor_rho) lhs = topi.multiply(update, update) lhs = topi.multiply(lhs, tensor_rho_gs) accum_update_res = akg.lang.ascend.vadd(lhs, rhs) if dtype == "float16": var_res = topi.cast(var_res, "float16") accum_res = topi.cast(accum_res, "float16") accum_update_res = topi.cast(accum_update_res, "float16") return var_res, accum_res, accum_update_res
def _bessel_i0e_compute(input_data): """bessel i0e compute""" shape_input = input_data.shape dtype_input = input_data.dtype # chose the type of data in begin if dtype_input == "float16": input_data = Cast(input_data, "float32", target=utils.CCE) abs_data = Abs(input_data, target=utils.CCE) # compute bessel_i0e for data in (-3.75, 3.75) # t = |x| / 3.75 # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8 # + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75 broad_const_limit = akg.lang.ascend.broadcast( akg.tvm.const(CONST_LIMIT, "float32"), shape_input) before_abs_data = minimum(abs_data, broad_const_limit) data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT) square_data = mul(data, data, target=utils.CCE) before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1]) before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2]) for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]: before_res = mul(before_res, square_data, target=utils.CCE) before_res = topi.add(before_res, iter_number) exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE) before_res = mul(before_res, exp_data, target=utils.CCE) # compute bessel_i0e for data in other domain # t = |x| / 3.75 # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3 # + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7 # + 0.00392377t^-8), |x| >= 3.75 data = Divide(broad_const_limit, abs_data, target=utils.CCE) after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1]) after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2]) for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]: after_res = mul(after_res, data, target=utils.CCE) after_res = topi.add(after_res, iter_number) rsqrt_data = rsqrt(abs_data, target=utils.CCE) after_res = mul(after_res, rsqrt_data, target=utils.CCE) after_res = minimum(before_res, after_res, target=utils.CCE) # chose the type of data in end if dtype_input == "float16": after_res = Cast(after_res, "float16", target=utils.CCE) return after_res
def bool_both_zero_compute(juduged_min, juduged_max): """if input min and max are both zero then output_data will be all zero,so need a juduge compute tensor""" dtype = juduged_min.dtype tensor_zero = topi.full(juduged_min.shape, dtype, dc.zero_const(dtype)) min_abs = topi.abs(juduged_min) max_abs = topi.abs(juduged_max) min_max_replace = topi.add(min_abs, max_abs) # just check wether min and max are all zero, if true return 0 bool_min_max_product_less_zero = less_compare_float32( min_max_replace, tensor_zero) bool_min_max_product_more_zero = less_compare_float32( tensor_zero, min_max_replace) bool_both_zero = topi.add(bool_min_max_product_less_zero, bool_min_max_product_more_zero) return bool_both_zero
def fused_l2loss_grad(data_f16, data_f32, layout='NHWC', fill_data=4e-05, target=utils.CUDA): """ fused_l2loss_grad. Args: input: tvm.tensor.Tensor. Returns: ret. """ if layout == "NCHW": data_f16 = topi.transpose(data_f16, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError('Layout not supported {} '.format(layout)) data_f16 = topi.cast(data_f16, 'float32') constant_tmp = topi.cast(fill_data, 'float32') data_constant = topi.full_like(data_f32, constant_tmp) data_out = topi.multiply(data_constant, data_f32) data_out = topi.add(data_f16, data_out) return data_out
def fused_bn_update(input1, input2, input3, input4, dtype, c1, c2, c3, c4): """ fused operator. Args: input1 ~ input4: tvm.tensor.Tensor. dtype: dtype of Tensor. c1 ~ c4: const. Returns: Three output (list of tvm.tensor.Tensor). """ const1 = tvm.const(c1, dtype) mul0 = topi.multiply(input2, const1) mul1 = topi.multiply(input1, const1) mul2 = topi.multiply(mul1, mul1) sigma2 = topi.subtract(mul0, mul2) const2 = tvm.const(c2, dtype) rsqrt_val = topi.rsqrt(topi.add(sigma2, const2)) const3 = tvm.const(c3, dtype) mul3 = topi.multiply(sigma2, const3) sub1 = topi.subtract(input3, mul3) const4 = tvm.const(c4, dtype) data1 = topi.multiply(const4, sub1) sub2 = topi.subtract(input4, mul1) data2 = topi.multiply(const4, sub2) return (rsqrt_val, data1, data2)
def batch_matmul_4D(data1, data2, bias=None, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT"): layout1_dict = {} layout2_dict = {} layout1_str = layout1.replace('N', 'B').replace('H', 'b').replace('D', 'm').replace('T', 'k') layout2_str = layout2.replace('N', 'B').replace('H', 'b').replace('D', 'n').replace('T', 'k') layout1_list = list(layout1_str) layout2_list = list(layout2_str) for i in range(len(layout1)): layout1_dict[layout1_list[i]] = data1.shape[i] layout2_dict[layout2_list[i]] = data2.shape[i] reduce_axis = tvm.reduce_axis((0, layout1_dict['k']), name='reduce_axis') if out_dtype == "float32": res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i].astype("float") * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j].astype("float"), axis=reduce_axis)) else: res = tvm.compute((layout1_dict['B'], layout1_dict['b'], layout1_dict['m'], layout2_dict['n']), lambda B, b, i, j: tvm.sum( data1[B, b, i if layout1_list[2] == 'm' else reduce_axis, reduce_axis if layout1_list[3] == 'k' else i] * data2[B, b, j if layout2_list[2] == 'n' else reduce_axis, reduce_axis if layout2_list[3] == 'k' else j], axis=reduce_axis)) if bias is not None: res = topi.add(res, bias) if layout_out != "NHDT": res = auto_out_transpose(res, layout_out) return res
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'): transform_list = [data_2, data_4, data_5, data_6, data_7] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.full_like(data_7, 0.0) data_tmp2 = topi.greater(data_7, data_tmp1) data_tmp3 = topi.add(data_5, data_6) data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1) data_tmp5 = topi.cast(data_tmp4, 'float32') data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2)) n, h, w, c = data_7.shape data_tmp8 = topi.cast(data_2, 'float32') data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w)) data_tmp10 = topi.multiply(data_1, data_tmp9) data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape) data_tmp12 = topi.subtract(data_tmp8, data_tmp11) data_tmp13 = topi.multiply(data_tmp5, data_tmp12) data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2)) data_tmp16 = topi.cast(data_4, 'float32') data_tmp17 = topi.multiply(data_3, data_tmp9) data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape) data_tmp19 = topi.subtract(data_tmp16, data_tmp18) data_tmp20 = topi.multiply(data_tmp5, data_tmp19) data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2)) return [data_tmp7, data_tmp15, data_tmp22]
def fake_quant_with_min_max_vars_per_channel_gradient_compute( input_gradients, inputs_data, min_broadcast, max_broadcast, num_bits=8, narrow_range=False): """Compute gradients for a FakeQuantWithMinMaxVarsPerChannel operation.""" shape = get_shape(inputs_data) sum_axis = [x for x in range(0, len(shape) - 1)] dtype = inputs_data.dtype nudged_min, nudged_max, _ = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # both zero yields zero bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) bool_both_zero_negate = _bool_negate(bool_both_zero_value) bool_less_equal_nudged_max = _less_equal_compare_float32( inputs_data, nudged_max) bool_more_equal_nudged_min = _less_equal_compare_float32( nudged_min, inputs_data) bool_between_nudged_min_max = topi.multiply(bool_less_equal_nudged_max, bool_more_equal_nudged_min) # gradient is 1 if input in [min, max] else 0 backprops_input_tmp = topi.multiply(bool_between_nudged_min_max, input_gradients) backprops_bool_both_zero = topi.multiply(backprops_input_tmp, bool_both_zero_value) # if min and max are both zero, gradients is input_gradients input_gradients_both_zero = topi.multiply(input_gradients, bool_both_zero_negate) backprops_input = topi.add(backprops_bool_both_zero, input_gradients_both_zero) # gradients for min is input_gradients if inputs_data < nudged_min else 0 bool_less_nudged_min = _bool_negate(bool_more_equal_nudged_min) output_backprop_min_tmp = topi.multiply(bool_less_nudged_min, input_gradients) # gradients for min is 0 if min and max are both 0 output_backprop_min_bool = topi.multiply(output_backprop_min_tmp, bool_both_zero_value) if sum_axis == []: output_backprop_min = output_backprop_min_bool else: output_backprop_min = topi.sum(output_backprop_min_bool, sum_axis) # gradients for max is input_gradients if inputs_data > nudged_max else 0 bool_more_nudged_max = _bool_negate(bool_less_equal_nudged_max) output_backprop_max_tmp = topi.multiply(bool_more_nudged_max, input_gradients) # gradients for max is 0 if min and max are both 0 output_backprop_max_bool = topi.multiply(output_backprop_max_tmp, bool_both_zero_value) if sum_axis == []: output_backprop_max = output_backprop_max_bool else: output_backprop_max = topi.sum(output_backprop_max_bool, sum_axis) return backprops_input, output_backprop_min, output_backprop_max
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA): """ input: data: length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d layout: (N, C, H, W) output: beta + gamma * xi_variance * ( xi - xi_mean/(N*H*W) ) """ n, h, w, c = data4.shape const = n * h * w inter_dtype = 'float32' data4 = topi.cast(data4, inter_dtype) multiply0 = topi.divide(data3, const) multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3) multiply0 = topi.broadcast_to(multiply0, (n, h, w, c)) subtract0 = topi.subtract(data4, multiply0) multiply1 = topi.multiply(subtract0, data2) multiply2 = topi.multiply(multiply1, data1) add0 = topi.add(multiply2, data0) return add0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 6 data0: tensor1 after bn_double_relu data1-6: bn parameters for conv2d tensor2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2, 0) ) """ if layout == 'NCHW': data0 = topi.transpose(data0, (0, 2, 3, 1)) data5 = topi.transpose(data5, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) n, h, w, c = data0.shape inter_dtype = 'float32' add0 = fused_bn_follow(data1, data2, data3, data4, data5) add0 = topi.cast(add0, data0.dtype) add1 = topi.add(data0, add0) output = topi.maximum(add1, 0) output = topi.cast(output, inter_dtype) output = topi.sum(output, axis=(1, 2)) output = topi.divide(output, h * w) output = topi.cast(output, out_dtype) return output
def _sinh_2x(sinh_x): """sinh(2x) = 2*sinh(x)*sqrt(sinh(x)^2+1)""" sinh_x_square = topi.multiply(sinh_x, sinh_x) sinh_x_square_add_one = topi.add(sinh_x_square, 1) sqrt_value = topi.sqrt(sinh_x_square_add_one) sinh_x_mul_sqrt_value = topi.multiply(sinh_x, sqrt_value) sinh_2x = topi.multiply(2, sinh_x_mul_sqrt_value) return sinh_2x
def _compute_taylor(data_input): """Algorithm: atanh(x) = x + x^3/3 + x^5/5 + x^7/7""" taylor_para = [0, 1.0, 0, 1 / 3.0, 0, 1.0 / 5, 0, 1.0 / 7] # x^2 data_mul_2 = topi.multiply(data_input, data_input) # 1/5 + x^2/7 data_mul_2_7 = topi.multiply(data_mul_2, tvm.const(taylor_para[7], "float32")) result = topi.add(data_mul_2_7, tvm.const(taylor_para[5], "float32")) # 1/3 + x^2(1/5 + x^2/7) result = topi.multiply(data_mul_2, result) result = topi.add(result, tvm.const(taylor_para[3], "float32")) # 1 + x^2(1/3 + x^2(1/5 + x^2/7)) result = topi.multiply(data_mul_2, result) result = topi.add(result, tvm.const(taylor_para[1], "float32")) # x(1 + x^2(1/3 + x^2(1/5 + x^2/7))) return topi.multiply(data_input, result)
def _newton_iter(data, init_x): """Do element-wise Newton compute.""" # Newton begin:x(n+1) = x(n)*(3-a*x(n)^2)/2 init_square = topi.multiply(init_x, init_x) newton_res = topi.multiply(init_square, data) newton_res = topi.multiply(newton_res, neg_one_const("float32")) newton_res = topi.add(newton_res, tvm.const(3, "float32")) newton_res = topi.multiply(newton_res, init_x) newton_res = topi.multiply(newton_res, tvm.const(0.5, "float32")) return newton_res