def tan_compute(input_x): """tan compute implemention""" dtype = input_x.dtype # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32 and not product_is_mini()): input_x = topi.cast(input_x, FLOAT_32) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32))) # cast to type float16 when type is int32 in mini elif dtype == INT_32 and product_is_mini(): input_x = topi.cast(input_x, FLOAT_16) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16))) res = _tan_2x_multi(input_x, TAN_2X_TIMES) # cast the dtype to original dtype res = topi.cast(res, dtype) return res
def fused_bn_update(input1, input2, input3, input4, dtype, c1, c2, c3, c4): """ fused operator. Args: input1 ~ input4: tvm.tensor.Tensor. dtype: dtype of Tensor. c1 ~ c4: const. Returns: Three output (list of tvm.tensor.Tensor). """ const1 = tvm.const(c1, dtype) mul0 = topi.multiply(input2, const1) mul1 = topi.multiply(input1, const1) mul2 = topi.multiply(mul1, mul1) sigma2 = topi.subtract(mul0, mul2) const2 = tvm.const(c2, dtype) rsqrt_val = topi.rsqrt(topi.add(sigma2, const2)) const3 = tvm.const(c3, dtype) mul3 = topi.multiply(sigma2, const3) sub1 = topi.subtract(input3, mul3) const4 = tvm.const(c4, dtype) data1 = topi.multiply(const4, sub1) sub2 = topi.subtract(input4, mul1) data2 = topi.multiply(const4, sub2) return (rsqrt_val, data1, data2)
def fused_relu_grad_bn_double_update_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout='NHWC'): transform_list = [data_2, data_4, data_5, data_6, data_7] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.full_like(data_7, 0.0) data_tmp2 = topi.greater(data_7, data_tmp1) data_tmp3 = topi.add(data_5, data_6) data_tmp4 = topi.where(data_tmp2, data_tmp3, data_tmp1) data_tmp5 = topi.cast(data_tmp4, 'float32') data_tmp7 = topi.sum(data_tmp5, axis=(0, 1, 2)) n, h, w, c = data_7.shape data_tmp8 = topi.cast(data_2, 'float32') data_tmp9 = topi.full_like(data_tmp7, 1.0/(n*h*w)) data_tmp10 = topi.multiply(data_1, data_tmp9) data_tmp11 = topi.broadcast_to(data_tmp10, data_tmp8.shape) data_tmp12 = topi.subtract(data_tmp8, data_tmp11) data_tmp13 = topi.multiply(data_tmp5, data_tmp12) data_tmp15 = topi.sum(data_tmp13, axis=(0, 1, 2)) data_tmp16 = topi.cast(data_4, 'float32') data_tmp17 = topi.multiply(data_3, data_tmp9) data_tmp18 = topi.broadcast_to(data_tmp17, data_tmp16.shape) data_tmp19 = topi.subtract(data_tmp16, data_tmp18) data_tmp20 = topi.multiply(data_tmp5, data_tmp19) data_tmp22 = topi.sum(data_tmp20, axis=(0, 1, 2)) return [data_tmp7, data_tmp15, data_tmp22]
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15, layout="NHWC", out_dtype="float16", target=utils.CUDA): if layout == 'NCHW': data5 = topi.transpose(data5, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) data13 = topi.transpose(data13, (0, 2, 3, 1)) data14 = topi.transpose(data14, (0, 2, 3, 1)) data15 = topi.transpose(data15, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = "float32" n, h, w, c = data5.shape scale = n * h * w mul = topi.multiply(data2, data3) mul1221 = topi.divide(mul, scale) # ReluGrad zero = tvm.const(0, data15.dtype) add = topi.add(data13, data14) addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE) addgrad = topi.cast(addgrad, inter_dtype) mul3283 = topi.multiply(scale, addgrad) sub1159 = topi.subtract(mul3283, data6) data5_cast = topi.cast(data5, inter_dtype) mul2372 = topi.divide(data4, scale) sub631 = topi.subtract(data5_cast, mul2372) mul1220 = topi.multiply(sub631, data1) div = topi.divide(mul1220, data0) sub271 = topi.subtract(sub1159, div) mul1218 = topi.multiply(mul1221, sub271) mul1218_cast = topi.cast(mul1218, out_dtype) mul1231 = topi.multiply(data11, data12) mul1230 = topi.divide(mul1231, scale) data9_cast = topi.cast(data9, inter_dtype) mul2364 = topi.divide(data8, scale) sub625 = topi.subtract(data9_cast, mul2364) mul1229 = topi.multiply(data10, sub625) div272 = topi.divide(mul1229, data7) sub272 = topi.subtract(sub1159, div272) mul1228 = topi.multiply(mul1230, sub272) mul1228_cast = topi.cast(mul1228, out_dtype) if layout == "NCHW": mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2)) mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2)) return [mul1218_cast, mul1228_cast]
def fused_relu_grad_bn_reduce_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout='NHWC', target=utils.CUDA): """ fused_relu_grad_bn_reduce_grad. Args: data_1~data_9: tvm.tensor.Tensor. layout: input layout, only 'NCHW', 'NHWC' supported Returns: tvm.tensor.Tensor. """ transform_list = [data_7, data_8, data_9] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.multiply(data_4, data_5) n, h, w, c = data_9.shape data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w)) data_tmp3 = topi.multiply(data_tmp1, data_tmp2) data_tmp5 = topi.full_like(data_9, 0.0) data_tmp6 = topi.greater(data_9, data_tmp5) data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5) data_tmp8 = topi.cast(data_tmp7, 'float32') data_tmp9 = topi.full_like(data_tmp8, n * h * w) data_tmp10 = topi.multiply(data_tmp8, data_tmp9) data_tmp12 = topi.subtract(data_tmp10, data_3) data_tmp14 = topi.cast(data_7, 'float32') data_tmp15 = topi.multiply(data_6, data_tmp2) data_tmp17 = topi.subtract(data_tmp14, data_tmp15) data_tmp18 = topi.multiply(data_2, data_tmp17) data_tmp20 = topi.divide(data_tmp18, data_1) data_tmp21 = topi.subtract(data_tmp12, data_tmp20) data_tmp22 = topi.multiply(data_tmp3, data_tmp21) data_out = topi.cast(data_tmp22, 'float16') return data_out
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat, dampening=0.0, weight_decay=0.0, nesterov=False): """sgd compute implementation""" dtype = parameters.dtype if dtype == "float16": parameters = topi.cast(parameters, "float32") accum = topi.cast(accum, "float32") learning_rate = topi.cast(learning_rate, "float32") gradient = topi.cast(gradient, "float32") momentum = topi.cast(momentum, "float32") stat = topi.cast(stat, "float32") # if weight_decay != 0.0, need compute grad_delta to update gradient if weight_decay != 0.0: parameters = topi.multiply(parameters, tvm.const(1.0, 'float32')) grad_delta = topi.multiply(parameters, weight_decay) gradient = topi.add(gradient, grad_delta) stat_mid = topi.multiply(stat, tvm.const(-1, "float32")) stat_act = topi.add(stat_mid, tvm.const(1, "float32")) dampening_t = topi.multiply(stat_act, dampening) # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0]) gradient_damp = topi.multiply(gradient, dampening_t) accum_t = topi.add(accum_delta, gradient) if dampening != 0.0: accum_t = topi.subtract(accum_t, gradient_damp) # update parameters if nesterov: parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0]) parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0]) parameters_delta_2 = tvm.compute(parameters_delta_2.shape, lambda *indice: parameters_delta_2(*indice) * learning_rate[0]) parameters_delta = topi.add(parameters_delta, parameters_delta_2) parameters_t = topi.subtract(parameters, parameters_delta) else: parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0]) parameters_t = topi.subtract(parameters, parameters_delta) # update stat stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32')) if dtype == "float16": parameters_t = topi.cast(parameters_t, "float16") accum_t = topi.cast(accum_t, "float16") stat_t = topi.cast(stat_t, "float16") return parameters_t, accum_t, stat_t
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon): """Compute ada_max.""" # cast to float32 for improved accuracy inp_dtype = var.dtype if inp_dtype == 'float16': var = topi.cast(var, 'float32') m = topi.cast(m, 'float32') v = topi.cast(v, 'float32') lr = topi.cast(lr, 'float32') beta1_power = topi.cast(beta1_power, 'float32') beta1 = topi.cast(beta1, 'float32') beta2 = topi.cast(beta2, 'float32') grad = topi.cast(grad, 'float32') epsilon = tvm.const(epsilon, 'float32') # m += (grad - m) * (1 - beta1) rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32")) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32")) lhs = topi.subtract(grad, m) rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0]) m = topi.add(m, rhs) # v = max(beta2*v, abs(grad)) lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0]) rhs = topi.abs(grad) v = topi.maximum(lhs, rhs) # var -= lr / (1 - beta1_power) * (m / (v + epsilon)) # lr * m / (1 - beta1_power) * (v + epsilon) # v + epsilon rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon) # 1 - beta1_power lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32")) lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32")) # (1 - beta1_power) * (v + epsilon) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0]) # lr * m lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0]) # lr * m / (1 - beta1_power) * (v + epsilon) rhs = reciprocal(rhs) rhs = topi.multiply(lhs, rhs) var = topi.subtract(var, rhs) if inp_dtype == 'float16': var = topi.cast(var, inp_dtype) m = topi.cast(m, inp_dtype) v = topi.cast(v, inp_dtype) return var, m, v
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def fused_bn_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, layout='NHWC', out_dtype='float16', target=utils.CUDA): if layout == 'NCHW': data3 = topi.transpose(data3, (0, 2, 3, 1)) data7 = topi.transpose(data7, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) n, h, w, c = data3.shape const = n * h * w inter_dtype = 'float32' out1 = topi.multiply(data4, data5) out1 = topi.divide(out1, const) out1 = topi.expand_dims(out1, axis=0, num_newaxis=3) out1 = topi.broadcast_to(out1, (n, h, w, c)) data3 = topi.cast(data3, inter_dtype) data2 = topi.expand_dims(data2, axis=0, num_newaxis=3) data2 = topi.broadcast_to(data2, (n, h, w, c)) out2 = topi.multiply(data3, const) out2 = topi.subtract(out2, data2) data1 = topi.expand_dims(data1, axis=0, num_newaxis=3) data1 = topi.broadcast_to(data1, (n, h, w, c)) data7 = topi.cast(data7, inter_dtype) out3 = topi.divide(data6, const) out3 = topi.subtract(data7, out3) out3 = topi.multiply(data1, out3) out3 = topi.divide(out3, data0) output = topi.subtract(out2, out3) output = topi.multiply(output, out1) output = topi.cast(output, out_dtype) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon): """Compute apply_adadelta""" dtype = var.dtype if dtype == "float16": var = topi.cast(var, "float32") accum = topi.cast(accum, "float32") accum_update = topi.cast(accum_update, "float32") lr = topi.cast(lr, "float32") rho = topi.cast(rho, "float32") grad = topi.cast(grad, "float32") epsilon = tvm.const(epsilon, "float32") tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape) tensor_rho = topi.broadcast_to(rho, var.shape) tensor_rho_gs = topi.subtract(tensor_one, tensor_rho) tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape) # accum = accum * rho + grad ** 2 * (1 - rho) rhs = topi.multiply(accum, tensor_rho) lhs = topi.multiply(grad, grad) lhs = topi.multiply(lhs, tensor_rho_gs) accum_res = akg.lang.ascend.vadd(lhs, rhs) # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad rhs = topi.add(accum_update, tensor_epsilon) rhs = sqrt(rhs, target=utils.CCE) lhs = topi.add(accum_res, tensor_epsilon) lhs = rsqrt(lhs, target=utils.CCE) lhs = topi.multiply(grad, lhs) update = topi.multiply(lhs, rhs) # var -= update * lr var_res = topi.broadcast_to(lr, var.shape) var_res = topi.multiply(update, var_res) var_res = topi.subtract(var, var_res) # accum_update = rho * accum_update + (1 - rho) * update.square rhs = topi.multiply(accum_update, tensor_rho) lhs = topi.multiply(update, update) lhs = topi.multiply(lhs, tensor_rho_gs) accum_update_res = akg.lang.ascend.vadd(lhs, rhs) if dtype == "float16": var_res = topi.cast(var_res, "float16") accum_res = topi.cast(accum_res, "float16") accum_update_res = topi.cast(accum_update_res, "float16") return var_res, accum_res, accum_update_res
def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA): """ input: data: length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d layout: (N, C, H, W) output: beta + gamma * xi_variance * ( xi - xi_mean/(N*H*W) ) """ n, h, w, c = data4.shape const = n * h * w inter_dtype = 'float32' data4 = topi.cast(data4, inter_dtype) multiply0 = topi.divide(data3, const) multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3) multiply0 = topi.broadcast_to(multiply0, (n, h, w, c)) subtract0 = topi.subtract(data4, multiply0) multiply1 = topi.multiply(subtract0, data2) multiply2 = topi.multiply(multiply1, data1) add0 = topi.add(multiply2, data0) return add0
def _apply_gradient_descent_compute(var, alpha, delta): """Compute gradient_descent""" # step 1: calculate delta * alpha var_change = tvm.compute(delta.shape, lambda *indices: delta(*indices) * alpha[0]) # step 2: calculate var - delta * alpha reuse_var = topi.subtract(var, var_change) return reuse_var
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def _cmpare_value(input_data, nudged_min, nudged_max): """ where((input_data<=nudged_max)&(x>=nudged_min),1,0) Args: input_data (tvm.tensor.Tensor): Input data nudged_min (tvm.tensor.Tensor): Minimum value of comparison nudged_max (tvm.tensor.Tensor): Maximum value of comparison Returns: tvm.tensor.Tensor """ min_value = tvm.const(2**(-126), dtype="float32") # (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 # so min_value*max_value*max_value*max_value_one = 1 max_value = tvm.const(2**(62), dtype="float32") max_value_one = tvm.const(2**(2), dtype="float32") data_zero = topi.multiply(input_data, 0) max_value_tensor = topi.add(data_zero, max_value) min_value_tensor = topi.add(data_zero, min_value) max_value_one_tensor = topi.add(data_zero, max_value_one) sub_tmp = topi.subtract(input_data, nudged_min) sub_min = topi.add(sub_tmp, min_value) vmax_tmp = topi.maximum(sub_min, data_zero) sub_tmp_max = topi.subtract(nudged_max, input_data) sub_max = topi.add(sub_tmp_max, min_value) vmin_tmp = topi.maximum(sub_max, data_zero) one_tmp = topi.multiply(vmax_tmp, vmin_tmp) one_min = topi.minimum(one_tmp, min_value_tensor) vmul_max_value = topi.multiply(one_min, max_value_tensor) vmul_max_value_one = topi.multiply(vmul_max_value, max_value_tensor) between_nudged_min_max = topi.multiply(vmul_max_value_one, max_value_one_tensor) return between_nudged_min_max
def _log_taylor(data): """log algrithm is log(1+x) = ((((0.2x - 0.25)x + 0.33333)x - 0.5)x + 1)x""" data = topi.subtract(data, 1) taylor_params = [0.2, -0.25, 1 / 3, -0.5, 1] taylor_five = topi.multiply(data, taylor_params[0]) taylor_four_1 = topi.add(taylor_five, taylor_params[1]) taylor_four_2 = topi.multiply(taylor_four_1, data) taylor_three_1 = topi.add(taylor_four_2, taylor_params[2]) taylor_three_2 = topi.multiply(taylor_three_1, data) taylor_two_1 = topi.add(taylor_three_2, taylor_params[3]) taylor_two_2 = topi.multiply(taylor_two_1, data) taylor_one = topi.add(taylor_two_2, taylor_params[4]) taylor = topi.multiply(taylor_one, data) return taylor
def bn_gamma_grad(head, in_data, data_sum, layout="NHWC"): if layout == "NCHW": head = topi.tranpose(head, (0, 2, 3, 1)) n, h, w, c = head.shape n = n.value h = h.value w = w.value c = c.value scale = tvm.const(n * h * w, head.dtype) mean = topi.divide(data_sum, scale) x_hat = topi.subtract(in_data, mean) x_hat_mul = topi.multiply(x_hat, head) bn_gamma_grad = topi.sum(x_hat_mul, axis=(0, 1, 2)) return bn_gamma_grad
def sinh_compute(x): """Compute sinh.""" dtype = x.dtype # in order to get the precise calcuate result if dtype == "float16": x = topi.cast(x, "float32") data_exp = Exp(x, utils.CCE) negative_data = topi.multiply(x, -1) negative_data_exp = Exp(negative_data, utils.CCE) data_exp_sub = topi.subtract(data_exp, negative_data_exp) res = topi.multiply(data_exp_sub, tvm.const(0.5, "float32")) if dtype == "float16": res = topi.cast(res, "float16") return res
def less_compare_float32(data_x, data_y): """if x is less than y, then return 1, else return 0""" shape_inputs = get_shape(data_x) # minimun num of float32 2**(-126) data_min = akg.lang.ascend.broadcast(tvm.const(2**(-126), dtype="float32"), shape_inputs, "float32") data_zero = akg.lang.ascend.broadcast(dc.zero_const("float32"), shape_inputs, "float32") res_sub = topi.subtract(data_y, data_x) res_min = topi.minimum(res_sub, data_min) res_max = topi.maximum(res_min, data_zero) # max num of float32 is 2**126 # but cce can only support 2**62, so use 62 * 62 * 2 to adaptor 126 res_mul_fierst = topi.multiply(res_max, tvm.const(2**62, dtype="float32")) res_mul_second = topi.multiply(res_mul_fierst, tvm.const(2**62, dtype="float32")) res = topi.multiply(res_mul_second, tvm.const(2**2, dtype="float32")) return res
def fake_quant_with_min_max_vars_per_channel_compute(input_data, input_min, input_max, num_bits=8, narrow_range=False): """fake_quant_with_min_max_vars_per_channel compute implemention""" shape = get_shape(input_data.shape) dtype = input_data.dtype min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype) max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype) # get nudged_min and nudged_max by nudged_min_max_compute function nudged_min_nudged_max = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # transform the input between nudged_max and nudged_min clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1]) clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0]) # calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0]) if product_is_mini(): clamped_shifted_div_scale = mul(clamped_shifted, reciprocal(nudged_min_nudged_max[2]), target=utils.CCE) else: clamped_shifted_div_scale = Divide(clamped_shifted, nudged_min_nudged_max[2], target=utils.CCE) result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype)) floor_result_tmp = akg.lang.ascend.floor(result_tmp) if product_is_mini(): floor_result_tmp = topi.cast(floor_result_tmp, "float16") floor_result_tmp = topi.cast(floor_result_tmp, "float32") scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2]) tmp_res = topi.add(scale_product, nudged_min_nudged_max[0]) # get bool_both_zero_value by bool_both_zero_compute function bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) res = topi.multiply(tmp_res, bool_both_zero_value) return res
def _do_atan_taylor(data): """ Taylor algorithm for atan. if x > 0 and x < tan(pi/8): atan(x) = x - x^3/3 + x^5/5 - x^7/7 ... elif x > tan(pi/8) and x < tan(pi/4): atan(x) = atan(y) + atan((x-y)/(1+xy)) Args: data (tvm.tensor.Tensor): Input data. Returns: A tvm.tensor.Tensor of atan(x). """ dtype = data.dtype tensor_offset = tvm.const(TAN_PI_BY_EIGHT, dtype) deno = topi.multiply(data, tvm.const(TAN_PI_BY_EIGHT, dtype)) deno = topi.add(deno, dc.one_const(dtype)) molecule = topi.subtract(data, tensor_offset) ddata = topi.divide(molecule, deno) ddata = topi.abs(ddata) square_ddata = topi.multiply(ddata, ddata) res = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR], dtype) for i in reversed(range(CONST_ITERTOR)): res = topi.multiply(res, square_ddata) res = topi.add(res, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) res = topi.multiply(res, ddata) res = topi.add(res, tvm.const(CONST_PI_BY_EIGHT, dtype)) square_data = topi.multiply(data, data) res2 = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR2], dtype) for i in reversed(range(CONST_ITERTOR2)): res2 = topi.multiply(res2, square_data) res2 = topi.add(res2, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) return topi.minimum(res, topi.multiply(res2, data))
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type): """ Gradient for minimum or maximum operation between two input tensors `x` and `y`. Args: dz (tvm.tensor.Tensor): Type float16, float32, int32. x (tvm.tensor.Tensor): Type float16, float32, int32. y (tvm.tensor.Tensor): Type float16, float32, int32. grad_x (bool): Whether calculate dx. grad_y (bool): Whether calculate dy. op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad. Note: At least one of grad_x and grad_y is True. Returns: dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True. dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True. """ vc_util.check_shape(x) vc_util.check_shape(y) vc_util.check_shape(dz) vc_util.ops_dtype_check([x.dtype, y.dtype, dz.dtype], [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) vc_util.broadcast_check(x, dz) vc_util.broadcast_check(y, dz) # check op types check_list = ["GE", "LE"] if op_type not in check_list: raise ValueError("FusedMinimumOrMaximumGrad only support %s while op type is %s" % (",".join(check_list), op_type)) if not grad_x and not grad_y: raise ValueError("At least one of grad_x and grad_y is True.") x_shape = get_shape(x) y_shape = get_shape(y) dz_shape = get_shape(dz) ori_dtype = dz.dtype # get greater compute x = akg.lang.cce.broadcast(x, dz_shape) y = akg.lang.cce.broadcast(y, dz_shape) if utils.product_is_mini() and ori_dtype != "float16": x = cast(x, "float16") y = cast(y, "float16") dz = cast(dz, "float16") elif ori_dtype == "int32": x = cast(x, "float32") y = cast(y, "float32") dz = cast(dz, "float32") zero = zero_const(dz.dtype) if op_type == "LE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) <= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) elif op_type == "GE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) >= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) if dx.dtype == "float16": # cast to fp32 for higher precision of reduce_sum. if get_shape(dx) != x_shape: dx = cast(dx, "float32") if get_shape(dy) != y_shape: dy = cast(dy, "float32") dx = sum.sum_by_shape(dx, x_shape) dy = sum.sum_by_shape(dy, y_shape) if ori_dtype != dx.dtype: dx = cast(dx, ori_dtype) if ori_dtype != dy.dtype: dy = cast(dy, ori_dtype) attrs = get_default_attrs() if grad_x and grad_y: return dx, dy, attrs if grad_x: return dx, attrs return dy, attrs
def _bool_negate(input_bool): """Negate every value""" return topi.subtract(dc.one_const(input_bool.dtype), input_bool)
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range): """ Calculate the maximum and minimum values of the quantization. Notes: Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min). Then compute nudged_zero_point: nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float, between_min_max_float is first calculated by: zero_point_from_min = (quant_min_float - min_broadcast) / scale, then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast. Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float, the value is quant_min, else is 0. The same as more_quant_max_float. Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max: nudged_min = (quant_min - nudged_zero_point) * scale nudged_max = (quant_max - nudged_zero_point) * scale Args: min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel. max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel. num_bits (int): num_bits is the bitwidth of the quantization, range [2,16]. narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else quantized into the quantization range [1, 2^num_bits - 1]. Returns: nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast. nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast. scale (tvm.tensor.Tensor): The same type and shape as max_broadcast. """ dtype = min_broadcast.dtype quant_min = 1 if narrow_range else 0 quant_max = (2**num_bits) - 1 # because of need compute each channel, so quant_min and quant_max need to broadcast. quant_min_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_min, dtype)) quant_max_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_max, dtype)) # caculate each channel max and min difference. max_sub_min = topi.subtract(max_broadcast, min_broadcast) quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float) # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min) # and min_div_scale = min_broadcast / scale if product_is_mini(): scale = mul(max_sub_min, reciprocal(quant_max_sub_quant_min), target=utils.CCE) min_div_scale = Mul(min_broadcast, reciprocal(scale), target=utils.CCE) else: scale = Divide(max_sub_min, quant_max_sub_quant_min, target=utils.CCE) min_div_scale = Divide(min_broadcast, scale, target=utils.CCE) # zero_point_from_min = quant_min_float - min_broadcast / scale zero_point_from_min = topi.subtract(quant_min_float, min_div_scale) # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0 bool_less_quant_min_float = less_compare_float32(zero_point_from_min, quant_min_float) # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0 bool_more_quant_max_float = less_compare_float32(quant_max_float, zero_point_from_min) # according to above bool param to select effective value less_quant_min_float = topi.multiply(quant_min_float, bool_less_quant_min_float) more_quant_max_float = topi.multiply(quant_max_float, bool_more_quant_max_float) # compute which num is not less than quant_min_float and not large than quant_max_float tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype)) bool_not_less_quant_min_float = topi.subtract(tensor_one, bool_less_quant_min_float) bool_not_more_quant_max_float = topi.subtract(tensor_one, bool_more_quant_max_float) bool_between_min_max = topi.multiply(bool_not_less_quant_min_float, bool_not_more_quant_max_float) between_min_max_float = topi.multiply(zero_point_from_min, bool_between_min_max) # add 0.5 to num which min <= num <= max and then floor them. between_min_max_add_half_one = topi.add(between_min_max_float, dc.half_const(dtype)) between_min_max_round = akg.lang.ascend.floor(between_min_max_add_half_one) if product_is_mini(): between_min_max_round = topi.cast(between_min_max_round, "float16") between_min_max_round = topi.cast(between_min_max_round, "float32") # calculate the maximum and minimum values of the quantization nudged_zero_point_tmp = topi.add(less_quant_min_float, more_quant_max_float) nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round) nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point) nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point) nudged_min = topi.multiply(nudged_min_tmp, scale) nudged_max = topi.multiply(nudged_max_tmp, scale) res = [nudged_min, nudged_max, scale] return res
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum, grad, lr, l1, l2, global_step): """Compute adagrad_da.""" dtype = var.dtype # cast to float32 for higher precision if dtype == "float16": gradient_accum = topi.cast(gradient_accum, "float32") gradient_squared_accum = topi.cast(gradient_squared_accum, "float32") grad = topi.cast(grad, "float32") lr = topi.cast(lr, "float32") l1 = topi.cast(l1, "float32") l2 = topi.cast(l2, "float32") if product_is_mini(): global_step = topi.cast(global_step, "float16") global_step = topi.cast(global_step, "float32") else: global_step = topi.cast(global_step, "float32") # 1.grad_accum += grad gradient_accum = topi.add(gradient_accum, grad) # 2.grad_squared_accum += grad * grad gs = topi.multiply(grad, grad) gradient_squared_accum = topi.add(gradient_squared_accum, gs) # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0) # else: tmp_val = grad_accum sign_val = Sign(gradient_accum) abs_val = topi.abs(gradient_accum) mul_val = topi.multiply(global_step, l1) sub_val = topi.subtract(abs_val, mul_val) max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype)) tmp_val = topi.multiply(sign_val, max_val) def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val tmp_val = select(l1, tmp_val, gradient_accum) # 4.x_value = -1 * lr * tmp_val x_value = topi.multiply(lr, tvm.const(-1, "float32")) x_value = topi.multiply(x_value, tmp_val) # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum) pro_val = topi.multiply(l2, global_step) pro_val = topi.multiply(pro_val, lr) sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE) y_value = topi.add(pro_val, sqrt_val) # 6.var = x_value / y_value if product_is_mini(): y_rec = reciprocal(y_value, target=utils.CCE) var_out = topi.multiply(x_value, y_rec) else: var_out = topi.divide(x_value, y_value) if dtype == "float16": var_out = akg.lang.ascend.cast_to(var_out, "float16") gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16") gradient_squared_accum = akg.lang.ascend.cast_to( gradient_squared_accum, "float16") return var_out, gradient_accum, gradient_squared_accum
def _compute_var(var, lr, update): """Update var.""" lt_tmp = tvm.compute(update.shape, lambda *indice: update(*indice) * lr[0]) var_t = topi.subtract(var, lt_tmp) return var_t