def fused_bn_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, layout='NHWC', out_dtype='float16', target=utils.CUDA): if layout == 'NCHW': data3 = topi.transpose(data3, (0, 2, 3, 1)) data7 = topi.transpose(data7, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) n, h, w, c = data3.shape const = n * h * w inter_dtype = 'float32' out1 = topi.multiply(data4, data5) out1 = topi.divide(out1, const) out1 = topi.expand_dims(out1, axis=0, num_newaxis=3) out1 = topi.broadcast_to(out1, (n, h, w, c)) data3 = topi.cast(data3, inter_dtype) data2 = topi.expand_dims(data2, axis=0, num_newaxis=3) data2 = topi.broadcast_to(data2, (n, h, w, c)) out2 = topi.multiply(data3, const) out2 = topi.subtract(out2, data2) data1 = topi.expand_dims(data1, axis=0, num_newaxis=3) data1 = topi.broadcast_to(data1, (n, h, w, c)) data7 = topi.cast(data7, inter_dtype) out3 = topi.divide(data6, const) out3 = topi.subtract(data7, out3) out3 = topi.multiply(data1, out3) out3 = topi.divide(out3, data0) output = topi.subtract(out2, out3) output = topi.multiply(output, out1) output = topi.cast(output, out_dtype) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def fused_bn_follow(data0, data1, data2, data3, data4, target=utils.CUDA): """ input: data: length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d layout: (N, C, H, W) output: beta + gamma * xi_variance * ( xi - xi_mean/(N*H*W) ) """ n, h, w, c = data4.shape const = n * h * w inter_dtype = 'float32' data4 = topi.cast(data4, inter_dtype) multiply0 = topi.divide(data3, const) multiply0 = topi.expand_dims(multiply0, axis=0, num_newaxis=3) multiply0 = topi.broadcast_to(multiply0, (n, h, w, c)) subtract0 = topi.subtract(data4, multiply0) multiply1 = topi.multiply(subtract0, data2) multiply2 = topi.multiply(multiply1, data1) add0 = topi.add(multiply2, data0) return add0
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 6 data0: tensor1 after bn_double_relu data1-6: bn parameters for conv2d tensor2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2, 0) ) """ if layout == 'NCHW': data0 = topi.transpose(data0, (0, 2, 3, 1)) data5 = topi.transpose(data5, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) n, h, w, c = data0.shape inter_dtype = 'float32' add0 = fused_bn_follow(data1, data2, data3, data4, data5) add0 = topi.cast(add0, data0.dtype) add1 = topi.add(data0, add0) output = topi.maximum(add1, 0) output = topi.cast(output, inter_dtype) output = topi.sum(output, axis=(1, 2)) output = topi.divide(output, h * w) output = topi.cast(output, out_dtype) return output
def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def fused_mul_div_rsqrt_mul_isfinite_red(input1, input2, out_dtype): """ fused operator. Args: input1: tvm.tensor.Tensor. input2: tvm.tensor.Tensor. dtype: dtype of Tensor. Returns: list of tvm.tensor.Tensor. """ mul_param1 = topi.multiply(input2, input2) divide_val = topi.divide(1, mul_param1) rsqrt_val = topi.rsqrt(divide_val) mul_param0 = topi.multiply(input1, rsqrt_val) isfinite = topi.isfinite(mul_param0) reduce_and = topi.all(isfinite) if mul_param0.dtype != out_dtype: mul_param0 = topi.cast(mul_param0, out_dtype) rsqrt_val = topi.cast(rsqrt_val, out_dtype) divide_val = topi.cast(divide_val, out_dtype) return [reduce_and, mul_param0, rsqrt_val, divide_val]
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout): """sigmoid_cross_entropy_with_logits_grad compute implemention""" dtype = predict.dtype if dtype == "float16": predict = topi.cast(predict, "float32") target = topi.cast(target, "float32") dout = topi.cast(dout, "float32") # e^x val1 = exp(predict) # 1 + e^x val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32")) # e^x / (1 + e^x) val3 = topi.divide(val1, val2) # -target val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE, dtype="float32")) # e^x / (1 + e^x) -y val5 = topi.add(val3, val4) result = topi.multiply(val5, dout) if dtype == "float16": result = topi.cast(result, dtype) return result
def asinh(x, target=utils.CCE): r""" Compute asinh function. .. math:: asinh(x) = log(x+\sqrt{x*x+1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x) # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero. # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1)) compute_dtype = dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 compute_dtype = "float32" x = topi.cast(x, compute_dtype) x_abs = topi.abs(x) if product_is_mini(): # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|)) vsquare_add_one = topi.add(1, topi.divide(1, topi.multiply(x_abs, x_abs))) sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one) sqrt_value = topi.multiply(x_abs, sqrt_compute_value) else: x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1) sqrt_value = topi.sqrt(x_abs_square_add_one) x_add_sqrt = topi.add(x_abs, sqrt_value) if product_is_mini(): log_value = log_compute_mini_impl(x_add_sqrt, target) else: log_value = topi.log(x_add_sqrt) res = topi.multiply(Sign(x, target), log_value) if res.dtype != dtype: res = topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = cast(input_x1, "float32") data_y_broad = cast(input_x2, "float32") res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = cast(res_trunc, "float32") else: res_trunc = topi.divide(input_x1, input_x2) return cast(res_trunc, input_x1.dtype)
def fused_relu_grad_bn_reduce_grad(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout='NHWC', target=utils.CUDA): """ fused_relu_grad_bn_reduce_grad. Args: data_1~data_9: tvm.tensor.Tensor. layout: input layout, only 'NCHW', 'NHWC' supported Returns: tvm.tensor.Tensor. """ transform_list = [data_7, data_8, data_9] for i in transform_list: if layout == "NCHW": i = topi.transpose(i, axes=(0, 2, 3, 1)) elif layout != "NHWC": raise NotImplementedError( 'Layout not supported {} '.format(layout)) data_tmp1 = topi.multiply(data_4, data_5) n, h, w, c = data_9.shape data_tmp2 = topi.full_like(data_tmp1, 1.0 / (n * h * w)) data_tmp3 = topi.multiply(data_tmp1, data_tmp2) data_tmp5 = topi.full_like(data_9, 0.0) data_tmp6 = topi.greater(data_9, data_tmp5) data_tmp7 = topi.where(data_tmp6, data_8, data_tmp5) data_tmp8 = topi.cast(data_tmp7, 'float32') data_tmp9 = topi.full_like(data_tmp8, n * h * w) data_tmp10 = topi.multiply(data_tmp8, data_tmp9) data_tmp12 = topi.subtract(data_tmp10, data_3) data_tmp14 = topi.cast(data_7, 'float32') data_tmp15 = topi.multiply(data_6, data_tmp2) data_tmp17 = topi.subtract(data_tmp14, data_tmp15) data_tmp18 = topi.multiply(data_2, data_tmp17) data_tmp20 = topi.divide(data_tmp18, data_1) data_tmp21 = topi.subtract(data_tmp12, data_tmp20) data_tmp22 = topi.multiply(data_tmp3, data_tmp21) data_out = topi.cast(data_tmp22, 'float16') return data_out
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def bn_gamma_grad(head, in_data, data_sum, layout="NHWC"): if layout == "NCHW": head = topi.tranpose(head, (0, 2, 3, 1)) n, h, w, c = head.shape n = n.value h = h.value w = w.value c = c.value scale = tvm.const(n * h * w, head.dtype) mean = topi.divide(data_sum, scale) x_hat = topi.subtract(in_data, mean) x_hat_mul = topi.multiply(x_hat, head) bn_gamma_grad = topi.sum(x_hat_mul, axis=(0, 1, 2)) return bn_gamma_grad
def fused_relu_grad_bn_double_reduce_grad(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, data10, data11, data12, data13, data14, data15, layout="NHWC", out_dtype="float16", target=utils.CUDA): if layout == 'NCHW': data5 = topi.transpose(data5, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) data13 = topi.transpose(data13, (0, 2, 3, 1)) data14 = topi.transpose(data14, (0, 2, 3, 1)) data15 = topi.transpose(data15, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = "float32" n, h, w, c = data5.shape scale = n * h * w mul = topi.multiply(data2, data3) mul1221 = topi.divide(mul, scale) # ReluGrad zero = tvm.const(0, data15.dtype) add = topi.add(data13, data14) addgrad = tvm.compute(add.shape, lambda *i: tvm.if_then_else(data15(*i) >= zero, add(*i), zero), tag=tag.INJECTIVE) addgrad = topi.cast(addgrad, inter_dtype) mul3283 = topi.multiply(scale, addgrad) sub1159 = topi.subtract(mul3283, data6) data5_cast = topi.cast(data5, inter_dtype) mul2372 = topi.divide(data4, scale) sub631 = topi.subtract(data5_cast, mul2372) mul1220 = topi.multiply(sub631, data1) div = topi.divide(mul1220, data0) sub271 = topi.subtract(sub1159, div) mul1218 = topi.multiply(mul1221, sub271) mul1218_cast = topi.cast(mul1218, out_dtype) mul1231 = topi.multiply(data11, data12) mul1230 = topi.divide(mul1231, scale) data9_cast = topi.cast(data9, inter_dtype) mul2364 = topi.divide(data8, scale) sub625 = topi.subtract(data9_cast, mul2364) mul1229 = topi.multiply(data10, sub625) div272 = topi.divide(mul1229, data7) sub272 = topi.subtract(sub1159, div272) mul1228 = topi.multiply(mul1230, sub272) mul1228_cast = topi.cast(mul1228, out_dtype) if layout == "NCHW": mul1218_cast = topi.transpose(mul1218_cast, (0, 3, 1, 2)) mul1228_cast = topi.transpose(mul1228_cast, (0, 3, 1, 2)) return [mul1218_cast, mul1228_cast]
def sqrt_mini_newton_iter_impl(x): """sqrt compute on mini with the Newton's Iteration""" # mini supports the rsqrt instruction, but not the sqrt instruction x_rsqrt = topi.rsqrt(x) x_sqrt = topi.divide(1, x_rsqrt) # newton_iter: x(n+1) = 1/2 *(x(n) + a/x(n)) steps = 3 half = tvm.const(0.5, x.dtype) shape = x.shape for i in range(steps): x_sqrt = tvm.compute(shape, lambda *indice: half * (x_sqrt(*indice) + x(*indice) / x_sqrt(*indice)), name="x_sqrt_%s" % i) return x_sqrt
def acosh_grad(y, dy): """ Gradient for acosh. Note: dx = dy * 1/sinh(y) Args: y (tvm.tensor.Tensor): tensor of type float16, float32. dy (tvm.tensor.Tensor): same type and shape as y. Returns: tvm.tensor.Tensor, same type and shape as y. Supported Platforms: 'Ascend' """ # mini product just used to infer if product_is_mini(): raise RuntimeError( "The mini product does not support the acosh_grad operator") dtype = y.dtype utils.ops_dtype_check(y.dtype, utils.DtypeForDavinci.ALL_FLOAT) utils.elemwise_dtype_check(dtype, dy.dtype) utils.check_shape(y.shape) utils.elemwise_shape_check(y.shape, dy.shape) if dtype == "float16": y = topi.cast(y, "float32") dy = topi.cast(dy, "float32") # If we use sinh(y) = (exp(y) - exp(-y))/2 directly, there will be some precision problems # For example, as dx = dy/sinh(y), if we use exp directly, when exp(y) and exp(-y) are close, # the small precision error of exp calculation will be greatly enlarged in the final result sinh_y = _sinh_taylor(y) dx = topi.divide(dy, sinh_y) if dx.dtype != dtype: dx = topi.cast(dx, dtype) attrs = {"enable_auto_inline": False} return dx, attrs
def _atan2_compute(y, x): """compute for atan2""" const_pi_by_two = 1.5707963267948966192313216916398 dtype = y.dtype if dtype == "float16": y = topi.cast(y, "float32") x = topi.cast(x, "float32") x_lt_zero_y_mask, y_ge_zero_mask = _init_atan2_mask(y, x) y_cmp_zero = topi.multiply(y_ge_zero_mask, tvm.const(const_pi_by_two, "float32")) res_x_lt_zero = topi.multiply(x_lt_zero_y_mask, dc.pi_const("float32")) # caculate the atan(y/x) when x > 0 if product_is_mini(): x_rec = reciprocal(x, target=utils.CCE) res = topi.multiply(y, x_rec) else: res = topi.divide(y, x) res, _ = atan(res) if product_is_mini(): tensor_zero = dc.zero_const("float16") x = topi.cast(x, "float16") y_cmp_zero = topi.cast(y_cmp_zero, "float16") res = topi.cast(res, "float16") else: tensor_zero = dc.zero_const("float32") res = tvm.compute(res.shape, lambda *i: tvm.expr.Select( x(*i) == tensor_zero, y_cmp_zero(*i), res(*i)), name="res") if product_is_mini(): res = topi.cast(res, "float32") res = topi.add(res, res_x_lt_zero) return topi.cast(res, dtype)
def asinh_grad(y, dy): """ Gradient for asinh. Note: dx = dy * 1/cosh(y) Args: y (tvm.tensor.Tensor): tensor of type float16, float32. dy (tvm.tensor.Tensor): same type and shape as y. Returns: tvm.tensor.Tensor, same type and shape as y. Supported Platforms: 'Ascend' """ # mini product just used to infer if product_is_mini(): raise RuntimeError( "The mini product does not support the asinh_grad operator") dtype = y.dtype utils.ops_dtype_check(y.dtype, utils.DtypeForDavinci.ALL_FLOAT) utils.elemwise_dtype_check(dtype, dy.dtype) utils.check_shape(y.shape) utils.elemwise_shape_check(y.shape, dy.shape) if dtype == "float16": y = topi.cast(y, "float32") dy = topi.cast(dy, "float32") dx = topi.divide(dy, cosh(y)) if dx.dtype != dtype: dx = topi.cast(dx, dtype) return dx
def _do_atan_taylor(data): """ Taylor algorithm for atan. if x > 0 and x < tan(pi/8): atan(x) = x - x^3/3 + x^5/5 - x^7/7 ... elif x > tan(pi/8) and x < tan(pi/4): atan(x) = atan(y) + atan((x-y)/(1+xy)) Args: data (tvm.tensor.Tensor): Input data. Returns: A tvm.tensor.Tensor of atan(x). """ dtype = data.dtype tensor_offset = tvm.const(TAN_PI_BY_EIGHT, dtype) deno = topi.multiply(data, tvm.const(TAN_PI_BY_EIGHT, dtype)) deno = topi.add(deno, dc.one_const(dtype)) molecule = topi.subtract(data, tensor_offset) ddata = topi.divide(molecule, deno) ddata = topi.abs(ddata) square_ddata = topi.multiply(ddata, ddata) res = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR], dtype) for i in reversed(range(CONST_ITERTOR)): res = topi.multiply(res, square_ddata) res = topi.add(res, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) res = topi.multiply(res, ddata) res = topi.add(res, tvm.const(CONST_PI_BY_EIGHT, dtype)) square_data = topi.multiply(data, data) res2 = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR2], dtype) for i in reversed(range(CONST_ITERTOR2)): res2 = topi.multiply(res2, square_data) res2 = topi.add(res2, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) return topi.minimum(res, topi.multiply(res2, data))
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum, grad, lr, l1, l2, global_step): """Compute adagrad_da.""" dtype = var.dtype # cast to float32 for higher precision if dtype == "float16": gradient_accum = topi.cast(gradient_accum, "float32") gradient_squared_accum = topi.cast(gradient_squared_accum, "float32") grad = topi.cast(grad, "float32") lr = topi.cast(lr, "float32") l1 = topi.cast(l1, "float32") l2 = topi.cast(l2, "float32") if product_is_mini(): global_step = topi.cast(global_step, "float16") global_step = topi.cast(global_step, "float32") else: global_step = topi.cast(global_step, "float32") # 1.grad_accum += grad gradient_accum = topi.add(gradient_accum, grad) # 2.grad_squared_accum += grad * grad gs = topi.multiply(grad, grad) gradient_squared_accum = topi.add(gradient_squared_accum, gs) # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0) # else: tmp_val = grad_accum sign_val = Sign(gradient_accum) abs_val = topi.abs(gradient_accum) mul_val = topi.multiply(global_step, l1) sub_val = topi.subtract(abs_val, mul_val) max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype)) tmp_val = topi.multiply(sign_val, max_val) def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val tmp_val = select(l1, tmp_val, gradient_accum) # 4.x_value = -1 * lr * tmp_val x_value = topi.multiply(lr, tvm.const(-1, "float32")) x_value = topi.multiply(x_value, tmp_val) # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum) pro_val = topi.multiply(l2, global_step) pro_val = topi.multiply(pro_val, lr) sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE) y_value = topi.add(pro_val, sqrt_val) # 6.var = x_value / y_value if product_is_mini(): y_rec = reciprocal(y_value, target=utils.CCE) var_out = topi.multiply(x_value, y_rec) else: var_out = topi.divide(x_value, y_value) if dtype == "float16": var_out = akg.lang.ascend.cast_to(var_out, "float16") gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16") gradient_squared_accum = akg.lang.ascend.cast_to( gradient_squared_accum, "float16") return var_out, gradient_accum, gradient_squared_accum