def _bessel_i1e_compute(input_data): """bessel i1e compute""" shape = utils.get_shape(input_data) dtype = input_data.dtype # chose the type of data in begin if dtype == "float16": input_data = Cast(input_data, "float32", target=utils.CCE) abs_data = Abs(input_data, utils.CCE) # compute bessel_i1e for data in (-3.75, 3.75) before_res = _before_res_compute(abs_data) # compute bessel_i1e for data in other domain after_res = _after_res_compute(abs_data) # As vcmp_lt and vsel instruction don't support fp32 on mini # It can be simplified by some methods, such as , "auto cast" if product_is_mini(): res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( abs_data[indice].astype("float16") < akg.tvm.const( CONST_LIMIT, "float16"), before_res[indice].astype( "float16"), after_res[indice].astype("float16"))) res = Cast(res, "float32", target=utils.CCE) else: res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select(abs_data[ indice] < CONST_LIMIT, before_res[indice], after_res[indice])) data_sign = Sign(input_data, target=utils.CCE) res = mul(res, data_sign, target=utils.CCE) if dtype == "float16": res = Cast(res, "float16", target=utils.CCE) return res
def pad(data, paddings, padtype, target="cce"): """add paddings to the tensor :shape: The shape of the tensor, now only support two dimension Tensor :paddings: The shape of the paddings, shape [N,2], N is the dimension of the tensor, For each dimension D of input, paddings[D, 0] indicates how many values to add before the contents of tensor in that dimension, and paddings[D, 1] indicates how many values to add after the contents of tensor in that dimension. :dtype: The type of the input, float16, float32 :padtype: One of "CONSTANT", "REFLECT", or "SYMMETRIC". """ # check shape utils.check_shape(data.shape) # check types utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_TYPES) # check padding types ptype_checklist = ['constant'] if not (padtype in ptype_checklist): raise RuntimeError("pad_cce only support %s while padtype is %s" % (",".join(ptype_checklist), padtype)) dtype = data.dtype if dtype == 'int8' or dtype == 'uint8': data = Cast(data, "float16", target=target) rank = len(data.shape) pad_before = [] pad_after = [] for i in range(rank): pad_before.append(paddings[i][0]) pad_after.append(paddings[i][1]) B = tvm_pad(data, pad_before, pad_after=pad_after, name='B') if dtype == 'int8' or dtype == 'uint8': B = Cast(B, dtype, target=target) return B
def scale(input_data, scale_data, target="cce"): """ Computes scaled input_data, res = input_data * scale_data Args: input_data(akg.tvm.Tensor): Tensor of type float16, float32, int8, uint8, int32. scale_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(scale_data) != shape(input_data), the shape of scale_data will broadcast to shape(input_data). Returns: akg.tvm.Tensor of same type and shape as input_data """ # check shape input_data_shape = [x.value for x in input_data.shape] scale_shape = [x.value for x in scale_data.shape] utils.check_shape(input_data_shape) utils.check_shape(scale_shape) # check type check_list = ["float16", "float32", "int8", "uint8", "int32"] dtype = input_data.dtype if not dtype in check_list: raise TypeError( "scale_data operator only supports %s while dtype is %s" % (",".join(check_list), dtype)) if scale_data.dtype != dtype: raise TypeError( "type(input_data) is %s, type(scale_data) is %d, which is inconsistent" % (dtype, scale_data.dtype)) orig_dtype = dtype if dtype == "int8" or dtype == "uint8": dtype = "float16" if dtype == "int32": dtype = "float32" if dtype != orig_dtype: input_data = Cast(input_data, dtype, target=utils.CCE) scale_data = Cast(scale_data, dtype, target=utils.CCE) if scale_shape != input_data_shape: scale_data = akg.topi.broadcast_to(scale_data, input_data_shape) res = akg.tvm.compute( input_data_shape, lambda *indice: input_data(*indice) * scale_data(*indice), name="res") if res.dtype != orig_dtype: res = Cast(res, orig_dtype, target=utils.CCE) return res
def scale_bias(input_data, scale_data, bias_data, target="cce"): """ Adds bias_data on scaled input_data, res = input_data * scale_data + bias_data Args: input_data(akg.tvm.Tensor): Tensor of type float16, float32, int8, uint8, int32. scale_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(scale_data) != shape(input_data), the shape of scale_data will broadcast to shape(input_data). bias_data(akg.tvm.Tensor): Tensor of same type as input_data, if shape(bias_data) != shape(input_data), the shape of bias_data will broadcast to shape(input_data). Returns: akg.tvm.Tensor of same type and shape as input_data. """ # check shape input_data_shape = [x.value for x in input_data.shape] bias_shape = [x.value for x in bias_data.shape] utils.check_shape(bias_shape) # check type if bias_data.dtype != input_data.dtype: raise RuntimeError( "type(input_data) is %s, type(bias_data) is %d, which is inconsistent" % (input_data.dtype, bias_data.dtype)) scale_input_data = scale(input_data, scale_data) dtype = bias_data.dtype orig_dtype = dtype if dtype == "int8" or dtype == "uint8": dtype = "float16" if dtype == "int32": dtype = "float32" if dtype != orig_dtype: scale_input_data = Cast(scale_input_data, dtype, target=utils.CCE) bias_data = Cast(bias_data, dtype, target=utils.CCE) if bias_shape != input_data_shape: bias_data = akg.topi.broadcast_to(bias_data, input_data_shape) res = akg.tvm.compute( input_data_shape, lambda *indice: scale_input_data(*indice) + bias_data(*indice), name="res_bias") if res.dtype != orig_dtype: res = Cast(res, orig_dtype, target=utils.CCE) return res
def _bessel_i0e_compute(input_data): """bessel i0e compute""" shape_input = input_data.shape dtype_input = input_data.dtype # chose the type of data in begin if dtype_input == "float16": input_data = Cast(input_data, "float32", target=utils.CCE) abs_data = Abs(input_data, target=utils.CCE) # compute bessel_i0e for data in (-3.75, 3.75) # t = |x| / 3.75 # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8 # + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75 broad_const_limit = akg.lang.ascend.broadcast( akg.tvm.const(CONST_LIMIT, "float32"), shape_input) before_abs_data = minimum(abs_data, broad_const_limit) data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT) square_data = mul(data, data, target=utils.CCE) before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1]) before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2]) for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]: before_res = mul(before_res, square_data, target=utils.CCE) before_res = topi.add(before_res, iter_number) exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE) before_res = mul(before_res, exp_data, target=utils.CCE) # compute bessel_i0e for data in other domain # t = |x| / 3.75 # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3 # + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7 # + 0.00392377t^-8), |x| >= 3.75 data = Divide(broad_const_limit, abs_data, target=utils.CCE) after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1]) after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2]) for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]: after_res = mul(after_res, data, target=utils.CCE) after_res = topi.add(after_res, iter_number) rsqrt_data = rsqrt(abs_data, target=utils.CCE) after_res = mul(after_res, rsqrt_data, target=utils.CCE) after_res = minimum(before_res, after_res, target=utils.CCE) # chose the type of data in end if dtype_input == "float16": after_res = Cast(after_res, "float16", target=utils.CCE) return after_res
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = Cast(input_x1, "float32", target=utils.CCE) data_y_broad = Cast(input_x2, "float32", target=utils.CCE) res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = Cast(res_trunc, "float32", target=utils.CCE) else: res_trunc = topi.divide(input_x1, input_x2) return Cast(res_trunc, input_x1.dtype, target=utils.CCE)
def truncatemod_func(a, b): """function for truncatemod formula""" # For positive numbers, floor and trunc are equivalent return akg.topi.subtract( a, akg.topi.multiply( b, Cast(floor(Divide(a, b, utils.CCE)), b.dtype, target=utils.CCE)))
def truncatemod(x, y, target=utils.CCE): """ Computes remainder of division(x / y). Note: res = x - y*trunc(x/y) Args: x(tvm.tensor.Tensor): Input tensor, support float16 on mini device, while support int32, int8, uint8, float16, float32 on cloud ones. y(tvm.tensor.Tensor): Tensor with same type as input tensor x. Returns: tvm.tensor.Tensor of same type as input tensors. """ utils.check_shape(x) utils.check_shape(y) utils.elemwise_dtype_check(x.dtype, y.dtype) dtype = x.dtype support_dtype = [ utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32, utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8 ] if product_is_mini(): support_dtype = [utils.DtypeForDavinci.FLOAT16] utils.ops_dtype_check(dtype, support_dtype) if not product_is_mini(): # The high precision compute is required. # For brevity, lex x = 132.05, y = 131.95; x and y are very close, but the difference between trunc(x)=132 # and trunc(y)=131 is 1 if dtype != "float32": x = Cast(x, "float32", target=target) y = Cast(y, "float32", target=target) res = akg.topi.mod(x, y) else: res = _truncatemod_compute_mini(x, y) if res.dtype != dtype: res = Cast(res, dtype, target=target) return res
def bitwise_not(data, target=utils.CCE): """ Bitwise-not. Args: data (tvm.tensor.Tensor): Input data of type int8 or int32. Returns: tvm.tensor.Tensor, Bitwise-not result. """ utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_INT) utils.check_shape(data.shape) one = akg.tvm.const(1, dtype=data.dtype) minus_one = akg.tvm.const(-1, dtype=data.dtype) add_one = akg.lang.ascend.vadds(data, one) multiply_one = akg.lang.ascend.vmuls(add_one, minus_one) res = Cast(multiply_one, data.dtype, target=target) return res
def ones_like(input): """ Generate an array of ones. Args: input (tvm.tensor.Tensor): Tensor,Should be of type float16, float32, int32, uint8, int8. Returns: tvm.tensor.Tensor with the same type and shape as input. """ dtype = input.dtype shape = get_shape(input) utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_TYPES]) utils.check_shape(shape) res = akg.tvm.compute(shape, lambda *i: akg.tvm.const(1, "float16"), name="res", attrs={'no_inline': 1}) res = Cast(res, dtype, target=utils.CCE) return res
def cast_conv(data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias=False, block_size=16, attrs=None): a = data[0] data[1].dtype = 'float32' b = Cast(data[1], 'float16', target='cce') if use_bias: conv_data = [a, b, data[2]] else: conv_data = [a, b] # mmad fp32 failed in post_fusing res, _ = conv_core(conv_data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias, block_size, attrs) return res, {}
def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.ascend.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.ascend.broadcast(min_, shape) if dtype != "float16": data = Cast(data, "float16", target=utils.CCE) return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ]
def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.ascend.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [Cast(max_values_and_zeros, dtype, target=utils.CCE)] else: return [max_values_and_zeros]
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type): """ Gradient for minimum or maximum operation between two input tensors `x` and `y`. Args: dz (tvm.tensor.Tensor): Type float16, float32, int32. x (tvm.tensor.Tensor): Type float16, float32, int32. y (tvm.tensor.Tensor): Type float16, float32, int32. grad_x (bool): Whether calculate dx. grad_y (bool): Whether calculate dy. op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad. Note: At least one of grad_x and grad_y is True. Returns: dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True. dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True. """ utils.check_shape(x) utils.check_shape(y) utils.check_shape(dz) utils.ops_dtype_check( [x.dtype, y.dtype, dz.dtype], [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) utils.broadcast_check(x, dz) utils.broadcast_check(y, dz) # check op types check_list = ["GE", "LE"] if op_type not in check_list: raise ValueError( "FusedMinimumOrMaximumGrad only support %s while op type is %s" % (",".join(check_list), op_type)) if not grad_x and not grad_y: raise ValueError("At least one of grad_x and grad_y is True.") x_shape = get_shape(x) y_shape = get_shape(y) dz_shape = get_shape(dz) ori_dtype = dz.dtype # get greater compute x = akg.lang.ascend.broadcast(x, dz_shape) y = akg.lang.ascend.broadcast(y, dz_shape) if product_is_mini() and ori_dtype != "float16": x = Cast(x, "float16", "cce") y = Cast(y, "float16", "cce") dz = Cast(dz, "float16", "cce") elif ori_dtype == "int32": x = Cast(x, "float32", "cce") y = Cast(y, "float32", "cce") dz = Cast(dz, "float32", "cce") zero = zero_const(dz.dtype) if op_type == "LE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select( (x(*i) <= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) elif op_type == "GE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select( (x(*i) >= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) if dx.dtype == "float16": # cast to fp32 for higher precision of reduce_sum. if get_shape(dx) != x_shape: dx = Cast(dx, "float32", "cce") if get_shape(dy) != y_shape: dy = Cast(dy, "float32", "cce") dx = sum_by_shape(dx, x_shape) dy = sum_by_shape(dy, y_shape) if ori_dtype != dx.dtype: dx = Cast(dx, ori_dtype, "cce") if ori_dtype != dy.dtype: dy = Cast(dy, ori_dtype, "cce") attrs = get_default_attrs() if grad_x and grad_y: return dx, dy, attrs if grad_x: return dx, attrs return dy, attrs
def cross(x, y, target=utils.CCE): """ Compute cross product of x and y. Note: The first dim of x or y must be 3, it will be calculated as (two dims for example) .. math:: res = x \\times y = \\left[ \\begin{matrix} l, & \\cdots \\\\ m, & \\cdots \\\\ n, & \\cdots \\end{matrix} \\right] \\times \\left[ \\begin{matrix} o, & \\cdots \\\\ p, & \\cdots \\\\ q, & \\cdots \\end{matrix} \\right] = \\left[ \\begin{matrix} mq-np, & \\cdots \\\\ no-lq, & \\cdots \\\\ lp-mo, & \\cdots \\\\ \\end{matrix} \\right] Args: x (tvm.tensor.Tensor): Input tensor, only support float16, float32, int32, int8, uint8. y (tvm.tensor.Tensor): Input tensor, must have the same shape and dtype as x. Returns: A tvm.tensor.Tensor with the same shape and dtype as x. """ utils.elemwise_shape_check(get_shape(y), get_shape(x)) utils.elemwise_dtype_check( y.dtype, x.dtype, (utils.DtypeForDavinci.ALL_FLOAT) if product_is_mini() \ else (utils.DtypeForDavinci.FLOAT16, utils.DtypeForDavinci.FLOAT32, utils.DtypeForDavinci.INT32, utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8)) shape = get_shape(x) if shape[0] != 3: raise RuntimeError( "The first axis of input must be 3, actual input is %d" % shape[0]) inp_dtype = x.dtype need_type_convert = inp_dtype in ("int8", "uint8") shape = get_shape(x) shp = shape[1:] if need_type_convert: x = Cast(x, "float16", target=utils.CCE) y = Cast(y, "float16", target=utils.CCE) a0b1 = tvm.compute(shp, lambda *i: x(0, *i) * y(1, *i), name="a0b1") a0b2 = tvm.compute(shp, lambda *i: x(0, *i) * y(2, *i), name="a0b2") a1b0 = tvm.compute(shp, lambda *i: x(1, *i) * y(0, *i), name="a1b0") a1b2 = tvm.compute(shp, lambda *i: x(1, *i) * y(2, *i), name="a1b2") a2b0 = tvm.compute(shp, lambda *i: x(2, *i) * y(0, *i), name="a2b0") a2b1 = tvm.compute(shp, lambda *i: x(2, *i) * y(1, *i), name="a2b1") res0 = tvm.compute(shp, lambda *i: a1b2(*i) - a2b1(*i), name="res0") res1 = tvm.compute(shp, lambda *i: a2b0(*i) - a0b2(*i), name="res1") res2 = tvm.compute(shp, lambda *i: a0b1(*i) - a1b0(*i), name="res2") res = tvm.compute( shape, lambda *i: tvm.expr.Select( i[0] == 0, res0(*i[1:]), tvm.expr.Select(i[0] == 1, res1(*i[1:]), res2(*i[1:]))), name='res') if need_type_convert: res = Cast(res, inp_dtype, target=utils.CCE) return res
def reduce_min_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def get_shape(pld): return [d.value for d in pld.shape] data = akg.tvm.placeholder(input_shape, dtype, name="input_data") #only works for last axis and 2D. Need to extend to multiple dimension and axes. def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.ascend.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.ascend.broadcast(min_, shape) if dtype != "float16": data = Cast(data, "float16", target=utils.CCE) return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ] l = reduce_min(data, axis, target=utils.CCE) head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) head_cast = Cast(head, "float16", target=utils.CCE) [dl_ddata ] = akg.differentiate(l, [data], head_cast, None, None, override={l: ([data], custom_reduce_min_fdiff)}) s = akg.tvm.create_schedule([dl_ddata.op]) head_ub = s.cache_read(head, "local.UB", [head_cast]) if dtype == "float16": data_ub = s.cache_read(data, "local.UB", [dl_ddata]) else: data_ub = s.cache_read(data, "local.UB", [dl_ddata.op.input_tensors[0]]) min_input_ub = s.cache_read( dl_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0].op.input_tensors[0], "local.UB", [ dl_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0] ]) s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0]. op.input_tensors[0]].set_scope("local.UB") dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # tiling split_axis = {} for i in range(len(attrs['tile'])): split_axis["axis" + str(i)] = s[dl_ddata].split( dl_ddata.op.axis[i], attrs["tile"][i]) split_axis_sorted = sorted(split_axis.items()) if dtype == "float16": s[data_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) else: s[data_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) s[dl_ddata.op.input_tensors[0]].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) s[dl_ddata.op.input_tensors[0]].set_scope("local.UB") s[min_input_ub].compute_at(s[dl_ddata], split_axis_sorted[0][1][1]) s[head_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) s[head_cast].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) s[head_cast].set_scope("local.UB") s[dl_ddata.op.input_tensors[1]].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) s[dl_ddata.op.input_tensors[1]].set_scope("local.UB") s[dl_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at( s[dl_ddata], split_axis_sorted[0][1][1]) s[dl_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB") s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].compute_at(s[dl_ddata], split_axis_sorted[0][1][1]) s[dl_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].set_scope("local.UB") # L is not being used for computation # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) # s[L].set_scope("local.UB"1 s[dl_ddata_ub].compute_at(s[dl_ddata], split_axis_sorted[-1][1][0]) with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, head, dl_ddata], "cce", name="reduce_min_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_min_ad_manual_schedule" create_code(kernel_name, './', source_code) return mod
def smooth_l1_loss(prediction, targets, anchor_samples, anchor_sample_correct=0, delta=1.0): """ Smooth l1 loss. For each value x in `error=predictions-target`, the following is calculated: .. math:: y = \\left\\{ \\begin{array}{rcl} 0.5 x^2, & & if \\left| x \\right| <= d \\\\ 0.5 d^2 + d \\cdot (\\left| x \\right| - d), & & if \\left| x \\right| > d \\end{array} \\right. `anchor_samples` acts as a condition for the loss. if anchor_samples == anchor_sample_correct, loss = 0, else loss=loss(attention pls) Args: prediction (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors, code_size] representing the (encoded) predicted locations of objects. targets (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors, code_size] representing the regression targets anchor_samples (tvm.tensor.Tensor): A int tensor of shape [batch_size, num_anchors] anchor_sample_correct (int): int, the threshold of anchor_samples delta (float): float, the point where the loss function changes from a quadratic to linear. Returns: loss (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors] tensor representing the value of the loss function. """ dim_info, _ = smooth_l1_loss_set_dim_func(prediction, targets, anchor_samples, anchor_sample_correct, delta) attrs = {DIM: dim_info} prediction_dtype = prediction.dtype target_dtype = targets.dtype anchor_samples_dtype = anchor_samples.dtype utils.elemwise_dtype_check(prediction_dtype, target_dtype, utils.DtypeForDavinci.ALL_FLOAT) utils.ops_dtype_check( anchor_samples_dtype, [utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.INT32]) if anchor_sample_correct > 5 or anchor_sample_correct < 0: raise ValueError("anchor_sample_correct attr only support [0,5]") # check shape dim prediction_shape = get_shape(prediction) if len(prediction_shape) != 3: raise RuntimeError("Prediction shape only support 3-dim!") target_shape = get_shape(targets) if len(target_shape) != 3: raise RuntimeError("Target shape only support 3-dim!") anchor_samples_shape = get_shape(anchor_samples) if len(anchor_samples_shape) != 2: raise RuntimeError("weights shape only support 2-dim!") prediction_dtype_old = prediction_dtype if product_is_mini() and prediction_dtype == 'float32': prediction = akg.topi.cast(prediction, "float16") targets = akg.topi.cast(targets, "float16") prediction_dtype = "float16" # cast anchor_samples to float type in order to use the vcmp instruction if anchor_samples.dtype.lower() != prediction_dtype.lower(): anchor_samples = Cast(anchor_samples, prediction_dtype, target=utils.CCE) anchor_samples_dtype = anchor_samples.dtype.lower() coefficient = akg.tvm.const(0.5, dtype=prediction_dtype) delta = akg.tvm.const(delta, dtype=prediction_dtype) error = akg.topi.subtract(prediction, targets) abs_error = akg.topi.abs(error) quadratic = akg.topi.minimum(abs_error, delta) linear = akg.topi.subtract(abs_error, quadratic) loss = akg.topi.add( akg.topi.multiply(coefficient, akg.topi.multiply(quadratic, quadratic)), akg.topi.multiply(delta, linear)) loss = akg.topi.sum(loss, axis=-1) loss = akg.tvm.compute(loss.shape, lambda *i: akg.tvm.expr.Select( anchor_samples(*i) == anchor_sample_correct, akg.tvm.const(0, loss.dtype), loss(*i)), name="loss") if product_is_mini() and prediction_dtype_old == 'float32': loss = akg.topi.cast(loss, prediction_dtype_old) return loss, attrs
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.ascend.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.ascend.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [Cast(max_values_and_zeros, dtype, target=utils.CCE)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max(data, axis, keepdims, target=utils.CCE) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = Cast(data, "float16", target=utils.CCE) head_cast = Cast(head, "float16", target=utils.CCE) else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" create_code(kernel_name, './', source_code) return mod