def _bessel_i1e_compute(input_data): """bessel i1e compute""" shape = vc_util.get_shape(input_data) dtype = input_data.dtype # chose the type of data in begin if dtype == "float16": input_data = cast(input_data, "float32") abs_data = abs_value(input_data) # compute bessel_i1e for data in (-3.75, 3.75) before_res = _before_res_compute(abs_data) # compute bessel_i1e for data in other domain after_res = _after_res_compute(abs_data) # As vcmp_lt and vsel instruction don't support fp32 on mini # It can be simplified by some methods, such as , "auto cast" if utils.product_is_mini(): res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( abs_data[indice].astype("float16") < akg.tvm.const( CONST_LIMIT, "float16"), before_res[indice].astype( "float16"), after_res[indice].astype("float16"))) res = cast(res, "float32") else: res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select(abs_data[ indice] < CONST_LIMIT, before_res[indice], after_res[indice])) data_sign = sign(input_data) res = mul(res, data_sign) if dtype == "float16": res = cast(res, "float16") return res
def pad(data, paddings, padtype): """add paddings to the tensor :shape: The shape of the tensor, now only support two dimension Tensor :paddings: The shape of the paddings, shape [N,2], N is the dimension of the tensor, For each dimension D of input, paddings[D, 0] indicates how many values to add before the contents of tensor in that dimension, and paddings[D, 1] indicates how many values to add after the contents of tensor in that dimension. :dtype: The type of the input, float16, float32 :padtype: One of "CONSTANT", "REFLECT", or "SYMMETRIC". """ # check shape vc_util.check_shape(data.shape) # check types vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_TYPES) # check padding types ptype_checklist = ['constant'] if not (padtype in ptype_checklist): raise RuntimeError("pad_cce only support %s while padtype is %s" % (",".join(ptype_checklist), padtype)) dtype = data.dtype if dtype == 'int8' or dtype == 'uint8': data = cast(data, "float16") rank = len(data.shape) pad_before = [] pad_after = [] for i in range(rank): pad_before.append(paddings[i][0]) pad_after.append(paddings[i][1]) B = tvm_pad(data, pad_before, pad_after=pad_after, name='B') if dtype == 'int8' or dtype == 'uint8': B = cast(B, dtype) return B
def broadcast_to(x, shape): """ Broadcast an tensor to a compatible shape. Args: x (tvm.tensor.Tensor): Tensor of type float32, float16, int8, uint8, int32 shape (list, tuple): The shape of output tensor. Returns: An tvm.tensor.Tensor with the same type as x. """ # check shape vc_util.check_shape(x) vc_util.check_shape(shape) # check dtype dtype = x.dtype vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_TYPES) # vector_dup instruction don't support int8 and uint8 # It can be simplified by some methods, such as , "auto cast" x_shape = get_shape(x) if len(x_shape) == 1 and x_shape[0] == 1 and dtype in ["int8", "uint8"]: x = cast(x, "float16") res = topi.broadcast_to(x, shape) if res.dtype != dtype: res = cast(res, dtype) return res
def minimum(input1, input2): """ Return the min value of two tensors element-wise. Note: minimum supports broadcasting. Args: input1: Tensor. input2: Tensor. Has the same type as input1. Returns: Tensor, has the same type as inputs. """ vc_util.ops_dtype_check([input1.dtype, input2.dtype], vc_util.DtypeForDavinci.ALL_TYPES) vc_util.elemwise_dtype_check(input1.dtype, input2.dtype) dtype = input1.dtype shape1 = [x.value for x in input1.shape] shape2 = [x.value for x in input2.shape] vc_util.check_shape(shape1) vc_util.check_shape(shape2) vc_util.auto_broadcast_check(shape1, shape2) if dtype in ("int8", "uint8"): input1 = cast(input1, "float16") input2 = cast(input2, "float16") res = akg.topi.minimum(input1, input2) if dtype in ("int8", "uint8"): res = cast(res, dtype) return res
def div(data1, data2): """ Calculates x/y, and returns an integer when inputs are all integers. When both arguments are integers, use integer division (also known as "floor division"). When arguments are float numbers, use normal floating point division Note: div supports broadcasting. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8. data2 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8. Returns: tvm.tensor.Tensor, has the same type as data1 and data2. """ vc_util.ops_dtype_check([data1.dtype, data2.dtype], vc_util.DtypeForDavinci.ALL_TYPES) vc_util.elemwise_dtype_check(data1.dtype, data2.dtype) dtype = data1.dtype shape1 = [x.value for x in data1.shape] shape2 = [x.value for x in data2.shape] vc_util.check_shape(shape1) vc_util.check_shape(shape2) vc_util.auto_broadcast_check(shape1, shape2) n_shape1, n_shape2, out_shape = produce_shapes(shape1, shape2) if n_shape1 != out_shape: input1_cast = akg.topi.broadcast_to(data1, out_shape) else: input1_cast = data1 if n_shape2 != out_shape: input2_cast = akg.topi.broadcast_to(data2, out_shape) else: input2_cast = data2 if dtype in ("int32", "int8", "uint8"): input1p = cast(input1_cast, "float16") input2p = cast(input2_cast, "float16") else: input1p = input1_cast input2p = input2_cast if utils.product_is_mini(): input2p_rec = reciprocal(input2p) res = akg.topi.multiply(input1p, input2p_rec) else: res = akg.topi.divide(input1p, input2p) if dtype in ("int8", "uint8"): res = floor(res) res = cast(res, "float16") if dtype in ("int32", "int8", "uint8"): res = cast(res, dtype) return res
def approximate_equal(x, y, tolerance=1e-5): """ abs(x-y) less than or equal to the tolerance Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. y (tvm.tensor.Tensor): Tensor of type float16, float32. tolerance (float): default is 1e-5 Returns: tvm.tensor.Tensor. If abs(x-y) less than or equal to the tolerance return True, else return False. """ if tolerance < 0: raise RuntimeError("tolerance should >= 0") # check shape vc_util.check_shape(x) vc_util.check_shape(y) shape = get_shape(x) if shape != get_shape(y): raise RuntimeError("input shape must be same, but got %s vs %s", shape, get_shape(y)) # check input tensor data_type vc_util.ops_dtype_check(x.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.ops_dtype_check(y.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype if dtype != y.dtype: raise RuntimeError("input type must be same, but got %s vs %s", dtype, y.dtype) res_vsub = sub(x, y) res_vabs = abs_value(res_vsub) # As vcmp_lt and vsel instruction don't support fp32 on mini # It can be simplified by some methods, such as , "auto cast" if utils.product_is_mini(): dtype = "float16" res_vabs = cast(res_vabs, dtype) t = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(1, dtype), "t") f = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(0, dtype), "f") res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( res_vabs[indice] <= akg.tvm.const(tolerance, dtype), t[indice], f[ indice])) # It can be be simplified that let cast op support fp16 and fp32 to bool type res_fp16 = cast(res, "float16") res_bool = akg.tvm.compute( shape, lambda *indice: res_fp16(*indice).astype("bool")) return res_bool
def reduce_min_max(data, axis=None, keepdims=False, method="min"): """ Computes the maximum or minimum of elements over a given axis or a list of axes of a tensor. Args: data (tvm.tensor.Tensor): The input tensor to reduce. Should be of type float16, float32, int8, uint8, int32. axis (Union[list, tuple, int, None]): The dimensions to reduce. If None, all dimensions will be reduced. If int or list, must be in the range [-len(data.shape), len(data.shape) - 1]. keepdims (bool): If True, retains reduced dimensions with length 1, default value is False. method (str): Specifies to compute maximum or minimum of input tensor, default value is min. Returns: tvm.tensor.Tensor of same type as input tensor data. """ # check shape vc_util.check_shape(data.shape) # check type dtype = data.dtype vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_TYPES) # check axis shape_len = len(data.shape) if axis is None: axis = range(shape_len) if hasattr(axis, 'index'): axis = list(axis) if isinstance(axis, int): axis = [axis] vc_util.is_valid_reduce_axis(data, axis) refined_axis = refine_reduce_axis(data, axis) if len(set(refined_axis)) == len(data.shape) and not keepdims: raise ValueError("When reducing on all axes of input, keepdim should be set to True.") # check method method_list = ["min", "max"] if method not in method_list: raise ValueError("supported method %s while given method is %s" % (",".join(method_list), method)) # In the emit_insn pass, for vmin and vmax, reduce_last_axis only support float16. if dtype != "float16": data = cast(data, "float16") if method == "min": res = akg.topi.min(data, axis=axis, keepdims=keepdims) else: res = akg.topi.max(data, axis=axis, keepdims=keepdims) if res.dtype != dtype: res = cast(res, dtype) return res
def equal_count(x, y): """ compute equal num of x and y. Args: x (tvm.tensor.Tensor): Tensor of type int32. y (tvm.tensor.Tensor): Tensor of type int32. Returns: tvm.tensor.Tensor, equal num, type is int32. """ # check shapes shape1 = get_shape(x) shape2 = get_shape(y) shapes = [shape1, shape2] for _, shape_ in enumerate(shapes): vc_util.check_shape(shape_) if len(shape1) != 1 or len(shape2) != 1: raise RuntimeError("Two inputs should all be one dim!") # check types dtype = x.dtype vc_util.ops_dtype_check([x.dtype, y.dtype], vc_util.DtypeForDavinci.INT32) # Due to instruction limitations, the int32 data needs to be converted to # float16 or float32. # When the int32 data is casted to float16, there may be overflow problems, # so as far as possible the int32 data is casted to float32. orig_dtype = dtype if product_is_mini(): dtype = "float16" else: dtype = "float32" x = cast(x, dtype) y = cast(y, dtype) shape1, shape2, shape = produce_shapes(shape1, shape2) t = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(1, dtype), "t") f = akg.tvm.compute(shape, lambda *indice: akg.tvm.const(0, dtype), "f") x = akg.topi.broadcast_to(x, shape) y = akg.topi.broadcast_to(y, shape) z = akg.tvm.compute(shape, lambda *indice: akg.tvm.expr.Select( x[indice] == y[indice], t[indice], f[indice]), name="z") res, _ = sum_value(z) if res.dtype != orig_dtype: res = cast(res, orig_dtype) return res
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = cast(input_x1, "float32") data_y_broad = cast(input_x2, "float32") res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = cast(res_trunc, "float32") else: res_trunc = topi.divide(input_x1, input_x2) return cast(res_trunc, input_x1.dtype)
def truncatemod(x, y): """ Computes remainder of division(x / y). Note: res = x - y*trunc(x/y) Args: x(tvm.tensor.Tensor): Input tensor, support float16 on mini device, while support int32, int8, uint8, float16, float32 on cloud ones. y(tvm.tensor.Tensor): Tensor with same type as input tensor x. Returns: tvm.tensor.Tensor of same type as input tensors. """ vc_util.check_shape(x) vc_util.check_shape(y) vc_util.elemwise_dtype_check(x.dtype, y.dtype) dtype = x.dtype support_dtype = [ vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32, vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8 ] if utils.product_is_mini(): support_dtype = [vc_util.DtypeForDavinci.FLOAT16] vc_util.ops_dtype_check(dtype, support_dtype) if not utils.product_is_mini(): # The high precision compute is required. # For brevity, lex x = 132.05, y = 131.95; x and y are very close, but the difference between trunc(x)=132 # and trunc(y)=131 is 1 if dtype != "float32": x = cast(x, "float32") y = cast(y, "float32") res = akg.topi.mod(x, y) else: res = _truncatemod_compute_mini(x, y) if res.dtype != dtype: res = cast(res, dtype) return res
def equal(input1, input2): """ check whether input1 equals to input2. Args: input1 (tvm.tensor.Tensor): input argument has type float16, float32 and int32. input2 (tvm.tensor.Tensor): input argument has type float16, float32 and int32. Returns: tvm.tensor.Tensor. If input1 equal to input2 return True, else return False. """ # check shapes shape1 = [x.value for x in input1.shape] shape2 = [x.value for x in input2.shape] shapes = [shape1, shape2] for _, shp in enumerate(shapes): vc_util.check_shape(shp) vc_util.ops_dtype_check([input1.dtype, input2.dtype], [ vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32, vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8 ]) dtype = input1.dtype orig_dtype = dtype if utils.product_is_mini() and dtype != "float16": dtype = "float16" if (not utils.product_is_mini()) and dtype not in ("float16", "float32"): # for int32, if cast to float16, there may be overflow dtype = "float32" if orig_dtype == "float32" and dtype == "float16": input_sub = sub(input1, input2) input_sub = cast(input_sub, dtype) zero = akg.tvm.const(0.0, dtype) res = akg.topi.equal(input_sub, zero) else: input1 = cast(input1, dtype) input2 = cast(input2, dtype) res = akg.topi.equal(input1, input2) return res
def sum_v2(inputs, axis=None, keepdims=True): """another implementation of sum with topi api.""" dtype = inputs.dtype vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) axis = ft_util.refine_reduce_axis(inputs, axis) vc_util.check_shape(inputs.shape) if not axis: output = akg.topi.identity(inputs) else: if dtype == "float16": step_sum = cast(inputs, "float32") else: step_sum = inputs step_sum = akg.topi.sum(step_sum, axis=axis, keepdims=keepdims) if dtype == "float16": output = cast(step_sum, "float16") else: output = step_sum attr_map = get_attrs() return output, attr_map
def pow_value(data, scale): shape1 = [x.value for x in data.shape] shape2 = [x.value for x in scale.shape] check_list = ["float16", "float32", "int32", "int8", "uint8"] dtype = data.dtype if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) shape = [x.value for x in data.shape] vc_util.check_shape(shape) vc_util.auto_broadcast_check(shape1, shape2) compute_dtype = "float32" if utils.product_is_mini(): compute_dtype = "float16" data = cast(data, compute_dtype) scale = cast(scale, compute_dtype) C = akg.topi.power(data, scale) C = cast(C, dtype) return C
def bitwise_not(data): """ Bitwise-not. Args: data (tvm.tensor.Tensor): Input data of type int8 or int32. Returns: tvm.tensor.Tensor, Bitwise-not result. """ vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_INT) vc_util.check_shape(data.shape) one = akg.tvm.const(1, dtype=data.dtype) minus_one = akg.tvm.const(-1, dtype=data.dtype) add_one = akg.lang.cce.vadds(data, one) multiply_one = akg.lang.cce.vmuls(add_one, minus_one) res = cast(multiply_one, data.dtype) return res
def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.cce.broadcast(min_, shape) if dtype != "float16": data = cast(data, "float16") return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ]
def ones_like(input): """ Generate an array of ones. Args: input (tvm.tensor.Tensor): Tensor,Should be of type float16, float32, int32, uint8, int8. Returns: tvm.tensor.Tensor with the same type and shape as input. """ dtype = input.dtype shape = get_shape(input) vc_util.ops_dtype_check(dtype, [vc_util.DtypeForDavinci.ALL_TYPES]) vc_util.check_shape(shape) res = akg.tvm.compute(shape, lambda *i: akg.tvm.const(1, "float16"), name="res", attrs={'no_inline': 1}) res = cast(res, dtype) return res
def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros]
def reduce_min_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def get_shape(pld): return [d.value for d in pld.shape] data = akg.tvm.placeholder(input_shape, dtype, name="input_data") #only works for last axis and 2D. Need to extend to multiple dimension and axes. def custom_reduce_min_fdiff(out, inputs, grad, ad_attrs, new_pld_array): data = inputs[0] shape = get_shape(data) if len(get_shape(data)) == 2: # add an extra stage to avoid alignment problem min_input = akg.tvm.compute(data.shape, lambda *i: data(*i), name="min_input") min_ = akg.lang.cce.reduce_min(min_input, axis=-1, keepdims=True) min_broadcast = akg.lang.cce.broadcast(min_, shape) if dtype != "float16": data = cast(data, "float16") return [ akg.tvm.compute(shape, lambda i, j: akg.tvm.expr.Select( data[i, j] == min_broadcast[i, j], grad[i], akg.tvm.const(0, dtype="float16")), name="reduce_min_ad2") ] L = reduce_min.reduce_min(data, axis) head = akg.tvm.placeholder(L.shape, name="head", dtype=L.dtype) head_cast = cast(head, "float16") [dL_ddata ] = akg.differentiate(L, [data], head_cast, None, None, override={L: ([data], custom_reduce_min_fdiff)}) s = akg.tvm.create_schedule([dL_ddata.op]) head_ub = s.cache_read(head, "local.UB", [head_cast]) if dtype == "float16": data_ub = s.cache_read(data, "local.UB", [dL_ddata]) else: data_ub = s.cache_read(data, "local.UB", [dL_ddata.op.input_tensors[0]]) min_input_ub = s.cache_read( dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0].op.input_tensors[0], "local.UB", [ dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0].op.input_tensors[0] ]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op.input_tensors[0]. op.input_tensors[0]].set_scope("local.UB") dL_ddata_ub = s.cache_write(dL_ddata, "local.UB") # tiling split_axis = {} for i in range(len(attrs['tile'])): split_axis["axis" + str(i)] = s[dL_ddata].split( dL_ddata.op.axis[i], attrs["tile"][i]) split_axis_sorted = sorted(split_axis.items()) if dtype == "float16": s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) else: s[data_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[0]].set_scope("local.UB") s[min_input_ub].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[head_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[head_cast].set_scope("local.UB") s[dL_ddata.op.input_tensors[1]].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) s[dL_ddata.op.input_tensors[1]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].compute_at( s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0]].set_scope("local.UB") s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].compute_at(s[dL_ddata], split_axis_sorted[0][1][1]) s[dL_ddata.op.input_tensors[1].op.input_tensors[0].op. input_tensors[0]].set_scope("local.UB") # L is not being used for computation # s[L].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) # s[L].set_scope("local.UB"1 s[dL_ddata_ub].compute_at(s[dL_ddata], split_axis_sorted[-1][1][0]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, head, dL_ddata], "cce", name="reduce_min_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_min_ad_manual_schedule" utils.create_code(kernel_name, './', source_code) return mod
def reduce_max_ad_optimized_manual_schedule(input_shape, dtype, axis, keepdims, polyhedral=True, attrs=None): def custom_reduce_max_fdiff(out, inputs, head_, ad_attrs, new_pld_array): data_ = inputs[0] shape = data_.shape # reduces maximum value for each column max_ = akg.lang.cce.reduce_max(data_, axis=axis, keepdims=True) # copies reduced values to get the original shape max_broadcast = akg.lang.cce.broadcast(max_, shape) # head broadcast is needed to generate correct cce code for the selection operation head_broadcast = akg.tvm.compute( shape, lambda *indices: head_(*get_reduced_indices( *indices, axis=axis, keepdims=keepdims))) # zero all the values that are not max values on the result, remaining is equal to the adjoint of the output max_values_and_zeros = akg.tvm.compute( shape, lambda *indices: akg.tvm.expr.Select( data_(*indices) == max_broadcast(*indices), head_broadcast(*indices), akg.tvm.const(0, dtype='float16')), name="reduce_max_ad2") # cast data back to the original dtype if dtype != 'float16': return [cast(max_values_and_zeros, dtype)] else: return [max_values_and_zeros] # tensor for the input data data = akg.tvm.placeholder(input_shape, dtype, name="input_data") # computation of reduce max # not used on the schedule because this is the diferentiation op l = reduce_max.reduce_max(data, axis, keepdims) # adjoint tensor for the differentiation head = akg.tvm.placeholder(l.shape, name="head", dtype=l.dtype) # cast input data if dtype != 'float16': data_cast = cast(data, "float16") head_cast = cast(head, "float16") else: data_cast = data head_cast = head # override differentiation computation with custom function [dl_ddata] = akg.differentiate( l, [data_cast], head_cast, None, None, override={l: ([data_cast], custom_reduce_max_fdiff)}) # get tensors from custom function if dtype != 'float16': max_values_and_zeros = dl_ddata.op.input_tensors[0] max_broadcast = max_values_and_zeros.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = max_values_and_zeros.op.input_tensors[2] else: max_broadcast = dl_ddata.op.input_tensors[1] max_ = max_broadcast.op.input_tensors[0] head_broadcast = dl_ddata.op.input_tensors[2] # schedule for differetiation operation # inputs: data and head s = akg.tvm.create_schedule([dl_ddata.op]) # cache reads of inputs if dtype != 'float16': head_ub = s.cache_read(head, "local.UB", [head_cast]) data_ub = s.cache_read(data, "local.UB", [data_cast]) else: # no cast operation head_ub = s.cache_read(head_cast, "local.UB", [head_broadcast]) data_ub = s.cache_read(data_cast, "local.UB", [max_, dl_ddata]) # cache write for the output dl_ddata_ub = s.cache_write(dl_ddata, "local.UB") # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] assert len(tiling_factors) == len(dl_ddata.shape) # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[dl_ddata].split(dl_ddata.op.axis[index], factor)) # get iterators iterator1 = split_iterators[0][0] # move computation of when there is a cast if dtype != "float16": s[data_cast].compute_at(s[dl_ddata], iterator1) s[data_cast].set_scope("local.UB") s[head_cast].compute_at(s[dl_ddata], iterator1) s[head_cast].set_scope("local.UB") s[max_values_and_zeros].compute_at(s[dl_ddata], iterator1) s[max_values_and_zeros].set_scope("local.UB") # move cache reads and writes s[data_ub].compute_at(s[dl_ddata], iterator1) s[head_ub].compute_at(s[dl_ddata], iterator1) s[dl_ddata_ub].compute_at(s[dl_ddata], iterator1) # move computation of the diferentiation s[max_].compute_at(s[dl_ddata], iterator1) s[max_].set_scope("local.UB") s[max_broadcast].compute_at(s[dl_ddata], iterator1) s[max_broadcast].set_scope("local.UB") s[head_broadcast].compute_at(s[dl_ddata], iterator1) s[head_broadcast].set_scope("local.UB") with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [head, data, dl_ddata], "cce", name="reduce_max_ad_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "reduce_max_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type): """ Gradient for minimum or maximum operation between two input tensors `x` and `y`. Args: dz (tvm.tensor.Tensor): Type float16, float32, int32. x (tvm.tensor.Tensor): Type float16, float32, int32. y (tvm.tensor.Tensor): Type float16, float32, int32. grad_x (bool): Whether calculate dx. grad_y (bool): Whether calculate dy. op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad. Note: At least one of grad_x and grad_y is True. Returns: dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True. dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True. """ vc_util.check_shape(x) vc_util.check_shape(y) vc_util.check_shape(dz) vc_util.ops_dtype_check([x.dtype, y.dtype, dz.dtype], [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) vc_util.broadcast_check(x, dz) vc_util.broadcast_check(y, dz) # check op types check_list = ["GE", "LE"] if op_type not in check_list: raise ValueError("FusedMinimumOrMaximumGrad only support %s while op type is %s" % (",".join(check_list), op_type)) if not grad_x and not grad_y: raise ValueError("At least one of grad_x and grad_y is True.") x_shape = get_shape(x) y_shape = get_shape(y) dz_shape = get_shape(dz) ori_dtype = dz.dtype # get greater compute x = akg.lang.cce.broadcast(x, dz_shape) y = akg.lang.cce.broadcast(y, dz_shape) if utils.product_is_mini() and ori_dtype != "float16": x = cast(x, "float16") y = cast(y, "float16") dz = cast(dz, "float16") elif ori_dtype == "int32": x = cast(x, "float32") y = cast(y, "float32") dz = cast(dz, "float32") zero = zero_const(dz.dtype) if op_type == "LE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) <= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) elif op_type == "GE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) >= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) if dx.dtype == "float16": # cast to fp32 for higher precision of reduce_sum. if get_shape(dx) != x_shape: dx = cast(dx, "float32") if get_shape(dy) != y_shape: dy = cast(dy, "float32") dx = sum.sum_by_shape(dx, x_shape) dy = sum.sum_by_shape(dy, y_shape) if ori_dtype != dx.dtype: dx = cast(dx, ori_dtype) if ori_dtype != dy.dtype: dy = cast(dy, ori_dtype) attrs = get_default_attrs() if grad_x and grad_y: return dx, dy, attrs if grad_x: return dx, attrs return dy, attrs
def Cast(x, dst_type): """cast.""" return cast.cast(x, dst_type)
def matmul4D_compute(x, y, bias_value, out_dtype, left_format, right_format, out_format, transpose_x=False, transpose_y=False, attrs=None): # for gemv use transpose of AB --> gevm trans(trans(B) * trans(A)) data_dtype = x.dtype.lower() check_list = ["int8", "uint8", "float16", "float32", "int32"] if not (data_dtype in check_list): raise RuntimeError("matmul_cce ony supports %s while dtype is %s" % (",".join(check_list), x.dtype)) if bias_value is None: bias_name = '' bias = 0 else: bias_name = bias_value.name bias = 0 if bias_value is None else 1 output_shape_zN, k = output_shape_compute(x.shape, y.shape, left_format, right_format, "zN", transpose_x, transpose_y) output_shape_zZ, k = output_shape_compute(x.shape, y.shape, left_format, right_format, "zZ", transpose_x, transpose_y) shape_A = x.shape shape_B = y.shape key = () key += (tuple(shape_A), tuple(shape_B), bias, left_format, right_format, out_format, transpose_x, transpose_y, x.dtype) hash_key = str(key) # bypass 2 left matrix ddr -> l0 # bypass 1 right matrix ddr -> l0 bypass_list = [0, 1, 2] bypass = 0 if attrs is not None and 'bypass' in attrs: bypass = attrs['bypass'] elif hash_key in matmul_set_dim_map: configs = matmul_set_dim_map[hash_key] if isinstance(configs, tuple): if len(configs) > 1 and "bypass" in configs[1]: bypass = configs[1]["bypass"] if not (bypass in bypass_list): raise RuntimeError("matmul_cce ony supports %s while bypass is %d" % (",".join(str(bypass_list)), bypass)) def matmul_compute(output_shape, adj_x, adj_y, left_format, right_format, output_format, x, y, k, *indices): N = len(output_shape) # reduce axis ko = akg.tvm.reduce_axis((0, k // cce.BLOCK_REDUCE), name='ko') ki = akg.tvm.reduce_axis((0, cce.BLOCK_REDUCE), name='ki') if output_format == "zN": if left_format == "zZ": x_indices = indices[:(N - 4)] + indices[(N - 3):(N - 2)] + ( ko, ) + indices[(N - 2):(N - 1)] + (ki, ) if adj_x: x_indices = indices[:(N - 4)] + (ko, ) + indices[ (N - 3):(N - 2)] + (ki, ) + indices[(N - 2):(N - 1)] elif left_format == "zN": x_indices = indices[:(N - 4)] + (ko, ) + indices[ (N - 3):(N - 2)] + indices[(N - 2):(N - 1)] + (ki, ) if adj_x: x_indices = indices[:(N - 4)] + indices[(N - 3):( N - 2)] + (ko, ) + (ki, ) + indices[(N - 2):(N - 1)] if right_format == "nZ": y_indices = indices[:(N - 4)] + (ko, ) + indices[ (N - 4):(N - 3)] + indices[(N - 1):] + (ki, ) if adj_y: y_indices = indices[:(N - 4)] + indices[ (N - 4):(N - 3)] + (ko, ki) + indices[(N - 1):] elif right_format == "zZ": y_indices = indices[:(N - 4)] + (ko, ) + indices[ (N - 4):(N - 3)] + (ki, ) + indices[(N - 1):] if adj_y: y_indices = indices[:(N - 4)] + indices[ (N - 4):(N - 3)] + (ko, ) + indices[(N - 1):] + (ki, ) elif right_format == "zN": y_indices = indices[:(N - 4)] + indices[ (N - 4):(N - 3)] + (ko, ) + (ki, ) + indices[(N - 1):] if adj_y: y_indices = indices[:(N - 4)] + (ko, ) + indices[ (N - 4):(N - 3)] + indices[(N - 1):] + (ki, ) return akg.lang.cce.mmad( (x(*x_indices) * y(*y_indices)).astype("float32"), axis=[ko, ki]) if left_format == "zZ": data_trans = "N" data_trans_block = "N" data_trans_block_in = "N" if transpose_x: data_trans = "Y" elif left_format == "zN": data_trans = "Y" data_trans_block = "Y" data_trans_block_in = "N" if transpose_x: data_trans = "Y" data_trans_block = "N" data_trans_block_in = "Y" if right_format == "nZ": weight_trans = "N" weight_trans_block = "N" weight_trans_block_in = "N" if transpose_y: weight_trans = "Y" elif right_format == "zZ": if not transpose_y: weight_trans_block_in = "Y" weight_trans_block = "N" weight_trans = "Y" elif transpose_y: weight_trans = "Y" weight_trans_block = "Y" weight_trans_block_in = "N" elif right_format == "zN": weight_trans = "Y" weight_trans_block = "N" weight_trans_block_in = "N" if transpose_y: weight_trans = "N" weight_trans_block = "N" weight_trans_block_in = "N" result_matmul = akg.tvm.compute( output_shape_zN, lambda *indices: matmul_compute(output_shape_zN, transpose_x, transpose_y, left_format, right_format, "zN", x, y, k, *indices), name="resMatmul", attrs={ "pragma_gemm_data": x.name, "pragma_data_transpose": data_trans, "pragma_data_transpose_block": data_trans_block, "pragma_data_transpose_block_inner": data_trans_block_in, "pragma_gemm_weight": y.name, "pragma_weight_transpose": weight_trans, "pragma_weight_transpose_block": weight_trans_block, "pragma_weight_transpose_block_inner": weight_trans_block_in, "pragma_conv_bypass_l1": bypass, "bias": bias_name, }) if out_dtype == "float16" and (bias_value == None or bias_value.dtype == "float16"): result_matmul = cast.cast(result_matmul, out_dtype) def matmul_reshape(shape, result_matmul, *indices): N = len(shape) new_indices = indices[:(N - 4)] + indices[(N - 3):(N - 2)] + indices[ (N - 4):(N - 3)] + indices[(N - 2):] return result_matmul(*new_indices) if out_format == "zZ": result = akg.tvm.compute(output_shape_zZ, lambda *indices: matmul_reshape( output_shape_zZ, result_matmul, *indices), name="result") else: result = result_matmul def bias_compute(output_shape, result, bias, output_format, *indices): N = len(output_shape) # reduce axis if output_format == "zN": bias_indices = indices[N - 4] * cce.BLOCK_OUT + indices[N - 1] elif output_format == "zZ": bias_indices = indices[N - 3] * cce.BLOCK_OUT + indices[N - 1] return result(*indices) + bias(bias_indices) if bias == 1: if out_format == "zN": out = akg.tvm.compute( output_shape_zN, lambda *indices: bias_compute( output_shape_zN, result, bias_value, out_format, *indices), name="output") elif out_format == "zZ": out = akg.tvm.compute( output_shape_zZ, lambda *indices: bias_compute( output_shape_zZ, result, bias_value, out_format, *indices), name="output") if out_dtype == "float16" and bias_value.dtype == "float32": out = cast.cast(out, out_dtype) else: out = result return out
def cross(x, y): """ Compute cross product of x and y. Note: The first dim of x or y must be 3, it will be calculated as (two dims for example) .. math:: res = x \\times y = \\left[ \\begin{matrix} l, & \\cdots \\\\ m, & \\cdots \\\\ n, & \\cdots \\end{matrix} \\right] \\times \\left[ \\begin{matrix} o, & \\cdots \\\\ p, & \\cdots \\\\ q, & \\cdots \\end{matrix} \\right] = \\left[ \\begin{matrix} mq-np, & \\cdots \\\\ no-lq, & \\cdots \\\\ lp-mo, & \\cdots \\\\ \\end{matrix} \\right] Args: x (tvm.tensor.Tensor): Input tensor, only support float16, float32, int32, int8, uint8. y (tvm.tensor.Tensor): Input tensor, must have the same shape and dtype as x. Returns: A tvm.tensor.Tensor with the same shape and dtype as x. """ vc_util.elemwise_shape_check(get_shape(y), get_shape(x)) vc_util.elemwise_dtype_check( y.dtype, x.dtype, (vc_util.DtypeForDavinci.ALL_FLOAT) if utils.product_is_mini() \ else (vc_util.DtypeForDavinci.FLOAT16, vc_util.DtypeForDavinci.FLOAT32, vc_util.DtypeForDavinci.INT32, vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.UINT8)) shape = get_shape(x) if shape[0] != 3: raise RuntimeError( "The first axis of input must be 3, actual input is %d" % shape[0]) inp_dtype = x.dtype need_type_convert = inp_dtype in ("int8", "uint8") shape = get_shape(x) shp = shape[1:] if need_type_convert: x = cast(x, "float16") y = cast(y, "float16") a0b1 = tvm.compute(shp, lambda *i: x(0, *i) * y(1, *i), name="a0b1") a0b2 = tvm.compute(shp, lambda *i: x(0, *i) * y(2, *i), name="a0b2") a1b0 = tvm.compute(shp, lambda *i: x(1, *i) * y(0, *i), name="a1b0") a1b2 = tvm.compute(shp, lambda *i: x(1, *i) * y(2, *i), name="a1b2") a2b0 = tvm.compute(shp, lambda *i: x(2, *i) * y(0, *i), name="a2b0") a2b1 = tvm.compute(shp, lambda *i: x(2, *i) * y(1, *i), name="a2b1") res0 = tvm.compute(shp, lambda *i: a1b2(*i) - a2b1(*i), name="res0") res1 = tvm.compute(shp, lambda *i: a2b0(*i) - a0b2(*i), name="res1") res2 = tvm.compute(shp, lambda *i: a0b1(*i) - a1b0(*i), name="res2") res = tvm.compute( shape, lambda *i: tvm.expr.Select( i[0] == 0, res0(*i[1:]), tvm.expr.Select(i[0] == 1, res1(*i[1:]), res2(*i[1:]))), name='res') if need_type_convert: res = cast(res, inp_dtype) return res
def truncatemod_func(a, b): """function for truncatemod formula""" # For positive numbers, floor and trunc are equivalent return akg.topi.subtract( a, akg.topi.multiply(b, cast(floor(div(a, b)), b.dtype)))
def smooth_l1_loss(prediction, target, anchor_samples, anchor_sample_correct=0, delta=1.0): """ Smooth l1 loss. For each value x in `error=predictions-target`, the following is calculated: .. math:: y = \\left\\{ \\begin{array}{rcl} 0.5 x^2, & & if \\left| x \\right| <= d \\\\ 0.5 d^2 + d \\cdot (\\left| x \\right| - d), & & if \\left| x \\right| > d \\end{array} \\right. `anchor_samples` acts as a condition for the loss. if anchor_samples == anchor_sample_correct, loss = 0, else loss=loss(attention pls) Args: prediction (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors, code_size] representing the (encoded) predicted locations of objects. target (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors, code_size] representing the regression targets anchor_samples (tvm.tensor.Tensor): A int tensor of shape [batch_size, num_anchors] anchor_sample_correct (int): int, the threshold of anchor_samples delta (float): float, the point where the loss function changes from a quadratic to linear. Returns: loss (tvm.tensor.Tensor): A float tensor of shape [batch_size, num_anchors] tensor representing the value of the loss function. """ dim_info, _ = smooth_l1_loss_set_dim_func( prediction, target, anchor_samples, anchor_sample_correct, delta) attrs = {DIM: dim_info} prediction_dtype = prediction.dtype target_dtype = target.dtype anchor_samples_dtype = anchor_samples.dtype vc_util.elemwise_dtype_check(prediction_dtype, target_dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.ops_dtype_check(anchor_samples_dtype, [vc_util.DtypeForDavinci.INT8, vc_util.DtypeForDavinci.INT32]) if anchor_sample_correct > 5 or anchor_sample_correct < 0: raise ValueError("anchor_sample_correct attr only support [0,5]") # check shape dim prediction_shape = get_shape(prediction) if len(prediction_shape) != 3: raise RuntimeError("Prediction shape only support 3-dim!") target_shape = get_shape(target) if len(target_shape) != 3: raise RuntimeError("Target shape only support 3-dim!") anchor_samples_shape = get_shape(anchor_samples) if len(anchor_samples_shape) != 2: raise RuntimeError("weights shape only support 2-dim!") prediction_dtype_old = prediction_dtype if utils.product_is_mini() and prediction_dtype == 'float32': prediction = akg.topi.cast(prediction, "float16") target = akg.topi.cast(target, "float16") prediction_dtype = "float16" # cast anchor_samples to float type in order to use the vcmp instruction if anchor_samples.dtype.lower() != prediction_dtype.lower(): anchor_samples = cast(anchor_samples, prediction_dtype) anchor_samples_dtype = anchor_samples.dtype.lower() coefficient = akg.tvm.const(0.5, dtype=prediction_dtype) delta = akg.tvm.const(delta, dtype=prediction_dtype) error = akg.topi.subtract(prediction, target) abs_error = akg.topi.abs(error) quadratic = akg.topi.minimum(abs_error, delta) linear = akg.topi.subtract(abs_error, quadratic) loss = akg.topi.add(akg.topi.multiply(coefficient, akg.topi.multiply( quadratic, quadratic)), akg.topi.multiply(delta, linear)) loss = akg.topi.sum(loss, axis=-1) loss = akg.tvm.compute(loss.shape, lambda *i: akg.tvm.expr.Select( anchor_samples(*i) == anchor_sample_correct, akg.tvm.const(0, loss.dtype), loss(*i)), name="loss") if utils.product_is_mini() and prediction_dtype_old == 'float32': loss = akg.topi.cast(loss, prediction_dtype_old) return loss, attrs
def conv(data, fmap_shape, filter_shape, pad, stride, dilation, use_bias=False, attrs=None, params=None): """ Computes sums of 5-D convolutionis. Args: data (list[tvm.tensor.Tensor]): the size is 3 if use_bias else the size is 2; data[0] Tensor of type float16 ,shape 5D (fN, fC // C0, C0, fH, fW) data[1] Tensor of type float16 ,shape 4D (wC // C0 * wH * wW, wN // C0, C0, C0) data[2] Tensor of type float16 ,shape 5D (1, wN // C0, 1, 1, 16) fmap_shape (list[int]): [fN, fC, fH, fW] filter_shape (list[int]): [wN, wC, wH, wW] pad (list[int]): [pad_top, pad_bottom, pad_left, pad_right] stride (list[int]): [stride_h, stride_w] dilation (list[int]): [dilation_h, dilation_w] use_bias (bool): bool var. attrs (dict): dict with keys for example: conv_tile,bypass Returns: tvm.tensor.Tensor of same type as data, shape is 5D(oN, oC // C0, oH, oW, C0) """ c_value = conv_core(data, fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs) c_value = cast.cast(c_value, "float16") if use_bias: bias_value = data[2] output_bias_name = "output1" cube = akg.tvm.compute(c_value.shape, lambda n, c1, h, w, c0: c_value[n, c1, h, w, c0] + bias_value[0, c1, 0, 0, c0], name=output_bias_name) else: cube = c_value block_size = 16 dim_info, _, _, dynamic_ci_c1 = conv_set_dim_func(fmap_shape, filter_shape, pad, stride, dilation, use_bias, block_size, attrs, conv_set_dim_map) all_dynamic = 0 # kh kw pad stride partial_dynamic = 0 # fn fc1 fh fw wN wC dynamic_tiling_full_dynamic = 1 # kh, kw, pad, stride are parameters if dynamic_tiling is enabled if attrs is None: attrs = {} if attrs.get("dynamic"): all_dynamic = 1 if attrs.get("partial_dynamic"): partial_dynamic = 1 dynamic = partial_dynamic or all_dynamic dynamic_tiling = 1 if attrs.get("dynamic") else 0 if not dynamic: attrs = { "dim": dim_info, "pragma_reschedule": 1, "pragma_rmselfdep": 0 } else: attrs = { "dim": dim_info, "pragma_reschedule": 1, "pragma_rmselfdep": 0, "enable_fix_loop_extent": 0, "enable_post_poly_loop_partition": 0, "enable_isolate_loop": 0, "enable_isolate_min_max": 1, "enable_conv_analyze_align": 0, "enable_double_buffer": 1, "enable_multicore": 1, "enable_invariant_hoist": 1, "pragma_keep_outer_band_order": 1, "enable_algebra_simplify": 1, "dynamic_shape_conv_full_parametric": dynamic_tiling and dynamic_tiling_full_dynamic, } attrs["pragma_outerband_need_split"] = 1 attrs["pragma_is_conv"] = 1 if dynamic_tiling: attrs["dynamic_shape"] = set_poly_upper_bound_for_tensor( data[0], 129, 1) # pos 1 of data[0] is CI1 axis else: attrs["dynamic_shape"] = set_poly_upper_bound_for_tensor( data[0], dynamic_ci_c1 + 1, 1) # pos 1 of data[0] is CI1 axis if dynamic_tiling: attrs["pragma_tilesize_is_var"] = 1 attrs["enable_stride_kernel_op"] = 0 return cube, attrs
def conv_bn1(data, fmap_shape, filter_shape, pad, stride, dilation, use_bias=False, attrs=None): """ Computes sums of 5-D convolutions and use convolution's fp32 result to compute first part of Fused_batch_norm. Fused_batch_norm's first part: \f[ m = N \times H \times W \\ \\mu_{tmp} = \\sum_{n, h, w}{\frac{x}{m}} \\ \\sigma^2_{tmp} = \\sum_{n, h, w}{\frac{x^2}{m}} \f] Args: data (list[tvm.tensor.Tensor]): the size is 3 if use_bias else the size is 2; data[0] Tensor of type float16 ,shape 5D (fN, fC // C0, C0, fH, fW) data[1] Tensor of type float16 ,shape 4D (wC // C0 * wH * wW, wN // C0, C0, C0) data[2] Tensor of type float16 ,shape 5D (1, wN // C0, 1, 1, 16) fmap_shape (list[int]): [fN, fC, fH, fW] filter_shape (list[int]): [wN, wC, wH, wW] pad (list[int]): [pad_top, pad_bottom, pad_left, pad_right] stride (list[int]): [stride_h, stride_w] dilation (list[int]): [dilation_h, dilation_w] use_bias (bool): bool var. attrs (dict): dict with keys for example: conv_tile,bypass Returns: tvm.tensor.Tensor of same type as data, shape is 5D(oN, oC // C0, oH, oW, C0) """ if use_bias: raise ValueError("do not support bias yet !!!") block_size = 16 dim_info, conv_tile, bypass, _ = conv_set_dim_func( fmap_shape, filter_shape, pad, stride, dilation, use_bias, block_size, attrs, conv_bn1_set_dim_map) if attrs is None: attrs = {"conv_tile": conv_tile, "bypass": bypass} else: attrs['conv_tile'] = conv_tile attrs['bypass'] = bypass conv_res_32 = conv_core(data, fmap_shape, filter_shape, pad, stride, dilation, use_bias, attrs) conv_res_16 = cast.cast(conv_res_32, "float16") axes = [3, 2, 0] conv_res_32_shape = [x.value for x in conv_res_32.shape] num = reduce(lambda i, j: i * j, [conv_res_32_shape[i] for i in axes]) avg_num = round(float(1) / float(num), 12) res_sum = akg.topi.sum(conv_res_32, axes, keepdims=True) mean = akg.lang.cce.vmuls(res_sum, avg_num) res_square = akg.tvm.compute(conv_res_32.shape, lambda *i: conv_res_32[i] * conv_res_32[i], name="res_square") square_sum = akg.topi.sum(res_square, axes, keepdims=True) var_part = akg.lang.cce.vmuls(square_sum, avg_num) # need pragma_force_rmselfdep to enable multicore using atomic add # because default pragma_rmselfdep=1 will disable multicore of reduce axes attrs = { "dim": dim_info, "pragma_reschedule": 1, "enable_bisect_optimize": 0, "pragma_rmselfdep": 0, "pragma_force_rmselfdep": 1 } return conv_res_16, var_part, mean, attrs
def conv_backprop_input_compute(data, output_shape, filter_shape, input_shape, pad_, stride_, block_size=16, attrs=None, key=None): """core computation of conv_backprop_input.""" _, in_c, w_h, w_w = filter_shape # stride (stride_h, stride_w) stride_h, stride_w = stride_ if stride_h != stride_w: raise ValueError("stride_h must be equal to stride_w.") # output shape (NCHW -> NC1HWC0) in_nn, in_cc, in_hh, in_ww = output_shape if in_c % block_size != 0: raise ValueError("in_c must be divided by block_size.") input_shape_nc1hwc0 = (in_nn, in_cc // block_size, in_hh, in_ww, block_size) in_nn, _, in_hh, in_ww, _ = input_shape_nc1hwc0 input_trans_shape_nc1hwc0 = (in_nn, in_cc // block_size, in_hh * stride_h, in_ww * stride_w, block_size) in_n, in_c1, in_h, in_w, _ = input_trans_shape_nc1hwc0 # kernel shape (NCHW -> NC1HWC0 -> Fractal) k_n, k_c, k_h, k_w = filter_shape if k_c % block_size != 0: raise ValueError("k_c must be divided by block_size.") kernel_shape_nc1hwc0 = (k_n, k_c // block_size, k_h, k_w, block_size) k_n, k_c1, k_h, k_w, k_c0 = kernel_shape_nc1hwc0 kernel_shape_trans = (k_n // block_size * k_h * k_w, k_c // block_size, block_size, block_size) k_c1 = k_n // block_size k_n = k_c _, _, input_h, input_w = input_shape # padding ((padding_h, padding_w) -> (padding_top, padding_bottom, padding_left, padding_right)) padding = (pad_[0], pad_[1], pad_[2], pad_[3]) pad_t, pad_b, pad_l, pad_r = padding # padHT -> padHT' p_top = k_h - pad_t - 1 # padHB -> padHB' p_bottom = input_h + pad_t - stride_h * ( (input_h + pad_t + pad_b - k_h) // stride_h + 1) # padWL -> padWL' p_left = k_w - pad_l - 1 # padWR -> padWR' p_right = input_w + pad_l - stride_w * ( (input_w + pad_l + pad_r - k_w) // stride_w + 1) s_h = 1 s_w = 1 # NC1HWCO a_value = data[0] if data[1].dtype == 'float32': b_value = cast.cast(data[1], 'float16') tiling_args = cast_tiling_args else: b_value = data[1] tiling_args = conv_backprop_input_tiling_args # Create reduction variables kc1 = akg.tvm.reduce_axis((0, k_c1), name='kc1') kh = akg.tvm.reduce_axis((0, k_h), name='kh') kw = akg.tvm.reduce_axis((0, k_w), name='kw') kc0 = akg.tvm.reduce_axis((0, k_c0), name='kc0') use_auto_tiling = False if attrs is not None and 'conv_tile' in attrs and len( attrs['conv_tile']) >= 5: tile_value = attrs['conv_tile'] elif key in tiling_args: tile_value = tiling_args[key] else: use_auto_tiling = True out_h = (in_h + p_top + p_bottom - k_h) // (s_h) + 1 out_w = (in_w + p_left + p_right - k_w) // (s_w) + 1 out_shape_nc1hwc0 = (in_n, k_n // block_size, out_h, out_w, block_size) out_n, out_c1, out_h, out_w, out_c0 = out_shape_nc1hwc0 # set dim info = dim.Dim() index_ = 0 if not use_auto_tiling: tile_hh = tile_value[0] if tile_hh == input_h: tile_hh += pad_t + pad_b tile_coco = tile_value[1] tile_coco = (tile_coco + block_size - 1) // block_size * block_size tile_mm = tile_value[2] tile_mm = (tile_mm + block_size - 1) // block_size * block_size tile_kk = tile_value[3] if not tile_kk % (block_size * w_h * w_w) == 0: logging.warning( "Warning: tile_k must be a multiple of (block_size * w_h * w_w)" ) tile_kk = (tile_kk + block_size * w_h * w_w - 1) // (block_size * w_h * w_w) * (block_size * w_h * w_w) tile_nn = tile_value[4] tile_nn = (tile_nn + block_size - 1) // block_size * block_size tile_ww = input_w if len(tile_value) >= 6 and tile_value[5] > 0: tile_ww = tile_value[5] if tile_ww == input_w: tile_ww += pad_l + pad_r if tile_hh == in_h: tile_hh += p_top + p_bottom tile_out_h = (tile_hh - k_h) // s_h + 1 if tile_ww == in_w: tile_ww += p_left + p_right tile_out_w = (tile_ww - k_w) // s_w + 1 if tile_coco > 0: c1_cut = tile_coco // block_size else: c1_cut = out_c1 if out_n > 1: info.setdim(index=index_, axis=0, tilel1=1, tilel0=0) # n if out_c1 > 1: info.setdim(index=index_, axis=1, tilel1=c1_cut, tilel0=0) # c1 if out_h > 1: info.setdim(index=index_, axis="H", tilel1=tile_out_h, tilel0=0) # h if out_w > 1: info.setdim(index=index_, axis="W", tilel1=tile_out_w, tilel0=0) # w if out_c0 > 1: info.setdim(index=index_, axis=4, tilel1=out_c0, tilel0=0) # c0 if in_c1 > 1: info.setdim(index=index_, axis=5, tilel1=in_c1, tilel0=0) # kc1 if k_h > 1: info.setdim(index=index_, axis=5, tilel1=k_h, tilel0=0) # kh if k_w > 1: info.setdim(index=index_, axis=5, tilel1=k_w, tilel0=0) # kw info = str(info) else: info = "" # Compute the convolution below output_name = "output0" # weight_trans [ ko, no, ni, ki ] # weight_trans [ co_1, kh, kw, ci_1, ci_0, co_0 ] # kw = ko % k_w # kh = ko // k_w % k_h # co_1 = ko // k_w // k_h # ci_1 = no # --> # weight [ ci_1, kh', kw', co_1, co_0, ci_0 ] # weight [ no, k_h - ko // k_w % k_h - 1, k_w - ko % k_w - 1, ko // k_w // k_h, co_0, ci_0 ] b_trans = akg.tvm.compute(kernel_shape_trans, lambda ko, no, ni, ki: b_value[ ((no * k_h + k_h - 1 - ko // k_w % k_h) * k_w + k_w - 1 - ko % k_w), ko // (k_h * k_w), ki, ni], name='B_trans') if ((stride_h > 1) or (stride_w > 1)): @akg.tvm.hybrid.script def data_trans_hybrid(output, inputs, const_zero): """Implements data_trans ( B[n, c1, h * strideH, w * strideW, c0] = A[n, c1, h, w, c0] ).""" stride_h = output.shape[2] // inputs.shape[2] stride_w = output.shape[3] // inputs.shape[3] b = allocate(output.shape, output.dtype, 'local') for n in range(output.shape[0]): for c1 in range(output.shape[1]): for h in range(output.shape[2]): for w in range(output.shape[3]): for c0 in range(output.shape[4]): b[n, c1, h, w, c0] = const_zero if h % stride_h == 0 and w % stride_w == 0: b[n, c1, h, w, c0] = inputs[n, c1, h // stride_h, w // stride_w, c0] return b a_trans_init = akg.tvm.placeholder(input_trans_shape_nc1hwc0, dtype="float16", name='a_trans') const_zero = akg.tvm.const(0, 'float16') a_trans = data_trans_hybrid(a_trans_init, a_value, const_zero) else: a_trans = a_value conv_attrs = { "pragma_conv_kernel_n": k_n, "pragma_conv_kernel_h": k_h, "pragma_conv_kernel_w": k_w, "pragma_conv_padding_top": p_top, "pragma_conv_padding_bottom": p_bottom, "pragma_conv_padding_left": p_left, "pragma_conv_padding_right": p_right, "pragma_conv_bypass_l1": 0, "pragma_conv_backprop_input": 1, "pragma_conv_stride_h": s_h, "pragma_conv_stride_w": s_w, "pragma_conv_dilation_h": 1, "pragma_conv_dilation_w": 1, "pragma_conv_fm_n": in_n, "pragma_conv_fm_c": in_c, "pragma_conv_fm_h": in_h, "pragma_conv_fm_w": in_w, "feature": a_trans.op.name, "filter": b_value.op.name, "bias": 'None', "res": output_name } if not use_auto_tiling: conv_attrs["pragma_conv_h_cut"] = (tile_out_h - 1) * s_h + k_h conv_attrs["pragma_conv_w_cut"] = (tile_out_w - 1) * s_w + k_w conv_attrs["pragma_conv_co_cut"] = c1_cut * k_c0 conv_attrs["pragma_conv_m_cut"] = tile_mm conv_attrs["pragma_conv_k_cut"] = tile_kk conv_attrs["pragma_conv_n_cut"] = tile_nn res_c = akg.tvm.compute( out_shape_nc1hwc0, lambda n, c1, h, w, c0: akg.lang.cce.mmad((akg.tvm.if_then_else( akg.tvm.any((h * s_h + kh) < p_top, (h * s_h + kh) > (in_h + p_top - 1), (w * s_w + kw) < p_left, (w * s_w + kw) > (in_w + p_left - 1)), akg.tvm.const(0.0, 'float16'), a_trans[n, kc1, (h * s_h + kh - p_top), (w * s_w + kw - p_left), kc0]) * b_trans[ (kc1 * k_h + kh) * k_w + kw, c1, c0, kc0]).astype( "float32"), axis=[kc1, kh, kw, kc0]), name=output_name, attrs=conv_attrs) res_c = cast.cast(res_c, "float16") return res_c, {"dim": info, "pragma_reschedule": 1, "pragma_rmselfdep": 0}