def bias_add_ad(head, input_shape, data_format): """ Compute gradient for bias_add operator using automatic differentiate. Args: head (tvm.tensor.Tensor): Input tensor. input_shape (Union[list, tuple]): Input shape of head. data_format (str): Data format of input tensors. Returns: tvm.tensor.Tensor of same shape and type as head. """ check_list = ["NHWC", "NC1HWC0", "DefaultFormat"] if data_format not in check_list: raise RuntimeError("bias_add_grad only support %s while dataformat is %s" % (",".join(check_list), data_format)) vc_util.check_shape(head.shape) shape1 = [x.value for x in head.shape] vc_util.davinci_format_check(shape1, data_format) a = akg.tvm.placeholder(head.shape, head.dtype, "A") if data_format == "NC1HWC0": bias_shape = (1, head.shape[1], 1, 1, head.shape[4]) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") elif data_format == "NHWC": bias_shape = (input_shape[-1],) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") else: bias_shape = (input_shape[1],) b = akg.tvm.placeholder(bias_shape, head.dtype, "B") c = bias_add.bias_add(a, b, data_format) jacs = list(akg.differentiate(c, [b], head)) attrs = {} return jacs[0], attrs
def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True): """ Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 4-dims tensor of type float16, float32 format_ (str): a str defined the format of "data" dst_dtype (str): a str defined the type of output, could be float16 or float32 Returns: 5-dims tvm.tensor.Tensor,type is defined by dst_dtype, which shape is [N, ceil(C / 16), H, W, 16] and attr about tiling args Raises: ValueError: If the type of format_ is invalid. """ # Check dtype vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) # Check shape shape = get_shape(data) vc_util.davinci_format_check(shape, format_, dim=4) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, four2five only support NCHW and NHWC format input" .format(format_)) last_channel = 16 if format_ == "NCHW": bs, c, h, w = get_shape(data) else: bs, h, w, c = get_shape(data) pad_c = c if c % last_channel != 0: pad_c = (c + 15) // last_channel * last_channel c1 = pad_c // last_channel c0 = last_channel is_dynamic = ds.shape_is_dynamic(data) if not is_dynamic: attrs = get_attrs() else: attrs = get_dynamic_attrs() # Check size c when casting happens if data.dtype != dst_dtype and c0 * c1 >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c0 * c1)) @script(capture=locals()) def nchw_to_nc1hwc0_step(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, c0, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, c_i0, w_i] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] output1 = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output1[n_i, c_i, h_i, w_i, c_i0] = output[n_i, c_i, h_i, c_i0, w_i] return output1 @script(capture=locals()) def nchw_to_nc1hwc0(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] return output @script(capture=locals()) def nhwc_to_nc1hwc0(inputs, zero, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): if c_i * last_channel + c_i0 < c: output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, h_i, w_i, c_i * last_channel + c_i0] else: output[n_i, c_i, h_i, w_i, c_i0] = zero return output cast_data = data need_cast = data.dtype == 'float32' and dst_dtype == 'float16' if c % last_channel != 0 or need_cast: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) else: expansion = None # float32 -> float16, need to cast before transform if need_cast: cast_data = akg.lang.cce.cast_to(data, dst_dtype) zero_ = akg.tvm.const(0.0, cast_data.dtype) if format_ == "NCHW": if c % last_channel != 0: pad_shape = [bs, pad_c, h, w] if h == 1 and w == 1: # if h and w both are 1, it is pad last dim case output_shape = [bs, pad_c // last_channel, h, w, last_channel] output = akg.tvm.compute( output_shape, lambda i, c1, k, l, c0: akg.tvm.expr.Select( c0 < c - c1 * last_channel, cast_data[ i, c1 * last_channel + c0, k, l], akg.tvm.const(0, cast_data.dtype)), name="output") else: # if need to pad c dim, separate transpose to two steps # first is nchw -> nc1hc0w, second is nc1hc0w -> nc1hwc0 pad_data = akg.tvm.compute( pad_shape, lambda i, j, k, l: akg.tvm.expr.Select( j < c, cast_data[i, j, k, l], zero_), name="pad_data") output = nchw_to_nc1hwc0_step(pad_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and data.dtype == "float16" and h * w % last_channel == 0 and h * w < 3600: output_shape = [bs, c1, h, w, c0] output = akg.tvm.compute( output_shape, lambda n, c1, h, w, c0: akg.lang.cce.four2five_nchw( cast_data[n, c1 * last_channel + c0, h, w]), name="output") else: output = nchw_to_nc1hwc0(cast_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and c < last_channel: rank = 5 # (n, c1, h, w, c0) pad_before = [] pad_after = [] for _ in range(rank): pad_before.append(0) pad_after.append(0) pad_after[-1] = last_channel - c # As c < last_channel, c1 is 1 output = akg.tvm.compute( (bs, c1, h, w, c), lambda bs_i, _, h_i, w_i, c_i: cast_data[bs_i, h_i, w_i, c_i], name="output") output = tvm_pad(output, pad_before, pad_after=pad_after, name='pad_output') else: output = nhwc_to_nc1hwc0(cast_data, zero_, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) # float16 -> float32, need to cast after transform if data.dtype == 'float16' and dst_dtype == 'float32': output = akg.lang.cce.cast_to(output, dst_dtype) vc_util.davinci_format_check(output.shape, "NC1HWC0", dim=5) if not is_dynamic: dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype) if dim_info != "": attrs["dim"] = dim_info if need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy( output, format_, expansion) elif need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy_dynamic( output, format_) if is_dynamic: attrs["enable_feature_library_pre_poly"] = True return output, attrs
def bias_add(data1, data2, data_format): """ Adds bias data2 to input tensor data1. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32. data2 (tvm.tensor.Tensor): The bias tensor, should be of same type as data1. If shape(data2) != shape(data1), broadcast will happen. data_format (str): Data format of input tensors, could be NC1HWC0, NHWC or DefaultFormat. Returns: tvm.tensor.Tensor of same shape and type as data1. """ vc_util.check_shape(data1.shape) vc_util.check_shape(data2.shape) shape1 = get_shape(data1) shape2 = get_shape(data2) vc_util.davinci_format_check(shape1, data_format) vc_util.ops_dtype_check([data1.dtype, data2.dtype], vc_util.DtypeForDavinci.ALL_FLOAT) if data_format == 'NC1HWC0': data2_new = akg.lang.cce.broadcast(data2, shape1) res = akg.lang.cce.vadd(data1, data2_new) else: if len(shape2) != 1: raise RuntimeError("data2 should be a 1D Tensor!") if data_format == "NHWC": if len(shape1) != 4: raise RuntimeError( "bias_add only support 4D shape when data format is NHWC!") c_dim_len = shape1[3] if c_dim_len != shape2[0]: raise ValueError( "The size of bias should be equal to the channel dimension, " " while the size of bias is {0} and the channel dimension is " "{1}".format(shape2[0], c_dim_len)) data2_reshaped, _ = reshape(data2, [1, 1, 1, shape2[0]]) elif data_format == "DefaultFormat": if len(shape1) != 2 and len(shape1) != 4: raise RuntimeError( "bias_add only support 2D and 4D shape when data format is DefaultFormat!" ) c_dim_len = shape1[1] if c_dim_len != shape2[0]: raise ValueError( "The size of bias should be equal to the channel dimension, " " while the size of bias is {0} and the channel dimension is " "{1}".format(shape2[0], c_dim_len)) if len(shape1) == 2: data2_reshaped, _ = reshape(data2, [1, shape2[0]]) else: # NCHW data2_reshaped, _ = reshape(data2, [1, shape2[0], 1, 1]) data2_new = akg.lang.cce.broadcast(data2_reshaped, shape1) res = akg.lang.cce.vadd(data1, data2_new) akg.register_variables("reshape_diff", [data2], data2_reshaped) return res
def five2four(data, shape4d, dst_type, format_): """ Convert 5-dims "data" to 4-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 5-dims tensor of type float16, float32 shape4d (Union[list, tuple]): a list has 4 nums, shape of output Tensor dst_type (str): data type of output Tensor format_ (str): a str defined the format of returns, support NCHW and NHWC Returns: 4-dims tvm.tensor.Tensor. """ vc_util.ops_dtype_check([data.dtype, dst_type], vc_util.DtypeForDavinci.ALL_FLOAT) shape5d = get_shape(data) if not shape_is_dynamic(data): if len(shape5d) != 5 or shape5d[-1] != 16: raise ValueError( "five2four_cce only support 5-dim data and last dim should be 16" ) bs, c1, h, w, c0 = shape5d if not shape_is_dynamic(data): vc_util.davinci_format_check(shape5d, "NC1HWC0", dim=5) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, five2four only support NCHW and NHWC format input" .format(format_)) if format_ == "NCHW": if shape_is_dynamic(data): shape4d = [bs, c1 * c0, h, w] _, c, h_4d, w_4d = shape4d else: if shape_is_dynamic(data): shape4d = [bs, h, w, c1 * c0] _, h_4d, w_4d, c = shape4d vc_util.davinci_format_check(shape4d, format_, dim=4) # Check is shape4d and shape5d match if False not in [ isinstance(s, (int, akg.tvm.expr.IntImm)) for s in shape5d ]: if h_4d != h or w_4d != w: raise ValueError( "five2four_cce's shape4d h and w should equal to data shape's h and w" ) if c > c1 * c0 or c <= (c1 - 1) * c0: raise ValueError( "five2four_cce's shape4d c should in set ((c1 - 1) * c0, c1 * c0]" ) # Check size c when casting happens if not shape_is_dynamic(data): if data.dtype != dst_type and c >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c)) @script(capture=locals()) def nc1hwc0_to_nhwc(inputs, bs, h, w, c, c1, c0): output = allocate((bs, h, w, c), inputs.dtype, "local") for n_i in range(bs): for h_i in range(h): for w_i in range(w): for c_i in range(c1): for c_i0 in range(c0): output[n_i, h_i, w_i, c_i * c0 + c_i0] = inputs[n_i, c_i, h_i, w_i, c_i0] return output @script(capture=locals()) def nc1hwc0_to_nchw(inputs, bs, h, w, c, c1, c0): output = allocate((bs, c, h, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i * c0 + c_i0, h_i, w_i] = inputs[n_i, c_i, h_i, w_i, c_i0] return output # if c % 16 == 0, h and w == 1, five2four is a reshape operation if shape_is_dynamic(data): call_reshape = isinstance(h, int) and isinstance( w, int) and h == 1 and w == 1 else: call_reshape = h == 1 and w == 1 and c % 16 == 0 c_value = None expansion = None if format_ == "NHWC": if call_reshape: output = akg.topi.reshape(data, (bs, h, w, c)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, h, w, c), lambda *indice: output(*indice), name="reshape") elif c < c0: reshape_output = akg.topi.reshape(data, (bs, h, w, c0)) output = akg.tvm.compute((bs, h, w, c), lambda *i: reshape_output(*i), name='slice_output') else: output = nc1hwc0_to_nhwc(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) else: if call_reshape: output = akg.topi.reshape(data, (bs, c, h, w)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, c, h, w), lambda *indice: output(*indice), name="reshape") else: output = nc1hwc0_to_nchw(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) # two special cases for tiling strategy if not shape_is_dynamic(data): if c < c0 or output.dtype != dst_type: c_value = c if c % c0 != 0 and output.dtype != dst_type: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) attrs = get_attrs() if not call_reshape: attrs["custom_tiling"] = five2four_tiling_strategy( data, c_value, expansion) if output.dtype != dst_type: output = akg.topi.cast(output, dst_type) return output, attrs
def maxpool_with_argmax_dynamic(data, kernel, stride, strategy): """ Performs the max pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of maxpooling. """ attrs = get_dynamic_attrs() dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0] for k, v in attr_map_v2.items(): attrs[k] = v if dim_info != "": attrs['dim'] = dim_info # attrs["custom_tiling"] = maxpool_with_argmax_custom_tiling_strategy(data) attrs["enable_feature_library"] = True shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape(kernel, 2, 'Kernel') vc_util.check_shape(stride, 2, 'Stride') pad_strategy_check(strategy) kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] zero = akg.tvm.const(0.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w, pad, stride, min_value, tag='') out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") output = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.max( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_max") zero = akg.tvm.const(0.0, dtype=dtype) mask_first_max_shape = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) mask_first_max = akg.tvm.compute(mask_first_max_shape, lambda *indice: zero, name="mask_first_max") attrs["custom_tiling"] = maxpool_with_argmax_dynamic_tensor_strategy( data, fmap_img2col_ub, mask_first_max) attrs["dynamic_shape"] = ds.set_dynamic_shape_limit_for_tensor( output, [64, 64], [2, 3]) return output, mask_first_max, attrs
def maxpool_with_argmax(data, kernel, stride, strategy): """ Performs the max pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of maxpooling. """ attrs = get_attrs() dim_info = maxpool_with_argmax_set_dim_func(data, kernel, stride, strategy)[0] for k, v in attr_map_v2.items(): attrs[k] = v if dim_info != "": attrs['dim'] = dim_info attrs["custom_tiling"] = maxpool_with_argmax_tiling_strategy( data, kernel, stride, strategy) shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape(kernel, 2, 'Kernel') vc_util.check_shape(stride, 2, 'Stride') pad_strategy_check(strategy) kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] zero = akg.tvm.const(0.0, dtype=dtype) one = akg.tvm.const(1.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shape_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shape_ub, kernel_h, kernel_w, pad, stride, min_value, tag='') out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") output = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.max( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_max") pooling_mask = akg.tvm.compute( fmap_img2col_shape_ub, lambda n, c1, kh, kw, oh, ow, c0: akg.tvm.if_then_else( fmap_img2col_ub[n, c1, kh, kw, oh, ow, c0] < output[ n, c1, oh, ow, c0], zero, one), name="pooling_mask") mask_flag = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0], name="mask_flag") mask_init = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: pooling_mask[n, c1, 0, 0, oh, ow, c0], name="mask_init") # spec 2 @script(capture=locals()) def hybrid_first_max(mask_, flag_, flag2_, zero_, one_): output_ = allocate( (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0), mask_.dtype, 'local') for n_i in range(in_n): for c1_i in range(in_c1): for oh_i in range(out_h): for ow_i in range(out_w): for c0_i in range(in_c0): output_[n_i, c1_i, 0, 0, oh_i, ow_i, c0_i] = flag2_[n_i, c1_i, oh_i, ow_i, c0_i] for kh_i in range(kernel_h): for kw_i in range(kernel_w): for oh_i in range(out_h): for ow_i in range(out_w): for c0_i in range(in_c0): output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \ mask_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] -\ flag_[n_i, c1_i, oh_i, ow_i, c0_i] output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] = \ max(output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i], zero_) flag_[n_i, c1_i, oh_i, ow_i, c0_i] =\ flag_[n_i, c1_i, oh_i, ow_i, c0_i] +\ output_[n_i, c1_i, kh_i, kw_i, oh_i, ow_i, c0_i] return output_ mask_first_max = hybrid_first_max(pooling_mask, mask_flag, mask_init, zero, one) return output, mask_first_max, attrs
def maxpool(data, kernel, stride, strategy): """ Performs the max pooling on the input data. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID', 'SAME' or instance of list(four int numbers for 'CONSTANTS' strategy). Support **Strategies** is same as avgpool. Returns: tvm.tensor.Tensor, as result for max pooling. """ attrs = attr_map attrs['dim'] = maxpool_set_dim_func(data, kernel, stride, strategy)[0] shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) vc_util.check_shape(kernel, 2, "Kernel") vc_util.check_shape(stride, 2, "Stride") pad_strategy_check(strategy) kernel_h, kernel_w = kernel stride_h, stride_w = stride in_n, in_c1, in_h, in_w, in_c0 = shape [ph_h, _, pw_h, _], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) if attrs.get("dynamic") is True: # dynamic shape: although we can represent out_h and out_w using input shapes, they are too complicated out_h = akg.tvm.var("OUT_H") out_w = akg.tvm.var("OUT_W") @script(capture=locals()) def dynamic_max_pool_hybrid_0(zero_, one_, min_value_, x_, in_n, in_c1, in_h, in_w, in_c0, out_h, out_w): output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype) for n in range(in_n): for c1 in range(in_c1): # Head for ow in range(out_w): for c0 in range(in_c0): output[n, c1, 0, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1: output[n, c1, 0, ow, c0] = \ max(output[n, c1, 0, ow, c0], x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0]) # Tail for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): output[n, c1, oh + 1, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1\ and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1: output[n, c1, oh + 1, ow, c0] = max( output[n, c1, oh + 1, ow, c0], x_[n, c1, (oh + 1) * stride_h + kh - ph_h, ow * stride_w + kw - pw_h, c0]) return output # static shape's hybrid @script(capture=locals()) def static_max_pool_hybrid_0(zero_, one_, min_value_, x_): output = output_tensor((in_n, in_c1, out_h, out_w, in_c0), x_.dtype) for n in range(in_n): for c1 in range(in_c1): # Head for ow in range(out_w): for c0 in range(in_c0): output[n, c1, 0, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= kh <= in_h + ph_h - 1 and 0 <= ow * stride_w + kw - pw_h <= in_w - 1: output[n, c1, 0, ow, c0] = \ max(output[n, c1, 0, ow, c0], x_[n, c1, kh - ph_h, ow * stride_w + kw - pw_h, c0]) # Tail for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): output[n, c1, oh + 1, ow, c0] = min_value_ for kh in range(kernel_h): for kw in range(kernel_w): for oh in range(out_h - 1): for ow in range(out_w): for c0 in range(in_c0): if ph_h <= (oh + 1) * stride_h + kh <= in_h + ph_h - 1 \ and pw_h <= ow * stride_w + kw <= in_w + pw_h - 1: output[n, c1, oh + 1, ow, c0] = max( output[n, c1, oh + 1, ow, c0], x_[n, c1, (oh + 1) * stride_h + kh - ph_h, ow * stride_w + kw - pw_h, c0]) return output zero = akg.tvm.const(0.0, dtype=dtype) one = akg.tvm.const(1.0, dtype=dtype) min_value = akg.tvm.const(-65504.0 if dtype == 'float16' else -340282346638528859811704183484516925440.0, dtype=dtype) if attrs.get("dynamic") is True: output = dynamic_max_pool_hybrid_0(zero, one, min_value, data, in_n, in_c1, in_h, in_w, in_c0, out_h, out_w) else: output = static_max_pool_hybrid_0(zero, one, min_value, data) return output, attrs
def maxpool_manual_schedule(shape, kernel, stride, padding, dtype, attrs=None, polyhedral=False): """maxpool with manual schedule""" vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) maxpool_param_check(kernel, stride, padding) data = akg.tvm.placeholder(shape, dtype, name="input_data") batch_size, in_c1, input_h, input_w, in_c0 = data.shape kernel_h, kernel_w = kernel stride_h, stride_w = stride if len(padding) == 2: pad_h, pad_w = padding elif len(padding) == 4: pad_h, pad_w = padding[0], padding[2] out_size_h = (input_h + 2 * pad_h - kernel_h) // stride_h + 1 out_size_w = (input_w + 2 * pad_w - kernel_w) // stride_w + 1 # padding operation if pad_h != 0 or pad_w != 0: pad_shape = (batch_size, in_c1, input_h + 2 * pad_h, input_w + 2 * pad_w, in_c0) padded_input = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm.any( h > input_h + pad_h - 1, h < pad_h, w > input_w + pad_w - 1, w < pad_w, ), akg.tvm.const(0.0, dtype=dtype), data[n, c1, h - pad_h, w - pad_w, c0], ), name="padded_input") else: padded_input = data # reduce iterators it_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="iterator_reduction_height") it_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="iterator_reduction_width") out_shape = (batch_size, in_c1, out_size_h, out_size_w, in_c0) res = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.max( padded_input[n, c1, (h * stride_h + it_kernel_h), (w * stride_w + it_kernel_w), c0], axis=[it_kernel_h, it_kernel_w]), name="maxpool_not_hybrid") s = akg.tvm.create_schedule([res.op]) if pad_w != 0 or pad_h != 0: padded_input = res.op.input_tensors[0] else: padded_input = res # cache reads and writes # after this cache write: reference to res_ub to change the reduction axis res_ub = s.cache_write(res, "local.UB") if pad_w != 0 or pad_h != 0: data_ub = s.cache_read(data, "local.UB", [padded_input]) else: data_ub = s.cache_read(data, "local.UB", [res_ub]) # get tiling attributes if attrs is None: raise Exception('attrs is None') tiling_factors = attrs['tile'] split_iterators = [] if len(tiling_factors) != len(res.shape): raise RuntimeError("tiling factors mismatch out shape") # split the final compute and save the iterators for index, factor in enumerate(tiling_factors): split_iterators.append(s[res_ub].split(res_ub.op.axis[index], factor)) # get iterators iterator_b_outer = split_iterators[0][0] iterator_b_inner = split_iterators[0][1] iterator_c1_outer = split_iterators[1][0] iterator_c1_inner = split_iterators[1][1] iterator_h_outer = split_iterators[2][0] iterator_h_inner = split_iterators[2][1] iterator_w_outer = split_iterators[3][0] iterator_w_inner = split_iterators[3][1] iterator_c0_outer = split_iterators[4][0] iterator_c0_inner = split_iterators[4][1] # reduction axis iterator_reduce_h = res_ub.op.reduce_axis[0] iterator_reduce_w = res_ub.op.reduce_axis[1] # move caches s[res_ub].compute_at(s[res], res.op.axis[0]) s[data_ub].compute_at(s[res_ub], iterator_c1_outer) if pad_w != 0 or pad_h != 0: s[padded_input].compute_at(s[res_ub], iterator_c1_outer) s[padded_input].set_scope("local.UB") # reorder computation s[res_ub].reorder(iterator_b_outer, iterator_b_inner, iterator_c1_outer, iterator_c1_inner, iterator_h_outer, iterator_h_inner, iterator_w_outer, iterator_w_inner, iterator_reduce_h, iterator_reduce_w, iterator_c0_outer, iterator_c0_inner) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): mod = akg.build(s, [data, res], "cce", name="maxpool_manual_schedule", attrs=attrs, polyhedral=polyhedral) source_code = mod.imported_modules[0].get_source() kernel_name = "maxpool_ad_manual_schedule" utils.create_cce(kernel_name, './', source_code) return mod
def old_maxpool(data, kernel, stride, pad): """ Old implement for maxpool. Args: data (tvm.tensor.Tensor): Tensor of type float16 or float32, \"NC1HWC0\" format (N: batch, C1: channel, H: height, W: width, C0: block size) kernel (Union[list, tuple]): List or tuple with two int number as window sizes of H and W. stride (Union[list, tuple]): List or tuple with two int number as stride sizes of H and W. pad (Union[list, tuple]): List or tuple with two int number as pad sizes of H and W. Returns: tvm.tensor.Tensor, result of maxpool operator. """ shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.ALL_FLOAT) maxpool_param_check(kernel, stride, pad) kernel_h, kernel_w = kernel stride_h, stride_w = stride if len(pad) == 2: pad_height, pad_width = pad else: pad_height, pad_width = pad[0], pad[2] in_n, in_c1, in_h, in_w, in_c0 = shape out_h = int( math.floor((in_h + 2 * pad_height - kernel_h) / float(stride_h)) + 1) out_w = int( math.floor((in_w + 2 * pad_width - kernel_w) / float(stride_w)) + 1) if pad[0] != 0 or pad[1] != 0: pad_shape = (in_n, in_c1, in_h + 2 * pad_height, in_w + 2 * pad_width, in_c0) pad2d = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.const(0.0, dtype=dtype), name="pad2d") pad2d = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm.any(h < pad_height, h > in_h + pad_height - 1, w < pad_width, w > in_w + pad_width - 1), pad2d[n, c1, h, w, c0], data[n, c1, h - pad_height, w - pad_width, c0], ), name="pad2d") else: pad2d = data axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="ah") axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="aw") out_shape = (in_n, in_c1, out_h, out_w, in_c0) res_value = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.max( pad2d[n, c1, h * stride_h + axis_kernel_h, w * stride_w + axis_kernel_w, c0], axis=[axis_kernel_h, axis_kernel_w]), name="res_value") return res_value
def avgpool_with_img2col(data, kernel, stride, strategy): """ Performs the avgpool with img2col. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of avgpooling. """ shape = get_shape(data) dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.ops_dtype_check(dtype, vc_util.DtypeForDavinci.FLOAT16) vc_util.check_shape(kernel, 2, "Kernel") vc_util.check_shape(stride, 2, "Stride") kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] pad_value = zero_const(dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shp_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shp_ub, kernel_h, kernel_w, pad, stride, pad_value, tag="") out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") res_sum = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.sum( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_avg") dividor = akg.tvm.const(kernel_h * kernel_w, dtype) output = akg.tvm.compute(out_shape, lambda *i: res_sum(*i) / dividor, name="res_value") return output
def avgpool(data, kernel, stride, strategy): """ Performs the average pooling on the input datas. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Support **Strategies**: .. hlist:: * VALID: will not pad, and drop tailed elements when pooling. Output shape will be `ceil((pool_shapes[i] - (kernel[i] - 1)) / stride[i])` > **example**: > params: inputs => 11, kernel width => 5, stride => 4 > inputs: 1 2 3 4 5 6 7 8 9 10 11 > 1st window contains: 1 2 3 4 5 > 2nd window contains: 5 6 7 8 9 > dropped: 10 11 * SAME: will pad with zero evenly each side, but will add extra to tail if the total padding amount is odd. Output shape will be `ceil(pool_shapes[i] / stride[i])` > **example**: > params: inputs => 10, kernel width => 5, stride => 4 > inputs: 1 2 3 4 5 6 7 8 9 10 > paded: 0(pad1) | 1 2 3 4 5 6 7 8 9 10 | 0(pad2) 0(pad3) > 1st window contains: 0(pad1) 1 2 3 4 > 2nd window contains: 4 5 6 7 8 > 3rd window contains: 8 9 10 0(pad2) 0(pad3) > dropped: None * CONSTANTS: will pad with zero according to given constants (also dropped tailed elements when pooling). > **example**: > params: inputs => 10, kernel width => 5, stride => 4, pad => (2, 2) > inputs: 1 2 3 4 5 6 7 8 9 10 > paded: 0(pad1) 0(pad2) | 1 2 3 4 5 6 7 8 9 10 | 0(pad2) 0(pad3) > 1st window contains: 0(pad1) 0(pad2) 1 2 3 > 2nd window contains: 3 4 5 6 7 > 3rd window contains: 7 8 9 10 0(pad3) > dropped: 0(pad4) Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): List or tuple of two int numbers for pooling window's size. stride (Union[list, tuple]): List or tuple of two int numbers for window's stride. strategy (Union[str, list, tuple]): A string or list or tuple for padding strategy, should be 'VALID', 'SAME' or instance of list(including four int numbers, as 'CONSTANTS' strategy). Returns: Tensor as result for average pooling. """ dim_info, _ = avgpool_set_dim_func(data, kernel, stride, strategy) attrs = {DIM: dim_info} attrs['disable_half_to_float_sum_opt'] = True shape = [x.value for x in data.shape] dtype = data.dtype vc_util.davinci_format_check(shape, "NC1HWC0", dim=5) vc_util.check_shape(kernel, 2, 'Kernel') vc_util.check_shape(stride, 2, 'Stride') if shape[2] > 60 and shape[3] > 60: return avg_pool_5d_hybrid(data, kernel, stride, strategy) kernel_h, kernel_w = kernel stride_h, stride_w = stride batch_size, c1, in_size_h, in_size_w, c0 = shape [pad_height_head, pad_height_tail, pad_width_head, pad_width_tail], [out_size_h, out_size_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad_shape = (batch_size, c1, in_size_h + pad_height_head + pad_height_tail, in_size_w + pad_width_head + pad_width_tail, c0) pad2d = akg.tvm.compute( pad_shape, lambda n, c1, h, w, c0: akg.tvm.if_then_else( akg.tvm. any(h < pad_height_head, h > in_size_h + pad_height_head - 1, w < pad_width_head, w > in_size_w + pad_width_head - 1), akg.tvm.const(0.0, dtype=dtype), data[n, c1, h - pad_height_head, w - pad_width_head, c0], ), name="pad2d") axis_kernel_h = akg.tvm.reduce_axis((0, kernel_h), name="axis_kernel_h") axis_kernel_w = akg.tvm.reduce_axis((0, kernel_w), name="axis_kernel_w") out_shape = (batch_size, c1, out_size_h, out_size_w, c0) dividor = akg.tvm.const(kernel_h * kernel_w, dtype) res = akg.tvm.compute(out_shape, lambda n, c1, h, w, c0: akg.tvm.sum( pad2d[n, c1, h * stride_h + axis_kernel_h, w * stride_w + axis_kernel_w, c0], axis=[axis_kernel_h, axis_kernel_w]), name="res") res_value = akg.tvm.compute( out_shape, lambda n, c1, h, w, c0: res[n, c1, h, w, c0] / dividor, name="res_value") return res_value, attrs