def compute_blockdim(shape, axis, dtype): # strategy: all the shape except reduce axis can be used for multicore blockdim_limit = 2 if utils.product_is_mini() else 32 blockdim = 1 if isinstance(shape, int): shape = [shape] if not isinstance(axis, list): axis = list(axis) for a in axis: if a < 0: a += len(shape) axis = sorted(axis) red_sh = 1 if isinstance(shape, (list, tuple)): for i, sh in enumerate(shape): if not isinstance(sh, int): raise TypeError( "Shape to compute blockdim must be a list/tuple of integer" ) if i in axis: red_sh *= sh else: blockdim = blockdim * sh else: raise TypeError( "Shape to compute blockdim must be a list/tuple of integer") if red_sh < 32 / get_bytes(dtype): # when reduce axis is too small, multicore may not always increase performace blockdim = 1 return min(blockdim_limit, blockdim)
def shape_dtype_max_size_check(shape, dtype): """check validation of tensor's shape.""" if shape: for x in shape: if not isinstance(x, int): return mul = get_bytes(dtype) * int(reduce(lambda x, y: int(x) * int(y), shape)) if mul > MAX_DATA_SIZE: error_msg = "*".join([str(sh) for sh in shape]) raise RuntimeError("Invalid shape, data is {} bytes ({}), which " "exceed max data size {} bytes" .format(mul, error_msg, MAX_DATA_SIZE))
def conv_shape_check(shape): if (not isinstance(shape, (tuple, list))) or (len(shape) != 4): raise RuntimeError("conv tensor shape should be 4d list or tuple") conv_dtype = "float16" size = get_bytes(conv_dtype) for i in shape: if (not isinstance(i, int)) or (i <= 0): raise RuntimeError("conv tensor shape should be 4d list or " "tuple of positive integer") size *= i if size > MAX_DATA_SIZE: raise RuntimeError("runtime can not support tensor more than 2G size")
def get_input_pad_shape(shape, dtype): """Function for getting input pad shape.""" pad_unit = ft_util.get_bytes(dtype, allow_none=True) if pad_unit is None: logging.warning( "%s is not support in TensorAddPad, the result is not undefined.", dtype) return shape lastdim = int(math.ceil(shape[-1] / pad_unit) * pad_unit) pad_shape = [*shape[:-1], '{},{}'.format(shape[-1], lastdim) ] if lastdim != shape[-1] else shape return pad_shape
def compute_blockdim(shape, axis, dtype): # strategy: all the shape before reduce axis can be used for multicore blockdim_limit = 2 if utils.product_is_mini() else 32 blockdim = 1 if isinstance(shape, int): shape = [shape] if axis < 0: axis += len(shape) if isinstance(shape, (list, tuple)): for i, sh in enumerate(shape): if not isinstance(sh, int): raise TypeError("Shape to compute blockdim must be a list/tuple of integer") if i == axis: if sh < 32 / get_bytes(dtype): # when reduce axis is too small, multicore may not always increase performace blockdim = 1 break blockdim = blockdim * sh else: raise TypeError("Shape to compute blockdim must be a list/tuple of integer") return min(blockdim_limit, blockdim)
def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True): """ Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 4-dims tensor of type float16, float32 format_ (str): a str defined the format of "data" dst_dtype (str): a str defined the type of output, could be float16 or float32 Returns: 5-dims tvm.tensor.Tensor,type is defined by dst_dtype, which shape is [N, ceil(C / 16), H, W, 16] and attr about tiling args Raises: ValueError: If the type of format_ is invalid. """ # Check dtype vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) # Check shape shape = get_shape(data) vc_util.davinci_format_check(shape, format_, dim=4) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, four2five only support NCHW and NHWC format input" .format(format_)) last_channel = 16 if format_ == "NCHW": bs, c, h, w = get_shape(data) else: bs, h, w, c = get_shape(data) pad_c = c if c % last_channel != 0: pad_c = (c + 15) // last_channel * last_channel c1 = pad_c // last_channel c0 = last_channel is_dynamic = ds.shape_is_dynamic(data) if not is_dynamic: attrs = get_attrs() else: attrs = get_dynamic_attrs() # Check size c when casting happens if data.dtype != dst_dtype and c0 * c1 >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c0 * c1)) @script(capture=locals()) def nchw_to_nc1hwc0_step(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, c0, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, c_i0, w_i] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] output1 = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output1[n_i, c_i, h_i, w_i, c_i0] = output[n_i, c_i, h_i, c_i0, w_i] return output1 @script(capture=locals()) def nchw_to_nc1hwc0(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] return output @script(capture=locals()) def nhwc_to_nc1hwc0(inputs, zero, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): if c_i * last_channel + c_i0 < c: output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, h_i, w_i, c_i * last_channel + c_i0] else: output[n_i, c_i, h_i, w_i, c_i0] = zero return output cast_data = data need_cast = data.dtype == 'float32' and dst_dtype == 'float16' if c % last_channel != 0 or need_cast: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) else: expansion = None # float32 -> float16, need to cast before transform if need_cast: cast_data = akg.lang.cce.cast_to(data, dst_dtype) zero_ = akg.tvm.const(0.0, cast_data.dtype) if format_ == "NCHW": if c % last_channel != 0: pad_shape = [bs, pad_c, h, w] if h == 1 and w == 1: # if h and w both are 1, it is pad last dim case output_shape = [bs, pad_c // last_channel, h, w, last_channel] output = akg.tvm.compute( output_shape, lambda i, c1, k, l, c0: akg.tvm.expr.Select( c0 < c - c1 * last_channel, cast_data[ i, c1 * last_channel + c0, k, l], akg.tvm.const(0, cast_data.dtype)), name="output") else: # if need to pad c dim, separate transpose to two steps # first is nchw -> nc1hc0w, second is nc1hc0w -> nc1hwc0 pad_data = akg.tvm.compute( pad_shape, lambda i, j, k, l: akg.tvm.expr.Select( j < c, cast_data[i, j, k, l], zero_), name="pad_data") output = nchw_to_nc1hwc0_step(pad_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and data.dtype == "float16" and h * w % last_channel == 0 and h * w < 3600: output_shape = [bs, c1, h, w, c0] output = akg.tvm.compute( output_shape, lambda n, c1, h, w, c0: akg.lang.cce.four2five_nchw( cast_data[n, c1 * last_channel + c0, h, w]), name="output") else: output = nchw_to_nc1hwc0(cast_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and c < last_channel: rank = 5 # (n, c1, h, w, c0) pad_before = [] pad_after = [] for _ in range(rank): pad_before.append(0) pad_after.append(0) pad_after[-1] = last_channel - c # As c < last_channel, c1 is 1 output = akg.tvm.compute( (bs, c1, h, w, c), lambda bs_i, _, h_i, w_i, c_i: cast_data[bs_i, h_i, w_i, c_i], name="output") output = tvm_pad(output, pad_before, pad_after=pad_after, name='pad_output') else: output = nhwc_to_nc1hwc0(cast_data, zero_, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) # float16 -> float32, need to cast after transform if data.dtype == 'float16' and dst_dtype == 'float32': output = akg.lang.cce.cast_to(output, dst_dtype) vc_util.davinci_format_check(output.shape, "NC1HWC0", dim=5) if not is_dynamic: dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype) if dim_info != "": attrs["dim"] = dim_info if need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy( output, format_, expansion) elif need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy_dynamic( output, format_) if is_dynamic: attrs["enable_feature_library_pre_poly"] = True return output, attrs
def gen_random_shape(shape_dim, slope=0, min_value=None, max_value=None): """ Generate a list of random integer with length equals shape_dim within range [min_value, max_value]; Args: shape_dim : length of output random shape slope : only represents the tendency of random shape's value, not mathematical slope of random shape; slope = -1 tend to generate random shape list with largest value at the beginning and smallest value at the end slope = 0 tend to generate random shape list with nearly average value among list slope = 1 tend to generate random shape list with smallest value at the beginning and largest value at the end """ if shape_dim <= 0: raise ValueError("Shape dim should be positive.") def _build_limit(limit, default): if limit is None: limit = default res = list() nonlocal shape_dim if isinstance(limit, (tuple, list)): if len(limit) != shape_dim: raise ValueError( "Min/Max value should have same length with shape_dim") res = limit elif isinstance(limit, int): res = [limit] * shape_dim else: raise TypeError( "Min/Max value should be int or list of int with same length of shape_dim" ) return res device_limit = MAX_DATA_SIZE // get_bytes("float32") if max_value is None and shape_dim > 1: limit_avg = int(math.pow(device_limit, 1 / shape_dim)) if slope == 0: max_value = [limit_avg] * shape_dim else: ratio = np.arange(-1 / 2, 1 / 2 + 1 / shape_dim, 1 / shape_dim) if len(ratio) > shape_dim: new_ratio = list() for i, r in enumerate(ratio): if i == len(ratio) // 2 - 1: new_ratio.append(0) elif i == len(ratio) // 2: continue else: new_ratio.append(r) ratio = new_ratio if slope == -1: ratio.reverse() max_value = list() for i, r in enumerate(ratio): max_value.append(int((1 + ratio[i]) * limit_avg)) shape_min = _build_limit(min_value, 1) shape_extent = _build_limit(max_value, device_limit) random_shape = list() for mn, mx in zip(shape_min, shape_extent): random_shape.append(random.randint(mn, mx)) return random_shape
def five2four(data, shape4d, dst_type, format_): """ Convert 5-dims "data" to 4-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 5-dims tensor of type float16, float32 shape4d (Union[list, tuple]): a list has 4 nums, shape of output Tensor dst_type (str): data type of output Tensor format_ (str): a str defined the format of returns, support NCHW and NHWC Returns: 4-dims tvm.tensor.Tensor. """ vc_util.ops_dtype_check([data.dtype, dst_type], vc_util.DtypeForDavinci.ALL_FLOAT) shape5d = get_shape(data) if not shape_is_dynamic(data): if len(shape5d) != 5 or shape5d[-1] != 16: raise ValueError( "five2four_cce only support 5-dim data and last dim should be 16" ) bs, c1, h, w, c0 = shape5d if not shape_is_dynamic(data): vc_util.davinci_format_check(shape5d, "NC1HWC0", dim=5) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, five2four only support NCHW and NHWC format input" .format(format_)) if format_ == "NCHW": if shape_is_dynamic(data): shape4d = [bs, c1 * c0, h, w] _, c, h_4d, w_4d = shape4d else: if shape_is_dynamic(data): shape4d = [bs, h, w, c1 * c0] _, h_4d, w_4d, c = shape4d vc_util.davinci_format_check(shape4d, format_, dim=4) # Check is shape4d and shape5d match if False not in [ isinstance(s, (int, akg.tvm.expr.IntImm)) for s in shape5d ]: if h_4d != h or w_4d != w: raise ValueError( "five2four_cce's shape4d h and w should equal to data shape's h and w" ) if c > c1 * c0 or c <= (c1 - 1) * c0: raise ValueError( "five2four_cce's shape4d c should in set ((c1 - 1) * c0, c1 * c0]" ) # Check size c when casting happens if not shape_is_dynamic(data): if data.dtype != dst_type and c >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c)) @script(capture=locals()) def nc1hwc0_to_nhwc(inputs, bs, h, w, c, c1, c0): output = allocate((bs, h, w, c), inputs.dtype, "local") for n_i in range(bs): for h_i in range(h): for w_i in range(w): for c_i in range(c1): for c_i0 in range(c0): output[n_i, h_i, w_i, c_i * c0 + c_i0] = inputs[n_i, c_i, h_i, w_i, c_i0] return output @script(capture=locals()) def nc1hwc0_to_nchw(inputs, bs, h, w, c, c1, c0): output = allocate((bs, c, h, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i * c0 + c_i0, h_i, w_i] = inputs[n_i, c_i, h_i, w_i, c_i0] return output # if c % 16 == 0, h and w == 1, five2four is a reshape operation if shape_is_dynamic(data): call_reshape = isinstance(h, int) and isinstance( w, int) and h == 1 and w == 1 else: call_reshape = h == 1 and w == 1 and c % 16 == 0 c_value = None expansion = None if format_ == "NHWC": if call_reshape: output = akg.topi.reshape(data, (bs, h, w, c)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, h, w, c), lambda *indice: output(*indice), name="reshape") elif c < c0: reshape_output = akg.topi.reshape(data, (bs, h, w, c0)) output = akg.tvm.compute((bs, h, w, c), lambda *i: reshape_output(*i), name='slice_output') else: output = nc1hwc0_to_nhwc(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) else: if call_reshape: output = akg.topi.reshape(data, (bs, c, h, w)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, c, h, w), lambda *indice: output(*indice), name="reshape") else: output = nc1hwc0_to_nchw(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) # two special cases for tiling strategy if not shape_is_dynamic(data): if c < c0 or output.dtype != dst_type: c_value = c if c % c0 != 0 and output.dtype != dst_type: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) attrs = get_attrs() if not call_reshape: attrs["custom_tiling"] = five2four_tiling_strategy( data, c_value, expansion) if output.dtype != dst_type: output = akg.topi.cast(output, dst_type) return output, attrs