def mean_v2(data, axis=None, keepdims=False, target=utils.CCE): """ Simple implementation of mean. Supported Platforms: 'Ascend' """ # Check types utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_FLOAT) # Check shape shape = [x.value for x in data.shape] utils.reduce_axis_check(shape, axis) axis = ft_util.refine_reduce_axis(data, axis) dtype = data.dtype count = 1 for i in axis: count *= shape[i] count_rec = 1 / count output = sum_v2(data, axis, keepdims, target=target) res = output * akg.tvm.const(count_rec, dtype) attrs = get_attrs(data) if shape_is_dynamic(data): attrs["custom_tiling"] = mean_dynamic_tiling_strategy(data, axis) return res, attrs
def mean(data, axis=None, keepdims=False, target=utils.CCE): """ Computes the mean of the values of a Tensor over the whole dataset. Note: If the tuple's elements are unsorted, this function will call preprocess_axis firstly to let these elements sorted. if tuple is empty, this function will compute all elements' sum. if the data type is folat 16 and the whole dim not less than 65536, this function will compute the mean by divide 65535 first to avoid whole dim too large. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. axis (Union[list, tuple, int, None]): If the tuple is empty, the axis equal to None. keepdims (bool): If keepdims equal to True, the result shape length is same to input shape length. Returns: tvm.tensor.Tensor, has the same type as data. If keepdims equal to True, all reduced dimensions are retained with length 1. else these reduced axis will be eliminate. Supported Platforms: 'Ascend' """ # Check types utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_FLOAT) # Check shape shape = ft_util.get_shape(data) utils.reduce_axis_check(shape, axis) axis = ft_util.refine_reduce_axis(data, axis) count = 1 for i in axis: count *= shape[i] output = sum(data, axis, keepdims, target=target) if shape_is_dynamic(data): res = akg.tvm.compute( output.shape, lambda *i: akg.lang.ascend.divide_var(output(*i), count), name="res") else: res = akg.topi.divide(output, count) attrs = get_attrs(data) if shape_is_dynamic(data): attrs["custom_tiling"] = mean_dynamic_tiling_strategy(data, axis) return res, attrs
def Softmax(data, axis, target=utils.CCE): """ Map all element of data to (0,1) and sum to 1. Args: data (tvm.tensor.Tensor): input. axis (int): along which normalization is applied. Return: tvm.tensor.Tensor, output. Supported Platforms: 'Ascend' """ utils.check_shape(data.shape) shape = data.shape utils.ops_dtype_check(data.dtype, utils.DtypeForDavinci.ALL_FLOAT) utils.reduce_axis_check(shape, axis) axis = ft_util.refine_reduce_axis(data, axis) if isinstance(axis, (list, tuple)): if len(axis) != 1: raise RuntimeError( "Reduce axis for softmax op must be 1-dimension, while current is %d-dimension" % (len(axis))) axis = axis[0] output = softmax_op(data, axis, shape) attr_map = {} if ds.shape_is_dynamic(data): # For shifted loops, should have: # dynamic_shape_bound mod tile_size_prime == 2 # This aims to ensure that the shift constant is a multiple of tile_size_prime. # So the generated IR will not have complicated head and tail for shifted blocks. attr_map = { "pragma_modshift": 1, "pragma_outerband_need_split": 1, "enable_post_poly_loop_partition": False, "pragma_disable_whole_component": False, "dynamic_shape": ds.set_dynamic_shape_limit_for_tensor(output, 2048, axis) + ds.set_poly_upper_bound_for_tensor(output, 2048, axis), "custom_tiling": ct.create_constraint_on_tensor( tensor=output, values=[1 for i, _ in enumerate(shape) if i != axis], constraints=ct.TileConstraint.FACTOR, tensor_pos=[i for i, _ in enumerate(shape) if i != axis]) } return output, attr_map
def get_attrs(tensor): """get attrs config""" attrs_map = { "pragma_checkcoincident": 0, "pragma_modshift": 1, "disable_cse": 1, "enable_bisect_optimize": 0, "enable_remove_broadcast_copy": True, } if shape_is_dynamic(tensor): attrs_map["pragma_analyze_reuse_buffer"] = True return attrs_map
def five2four_tiling_strategy(tensor, c_value=None, expansion=None): """Custom tiling strategy for five2four op.""" strategy = list() if c_value is None: strategy = ct_util.create_template( tensor=tensor, template=ct_util.TileTemplate.NC1HWC0) elif not shape_is_dynamic(tensor): c_value = 16 if c_value < 16 else c_value node_n = ct_util.create_constraint_on_tensor( tensor=tensor, values=1, constraints=ct_util.TileConstraint.FACTOR, tensor_pos=0) node_c1 = ct_util.create_constraint_on_tensor( tensor=tensor, values="FULL", constraints=ct_util.TileConstraint.MAX, tensor_pos=1) node_c0 = ct_util.create_constraint_on_tensor( tensor=tensor, values=c_value, constraints=ct_util.TileConstraint.FACTOR, tensor_pos=4) strategy = node_n + node_c1 + node_c0 if expansion: strategy.append( ct_util.create_constraint_on_tensor( tensor=tensor, values=expansion, constraints=ct_util.TileConstraint.SET_EXPANSION)[0]) if shape_is_dynamic(tensor): # axis should be full tiled due to cast operator strategy.append( ct_util.modify_common_constraints( value=0.85, constraint=ct_util.TileConstraint.SET_MEM_RATIO)) return strategy
def _reshape_ascend(data, out_shape): """ Rearranges input tensor data to new shape out_shape. Args: data (tvm.tensor.Tensor): The tensor to be reshaped. out_shape (list, tuple): The new shape applied on the input tensor data, should be compatible with the original shape of data. Returns: The reshaped akg.tvm.tensor of same type as input tensor data. Supported Platforms: 'Ascend' """ utils.ops_dtype_check( data.dtype, utils.DtypeForDavinci.INT32.value + utils.DtypeForDavinci.ALL_FLOAT.value) data_shape = data.shape utils.check_shape(data_shape) in_shape = get_shape(data) out_shape = list(out_shape) is_dynamic = ds.shape_is_dynamic(data) if -1 in out_shape: out_shape = get_out_shape(in_shape, out_shape) else: if not is_dynamic: if reduce(lambda x, y: x * y, in_shape) != reduce( lambda x, y: x * y, out_shape): raise ValueError( "the total length of out_shape is not equal to the in_shape" ) inputs = akg.tvm.compute(in_shape, lambda *indice: data(*indice), name="inputs") res = akg.topi.reshape(inputs, out_shape) output = akg.tvm.compute(out_shape, lambda *indice: res(*indice), name="reshape") return output
def reshape(data, out_shape): """ Rearranges input tensor data to new shape out_shape. Args: data (tvm.tensor.Tensor): The tensor to be reshaped. out_shape (list, tuple): The new shape applied on the input tensor data, should be compatible with the original shape of data. Returns: The reshaped akg.tvm.tensor of same type as input tensor data. """ ops_dtype_check(data.dtype, DtypeForDavinci.INT32.value + DtypeForDavinci.ALL_FLOAT.value) data_shape = data.shape check_shape(data_shape) in_shape = get_shape(data) out_shape = list(out_shape) is_dynamic = ds.shape_is_dynamic(data) if -1 in out_shape: access_size = 1 for i, o_shape in enumerate(out_shape): if -1 != o_shape: access_size *= o_shape else: hit_idx = i ori_size = reduce(lambda x, y: x * y, in_shape) if ori_size % access_size != 0: raise ValueError(("Invalid out_shape ({})".format(out_shape))) out_shape[hit_idx] = int(ori_size / access_size) else: if not is_dynamic: if reduce(lambda x, y: x * y, in_shape) != reduce(lambda x, y: x * y, out_shape): raise ValueError("the total length of out_shape is not equal to the in_shape") inputs = akg.tvm.compute(in_shape, lambda *indice: data(*indice), name="inputs") res = akg.topi.reshape(inputs, out_shape) output = akg.tvm.compute(out_shape, lambda *indice: res(*indice), name="reshape") attr_map = {} return output, attr_map
def mean_v2(data, axis=None, keepdims=False): """Simple implementation of mean.""" # Check types vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) # Check shape shape = [x.value for x in data.shape] vc_util.reduce_axis_check(shape, axis) axis = ft_util.refine_reduce_axis(data, axis) dtype = data.dtype count = 1 for i in axis: count *= shape[i] count_rec = 1 / count output, _ = sum.sum_v2(data, axis, keepdims) res = output * akg.tvm.const(count_rec, dtype) attrs = get_attrs(data) if shape_is_dynamic(data): attrs["custom_tiling"] = mean_dynamic_tiling_strategy(data, axis) return res, attrs
def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True): """ Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 4-dims tensor of type float16, float32 format_ (str): a str defined the format of "data" dst_dtype (str): a str defined the type of output, could be float16 or float32 Returns: 5-dims tvm.tensor.Tensor,type is defined by dst_dtype, which shape is [N, ceil(C / 16), H, W, 16] and attr about tiling args Raises: ValueError: If the type of format_ is invalid. """ # Check dtype vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.ALL_FLOAT) # Check shape shape = get_shape(data) vc_util.davinci_format_check(shape, format_, dim=4) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, four2five only support NCHW and NHWC format input" .format(format_)) last_channel = 16 if format_ == "NCHW": bs, c, h, w = get_shape(data) else: bs, h, w, c = get_shape(data) pad_c = c if c % last_channel != 0: pad_c = (c + 15) // last_channel * last_channel c1 = pad_c // last_channel c0 = last_channel is_dynamic = ds.shape_is_dynamic(data) if not is_dynamic: attrs = get_attrs() else: attrs = get_dynamic_attrs() # Check size c when casting happens if data.dtype != dst_dtype and c0 * c1 >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c0 * c1)) @script(capture=locals()) def nchw_to_nc1hwc0_step(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, c0, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, c_i0, w_i] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] output1 = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output1[n_i, c_i, h_i, w_i, c_i0] = output[n_i, c_i, h_i, c_i0, w_i] return output1 @script(capture=locals()) def nchw_to_nc1hwc0(inputs, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, c_i * last_channel + c_i0, h_i, w_i] return output @script(capture=locals()) def nhwc_to_nc1hwc0(inputs, zero, bs, c1, h, w, c0): output = allocate((bs, c1, h, w, c0), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): if c_i * last_channel + c_i0 < c: output[n_i, c_i, h_i, w_i, c_i0] = inputs[n_i, h_i, w_i, c_i * last_channel + c_i0] else: output[n_i, c_i, h_i, w_i, c_i0] = zero return output cast_data = data need_cast = data.dtype == 'float32' and dst_dtype == 'float16' if c % last_channel != 0 or need_cast: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) else: expansion = None # float32 -> float16, need to cast before transform if need_cast: cast_data = akg.lang.cce.cast_to(data, dst_dtype) zero_ = akg.tvm.const(0.0, cast_data.dtype) if format_ == "NCHW": if c % last_channel != 0: pad_shape = [bs, pad_c, h, w] if h == 1 and w == 1: # if h and w both are 1, it is pad last dim case output_shape = [bs, pad_c // last_channel, h, w, last_channel] output = akg.tvm.compute( output_shape, lambda i, c1, k, l, c0: akg.tvm.expr.Select( c0 < c - c1 * last_channel, cast_data[ i, c1 * last_channel + c0, k, l], akg.tvm.const(0, cast_data.dtype)), name="output") else: # if need to pad c dim, separate transpose to two steps # first is nchw -> nc1hc0w, second is nc1hc0w -> nc1hwc0 pad_data = akg.tvm.compute( pad_shape, lambda i, j, k, l: akg.tvm.expr.Select( j < c, cast_data[i, j, k, l], zero_), name="pad_data") output = nchw_to_nc1hwc0_step(pad_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and data.dtype == "float16" and h * w % last_channel == 0 and h * w < 3600: output_shape = [bs, c1, h, w, c0] output = akg.tvm.compute( output_shape, lambda n, c1, h, w, c0: akg.lang.cce.four2five_nchw( cast_data[n, c1 * last_channel + c0, h, w]), name="output") else: output = nchw_to_nc1hwc0(cast_data, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) else: if not is_dynamic and c < last_channel: rank = 5 # (n, c1, h, w, c0) pad_before = [] pad_after = [] for _ in range(rank): pad_before.append(0) pad_after.append(0) pad_after[-1] = last_channel - c # As c < last_channel, c1 is 1 output = akg.tvm.compute( (bs, c1, h, w, c), lambda bs_i, _, h_i, w_i, c_i: cast_data[bs_i, h_i, w_i, c_i], name="output") output = tvm_pad(output, pad_before, pad_after=pad_after, name='pad_output') else: output = nhwc_to_nc1hwc0(cast_data, zero_, to_tvm_const(bs), to_tvm_const(c1), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c0)) # float16 -> float32, need to cast after transform if data.dtype == 'float16' and dst_dtype == 'float32': output = akg.lang.cce.cast_to(output, dst_dtype) vc_util.davinci_format_check(output.shape, "NC1HWC0", dim=5) if not is_dynamic: dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype) if dim_info != "": attrs["dim"] = dim_info if need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy( output, format_, expansion) elif need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy_dynamic( output, format_) if is_dynamic: attrs["enable_feature_library_pre_poly"] = True return output, attrs
def add(first_input, second_input, scale=1.0, polyhedral=True, attrs=None): """ Computes first_input + second_input * scale elementwise. Args: first_input (tvm.tensor.Tensor): Tensor of type float16, float32, int32. second_input (tvm.tensor.Tensor): Tensor with same type as first_input. Broadcast will happen if shapes of input tensors are different. scale (float): scale factor applied on second_input, default value is 1.0. polyhedral (bool): If True, use auto-schedule, else use manual-schedule, default value is True. attrs (dict): Specifies parameters used in manual-schedule. Returns: tvm.tensor.Tensor of same type as input tensor with shape the broadcast shape of input tensors. """ vc_util.check_shape(first_input.shape) vc_util.check_shape(second_input.shape) attr_map = {} first_input_shape = get_shape(first_input) second_input_shape = get_shape(second_input) if shape_is_dynamic([first_input, second_input]): if first_input_shape != second_input_shape: raise RuntimeError( "Input tensors have different shapes, broadcast is not supported for dynamic." ) first_broadcast = first_input second_broadcast = second_input else: if first_input_shape != second_input_shape: _, _, out_shape = produce_shapes(first_input_shape, second_input_shape) else: out_shape = first_input_shape first_broadcast = akg.topi.broadcast_to(first_input, out_shape) second_broadcast = akg.topi.broadcast_to(second_input, out_shape) first_input_type = first_input.dtype second_input_type = second_input.dtype if first_input_type != second_input_type: raise TypeError("Input tensors have different data types.") vc_util.ops_dtype_check(first_input_type, vc_util.DtypeForDavinci.ALL_TYPES) temp = vmuls(second_broadcast, scale) res = vadd(first_broadcast, temp) res_cast = res.astype(first_input_type) if polyhedral: return res_cast, attr_map def comp_func(s): first_ub = s.cache_read(first_input, "local.UB", [first_broadcast]) second_ub = s.cache_read(second_input, "local.UB", [second_broadcast]) res_cast_ub = s.cache_write(res_cast, "local.UB") s[first_broadcast].set_scope("local.UB") s[second_broadcast].set_scope("local.UB") s[temp].set_scope("local.UB") s[res].set_scope("local.UB") split_axis = [] for i in range(len(attrs["tile"])): outer, inner = s[res_cast].split(res_cast.op.axis[i], attrs["tile"][i]) axis_dict = {"outer": outer, "inner": inner} split_axis.append(axis_dict) s[first_ub].compute_at(s[res], res.op.axis[0]) s[second_ub].compute_at(s[res], res.op.axis[0]) s[first_broadcast].compute_at(s[res], res.op.axis[0]) s[second_broadcast].compute_at(s[res], res.op.axis[0]) s[temp].compute_at(s[res], res.op.axis[0]) s[res].compute_at(s[res_cast_ub], res_cast_ub.op.axis[0]) s[res_cast_ub].compute_at(s[res_cast], split_axis[-1]['outer']) # no scaling nedeed if scale == 1: s[temp].compute_inline() # no broadcast needed if first_input_shape == second_input_shape: s[first_broadcast].compute_inline() s[second_broadcast].compute_inline() return res_cast, comp_func, attr_map
def get_attrs(tensor): """generate default attrs.""" if shape_is_dynamic(tensor): return {"enable_double_buffer": 0, "enable_divide_var": 1} return {}
def fused_batch_norm(inputs, attrs): r""" Batch normalization. See Source: <a href="https://arxiv.org/abs/1502.03167"> Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy. </a> .. math:: \begin{array}{ll} \\ \mu = \frac{1}{m} \sum^m_{i=1}{x_i} \\ \sigma^2 = \frac{1}{m} \sum^m_{i=1}{(x_i-\mu)^2} \\ \hat{x_i} = \frac{x_i - \mu}{ \sqrt{\sigma^2 + \epsilon} } \\ y_i = \gamma \hat{x_i} + \beta \equiv BN_{\gamma, \beta}(x_i) \end{array} This momentum argument is different from one used in optimizer classes and the conventional notion of momentum. Mathematically, the update rule for running statistics here is .. math:: \hat{z_{new}} = momentum \cdot \hat{z} + (1-momentum) \cdot z_t where :math:`\hat{z}` is the estimated statistic and :math:`z_t` is the new observed value. Note: When data_format is \"NC1HWC0\", the `gamma`, `beta`, `moving_mean` and `moving_variance` should be 5D tensors of shape `(1, C1, 1, 1, C0)`, otherwise, they should be 1D tensors of shape `(C,)`. Args: inputs: data (tvm.tensor.Tensor): Tensor of type float16, float32. (:math:`x_i`) gamma (tvm.tensor.Tensor): Tensor for scaling (:math:`\gamma`). beta (tvm.tensor.Tensor): Tensor for bias (:math:`\beta`). moving_mean (tvm.tensor.Tensor): Tensor for population mean used for inference. moving_variance (tvm.tensor.Tensor): Tensor for population variance used for inference. attrs: momentum (float): A float number used for the moving_mean and moving_variance computation. eps (float): A small float added to variance to avoid dividing by zero. is_training (bool): A bool value to specify if the operation is used for training or inference. data_format (str): Support format, \"DefaultFormat\", \"NCHW\", \"NHWC\" or \"NC1HWC0\". axis (Union[int, list, tuple]): Integer to specify the channel axis when data_format is \"DefaultFormat\". List or tuple for \"NC1HWC0\". When format is \"NCHW\" or \"NHWC\", it's not work. Must be in the range [-rank(data), rank(data)). single_sum (bool): whether use "mul_axis_sum". Returns: outs (tvm.tensor.Tensor): Tensor for normalized, scaled, shifted data. new_moving_mean (tvm.tensor.Tensor): Tensor of same type and shape as `moving_mean`. The `moving_mean` updated by data. Only returns when `is_training` is True. new_moving_variance (tvm.tensor.Tensor): Tensor of same type and shape as `moving_variance`. The `moving_variance` updated by data. Only returns when `is_training` is True. sample_mean (tvm.tensor.Tensor): Tensor of same type and shape as `moving_mean`. The mean of `data`. Only returns when `is_training` is True. sample_var (tvm.tensor.Tensor): Tensor of same type and shape as `moving_variance`. The variance of `data`. Only returns when `is_training` is True. """ if len(inputs) != 5: raise ValueError( "Input tensors number should be 5, but get %s." % len(inputs)) data_format = attrs.get("data_format", "DefaultFormat") params = check_inputs(inputs, data_format, attrs.get("axis", 1)) data = inputs[0] gamma = inputs[1] beta = inputs[2] moving_mean = inputs[3] moving_variance = inputs[4] ori_dtype = data.dtype shape = get_shape(data) axes = params.get("axes", (0,)) keepdims = params.get("is_special5d", False) mid_shape = params.get("mid_shape", [1, ]) data = akg.tvm.compute(data.shape, lambda *i: data(*i), "batchnorm_" + data_format) ori_moving_mean = moving_mean ori_moving_variance = moving_variance if ori_dtype != DTYPE_FLOAT32: data = akg.topi.cast(data, DTYPE_FLOAT32) gamma = akg.topi.cast(gamma, DTYPE_FLOAT32) beta = akg.topi.cast(beta, DTYPE_FLOAT32) moving_mean = akg.topi.cast(moving_mean, DTYPE_FLOAT32) moving_variance = akg.topi.cast(moving_variance, DTYPE_FLOAT32) ######## following is dsl ######## is_training = attrs.get("is_training", True) if is_training: value_num = 1 for index in axes: value_num *= shape[index] avg_num = round(float(1) / float(value_num), 12) data_square = akg.tvm.compute(data.shape, lambda *i: data(*i) * data(*i), name="data_square") # cal mean data_mean = akg.lang.ascend.vmuls( sum_data(data, axes, keepdims, attrs.get("single_sum", False)), avg_num) data_square_mean = akg.lang.ascend.vmuls(sum_data(data_square, axes, keepdims, attrs.get("single_sum", False)), avg_num) data_mean_square = akg.tvm.compute(data_mean.shape, lambda *i: data_mean(*i) * data_mean(*i), name="data_mean_square") data_variance = akg.tvm.compute(data_mean.shape, lambda *i: data_square_mean( *i) - data_mean_square(*i), name="data_variance") mean_new = update_by_moving_average( moving_mean, data_mean, attrs.get("momentum", 0.99)) variance_new = update_by_moving_average(moving_variance, data_variance, attrs.get("momentum", 0.99)) else: # no_bc version data_variance = moving_variance data_mean = moving_mean rsveps = akg.lang.ascend.vadds(data_variance, akg.tvm.const( attrs.get("eps", 1e-3), dtype=DTYPE_FLOAT32)) rsveps = rsqrt(rsveps, utils.CCE) rsveps = akg.lang.ascend.broadcast(rsveps, shape) mean2 = akg.lang.ascend.vmuls(data_mean, akg.tvm.const(-1, data.dtype)) mean2 = akg.lang.ascend.broadcast(mean2, shape) dmean = akg.tvm.compute( shape, lambda *i: data(*i) + mean2(*i), name="dmean") dmsve = akg.tvm.compute(shape, lambda *i: dmean(*i) * rsveps(*i), name="dmsve") if not keepdims: gamma = akg.topi.reshape(gamma, mid_shape) beta = akg.topi.reshape(beta, mid_shape) gamma_bc = akg.lang.ascend.broadcast(gamma, shape) beta_bc = akg.lang.ascend.broadcast(beta, shape) dmsveg = akg.tvm.compute(shape, lambda *i: dmsve(*i) * gamma_bc(*i), name="dmsveg") outs = akg.tvm.compute(shape, lambda *i: dmsveg(*i) + beta_bc(*i), name="output") out_attrs = get_attrs(outs) if is_training: if ori_dtype != DTYPE_FLOAT32: outs = akg.topi.cast(outs, ori_dtype) mean_new = akg.topi.cast(mean_new, ori_dtype) variance_new = akg.topi.cast(variance_new, ori_dtype) data_mean = akg.topi.cast(data_mean, ori_dtype) data_variance = akg.topi.cast(data_variance, ori_dtype) mean_new, binds_info_mean = TensorUtils.inplace_set( ori_moving_mean, mean_new, buffer_name="mean_buf") variance_new, binds_info_var = TensorUtils.inplace_set( ori_moving_variance, variance_new, buffer_name="var_buf") binds_info_all = binds_info_mean binds_info_all.update(binds_info_var) out_attrs[BINDS] = binds_info_all # the new moving_mean and moving_var are updated inplace in # inputs(moving_mean and moving_var). But Mindspore needs # These two fake outputs though it never uses them fake_moving_mean = akg.tvm.compute(mean_new.shape, lambda *indices: mean_new(*indices), "fake_moving_mean") fake_moving_var = akg.tvm.compute(mean_new.shape, lambda *indices: variance_new( *indices), "fake_moving_var") out_tensors = (outs, fake_moving_mean, fake_moving_var, data_mean, data_variance, mean_new, variance_new,) else: if ori_dtype != DTYPE_FLOAT32: outs = akg.topi.cast(outs, ori_dtype) out_tensors = (outs,) out_tensors = list(out_tensors) if isinstance( out_tensors, tuple) else out_tensors if shape_is_dynamic(out_tensors): out_attrs["custom_tiling"] = batch_norm_tiling_strategy_dynamic(outs) else: out_attrs["custom_tiling"] = batch_norm_tiling_strategy( outs, data_format) out_tensors.append(out_attrs) return out_tensors
def five2four(data, shape4d, dst_type, format_): """ Convert 5-dims "data" to 4-dims,the format of "data" is defined in "format_" Args: data (tvm.tensor.Tensor): 5-dims tensor of type float16, float32 shape4d (Union[list, tuple]): a list has 4 nums, shape of output Tensor dst_type (str): data type of output Tensor format_ (str): a str defined the format of returns, support NCHW and NHWC Returns: 4-dims tvm.tensor.Tensor. """ vc_util.ops_dtype_check([data.dtype, dst_type], vc_util.DtypeForDavinci.ALL_FLOAT) shape5d = get_shape(data) if not shape_is_dynamic(data): if len(shape5d) != 5 or shape5d[-1] != 16: raise ValueError( "five2four_cce only support 5-dim data and last dim should be 16" ) bs, c1, h, w, c0 = shape5d if not shape_is_dynamic(data): vc_util.davinci_format_check(shape5d, "NC1HWC0", dim=5) # Check format if format_ not in ['NCHW', 'NHWC']: raise ValueError( "{} format is not support, five2four only support NCHW and NHWC format input" .format(format_)) if format_ == "NCHW": if shape_is_dynamic(data): shape4d = [bs, c1 * c0, h, w] _, c, h_4d, w_4d = shape4d else: if shape_is_dynamic(data): shape4d = [bs, h, w, c1 * c0] _, h_4d, w_4d, c = shape4d vc_util.davinci_format_check(shape4d, format_, dim=4) # Check is shape4d and shape5d match if False not in [ isinstance(s, (int, akg.tvm.expr.IntImm)) for s in shape5d ]: if h_4d != h or w_4d != w: raise ValueError( "five2four_cce's shape4d h and w should equal to data shape's h and w" ) if c > c1 * c0 or c <= (c1 - 1) * c0: raise ValueError( "five2four_cce's shape4d c should in set ((c1 - 1) * c0, c1 * c0]" ) # Check size c when casting happens if not shape_is_dynamic(data): if data.dtype != dst_type and c >= C_LIMIT_FOR_CAST: raise ValueError( "When input and output data type is not matched, shape of 'c' axis should not exceed {}, " "while currently set is {}".format(C_LIMIT_FOR_CAST, c)) @script(capture=locals()) def nc1hwc0_to_nhwc(inputs, bs, h, w, c, c1, c0): output = allocate((bs, h, w, c), inputs.dtype, "local") for n_i in range(bs): for h_i in range(h): for w_i in range(w): for c_i in range(c1): for c_i0 in range(c0): output[n_i, h_i, w_i, c_i * c0 + c_i0] = inputs[n_i, c_i, h_i, w_i, c_i0] return output @script(capture=locals()) def nc1hwc0_to_nchw(inputs, bs, h, w, c, c1, c0): output = allocate((bs, c, h, w), inputs.dtype, "local") for n_i in range(bs): for c_i in range(c1): for h_i in range(h): for w_i in range(w): for c_i0 in range(c0): output[n_i, c_i * c0 + c_i0, h_i, w_i] = inputs[n_i, c_i, h_i, w_i, c_i0] return output # if c % 16 == 0, h and w == 1, five2four is a reshape operation if shape_is_dynamic(data): call_reshape = isinstance(h, int) and isinstance( w, int) and h == 1 and w == 1 else: call_reshape = h == 1 and w == 1 and c % 16 == 0 c_value = None expansion = None if format_ == "NHWC": if call_reshape: output = akg.topi.reshape(data, (bs, h, w, c)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, h, w, c), lambda *indice: output(*indice), name="reshape") elif c < c0: reshape_output = akg.topi.reshape(data, (bs, h, w, c0)) output = akg.tvm.compute((bs, h, w, c), lambda *i: reshape_output(*i), name='slice_output') else: output = nc1hwc0_to_nhwc(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) else: if call_reshape: output = akg.topi.reshape(data, (bs, c, h, w)) if shape_is_dynamic(data): output = akg.tvm.compute((bs, c, h, w), lambda *indice: output(*indice), name="reshape") else: output = nc1hwc0_to_nchw(data, to_tvm_const(bs), to_tvm_const(h), to_tvm_const(w), to_tvm_const(c), to_tvm_const(c1), to_tvm_const(c0)) # two special cases for tiling strategy if not shape_is_dynamic(data): if c < c0 or output.dtype != dst_type: c_value = c if c % c0 != 0 and output.dtype != dst_type: expansion = int(ct_util.BLOCK_SIZE / get_bytes(data.dtype)) attrs = get_attrs() if not call_reshape: attrs["custom_tiling"] = five2four_tiling_strategy( data, c_value, expansion) if output.dtype != dst_type: output = akg.topi.cast(output, dst_type) return output, attrs
def common(data, axis, method="min"): """ Returns the index with the max or min value across axes of a tensor. Note: method can be "max" or "min" to get argmax or argmin. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32, int8, int32. axis (int): Describe the axis of input tensor. method (str): Can be "max" or "min". Returns: tvm.tensor.Tensor, has type of int32. """ shape = get_shape(data) dtype = data.dtype utils.ops_dtype_check( data.dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.ALL_INT]) utils.reduce_axis_check(shape, axis) real_axis = refine_reduce_axis(shape, axis)[0] out_shape = get_reduce_out_shape(shape, axis=axis) attr_map = {} if shape_is_dynamic(data): attr_map["dynamic_shape"] = set_dynamic_shape_limit_for_tensor( data, 4096, real_axis) if dtype != "float16": data = akg.topi.cast(data, "float16") k = akg.tvm.reduce_axis((0, data.shape[real_axis]), "k") if axis in (len(shape) - 1, -1): if method == "min": reducer = akg.tvm.comm_reducer(lambda x, y: dav.fargmin(x, y), lambda t: akg.tvm.max_value(t)) elif method == "max": reducer = akg.tvm.comm_reducer(lambda x, y: dav.fargmax(x, y), lambda t: akg.tvm.min_value(t)) else: raise ValueError("not support {}".format(method)) if len(data.shape) == 1: res = akg.tvm.compute((1, ), lambda i: reducer(data[k], axis=k)) else: res = akg.tvm.compute( out_shape, lambda *indice: reducer(data(*indice, k), axis=k)) res = akg.tvm.compute(out_shape, lambda *indice: res(*indice).astype("int32"), "argred_output") elif axis in (0, -len(shape)): tmp_idx = akg.tvm.compute( shape[1:], lambda *indice: akg.tvm.const(0.0, "float16"), name='tmp_index') local_data = akg.tvm.compute(shape[1:], lambda *indice: data(0, *indice), name="tmp_data") for idx in range(shape[axis] - 1): if method == 'min': tmp_idx = akg.tvm.compute( shape[1:], lambda *indice, ite_idx=idx: akg.tvm.expr.Select( local_data(*indice) > data(ite_idx + 1, *indice), akg.tvm.const(ite_idx + 1, "float16"), tmp_idx(*indice) )) local_data = akg.tvm.compute( shape[1:], lambda *indice, ite_idx=idx: akg.tvm.expr.Select( local_data(*indice) > data(ite_idx + 1, *indice), data(ite_idx + 1, *indice), local_data(*indice))) elif method == "max": tmp_idx = akg.tvm.compute( shape[1:], lambda *indice, ite_idx=idx: akg.tvm.expr.Select( local_data(*indice) < data(ite_idx + 1, *indice), akg.tvm.const(ite_idx + 1, "float16"), tmp_idx(*indice) )) local_data = akg.tvm.compute( shape[1:], lambda *indice, ite_idx=idx: akg.tvm.expr.Select( local_data(*indice) < data(ite_idx + 1, *indice), data(ite_idx + 1, *indice), local_data(*indice))) else: raise ValueError("not support " + method) res = akg.tvm.compute(out_shape, lambda *indice: tmp_idx(*indice).astype("int32"), "cast1") else: raise ValueError( "Argmax only support first axis and is last axis now!") lager = out_shape if len(out_shape) > len(shape) else shape strategy = argminmax_tiling_strategy(lager, real_axis) if strategy: attr_map["custom_tiling"] = strategy return res, attr_map