def _check_params(ref_shape, value_shape, dtype, kernel_name): """ check the parameters including ref_shape, value_shape, dtype and kernel_name Parameters ---------- ref_shape: list or tuple shape of ref_tensor value_shape: list or tuple shape of value_tensor dtype: str the data type kernel_name: str cce kernel name, default value is "cce_assign" Returns ------- None """ check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") check_dtype(dtype, check_list, param_name="ref") _check_shape(ref_shape, value_shape)
def _check_para_and_getplaceholder(scalar_input, tensor_input, input_dict): check_list = ("float32", ) var_shape = input_dict["var"].get("shape") var_dtype = input_dict["var"].get("dtype") list_placeholder = [] for key, value in input_dict.items(): shape = util.scalar2tensor_one(value.get("shape")) op_utils.check_shape(shape) if value in scalar_input: if not util.is_scalar(shape): raise RuntimeError("The shape of ", key, " must be scalar") if value in tensor_input: if shape != var_shape: raise RuntimeError("The shape of", key, "must be the same as the var") dtype = value.get("dtype").lower() op_utils.check_dtype(dtype, check_list, param_name="var") if dtype != var_dtype: raise RuntimeError("The dtype of", key, "must be the same as the var") shape_refine = (functools_reduce(operator.mul, shape), ) list_placeholder.append( tvm.placeholder(shape=shape_refine, name=key, dtype=dtype)) return list_placeholder
def _check_dtype(dtype_x, dtype_sum, dtype_square_sum, dtype_scale, dtype_offset): check_dtype(dtype_x, ("float16", "float32")) check_dtype(dtype_sum, ("float32",)) check_dtype(dtype_square_sum, ("float32",)) check_dtype(dtype_scale, ("float32",)) check_dtype(dtype_offset, ("float32",))
def optional_weight(tensor_list, predict_shape, dtype_list, weight, pos_weight): weight_data = None pos_weight_data = None if weight is not None: weight_shape = weight.get("shape") weight_dtype = weight.get("dtype").lower() op_utils.check_dtype(weight_dtype, dtype_list) _broadcast_shape_check(weight_shape, predict_shape) weight_shape = tuple( [1] * (len(predict_shape) - len(weight_shape))) + tuple(weight_shape) weight_data = tvm.placeholder(weight_shape, weight_dtype, name="weight_data") tensor_list.append(weight_data) if pos_weight is not None: pos_weight_shape = pos_weight.get("shape") pos_weight_dtype = pos_weight.get("dtype").lower() op_utils.check_dtype(pos_weight_dtype, dtype_list) _broadcast_shape_check(pos_weight_shape, predict_shape) pos_weight_shape = tuple([1] * (len(predict_shape) - len(pos_weight_shape)) ) + tuple(pos_weight_shape) pos_weight_data = tvm.placeholder(pos_weight_shape, pos_weight_dtype, name="pos_weight_data") tensor_list.append(pos_weight_data) return weight_data, pos_weight_data
def check_param(self): """ Check parameter Parameters ---------- None Returns ------- None """ op_utils.check_shape(self.input_x_shape, param_name="input_x") op_utils.check_shape(self.input_y_shape, param_name="input_y") op_utils.check_dtype(self.input_x_dtype, ("float32", ), param_name="input_x") op_utils.check_dtype(self.input_y_dtype, ("float32", ), param_name="input_y") add_support = tbe_platform.cce_conf.api_check_support( "tik.vadd", "float32") if self.input_x_dtype != self.input_y_dtype: raise RuntimeError( "input_x and input_y do not have the same dtype") if self.input_x_dtype == "float32" and not add_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform")
def check_supported(x, segment_ids, y, num_segments, kernel_name="unsorted_segment_max_d"): """ fusion pass test if num_segments is int32 """ shape = x.get("shape") dtype = x.get("dtype").lower() segment_ids_shape = segment_ids.get("shape") segment_ids_dtype = segment_ids.get("dtype").lower() check_list = ("float16", "float32", "int32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") op_utils.check_shape(shape, param_name="x") check_list_ids = ("int32") op_utils.check_dtype(segment_ids_dtype, check_list_ids, param_name="segment_ids") if num_segments <= 0: return False first_shape = int(shape[0]) ids_length = int(segment_ids_shape[0]) if first_shape != ids_length: return False total_ub_size = (num_segments + first_shape) * BLOCK_LENGTH + ( (BLOCK_LENGTH // 2 - first_shape % (BLOCK_LENGTH // 4)) + first_shape) * (BLOCK_LENGTH // 8) if total_ub_size > UB_SIZE_MAX // 2: return False return True
def _check_parameter(input_x, input_target): """ Parameters ---------- input_x : dict shape and dtype of input_x input_target : dict shape and dtype of input_target.Shape and dtype must be same as input_x Returns ------ None """ shape_x = input_x.get("shape") shape_target = input_target.get("shape") op_utils.check_shape(shape_x, param_name="input_x") if list(shape_x) != list(shape_target): raise RuntimeError("input_x and input_target must " "have the same shape.") # check input tensor data_type dtype_x = input_x.get("dtype").lower() dtype_target = input_target.get("dtype").lower() check_list = ("float16", "float32") op_utils.check_dtype(dtype_x, check_list, param_name="input_x") if dtype_x != dtype_target: raise RuntimeError("input_x and input_target must " "have the same dtype.") if dtype_x == "float32" and not tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32"): raise RuntimeError( "Instric only support float16 while input dtype is float32")
def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"): """ reduce a tensor on a certain axes based on max. Parameters ---------- x : dict shape and dtype of input y : dict shape and dtype of output, should be same shape and type as input axes: list the first axes to reduce,may be negative to index from the end (e.g., -1 for the last axes). axes may be int or list(e.g. [1,2]) keepdims: bool if true, retains reduced dimensions with length 1, default value is None kernel_name : str kernel name, default value is "reduce_max_d" Returns ------- None """ dtype = x["dtype"] dtype_lower = dtype.lower() check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_lower, check_list) with te.op.compute(): shape = x["shape"] shape_range = x["range"] shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = cce_util.axis_check(shape_len, axes) shape_new, shape_range_new, axes_new, fused_rel_dic = \ fused_reduce_axis(shape, shape_range, axes) add_compile_info("fused_rel_dic", fused_rel_dic) x["shape"] = shape_new x["range"] = shape_range_new shape_var_new = variable_shape([x])[0] data_input = tvm.placeholder(shape_var_new, name="data_input", dtype=dtype_lower) res = reduce_max_d_compute(data_input, y, axes_new, keepdims) with tvm.target.cce(): sch = generic.auto_schedule(res) # build config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.dynamic.build(sch, config)
def log(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="log"): """ calculating data Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "log" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() # input_x' shape check op_utils.check_shape(shape, param_name="input_x") # input_x' dtype check, only supports fp16 and fp32 check_list = ("float16", "float32") op_utils.check_dtype(input_dtype, check_list, param_name="input_x") if base <= 0 and (not isclose(base, -1.0)): error_info = {} error_info['errCode'] = 'E80000' error_info['param_name'] = 'base' error_info['op_name'] = 'log' error_info['expect_value'] = "strictly positive or -1" error_info['real_value'] = base raise RuntimeError("In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'], \ error_info['expect_value'], error_info['real_value'])) fused_shape = [reduceIns(lambda x, y: x * y, shape[:])] data_input = tvm.placeholder(fused_shape, name="data_input", dtype=input_dtype) res = log_compute(data_input, output_y, base, scale, shift, kernel_name) # auto schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # operator build config = { "name": kernel_name, "need_build": True, "tensor_list": (data_input, res) } te.lang.cce.cce_build_code(sch, config)
def mul(x, y, output, kernel_name="mul"): """ do element-wise mul operation between two input tensors Parameters: ---------- x : dict. shape, dtype of input x y : dict. shape, dtype of input y output : dict. shape, dtype of ouput kernel_name : str. cce kernel name, default value is "mul" Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _mul_check_format(x, y) shape_x, shape_y = _infer_shape(format_pattern, x, y) shape_x = util.scalar2tensor_one(shape_x) dtype_x = x.get("dtype").lower() shape_y = util.scalar2tensor_one(shape_y) dtype_y = y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_y, param_name="y") if dtype_x != dtype_y: raise RuntimeError("dtype of inputs should be consistent") dtype = dtype_x check_list = ("int32", "float16", "float32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if dtype_x == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y) input_x = tvm.placeholder(shape_x, dtype=dtype, name="x") input_y = tvm.placeholder(shape_y, dtype=dtype, name="y") res = _mul_compute(input_x, input_y, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)} te.lang.cce.cce_build_code(sch, config)
def logical_or(x1, x2, y, kernel_name="logical_or"): """ algorithm : logical_or calculating the value of x1 OR x2 element-wise Parameters ---------- x1 : the dict of x1, include shape and dtype, dtype support int8, the value only support 0, 1 x2 : the dict of x2, include shape and dtype, dtype support int8, the value only support 0, 1 y : the dict of y, include shape and dtype kernel_name : string, cce kernel name, default value is "logical_or" Returns ------- None """ shape_x1 = x1.get("shape") shape_x2 = x2.get("shape") dtype_x1 = x1.get("dtype") dtype_x2 = x2.get("dtype") if dtype_x1 == "bool" or dtype_x2 == "bool": dtype_x1 = "int8" dtype_x2 = "int8" check_shape(shape_x1, param_name="x1") check_shape(shape_x2, param_name="x2") check_tuple = ("int8", ) check_dtype(dtype_x1, check_tuple, param_name="x1") check_dtype(dtype_x2, check_tuple, param_name="x2") shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1, shape_x2, param_name_input1="x1", param_name_input2="x2") dtype = dtype_x1.lower() data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype) res = logical_or_compute(data_x1, data_x2, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "need_build": False, "name": kernel_name, "tensor_list": (data_x1, data_x2, res) } te.lang.cce.cce_build_code(schedule, config)
def atan_grad(y, dy, z, kernel_name="atan_grad"): """ Gradient calculation for atan(x) Parameters: ---------- y : dict of y, include shape and dtype, dtype support float16, float32 dy : dict of dy, include shape and dtype, dtype support float16, float32 z : dict of output, include shape and dtype kernel_name : cce kernel name, default value is atan_grad Algorithm : ---------- forward : y = atan(x) backward gradient : de/dx = dy/dx*de/dy = 1/(1+x^2)*grad Returns ---------- None """ # get the shape and dtype shape = y.get("shape") shape_grad = dy.get("shape") dtype = y.get("dtype") dtype_grad = dy.get("dtype") # check whether kernel name is unique # check whether the shape is right check_shape(shape, param_name="y") check_shape(shape_grad, param_name="dy") if not operator.eq(shape, shape_grad): raise RuntimeError("all input shape must be the same") shape, _ = refine_shape_axes(shape, []) # check whether dtypes are fp16,fp32 and whether they are the same check_list = ("float16", "float32") check_dtype(dtype, check_list, param_name="y") check_dtype(dtype_grad, check_list, param_name="dy") dtype = dtype.lower() if dtype != dtype_grad.lower(): raise RuntimeError("all input dtype must be same") # get 2 input placeholders: data_input, grad data_input = tvm.placeholder(shape, name="input_data", dtype=dtype) grad = tvm.placeholder(shape, name="input_grad", dtype=dtype) # compute the backward gradient res = atan_grad_compute(data_input, grad, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, grad, res]} te.lang.cce.cce_build_code(sch, config)
def floor_div(input_x, input_y, output_z, kernel_name="floor_div"): """ algorithm: floordiv calculating data's floordiv, res =floor(x / y) Parameters ---------- input_x: dict input_y: dict output_z: dict kernel_name: str, default value is "floor_div" Returns ------- None """ # check dtype of input_x/input_y input_dtype_x = input_x.get("dtype").lower() input_dtype_y = input_y.get("dtype").lower() check_list = ('int8', 'uint8', 'int32', 'float16', 'float32') check_dtype(input_dtype_x, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") check_elewise_shape_range([input_x, input_y], support_broadcast=True) if input_dtype_x != input_dtype_y: error_info = {} error_info['errCode'] = OP_ERROR_CODE_018 error_info['op_name'] = 'floor_div' error_info['param_name1'] = 'input_dtype_x' error_info['param_name2'] = 'input_dtype_y' error_info['param1_dtype'] = str(input_dtype_x) error_info['param2_dtype'] = str(input_dtype_y) raise RuntimeError(error_info, "In op[%s], the parameter[%s][%s] are not equal in " "dtype with dtype[%s][%s]." % ( error_info['op_name'], error_info['param_name1'], error_info['param_name2'], error_info['param1_dtype'], error_info['param2_dtype'])) ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (input_x, input_y) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([input_x, input_y], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) tensor_x = tvm.placeholder(x_shape, input_dtype_x, "tensor_x") tensor_y = tvm.placeholder(y_shape, input_dtype_y, "tensor_y") res = floor_div_compute(tensor_x, tensor_y, output_z, kernel_name) tensors.append([tensor_x, tensor_y, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def data_format_dim_map(x, y, src_format="NHWC", dst_format="NCHW", kernel_name="data_format_dim_map"): """ Returns the dimension index in the destination data format given the one in. Parameters ---------- x : A Tensor with each element as a dimension index in source data format. Must be the following types: `int32`. Must be in the range [-4, 4). y : Shape and dtype of y, reserved parameter, not used now. src_format : An optional `string`. Defaults to `"NHWC"`. source data format. dst_format : An optional `string`. Defaults to `"NCHW"`. destination data format. kernel_name : CCE kernel name, default value is "data_format_dim_map" (optional). Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype") # check kernel name, shape, size, dtype check_shape(shape_input, param_name="x") shape_input, _ = refine_shape_axes(shape_input, []) check_list = ("int32", ) dtype_input = dtype_input.lower() check_dtype(dtype_input, check_list, param_name="x") # check length of format if len(src_format) != 4: raise ValueError( "source format must of length 4, received src_format = %s" % src_format) if len(dst_format) != 4: raise ValueError( "destination format must of length 4, received dst_format = %s" % dst_format) # get data and compute data_input = tvm.placeholder(shape_input, dtype=dtype_input, name="data_input") res = _data_format_dim_map_compute(data_input, y, src_format, dst_format, kernel_name) with tvm.target.cce(): sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "print_ir": False, "tensor_list": (data_input, res), "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def reduce_sum_d(x, y, axis=None, keepdims=None, kernel_name="reduce_sum_d"): """reduce a tensor on a certain axis based on sum. Parameters: ---------- x: dict the dict of input tensor. y: dict the dict of output tensor. axis: int, list, tuple or NONETYPE the axis for reduce. keepdims: bool or NONETYPE if true, retains reduced dimensions with length 1. kernel_name: str cce kernel name, default value is "reduce_sum_d". Returns ------- None """ dtype = x["dtype"] dtype_lower = dtype.lower() check_list = ("float16", "float32") check_dtype(dtype_lower, check_list, param_name="x") with te.op.compute(): shape = x["shape"] shape_range = x["range"] axes = [] shape_len = len(shape) if not axis: for i, _ in enumerate(shape): axes.append(i) else: axes = list(axis) axes = cce_util.axis_check(shape_len, axes) shape_new, shape_range_new, axes_new, fused_rel_dic = \ fused_reduce_axis(shape, shape_range, axes) add_compile_info("fused_rel_dic", fused_rel_dic) x["shape"] = shape_new x["range"] = shape_range_new shape_var_new = variable_shape([x])[0] data_input = tvm.placeholder(shape_var_new, name="data_input", dtype=dtype_lower) res = reduce_sum_d_compute(data_input, y, axes_new, keepdims) with tvm.target.cce(): sch = generic.auto_schedule(res) # build config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.dynamic.build(sch, config)
def assign_sub(var, value, out, kernel_name='assign_sub'): """ Update var by subtracting value from it. Parameters: ---------- var : dict dict of input_var, include shape and dtype, dtype support int8, uint8, int32, float16, float32 value : dict dict of input_value, include shape and dtype, dtype support int8, uint8, int32, float16, float32. Must have the same shape and dtype as input_var out : dict dict of out kernel_name : str cce kernel name, default value is "assign_sub" Returns ------- None """ # get the shape and dtype shape_var = var.get("shape") shape_value = value.get("shape") dtype_var = var.get("dtype") dtype_value = value.get("dtype") # kernel name check: should be unique # check whether the shape is right check_shape(shape_var, param_name="var") check_shape(shape_value, param_name="value") if not operator.eq(shape_var, shape_value): raise RuntimeError("all input shape must be the equal") # check whether dtypes are fp16, fp32, int8, uint8, int32 # and whether they are the same check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_var, check_list, param_name="var") check_dtype(dtype_value, check_list, param_name="value") dtype_var = dtype_var.lower() dtype_value = dtype_value.lower() if dtype_var != dtype_value: raise RuntimeError("all input dtype must be same") shape, _ = refine_shape_axes(shape_var, []) data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var') data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value') sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name) with set_bool_storage_config(): tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)
def __init__(self, var, indices, updates, var_out, use_locking, kernel_name): self.tik_instance = tik.Tik(tik.Dprofile()) self.var_dtype = var.get("dtype").lower() self.indices_dtype = indices.get("dtype").lower() self.updates_dtype = updates.get("dtype").lower() self.out_dtype = var_out.get("dtype").lower() indices_support_dtype_list = ("int32", ) var_support_dtype_list = ("float32", ) check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices") check_dtype(self.var_dtype, var_support_dtype_list, param_name="var") if self.var_dtype != self.updates_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "updates", "var", self.updates_dtype, self.var_dtype) if self.var_dtype != self.out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "out", "var", self.out_dtype, self.var_dtype) self.kernel_name = kernel_name self.ai_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.ub_size_bytes = ( tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) - RESERVED_UB_SIZE) self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.var_dtype) // 8 self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.indices_dtype) // 8 self.var_data_each_block = 32 // self.var_dtype_bytes_size self.indices_data_each_block = 32 // self.indices_dtype_bytes_size self.tiling_gm = self.tik_instance.Tensor("int32", (TILING_ARG_NUM, ), name="tiling_gm", scope=tik.scope_gm) self.var_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ), name="var_gm", scope=tik.scope_gm) self.indices_gm = self.tik_instance.Tensor(self.indices_dtype, (MAX_INT32, ), name="indices_gm", scope=tik.scope_gm) self.updates_gm = self.tik_instance.Tensor(self.updates_dtype, (MAX_INT32, ), name="updates_gm", scope=tik.scope_gm) self.out_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ), name="out_gm", scope=tik.scope_gm) self.updates_ub = None self.indices_ub = None self.var_read_index = None self.updates_read_index = None self.indices_loop_index = None
def bn_training_reduce(x, sum, square_sum, kernel_name="bn_training_reduce"): """ algorithm: part of fused_batch_norm_v2 The first step of batch_norm which to calculate the sum and square sum of x. The major component of this operator is reduce operation. Parameters ---------- x: dict dict of input, A 5HD Tensor for input data. sum: dict dict of sum, A `Tensor`. Sum of x. square_sum: dict dict of square_sum, A `Tensor`. Square sum of x. kernel_name: str kernel name, default value is "bn_training_reduce" Returns ------- None """ data_format = x.get("format").upper() origin_format = x.get("ori_format").upper() dtype = x.get("dtype").lower() # check and format check_list = ("NC1HWC0", "NCHW") check_format(data_format, check_list, param_name="x") if data_format == "NCHW" and origin_format not in ("NCHW", ): raise RuntimeError("The origin format only supports " "NCHW when format is NCHW") # check dtype check_list = ("float16", "float32") check_dtype(dtype, check_list, param_name="x") # get dynamic shape, x.get("shape"), x.get("range") shape_x = variable_shape([x])[0] # compute with te.op.compute(): data_input = tvm.placeholder(shape_x, name="data_input", dtype=dtype) res = bn_training_reduce_compute(data_input, sum, square_sum, kernel_name=kernel_name) # schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # build tensor_list = [data_input] + list(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.dynamic.build(sch, config)
def _float32_process(data, dst_type): """ deal with src dtype=float32 case """ check_list_value = ("int32", "float16") check_dtype(dst_type, check_list_value, param_name="from_fp32_to_dsttype") if dst_type == "int32": return te.lang.dynamic.cast_to(data, "int32") if dst_type == "float16": return te.lang.dynamic.cast_to(data, "float16")
def lp_loss(predict, label, y, p, reduction="mean", kernel_name="lp_loss"): """ :param predict: dict shape and dtype of input :param label: dict shape and dtype of label, should be same shape and type as predict :param y: dict shape and dtype of y, should be same shape and type as predict :param p: int decides which loss to compute, now the p only can be 1 to compute l1_loss :param reduction: str reduce mode,can be 'mean','sum' or 'none' :param kernel_name: kernel name, default value is "lp_loss" :return: None """ predict_shape = predict.get("shape") predict_dtype = predict.get("dtype").lower() label_shape = label.get("shape") label_dtype = label.get("dtype").lower() dtype_list = ["float16", "float32"] reduction_list = ["none", "mean", "sum"] op_utils.check_dtype(predict_dtype, dtype_list) op_utils.check_dtype(label_dtype, dtype_list) op_utils.check_shape(predict_shape) op_utils.check_shape(label_shape) util.compare_tensor_dict_key(predict, label, "shape") util.compare_tensor_dict_key(predict, label, "dtype") if p != 1: raise RuntimeError("lp_loss only supports l1_loss") if reduction not in reduction_list: raise RuntimeError("reduction should be one of ['none','mean','sum']") predict_data = tvm.placeholder(predict_shape, dtype=predict_dtype, name="predict_data") label_data = tvm.placeholder(label_shape, dtype=label_dtype, name="label_data") res = lp_loss_compute(predict_data, label_data, p, reduction, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [predict_data, label_data, res] } te.lang.cce.cce_build_code(schedule, config)
def elu_grad(grads, activations, y, kernel_name="elu_grad"): """ do element-wise elu_grad operation Parameters: ---------- grads: the dict of gradient input, only support float16, float32 activations: the dict of activation input, only support float16, float32 y : the dict of output kernel_name : cce kernel name, default value is "cce_elu_grad" Returns ------- None """ shape_gradient = grads.get("shape") shape_activation = activations.get("shape") dtype_gradient = grads.get("dtype") dtype_activation = activations.get("dtype") check_shape(shape_gradient, param_name="grads") check_shape(shape_activation, param_name="activations") if not operator.eq(shape_gradient, shape_activation): raise RuntimeError("all input shape must be equal") shape_gradient, _ = refine_shape_axes(shape_gradient, []) shape_activation, _ = refine_shape_axes(shape_activation, []) check_list = ("float16", "float32") check_dtype(dtype_gradient, check_list, param_name="grads") check_dtype(dtype_activation, check_list, param_name="activations") if dtype_gradient.lower() != dtype_activation.lower(): raise RuntimeError("all input dtype must be same") dtype = dtype_gradient.lower() data_gradient = tvm.placeholder(shape_gradient, dtype=dtype, name="data_gradient") data_activation = tvm.placeholder(shape_activation, dtype=dtype, name="data_activation") res = elu_grad_compute(data_gradient, data_activation, y, kernel_name) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "print_ir": False, "tensor_list": [data_gradient, data_activation, res] } te.lang.cce.cce_build_code(auto_sch, config)
def __init__(self, var, indices, updates, var_out, use_locking, kernel_name): self.tik_instance = tik.Tik(tik.Dprofile()) self.indicesdtype = indices.get("dtype").lower() self.updatesdtype = updates.get("dtype").lower() self.vardtype = var.get("dtype").lower() self.var_out_dtype = var_out.get("dtype").lower() indices_support_dtype_list = ("int32", ) check_dtype(self.indicesdtype, indices_support_dtype_list, param_name="indices") updates_support_dtype_list = ("float32", ) check_dtype(self.updatesdtype, updates_support_dtype_list, param_name="updates") self.tiling_dtype = "int32" if self.updatesdtype != self.vardtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "updates", "var", self.updatesdtype, self.vardtype) if self.vardtype != self.var_out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "var_out", "var", self.var_out_dtype, self.vardtype) self.kernel_name = kernel_name self.var_read_index = self.tik_instance.Scalar("int32") self.updates_read_index = self.tik_instance.Scalar("int32") self.indices_loop_index = self.tik_instance.Scalar("int32") self.zero_var = self.tik_instance.Scalar(dtype=self.updatesdtype, name="zero_var") self.zero_var.set_as(0) self.indices_ub = None self.updates_ub = None self.core_num = self._tik_get_core_num() self.ub_size = self._tik_get_ub_size() self.tiling_gm = self.tik_instance.Tensor(self.tiling_dtype, (32, ), name="tiling_gm", scope=tik.scope_gm) self.input_var = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_VAR, ), name="input_var", scope=tik.scope_gm) self.input_indices = self.tik_instance.Tensor(self.indicesdtype, (MAX_ZERO_DIM_INDICE, ), name="input_indices", scope=tik.scope_gm) self.input_updates = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_INDICE, ), name="input_updates", scope=tik.scope_gm) self.output_var = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_VAR, ), name="output_var", scope=tik.scope_gm)
def relu6_grad(input_grad, input_x, output_y, kernel_name="relu6_grad"): """ Parameters ---------- input_grad : dict shape and dtype of input_grad input_x : dict shape and dtype of input_x output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is "relu6_grad" Returns ------ None """ # check input shape shape_x = input_x.get("shape") shape_grad = input_grad.get("shape") op_utils.check_shape(shape_x, param_name="input_x") op_utils.check_shape(shape_grad, param_name="input_grad") if list(shape_x) != list(shape_grad): raise RuntimeError("input_grad and input_x must have the same shape.") # check input tensor data_type and kernel_name check_list = ("float16", "float32") input_dtype = input_x.get("dtype").lower() grad_dtype = input_grad.get("dtype").lower() op_utils.check_dtype(input_dtype, check_list, param_name="input_x") op_utils.check_dtype(grad_dtype, check_list, param_name="input_grad") if input_dtype == "float32" and not tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmuls", "float32"): raise RuntimeError( "Input dtype only support float16 while input dtype is float32") shape_x = [reduce_ins(lambda x, y: x * y, shape_x[:])] input_data_orginal = tvm.placeholder(shape_x, name="input_data", dtype=input_dtype) input_grad = tvm.placeholder(shape_x, name="input_grad", dtype=grad_dtype) final_res = relu6_grad_compute(input_grad, input_data_orginal, output_y, kernel_name="relu6_grad") with tvm.target.cce(): auto_sch = generic.auto_schedule(final_res) config = { "name": kernel_name, "tensor_list": (input_grad, input_data_orginal, final_res) } te.lang.cce.cce_build_code(auto_sch, config)
def acos_grad(y, dy, z, kernel_name="acos_grad"): """ do element-wise acos_grad operation between two input tensors Parameters: ---------- y : dict of y, include shape and dtype, dtype support float16, float32 dy : dict of dy, include shape and dtype, dtype support float16, float32 z : dict of z, include shape and dtype, dtype support float16, float32 kernel_name : cce kernel name, default value is "acos_grad" ------- """ # get the shape and dtype for input_1,input_2 shape_y = y.get("shape") shape_dy = dy.get("shape") dtype = y.get("dtype") dtype1 = dy.get("dtype") check_shape(shape_y, param_name="y") check_shape(shape_dy, param_name="dy") shape_y, _ = refine_shape_axes(shape_y, []) shape_dy, _ = refine_shape_axes(shape_dy, []) # raise runtimeerror if the input paras are invalid check_list = ("float16", "float32") check_dtype(dtype, check_list, param_name="y") check_dtype(dtype1, check_list, param_name="dy") dtype = dtype.lower() dtype1 = dtype1.lower() if not operator.eq(shape_y, shape_dy): raise RuntimeError( "acos_grad only support input shape while input_shape1 equals" " to input_shape2") if dtype != dtype1: raise RuntimeError( "acos_grad only support dtype while input_dtype1 equals" " to input_dtype2") shape_y, _ = refine_shape_axes(shape_y, []) shape_dy, _ = refine_shape_axes(shape_dy, []) data_y = tvm.placeholder(shape_y, dtype=dtype, name="data1") data_dy = tvm.placeholder(shape_dy, dtype=dtype, name="data2") res = acos_grad_compute(data_y, data_dy, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_y, data_dy, res)} te.lang.cce.cce_build_code(sch, config)
def apply_power_sign_d(var, m, lr, logbase, sign_decay, beta, grad, var_out, m_out, kernel_name="apply_power_sign_d"): """ Update '*var' according to the AddSign update Parameters: ---------- var: dict of Variable, only support float16, float32 m : dict of input_grad, only support float16, float32 lr : dict of lr, only support float16, float32 logbase : dict of logbase, only support float16, float32 sign_decay : dict of sign_decay, only support float16, float32 grad : dict of grad, only support float16, float32 beta : dict of beta, only support float16, float32 var_out : dict of output, only support float16, float32 m_out : dict of output, only support float16, float32 kernel_name : cce kernel name, default value is apply_power_sign Algorithm : ---------- m_t <- beta * m_{t-1} + (1 - beta) * grad update <- exp(logbase * sign_decay * sign(grad) * sign(m_t)) * grad variable <- variable - lr_t * update Returns ---------- None """ input_dict = (var, m, lr, logbase, sign_decay, beta, grad) check_list = ('float16', 'float32') dtype = var.get('dtype') check_dtype(dtype, check_list, param_name="var") dtype = dtype.lower() args = ApplyOpConfig.TensorArgs(input_dict, apply_power_sign_d_compute, [var_out, m_out], 6 if dtype == 'float32' else 10) name = ApplyOpConfig.TensorName(all=('var', 'm', 'lr', 'logbase', 'sign_decay', 'beta', 'grad'), scalar=('lr', 'logbase', 'sign_decay', 'beta'), reuse=('m', 'var')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def asin_grad(y, dy, z, kernel_name="asin_grad"): """ do element-wise asin_grad operation between two input tensors Parameters: ---------- y : dict of y, include shape and dtype, dtype support float16, float32 dy : dict of dy, include shape and dtype, dtype support float16, float32 z : dict of output kernel_name : cce kernel name, default value is "asin_grad" Returns ------- None """ # get the shape and dtype shape_y = y.get("shape") shape_dy = dy.get("shape") dtype_y = y.get("dtype") dtype_dy = dy.get("dtype") # kernel name check: should be unique # check whether the shape is right check_shape(shape_y, param_name="y") check_shape(shape_dy, param_name="dy") if not operator.eq(shape_y, shape_dy): raise RuntimeError("all input shape must be the same") shape_y, _ = refine_shape_axes(shape_y, []) shape_dy, _ = refine_shape_axes(shape_dy, []) # check whether dtypes are fp16,fp32 and whether they are the same check_list = ("float16", "float32") check_dtype(dtype_y, check_list, param_name="y") check_dtype(dtype_dy, check_list, param_name="dy") dtype_y = dtype_y.lower() if dtype_y != dtype_dy.lower(): raise RuntimeError("all input dtype must be same") # get 2 input tensors: data_y, data_dy data_y = tvm.placeholder(shape_y, name="data_y", dtype=dtype_y) data_dy = tvm.placeholder(shape_y, name="data_dy", dtype=dtype_y) res = asin_grad_compute(data_y, data_dy, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]} te.lang.cce.cce_build_code(sch, config)
def atan2(x1, x2, y, kernel_name="atan2"): """ Algorithm: arctan2 arctan2(y, x) = arctan(y/x) ---------------------------------- Parameters: x1: the dict of input data x1, only support float16, float32. x2: the dict of input data x2, only support float16, float32. y: the dict of output kernel_name: default value is "atan2". ---------------------------------- Returns: None """ y_shape = x1.get("shape") x_shape = x2.get("shape") y_dtype = x1.get("dtype") x_dtype = x2.get("dtype") check_shape(y_shape, param_name="x1") check_shape(x_shape, param_name="x2") shape_y, shape_x, shape_max = broadcast_shapes( y_shape, x_shape, param_name_input1="x1", param_name_input2="x2") check_list = ("float16", "float32") check_dtype(y_dtype, check_list, param_name="x1") check_dtype(x_dtype, check_list, param_name="x2") if y_dtype.lower() != x_dtype.lower(): raise RuntimeError("The input tensor must have identical dtype!") shape_y, shape_x = refine_shapes_for_broadcast(shape_y, shape_x) input_y = tvm.placeholder(shape_y, dtype=y_dtype.lower(), name="input_y") input_x = tvm.placeholder(shape_x, dtype=x_dtype.lower(), name="input_x") res = atan2_compute(input_y, input_x, y, kernel_name) res = te.lang.cce.cast_to(res, x_dtype.lower()) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (input_y, input_x, res), "print_ir": False, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(auto_sch, config)
def check_input_params(self): """ to the check whether the input parameters is valid or not """ if self.input_dtype != self.output_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( "split_d", "self.input_dtype", "self.output_dtype", self.input_dtype, self.output_dtype) dtype_list = ( "float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64" ) check_dtype(self.input_dtype, dtype_list, param_name="x")
def depthwise_weight_6d_2_4d(x, y, src_format, dst_format, kernel_name="depthwise_weight_6d_2_4d"): """Operation and Schedule for depthwise_weight_6d_2_4d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_6d_2_4d" Returns ------- convert C1HWNCoC0 tp HWCN """ _check_parameters(x, y, src_format, dst_format) output_shape = y.get("shape") channel_size = output_shape[2] input_shape = x.get("shape") dtype = x.get("dtype") channel_4d = channel_size op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) six2four = _Six2FourParam(input_shape, channel_4d) res = tvm.extern( [six2four.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(six2four, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def depthwise_weight_4d_2_6d(x, y, src_format, dst_format, kernel_name="depthwise_weight_4d_2_6d"): """Operation and Schedule for depthwise_weight_4d_2_6d. Parameters ---------- x: shape and dtype of input, the dtype support float16, float32, int32, uint16. y: the shape and dtype of outputs, the dtype same as input. src_format: the source data_format dst_format: the target data_format kernel_name : cce kernel name, default value is "depthwise_weight_4d_2_6d" Returns ------- convert HWCN to C1HWNCoC0 """ if src_format.lower() != "hwcn": raise RuntimeError("dst_format must be HWCN!") if dst_format.lower() != "c1hwncoc0": raise RuntimeError("src_format must be C1HWNCoC0 !") input_shape = x.get("shape") dtype = x.get("dtype") op_utils.check_shape(input_shape, param_name="x") check_list = ("float16", "float32", "int32", "uint16") dtype = dtype.lower() op_utils.check_dtype(dtype, check_list, param_name="x") input_data = tvm.placeholder(input_shape, name="input_data", dtype=dtype) four2six = _Four2SixParam(input_shape) res = tvm.extern( [four2six.get_out_shape()], [input_data], lambda ins, outs: _intrin_factor(four2six, dtype, ins, outs), name="res", dtype=dtype) sch = tvm.create_schedule(res.op) build_list = [input_data, res] with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)