def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") ori_shape_x = x.get("ori_shape") ori_shape_y = y.get("ori_shape") shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) if format_pattern == 1: ori_shape_x, shape_y, shape_max = op_utils.broadcast_shapes( ori_shape_x, shape_y, param_name_input1="x", param_name_input2="y") if shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == ori_shape_x[-1]: raise RuntimeError("the inputshape of y is illegal") if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, ori_shape_y, shape_max = op_utils.broadcast_shapes( shape_x, ori_shape_y, param_name_input1="x", param_name_input2="y") if shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == ori_shape_y[-1]: raise RuntimeError("the inputshape of x is illegal") if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") shape_x = scalar2tensor_one(shape_x) shape_y = scalar2tensor_one(shape_y) if format_pattern == 1: shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_y[-2] == 1 and shape_y[-1] == shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_x[-2] == 1 and shape_x[-1] == shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"): check_list = ("float16", "float32") shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype").lower() shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype").lower() shape_x3 = x3.get("shape") dtype_x3 = x3.get("dtype").lower() util.check_shape_rule(shape_x1) # 校验算子的shape,维度数需要大于等于1、小于等于8 util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) # 校验算子第一个输入shape大小 util.check_dtype_rule(dtype_x1, check_list) # 校验算子的输入数据类型 util.check_shape_rule(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x2, check_list) util.check_shape_rule(shape_x3) util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x3, check_list) if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3: raise RuntimeError("the type of x1, x2, x3 must be the same!") util.check_kernel_name(kernel_name) # 校验算子的kernel_name # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max) # 将input_x的shape广播为shape_max shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max) # 将input_y的shape广播为shape_max data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2) data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3) res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x1, data_x2, data_x3, res]} te.lang.cce.cce_build_code(schedule, config)
def maximum_compute(x1, x2, y, kernel_name="maximum"): """dynamic maximum compute Parameters: ---------- x1: TVM tensor input_x tensor. x2: TVM tensor input_y tensor. y: dict shape and dtype of output. kernel_name: str cce kernel name, default value is "maximum". Returns ------- res: TVM tensor output tensor, has the same shape and type as input tensor. """ shape_x = te.lang.dynamic.shape_to_list(x1.shape) shape_y = te.lang.dynamic.shape_to_list(x2.shape) shape1, shape2, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") data1 = te.lang.dynamic.broadcast(x1, shape_max) data2 = te.lang.dynamic.broadcast(x2, shape_max) res = te.lang.dynamic.vmax(data1, data2) return res
def mul_compute(input1, input2, output, kernel_name="mul"): """ calculating data's mul, c = a * b Parameters ---------- input1: TVM tensor the placeholder of first input data input2: TVM tensor the placeholder of second input data output: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is mul Returns ------- res : output of the data's mul """ x0_shape = te.lang.dynamic.shape_to_list(input1.shape) x1_shape = te.lang.dynamic.shape_to_list(input2.shape) x0_shape, x1_shape, y_shape = broadcast_shapes(x0_shape, x1_shape, param_name_input1="input1", param_name_input2="input2") input1 = te.lang.dynamic.broadcast(input1, y_shape) input2 = te.lang.dynamic.broadcast(input2, y_shape) res = te.lang.dynamic.vmul(input1, input2) return res
def _mul_compute(input_x, input_y, output_data, kernel_name="mul"): """ calculating element-wise mul Parameters ---------- input_x: TVM tensor the placeholder of first input data input_y: TVM tensor the placeholder of second input data output_data: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is "mul" Returns ------- output of the element-wise mul """ shape_x = te.lang.cce.util.shape_to_list(input_x.shape) shape_y = te.lang.cce.util.shape_to_list(input_y.shape) shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") if shape_x != shape_y and len(shape_x) == 2 and len(shape_y) == 2: res = _mul_compute_ex(input_x, input_y, shape_x, shape_y, shape_max) if res is not None: return res input_x = te.lang.cce.broadcast(input_x, shape_max) input_y = te.lang.cce.broadcast(input_y, shape_max) res = te.lang.cce.vmul(input_x, input_y) return res
def sub_compute(input_x, input_y, output_z, kernel_name="sub"): """ calculating data's sub, c = a - b Parameters ---------- input_x: TVM tensor the placeholder of first input data input_y: TVM tensor the placeholder of second input data output_z: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is sub Returns ------- res : output of the data's sub """ shape_x = te.lang.dynamic.shape_to_list(input_x.shape) shape_y = te.lang.dynamic.shape_to_list(input_y.shape) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_x = te.lang.dynamic.broadcast(input_x, shape_max) input_y = te.lang.dynamic.broadcast(input_y, shape_max) res = te.lang.dynamic.vsub(input_x, input_y) return res
def _check_shape_compatibility(shape_in, shape_out): """ Check if the shape of input tensor is compatible with output tensor. Parameters: ---------- shape_in : shape of input tensor. shape_out : shape of output tensor. Returns: ------- comp_shape_in : new shape_in compatible with shape_out. """ try: comp_shape_in, comp_shape_out, shape_max = broadcast_shapes( shape_in, shape_out, param_name_input1="value", param_name_input2="dims") if comp_shape_out != shape_max: raise ValueError('shape_in is not compatible with shape_out.') except RuntimeError: raise ValueError('shape_in is not compatible with shape_out.') return comp_shape_in
def add_compute(input_x, input_y, output_z, kernel_name="add"): """ calculating data's add, c = a + b Parameters ---------- input_x: left input, may be dict or tensor input_y: left input, may be dict or tensor output_z: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is add Returns ------- res : output of the data's add """ shape_x = te.lang.dynamic.shape_to_list(input_x.shape) shape_y = te.lang.dynamic.shape_to_list(input_y.shape) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_x = te.lang.dynamic.broadcast(input_x, shape_max) input_y = te.lang.dynamic.broadcast(input_y, shape_max) res = te.lang.dynamic.vadd(input_x, input_y) return res
def logical_or_compute(x1, x2, y, kernel_name="logical_or"): """ algorithm : logical_or_compute calculating the value of x1 OR x2 element-wise Parameters ---------- x1 : the placeholders of x1 x2 : the placeholders of x2 y : the dict of y kernel_name : string, cce kernel name, default value is "logical_or" Returns ------- result res """ _, _, shape_max = broadcast_shapes( te.lang.cce.util.shape_to_list(x1.shape), te.lang.cce.util.shape_to_list(x2.shape), param_name_input1="x1", param_name_input2="x2") x1 = te.lang.cce.cast_to(x1, "float16") x2 = te.lang.cce.cast_to(x2, "float16") x1 = te.lang.cce.broadcast(x1, shape_max) x2 = te.lang.cce.broadcast(x2, shape_max) res = te.lang.cce.vmax(x1, x2) res = te.lang.cce.cast_to(res, "int8") return res
def real_div_compute(x1, x2, y, kernel_name="real_div"): """ calculating data's realdiv, c = a / b Parameters ---------- x1: TVM tensor the placeholder of first input data x2: TVM tensor the placeholder of second input data y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is real_div Returns ------- res : output of the data's divide """ shape_x = te.lang.dynamic.shape_to_list(x1.shape) shape_y = te.lang.dynamic.shape_to_list(x2.shape) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y) data_x = te.lang.dynamic.broadcast(x1, shape_max) data_y = te.lang.dynamic.broadcast(x2, shape_max) res = te.lang.dynamic.vdiv(data_x, data_y) return res
def logical_or(x1, x2, y, kernel_name="logical_or"): """ algorithm : logical_or calculating the value of x1 OR x2 element-wise Parameters ---------- x1 : the dict of x1, include shape and dtype, dtype support int8, the value only support 0, 1 x2 : the dict of x2, include shape and dtype, dtype support int8, the value only support 0, 1 y : the dict of y, include shape and dtype kernel_name : string, cce kernel name, default value is "logical_or" Returns ------- None """ shape_x1 = x1.get("shape") shape_x2 = x2.get("shape") dtype_x1 = x1.get("dtype") dtype_x2 = x2.get("dtype") if dtype_x1 == "bool" or dtype_x2 == "bool": dtype_x1 = "int8" dtype_x2 = "int8" check_shape(shape_x1, param_name="x1") check_shape(shape_x2, param_name="x2") check_tuple = ("int8", ) check_dtype(dtype_x1, check_tuple, param_name="x1") check_dtype(dtype_x2, check_tuple, param_name="x2") shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1, shape_x2, param_name_input1="x1", param_name_input2="x2") dtype = dtype_x1.lower() data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype) res = logical_or_compute(data_x1, data_x2, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "need_build": False, "name": kernel_name, "tensor_list": (data_x1, data_x2, res) } te.lang.cce.cce_build_code(schedule, config)
def mul(x, y, output, kernel_name="mul"): """ do element-wise mul operation between two input tensors Parameters: ---------- x : dict. shape, dtype of input x y : dict. shape, dtype of input y output : dict. shape, dtype of ouput kernel_name : str. cce kernel name, default value is "mul" Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _mul_check_format(x, y) shape_x, shape_y = _infer_shape(format_pattern, x, y) shape_x = util.scalar2tensor_one(shape_x) dtype_x = x.get("dtype").lower() shape_y = util.scalar2tensor_one(shape_y) dtype_y = y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_y, param_name="y") if dtype_x != dtype_y: raise RuntimeError("dtype of inputs should be consistent") dtype = dtype_x check_list = ("int32", "float16", "float32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if dtype_x == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y) input_x = tvm.placeholder(shape_x, dtype=dtype, name="x") input_y = tvm.placeholder(shape_y, dtype=dtype, name="y") res = _mul_compute(input_x, input_y, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)} te.lang.cce.cce_build_code(sch, config)
def dequantize_compute(x, min_range, max_range, y, mode="MIN_COMBINED", kernel_name="dequantize"): """ Computation for dequantize the 'input' tensor into a float tensor. Parameters: ---------- x: input data, dtype must be one of the following: only support `int8`, `uint8`, `int32`, min_range: input min_range, dtype must be `float32`. The minimum scalar value possibly produced for the input. max_range: input max_range, dtype must be `float32`. The maximum scalar value possibly produced for the input. y: the dict of output_data, dtype must be `float32`. mode: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST", "SCALED"`. Defaults to `"MIN_COMBINED"`. kernel_name : cce kernel name, default value is "dequantize". Returns ------- res : output of the dequantization's computation. """ input_tensor = x shape_x = te.lang.cce.util.shape_to_list(x.shape) shape_range = te.lang.cce.util.shape_to_list(max_range.shape) shape_x, shape_range, shape_max = op_utils.broadcast_shapes( shape_x, shape_range, param_name_input1="x", param_name_input2="max_range") broadcast_min_range = te.lang.cce.broadcast(min_range, shape_max) broadcast_max_range = te.lang.cce.broadcast(max_range, shape_max) if mode == "MIN_COMBINED": res = _min_combined_mode_compute(input_tensor, broadcast_min_range, broadcast_max_range) elif mode == "MIN_FIRST": res = _min_first_mode_compute(input_tensor, broadcast_min_range, broadcast_max_range) elif mode == "SCALED": res = _scaled_mode_compute(input_tensor, broadcast_max_range) return res
def atan2_compute(y, x, output_dict, kernel_name="atan2"): """ Algorithm: atan2 ---------------------------------- Parameters: y: Input data y. x: Input data x. kernel_name: cce kernel name, default value is "atan2" ---------------------------------- Returns: A Tensor of atan2(x). """ shape_y = y.shape dtype_y = y.dtype shape_x = x.shape shape_y = te.lang.cce.util.shape_to_list(shape_y) shape_x = te.lang.cce.util.shape_to_list(shape_x) shape_y, shape_x, shape_broadcast = broadcast_shapes(shape_y, shape_x, param_name_input1="x1", param_name_input2="x2") y = te.lang.cce.broadcast(y, shape_broadcast) x = te.lang.cce.broadcast(x, shape_broadcast) if dtype_y == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): y = te.lang.cce.cast_to(y, "float32") x = te.lang.cce.cast_to(x, "float32") mask = _init_atan2_mask(y, x) # caculate the atan(y/x) when x > 0 res = te.lang.cce.vdiv(y, x) res = _atan_compute(res) y_cmp_zero = te.lang.cce.vmuls(mask[CONST_ONE], tvm.const(CONST_PI_BY_TWO, y.dtype)) res_x_lt_zero = te.lang.cce.vmuls(mask[CONST_ZERO], tvm.const(CONST_PI, y.dtype)) if x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", x.dtype): res = te.lang.cce.vcmpsel(x, tvm.const(CONST_ZERO, x.dtype), 'eq', y_cmp_zero, res) else: tensor_zero = te.lang.cce.broadcast(tvm.const(CONST_ZERO, x.dtype), shape_broadcast) x_equal_zero = te.lang.cce.vcmp(x, tensor_zero, 'eq') res = te.lang.cce.vsel(x_equal_zero, y_cmp_zero, res) res = te.lang.cce.vadd(res, res_x_lt_zero) if dtype_y == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def atan2(x1, x2, y, kernel_name="atan2"): """ Algorithm: arctan2 arctan2(y, x) = arctan(y/x) ---------------------------------- Parameters: x1: the dict of input data x1, only support float16, float32. x2: the dict of input data x2, only support float16, float32. y: the dict of output kernel_name: default value is "atan2". ---------------------------------- Returns: None """ y_shape = x1.get("shape") x_shape = x2.get("shape") y_dtype = x1.get("dtype") x_dtype = x2.get("dtype") check_shape(y_shape, param_name="x1") check_shape(x_shape, param_name="x2") shape_y, shape_x, shape_max = broadcast_shapes( y_shape, x_shape, param_name_input1="x1", param_name_input2="x2") check_list = ("float16", "float32") check_dtype(y_dtype, check_list, param_name="x1") check_dtype(x_dtype, check_list, param_name="x2") if y_dtype.lower() != x_dtype.lower(): raise RuntimeError("The input tensor must have identical dtype!") shape_y, shape_x = refine_shapes_for_broadcast(shape_y, shape_x) input_y = tvm.placeholder(shape_y, dtype=y_dtype.lower(), name="input_y") input_x = tvm.placeholder(shape_x, dtype=x_dtype.lower(), name="input_x") res = atan2_compute(input_y, input_x, y, kernel_name) res = te.lang.cce.cast_to(res, x_dtype.lower()) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (input_y, input_x, res), "print_ir": False, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(auto_sch, config)
def floor_div_compute(input_x, input_y, output_z, kernel_name='floor_div'): """ floordiv compute calculating data's floordiv, res =floor(x / y) Parameters ---------- input_x: TVM tensor the placeholder of input_x input_y: TVM tensor the placeholder of input_y output_z: dict dict of output kernel_name: str kernel name, default value is "floor_div" Returns ------- res: TVM tensor the result of floordiv compute """ dtype_x = input_x.dtype input_x_shape = te.lang.dynamic.shape_to_list(input_x.shape) input_y_shape = te.lang.dynamic.shape_to_list(input_y.shape) input_x_shape, input_y_shape, shape_broad = \ broadcast_shapes(input_x_shape, input_y_shape, param_name_input1="input_x", param_name_input2="input_y") if dtype_x != "float16" and tbe_platform.cce_conf.api_check_support( "te.lang.dynamic.vdiv", "float32"): input_x = te.lang.dynamic.cast_to(input_x, 'float32') input_y = te.lang.dynamic.cast_to(input_y, 'float32') input_x = te.lang.dynamic.broadcast(input_x, shape_broad) input_y = te.lang.dynamic.broadcast(input_y, shape_broad) else: input_x = te.lang.dynamic.broadcast(input_x, shape_broad) input_y = te.lang.dynamic.broadcast(input_y, shape_broad) res = te.lang.dynamic.vdiv(input_x, input_y) if dtype_x != "float16" and tbe_platform.cce_conf.get_soc_spec( "SOC_VERSION") == "Ascend310": res = te.lang.dynamic.cast_to(res, "float16") res = te.lang.dynamic.floor(res) res = te.lang.dynamic.cast_to(res, dtype_x) return res
def greater_equal_compute(input_x, input_y, output_z, kernel_name="greater_equal"): """ if x is greater than y or equals y, then return 1, else return 0. Parameters ---------- input_x: TVM tensor the placeholder of input_x, has shape, dtype and range attributes input_y: TVM tensor the placeholder of input_y, has shape, dtype and range attributes output_z: dict dict info of output_z kernel_name: str cce kernel name, default value is "greater_equal" Returns ------- res: TVM tensor the result of compute """ shape_x = te.lang.dynamic.shape_to_list(input_x.shape) shape_y = te.lang.dynamic.shape_to_list(input_y.shape) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") dtype_x = input_x.dtype if dtype_x in ("int8", "uint8"): input_x = te.lang.dynamic.cast_to(input_x, "float16") input_y = te.lang.dynamic.cast_to(input_y, "float16") dtype_x = "float16" input_x = te.lang.dynamic.broadcast(input_x, shape_max) input_y = te.lang.dynamic.broadcast(input_y, shape_max) if dtype_x == "float32": # minimun num of float32 2**(-126) data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype_x) elif dtype_x == "float16": # minimun num of float16 2**(-24) data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype_x) else: # minimun num of int32 1 data_min = tvm.const(SCALAR_ONE, dtype=dtype_x) return _greater_equal_compare((input_x, input_y), shape_max, dtype_x, data_min)
def div_compute(input_x, input_y, output_z, kernel_name="div"): """ div compute calculating data's div, res =x / y Parameters ---------- input_x: TVM tensor the placeholder of input_x input_y: TVM tensor the placeholder of input_y output_div: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "div" Returns ------- res: TVM tensor the result of div compute """ x_shape = te.lang.dynamic.shape_to_list(input_x.shape) y_shape = te.lang.dynamic.shape_to_list(input_y.shape) x_shape, y_shape, z_shape = broadcast_shapes(x_shape, y_shape, param_name_input1="input_x", param_name_input2="input_y") dtype_x = input_x.dtype int_list = ("int8", "uint8", "int32") if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vdiv", "float32"): input_x = te.lang.dynamic.cast_to(input_x, "float32") input_y = te.lang.dynamic.cast_to(input_y, "float32") input_x = te.lang.dynamic.broadcast(input_x, z_shape) input_y = te.lang.dynamic.broadcast(input_y, z_shape) res = te.lang.dynamic.vdiv(input_x, input_y) if dtype_x in int_list: if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") == "Ascend310": res = te.lang.dynamic.cast_to(res, "float16") res = te.lang.dynamic.floor(res) res = te.lang.dynamic.cast_to(res, dtype_x) return res
def less_compute(input_x, input_y, output_z, kernel_name="less"): """ if x is less than y, then return 1, else return 0. Parameters: ---------- input_x: TVM tensor the placeholder of first input data input_y: TVM tensor the placeholder of second input data output_x: dict shape and dtype of output, should be broadcast shape and type as input kernel_name: str cce kernel name, default value is less Returns ------- the result """ shape_x = te.lang.dynamic.shape_to_list(input_x.shape) shape_y = te.lang.dynamic.shape_to_list(input_y.shape) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") dtype_x = input_x.dtype if dtype_x in ("uint8", "int8"): input_x = te.lang.dynamic.cast_to(input_x, "float16") input_y = te.lang.dynamic.cast_to(input_y, "float16") dtype_x = "float16" input_x = te.lang.dynamic.broadcast(input_x, shape_max) input_y = te.lang.dynamic.broadcast(input_y, shape_max) if dtype_x == "float32": # minimun num of float32 2**(-126) data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype_x) elif dtype_x == "float16": # minimun num of float16 2**(-24) data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype_x) else: # minimun num of int32 1 data_min = tvm.const(SCALAR_ONE, dtype=dtype_x) return _less_compare((input_x, input_y), shape_max, dtype_x, data_min)
def masked_fill_compute(x, mask, value, y, kernel_name="masked_fill"): ''' calculating masked_fill :param x: TVM tensor the output of previous layer :param mask: TVM tensor mask dtype is bool :param value: scalar or TVM tensor the value to fill in with :param kernel_name: str kernel name, default value is "masked_fill" :return:y TVM tensor ''' ori_dtype = x.dtype if x.dtype in ('int8', 'int32'): x = te.lang.cce.cast_to(x, 'float16') x_shape = te.lang.cce.util.shape_to_list(x.shape) mask_shape = te.lang.cce.util.shape_to_list(mask.shape) # computer output shape x_shape, mask_shpae, target_shape = op_utils.broadcast_shapes( x_shape, mask_shape) target_dtype = x.dtype mask = te.lang.cce.cast_to(mask, x.dtype) value = te.lang.cce.cast_to(value, x.dtype) mask = te.lang.cce.broadcast(mask, target_shape) tensor_ones = te.lang.cce.broadcast(tvm.const(1, target_dtype), target_shape) value = te.lang.cce.broadcast(value, target_shape) x = te.lang.cce.broadcast(x, target_shape) y = te.lang.cce.vcmpsel(mask, tensor_ones, 'eq', value, x) if y.dtype != ori_dtype: y = te.lang.cce.cast_to(y, ori_dtype) return y
def floor_mod_compute(x1, x2, y, kernel_name="floor_mod"): """ Compute remainder of division res = x1 - floor(input_data_x / input_data_y) * input_data_y Parameters ---------- x1: TVM tensor input tensor has shape, dtype and range attributes x2: TVM tensor input tensor has shape, dtype and range attributes y: dict dict with keys(shape, dtype and range) of output kernel_name : str cce kernel name, default value is "floor_mod" Returns ------ res: TVM tensor the calculation results """ dtype = x1.dtype shape_x = te.lang.dynamic.shape_to_list(x1.shape) shape_y = te.lang.dynamic.shape_to_list(x2.shape) shape_x, shape_y, shape = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") # calculate result, using float32 for better precision has_improve_precision = False input_x_fp32 = x1 input_y_fp32 = x2 if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vdiv", "float32"): input_x_fp32 = te.lang.dynamic.cast_to(x1, "float32") input_y_fp32 = te.lang.dynamic.cast_to(x2, "float32") has_improve_precision = True input_x_fp32 = te.lang.dynamic.broadcast(input_x_fp32, shape) input_y_fp32 = te.lang.dynamic.broadcast(input_y_fp32, shape) res = te.lang.dynamic.vdiv(input_x_fp32, input_y_fp32) if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.floor", res.dtype): res = te.lang.dynamic.floor(res) else: res = te.lang.dynamic.cast_to(res, "float16") res = te.lang.dynamic.floor(res) if dtype != "int32": if has_improve_precision: res = te.lang.dynamic.cast_to(res, "float32") else: res = te.lang.dynamic.cast_to(res, "float16") res = te.lang.dynamic.vmul(res, input_y_fp32) res = te.lang.dynamic.vsub(input_x_fp32, res) if has_improve_precision: res = te.lang.dynamic.cast_to(res, dtype) else: x2_broad = te.lang.dynamic.broadcast(x2, shape) x1_broad = te.lang.dynamic.broadcast(x1, shape) res = te.lang.dynamic.vmul(res, x2_broad) res = te.lang.dynamic.vsub(x1_broad, res) return res
def leaky_relu_grad_compute(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). Parameters ---------- g : TVM tensor the placeholder of input g x : TVM tensor the placeholder of input x y : dict dict of output y, include keys(shape and dtype) negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- res: TVM tensor the result of leaky_relu_grad_compute """ shape_list = broadcast_shapes(te.lang.dynamic.shape_to_list(g.shape), te.lang.dynamic.shape_to_list(x.shape)) dtype = g.dtype g = te.lang.dynamic.broadcast(g, shape_list[2]) x = te.lang.dynamic.broadcast(x, shape_list[2]) if dtype == "float32": help_min = tvm.const(2**(-126), "float32") help_rec_one = tvm.const(2**38, "float32") help_rec_sec = tvm.const(2**44, "float32") elif dtype == "float16": help_min = tvm.const(2**(-24), "float16") help_rec_one = tvm.const(2**12, "float16") help_rec_sec = help_rec_one tmp_min_x = te.lang.dynamic.vmins(x, help_min) tmp_max_x = te.lang.dynamic.vmaxs(tmp_min_x, tvm.const(SCALAR_ZERO, "float32")) tmp_mul_x = te.lang.dynamic.vmuls(tmp_max_x, help_rec_one) if dtype == "float32": tmp_mul_x = te.lang.dynamic.vmuls(tmp_mul_x, help_rec_sec) result_tmp_right = te.lang.dynamic.vmuls(tmp_mul_x, help_rec_sec) result_sub = te.lang.dynamic.vadds(result_tmp_right, tvm.const(NEGATIVE_ONE, "float32")) result_abs = te.lang.dynamic.vabs(result_sub) result_tmp_left = te.lang.dynamic.vmuls(result_abs, negative_slope) result_tmp = te.lang.dynamic.vadd(result_tmp_left, result_tmp_right) res = te.lang.dynamic.vmul(g, result_tmp) return res
def dequantize(x, min_range, max_range, y, mode="MIN_COMBINED", kernel_name="dequantize"): """ Dequantize the 'input' tensor into a float tensor. [min_range, max_range] are scalar floats that specify the range for the 'input' data. The 'mode' attribute controls exactly which calculations are used to convert the float to their quantized equivalents. In 'MIN_COMBINED' mode, each value of the tensor will undergo the following: ``` if T == int8 or T == int32: in[i] += (range(T) + 1) / 2.0 out[i] = min_range + (in[i] * (max_range - min_range) / range(T)) ``` here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()` Note that if quantizedtype is int8, the operation will additionally add each value by 128 prior to casting. If the mode is 'MIN_FIRST', then this approach is used: ``` num_discrete_values = 1 << (# of bits in T) range_adjust = num_discrete_values / (num_discrete_values - 1) range = (range_max - range_min) * range_adjust range_scale = range / num_discrete_values if T == int32: result = range_min + ((input - numeric_limits<T>::min()) * range_scale) else if T == int8 or T == uint8: least_quantitize = -round(min_range * ((1 << num_bits) - 1) / (max_range - min_range)) offset = min_range + least_quantitize * 1.0 * (max_range - min_range) / ((1 << num_bits) - 1) res_tmp = range_min + ((input - numeric_limits<T>::min()) * range_scale) result = res_tmp - offset ``` In `SCALED` mode, ``` m = input_max num_bits = sizeof(T) * 8 if T == int8 or T == int32: [min_fixed, max_fixed] = [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1] s = (2.0 * m) / (max_fixed - min_fixed) if T == uint8: [min_fixed, max_fixed] = [0, (1 << num_bits) - 1] s = 1.0 * m / (max_fixed - min_fixed) result = input * s ``` Parameters: ---------- x: the dict of x, dtype must be one of the following: cloud version only supports `int8`, `uint8`, `int32`, mini version only supports `int8`, `uint8`. min_range: the dict of input_min_range, dtype must be `float32`. The minimum scalar value possibly produced for the input. max_range: the dict of input_max_range, dtype must be `float32`. The maximum scalar value possibly produced for the input. y: the dict of output_data, dtype must be `float32`. mode: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST", "SCALED"`. Defaults to `"MIN_COMBINED"`. kernel_name : cce kernel name, default value is "dequantize" Returns ------- None """ shape_x = x.get("shape") shape_input_min_range = min_range.get("shape") shape_input_max_range = max_range.get("shape") shape_output_data = y.get("shape") if len(shape_input_min_range) != len(shape_input_max_range): raise RuntimeError("shape_input_min_range and shape_input_max_range" " must be equal") if shape_output_data != shape_x: raise RuntimeError("shape_output_data and shape_x must be equal.") shape_range = shape_input_min_range op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_range, param_name="min_range") dtype_x = x.get("dtype") dtype_input_min_range = min_range.get("dtype") dtype_input_max_range = max_range.get("dtype") dtype_output_data = y.get("dtype") dtype_x = dtype_x.lower() dtype_input_min_range = dtype_input_min_range.lower() dtype_input_max_range = dtype_input_max_range.lower() dtype_output_data = dtype_output_data.lower() check_list = ("int8", "uint8", "int32") s322f32_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.cast_to", "s322f32") if dtype_x == "int32" and not s322f32_support: raise RuntimeError("not support on the platform") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if not vmul_support: raise RuntimeError("not support on the platform") op_utils.check_dtype(dtype_x, check_list, param_name="x") op_utils.check_dtype(dtype_input_min_range, ("float32", ), param_name="min_range") op_utils.check_dtype(dtype_input_max_range, ("float32", ), param_name="max_range") op_utils.check_dtype(dtype_output_data, ("float32", ), param_name="y") if mode not in ("MIN_COMBINED", "MIN_FIRST", "SCALED"): raise RuntimeError( "mode only support MIN_COMBINED, MIN_FIRST, SCALED.") shape_x, shape_range, _ = op_utils.broadcast_shapes( shape_x, shape_range, param_name_input1="x", param_name_input2="min_range") shape_x, shape_range = op_utils.refine_shapes_for_broadcast( shape_x, shape_range) input_tensor = tvm.placeholder(shape_x, dtype=dtype_x, name="x") min_range = tvm.placeholder(shape_range, dtype="float32", name="input_min_range") max_range = tvm.placeholder(shape_range, dtype="float32", name="input_max_range") res = dequantize_compute(input_tensor, min_range, max_range, y, mode, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_tensor, min_range, max_range, res], "dummy_placeholder": True } te.lang.cce.cce_build_code(sch, config)
def addcmul(input_data, x1, x2, y, value=1.0, kernel_name="addcmul"): """ algorithm: addcmul calculating data's addcmul, y = input_data + value * (x1 * x2) Parameters ---------- input_data : dict shape and dtype of first input, only support float16, float32, int32, int8, uint8 x1 : dict shape and dtype of second input, only support float16, float32, int32, int8, uint8 x2 : dict shape and dtype of third input, only support float16, float32, int32, int8, uint8 y: dict shape and dtype of output, should be broadcast shape and type as input value: float scaling coefficient, default value is 1.0 kernel_name : str cce kernel name, default value is addcmul Returns ------- None """ shape_input = input_data.get("shape") shape_x1 = x1.get("shape") shape_x2 = x2.get("shape") dtype_input = input_data.get("dtype").lower() dtype_x1 = x1.get("dtype").lower() dtype_x2 = x2.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_input) util.check_shape_size(shape_input, SHAPE_SIZE_LIMIT) util.check_shape_rule(shape_x1) util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) util.check_shape_rule(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) check_list = ("float16", "float32", "int32", "int8", "uint8") util.check_dtype_rule(dtype_input, check_list) util.check_dtype_rule(dtype_x1, check_list) util.check_dtype_rule(dtype_x2, check_list) if dtype_input != dtype_x1 or dtype_input != dtype_x2: raise RuntimeError("the type of input_data, x1, x2 must be same") shape_x1, shape_x2, shape_max1 = broadcast_shapes(shape_x1, shape_x2) util.check_tensor_shape_size(shape_max1) shape_input, _, shape_max = broadcast_shapes(shape_input, shape_max1) util.check_tensor_shape_size(shape_max) shape_x1, _, _ = broadcast_shapes(shape_x1, shape_max) shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max) data_input = tvm.placeholder(shape_input, dtype=dtype_input, name="data_input") data_x1 = tvm.placeholder(shape_x1, dtype=dtype_x1, name="data_x1") data_x2 = tvm.placeholder(shape_x2, dtype=dtype_x2, name="data_x2") res = addcmul_compute(data_input, data_x1, data_x2, shape_max, y, value, kernel_name="addcmul") with tvm.target.cce(): schedule = generic.auto_schedule(res) tensor_list = [data_input, data_x1, data_x2, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(schedule, config)
def less_equal_compute(input_x, input_y, output_z, kernel_name="less_equal"): """ compute for less_equal Parameters ---------- input_x: TVM tensor the placeholder of input_x input_y: TVM tensor the placeholder of input_y output_z: dict dict info of output_z kernel_name: str cce kernel name, default value is "less_equal" Returns ------- res: TVM tensor the result of compute """ dtype_x = input_x.dtype shape_x = te.lang.dynamic.shape_to_list(input_x.shape) shape_y = te.lang.dynamic.shape_to_list(input_y.shape) shape_x, shape_y, shape_broadcast = broadcast_shapes( shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if dtype_x == "float32": scalar_min = tvm.const(SCALAR_MIN_FP32, dtype="float32") scalar_mul = tvm.const(SCALAR_MUL_FP32, dtype="float32") scalar_mul1 = tvm.const(SCALAR_MUL2_FP32, dtype="float32") scalar_neg_one = tvm.const(SCALAR_NEG_ONE, dtype="float32") else: scalar_min = tvm.const(SCALAR_MIN_FP16, dtype="float16") scalar_mul = tvm.const(SCALAR_MUL_FP16, dtype="float16") scalar_neg_one = tvm.const(SCALAR_NEG_ONE, dtype="float16") if dtype_x in ("int8", "uint8"): input_x = te.lang.dynamic.cast_to(input_x, "float16") input_y = te.lang.dynamic.cast_to(input_y, "float16") input_x = te.lang.dynamic.broadcast(input_x, shape_broadcast) input_y = te.lang.dynamic.broadcast(input_y, shape_broadcast) res_max = te.lang.dynamic.vmax(input_x, input_y) res_vsub = te.lang.dynamic.vsub(input_y, res_max) if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vabs", res_vsub.dtype): res_vabs = te.lang.dynamic.vabs(res_vsub) else: res_vsub = te.lang.dynamic.cast_to(res_vsub, "float32") res_vabs = te.lang.dynamic.vabs(res_vsub) res_min = te.lang.dynamic.vmins(res_vabs, scalar_min) res_vmul = te.lang.dynamic.vmuls(res_min, scalar_mul) res_vmul1 = te.lang.dynamic.vmuls(res_vmul, scalar_mul) if dtype_x == "float32": res_vmul2 = te.lang.dynamic.vmuls(res_vmul1, scalar_mul1) res_vsub1 = te.lang.dynamic.vadds(res_vmul2, scalar_neg_one) res_vabs1 = te.lang.dynamic.vabs(res_vsub1) else: res_vsub1 = te.lang.dynamic.vadds(res_vmul1, scalar_neg_one) res_vabs1 = te.lang.dynamic.vabs(res_vsub1) res = te.lang.dynamic.cast_to(res_vabs1, "int8", True) return res
def masked_fill(x, mask, value, y, kernel_name="masked_fill"): ''' :param x: dict shape and dtype of tensor x input :param mask: dict shape and dtype of tensor mask, can be boardcast as shape as x :param value: dict shape and dtype of value :param y: dict the output of masked _fill :param kernel_name: str kernel name, default value is "masked _fill" :return: none ''' x_shape = x.get("shape") x_dtype = x.get("dtype") x_dtype_lower = x_dtype.lower() mask_shape = mask.get("shape") mask_dtype = mask.get("dtype") value_shape = value.get("shape") value_dtype = value.get("dtype") value_dtype_lower = value_dtype.lower() # check dtype x_dtype_list = ("float16", "float32", "int8", "int32") op_utils.check_dtype(x_dtype, x_dtype_list) mask_dtype_list = ("bool", "int8") op_utils.check_dtype(mask_dtype, mask_dtype_list) if mask_dtype == "bool": mask_dtype = "int8" value_dtype_list = ("float16", "float32", "int8", "int32") op_utils.check_dtype(value_dtype, value_dtype_list) # check shape op_utils.check_shape(x_shape) op_utils.check_shape(mask_shape) op_utils.check_shape(value_shape) # check boardcast shape x_shape, mask_shape, out_shape = op_utils.broadcast_shapes( x_shape, mask_shape) op_utils.check_shape(out_shape) # check kernel_name util.check_kernel_name(kernel_name) pos_mask_shape = tuple( [1] * (len(x_shape) - len(mask_shape))) + tuple(mask_shape) data_x = tvm.placeholder(x_shape, dtype=x_dtype_lower, name="data_x") data_mask = tvm.placeholder(pos_mask_shape, dtype=mask_dtype, name="data_mask") data_value = tvm.placeholder(pos_mask_shape, dtype=value_dtype_lower, name="data_value") y = masked_fill_compute(data_x, data_mask, data_value, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(y) config = { "name": kernel_name, "tensor_list": [data_x, data_mask, data_value, y], } te.lang.cce.cce_build_code(schedule, config)