def sin(x, y, kernel_name="sin"): """ algorithm: sin calculating data's sin x = x - x^3/3! + x^5/5! + ... + (-1)^k*x^2(k+1)/(2(k+1))! Parameters ---------- x : dict shape and dtype of input, only support float16, float32 y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is "sin" Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype").lower() check_shape(shape_input, param_name="x") check_list = (FLOAT_16, FLOAT_32) check_dtype(dtype_input, check_list, param_name="x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_input) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=dtype_input) res = sin_compute(data_input, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_input, res)} te.lang.cce.cce_build_code(sch, config)
def check_supported(input_x, output_y, dst_type, kernel_name="cast"): """ verify the types of cast supported by tbe """ src_type = input_x.get("dtype").lower() check_result = False if src_type == "bool": src_type = "int8" dst_type = _cast_dsttype_conversion(dst_type) check_list = [] if src_type == "float16": check_list = ["float32", "int32", "uint8"] elif src_type == "float32": check_list = ["float16", "int32"] elif src_type == "int8": check_list = ["float32", "float16", "int32", "uint8"] elif src_type == "uint8": check_list = ["float32", "float16", "int32"] elif src_type == "int32": check_list = ["bool", "uint8", "int8", "float32", "float16"] src_shape = input_x.get("shape") shape_size = reduceIns(lambda x, y: x * y, src_shape) if shape_size == 1 and src_type == "int64": check_list = ["int32", "float32"] if dst_type in check_list: check_result = True return check_result
def log(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="log"): """ calculating data Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "log" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() # input_x' shape check op_utils.check_shape(shape, param_name="input_x") # input_x' dtype check, only supports fp16 and fp32 check_list = ("float16", "float32") op_utils.check_dtype(input_dtype, check_list, param_name="input_x") if base <= 0 and (not isclose(base, -1.0)): error_info = {} error_info['errCode'] = 'E80000' error_info['param_name'] = 'base' error_info['op_name'] = 'log' error_info['expect_value'] = "strictly positive or -1" error_info['real_value'] = base raise RuntimeError("In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'], \ error_info['expect_value'], error_info['real_value'])) fused_shape = [reduceIns(lambda x, y: x * y, shape[:])] data_input = tvm.placeholder(fused_shape, name="data_input", dtype=input_dtype) res = log_compute(data_input, output_y, base, scale, shift, kernel_name) # auto schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # operator build config = { "name": kernel_name, "need_build": True, "tensor_list": (data_input, res) } te.lang.cce.cce_build_code(sch, config)
def gelu_grad(input_dy, input_x, input_y, output_z, kernel_name="gelu_grad"): """ algorithm: gelu_grad calculating: dy*res' res' = res/x + x*0.5*(1 - tanh(math_four)*tanh(math_four))* np.sqrt(2 / np.pi)*(1 + 3*0.044715*x2) math_four = (np.sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3))) Parameters ---------- input_dy : dict shape and dtype of dy input, only support float16, float32 input_x : dict shape and dtype of x input, only support float16, float32 input_y : dict shape and dtype of y input, only support float16, float32 output_z: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is gelu_grad Returns: ------- none. """ shape_dy = input_dy.get("shape") shape_x = input_x.get("shape") shape_y = input_y.get("shape") check_shape(shape_dy, param_name="input_dy") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = input_dy.get("dtype").lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_dy") shape_dy = list(shape_dy) shape_x = list(shape_x) shape_y = list(shape_y) if not (operator.eq(shape_dy, shape_x) and operator.eq(shape_dy, shape_y)): raise RuntimeError("all input shape must be equal") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_dy) data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype) data_gelu = tvm.placeholder(fuseshape, name="data_gelu", dtype=input_dtype) res = gelu_grad_compute(data_dy, data_x, data_gelu, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_dy, data_x, data_gelu, res] } te.lang.cce.cce_build_code(sch, config)
def elu(x, y, alpha=1.0, kernel_name="elu"): """ do element-wise elu operation Parameters: ---------- x: the dict of input, only support float16, float32 alpha: float, coefficient when input tensor is less than zero. output_res : the dict of output kernel_name : cce kernel name, default value is "elu" Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype") input_dtype = dtype_input.lower() check_shape(shape_input, param_name="x") check_list = ("float16", "float32") check_dtype(dtype_input, check_list, param_name="x") if not tbe_platform.cce_conf.api_check_support( "te.lang.cce.sum", "float32") and dtype_input == "float32": error_info = {} error_info['errCode'] = 'E80008' error_info['param_name'] = 'x' error_info['op_name'] = 'elu' error_info['expect_value'] = "float16" error_info['real_value'] = dtype_input raise RuntimeError(error_info, "In op[%s], the parameter[%s]'s dtype " "should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'], \ error_info['expect_value'], error_info['real_value'])) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_input) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=input_dtype) res = elu_compute(data_input, y, alpha, kernel_name) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "print_ir": False, "tensor_list": [data_input, res], "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(auto_sch, config)
def exp(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="exp"): """ algorithm: exp calculating data's exp if base == -1: y = exp(shift + scale * x) if base > 0: y = exp((shift+scale*x)*ln(base)) Parameters ---------- input_x : dict,shape and dtype of input, only support float16,float32 output_y: dict,shape and dtype of output, should be same shape and type as input base: (optional, default -1 for a value of e the base gamma scale: (optional, default 1) the scale alpha shift: (optional, default 0) the shift beta kernel_name : str, kernel name, default value is "exp" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") check_shape(shape, param_name="input_x") # input_x' dtype check, only supports fp16 and fp32 check_list = ("float16", "float32") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_x") if base <= 0 and (not isclose(base, -1.0)): error_info = {} error_info['errCode'] = 'E80000' error_info['param_name'] = 'base' error_info['op_name'] = 'exp' error_info['expect_value'] = "strictly positive or -1" error_info['real_value'] = base raise RuntimeError( "In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'], error_info['expect_value'], error_info['real_value'])) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=input_dtype) res = exp_compute(data_input, output_y, base, scale, shift, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def bnll(input_x, output_y, kernel_name="bnll"): """ calculating data algrithm: y=x+log(1+exp(-x)) if x>0; y=log(1+exp(x)) otherwise Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "bnll" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() check_shape(shape, param_name="x") check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="x") product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") if product in ["Ascend310", "Hi3796CV300ES", "Hi3796CV300CS"] and \ input_dtype == "float32": error_info = {} error_info['errCode'] = 'E80008' error_info['param_name'] = 'input_x' error_info['op_name'] = 'bnll' error_info['expect_value'] = "float16" error_info['real_value'] = input_dtype raise RuntimeError(error_info, "In op[%s], the parameter[%s]'s dtype " "should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'],\ error_info['expect_value'], error_info['real_value'])) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x*y, shape) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=input_dtype) res = _bnll_computer(data_input, product) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "print_ir": False, "bool_storage_as_1bit": False, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(schedule, config)
def exp(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="exp"): """ algorithm: exp calculating data's exp if base == -1: y = exp(shift + scale * x) if base > 0: y = exp((shift+scale*x)*ln(base)) Parameters ---------- input_x : dict,shape and dtype of input, only support float16,float32 output_y: dict,shape and dtype of output, should be same shape and type as input base: (optional, default -1 for a value of e the base gamma scale: (optional, default 1) the scale alpha shift: (optional, default 0) the shift beta kernel_name : str, kernel name, default value is "exp" Returns ------- None """ dtype = input_x.get("dtype") # input_x' dtype check, only supports fp16 and fp32 check_list = ("float16", "float32") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_x") if base <= 0 and (not isclose(base, -1.0)): expect_value = "strictly positive or -1" real_value = "base < 0 or base notequal with -1" error_manager_vector.raise_err_input_value_invalid( kernel_name, "base", expecte_value, real_value) ins = classify([input_x], Mode.ELEWISE) schedules, tensors = [], [] for (input_x,) in ins: with te.op.compute(): shape_x = variable_shape([input_x]) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_x[0]) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=input_dtype) res = exp_compute(data_input, output_y, base, scale, shift, kernel_name) tensors.append([data_input, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def relu(x, y, kernel_name="relu"): """ Algrithm: relu(x) = max(x, 0) Parameters ---------- Algorithm: relu Parameters: x: dynamic input, include shape, dtype and range y: the dict of output kernel_name: kernel name, must be string, default value is "relu". Returns ------- None """ # check input tensor data_type dtype_x = x.get("dtype").lower() check_list = ("float16", "float32", "int8", "int32") check_dtype(dtype_x, check_list, param_name="x") ins = classify([x], Mode.ELEWISE) schedules, tensors = [], [] for (x, ) in ins: with te.op.compute(): shape_x = variable_shape([x]) fuse_shape = [1] fuse_shape[0] = reduceIns(lambda x, y: x * y, shape_x[0]) input_data = tvm.placeholder(fuse_shape, name="input_data", dtype=dtype_x) res = relu_compute(input_data, y, kernel_name) tensors.append([input_data, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def sqrt(input_x, output_y, kernel_name="sqrt"): """ algorithm: sqrt calculating data sqrt,y= x**0.5, mini not support vsqrt, use exp(0.5*log(x)) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is sqrt Returns ------- None """ # check dtype x_dtype = input_x.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="input_x") ins = classify([input_x], Mode.ELEWISE) schedules, tensors = [], [] for (input_x, ) in ins: with te.op.compute(): # shape x_shape = variable_shape([input_x]) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, x_shape[0]) # div_compute input_data = tvm.placeholder(fuseshape, name="input_data", dtype=x_dtype) res = sqrt_compute(input_data, output_y, kernel_name) tensors.append([input_data, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) # build config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def leaky_relu(x, y, negative_slope=0, kernel_name="leaky_relu"): """leaky_relu op for input tensor f(x)= x(x>=0) or negative_slope*x(x<0) equal to f(x)=negative_slope*x Parameters ---------- x : TVM tensor input tensor has shape and dtype attributes y : dict dict with keys(shape and dtype) of output negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str cce kernel name, default value is "leaky_relu" Returns ------ None """ # check input tensor shape shape = x.get("shape") dtype = x.get("dtype") check_shape(shape, param_name="x") # check input tensor data_type check_list = ["float16", "float32", "int32", "int8"] check_dtype(dtype.lower(), check_list, param_name="x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) inp_dtype = dtype.lower() input_data_x = tvm.placeholder(fuseshape, name="input_data_x", dtype=inp_dtype) with tvm.target.cce(): res = leaky_relu_compute(input_data_x, y, negative_slope, kernel_name) sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [input_data_x, res]} te.lang.cce.cce_build_code(sch, config)
def threshold(input_x, output_y, threshold=0.0, kernel_name="threshold"): """ algorithm: threshold compare data with threshold: x > threshold ? 1; 0 Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32, int32 output_y: dict shape and dtype of output, should be broadcast shape and type as input threshold: scalar parameter of the operator kernel_name : str kernel name, default value is "threshold" Returns ------- None """ # check shape shape = input_x.get("shape") op_utils.check_shape(shape, param_name="input_x") # check data type input_data_type = input_x.get("dtype").lower() op_utils.check_dtype(input_data_type, ["float16", "float32"], param_name="input_x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_data_type) res = threshold_compute(data_x, threshold, output_y, kernel_name) with tvm.target.cce(): schedule = cce.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "need_build": False, "tensor_list": (data_x, res) } te.lang.cce.cce_build_code(schedule, config)
def tanh_grad(y, dy, z, kernel_name="tanh_grad"): """ do element-wise tanh_grad operation between two input tensors Parameters ---------- y : dict shape and dtype of y input, only support float16, float32 dy : dict shape and dtype of dy input, only support float16, float32 z: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is tanh_grad Returns ------- None """ shape_y = y.get("shape") shape_dy = dy.get("shape") check_shape(shape_y, param_name="y") check_shape(shape_dy, param_name="dy") check_list = ("float16", "float32") dtype = y.get("dtype").lower() check_dtype(dtype, check_list, param_name="y") if list(shape_y) != list(shape_dy): raise RuntimeError("tanh_grad only support input shape" "while input_shape1 equals to input_shape2") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_y) data_y = tvm.placeholder(fuseshape, dtype=dtype, name="data1") data_dy = tvm.placeholder(fuseshape, dtype=dtype, name="data2") res = tanh_grad_compute(data_y, data_dy, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_y, data_dy, res] } te.lang.cce.cce_build_code(sch, config)
def square(input_x, output, kernel_name="square"): """ algorithm: square calculating data's square,y= x*x Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32, int32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "square" Returns ------- None """ # check dtype x_dtype = input_x.get("dtype").lower() check_list = ("float16", "float32", "int32") check_dtype(x_dtype, check_list, param_name="input_x") ins = classify([input_x], Mode.ELEWISE) schedules, tensors = [], [] for (input_x, ) in ins: with te.op.compute(): # shape x_shape = variable_shape([input_x]) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, x_shape[0]) # square_compute data_x = tvm.placeholder(fuseshape, x_dtype, name="data_x") res = square_compute(data_x, output, kernel_name) tensors.append((data_x, res)) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) # build config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def mish(input_x, output_y, kernel_name="mish"): """ algorithm: mish calculating data's mish,y= x*(1 - 2/(1+(1+exp(x))^2)) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is mish Returns ------- None """ input_shape = input_x.get("shape") input_format = input_x.get("format") input_dtype = input_x.get("dtype").lower() check_shape(input_shape, param_name="input_x") check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") check_format(input_format) # fuse single axis fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, input_shape) data_x = tvm.placeholder(fuseshape, dtype=input_dtype, name="data_x") res = mish_compute(data_x, output_y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, res] } te.lang.cce.cce_build_code(schedule, config)
def log1p(input_x, output_y, kernel_name="log1p"): """ algorithm: log1p calculating data's log1p, y = log(x + 1) Parameters ---------- input_x: dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name: str kernel name, default value is "log1p" Returns ------- None """ dtype = input_x.get("dtype") check_list = ("float16", "float32") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_x") schedules, tensors = [], [] ins = classify([input_x], Mode.ELEWISE) for (input_x, ) in ins: with te.op.compute(): x_shape = variable_shape([input_x]) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, x_shape[0]) data_input = tvm.placeholder(fuseshape, dtype=input_dtype, name="data_input") res = log1p_compute(data_input, output_y, kernel_name) tensors.append([data_input, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = { "name": kernel_name, "tensor_list": tensors, "bool_storage_as_1bit": False } te.lang.dynamic.build(schedules, config)
def abs(x, y, kernel_name="abs"): """ algorithm: abs calculating data's abs,y= |x| Parameters ---------- x : dict shape and dtype of input, only support float16, float32, int32 y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is abs Returns ------- None """ shape = x.get("shape") check_shape(shape, param_name="x") check_list = ["float16", "float32", "int32"] inp_dtype = x.get("dtype").lower() check_dtype(inp_dtype, check_list, param_name="x") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=inp_dtype) res = abs_compute(data, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def rint(input_x, output_y, kernel_name="rint"): """ algorithm: rint calculating rint(x): returns the integer nearest to x by element-wise If the result is between two representable values, the even number should be used. For example: x : [0.9, 2.5, 2.3, 1.5, -4.5] res : [ 1.0, 2.0, 2.0, 2.0, -4.0 ] Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x output_y: dict dict with keys(shape and dtype) of output_y kernel_name: str kernel name, default value is "rint" Returns ------- None """ shape_x = input_x.get("shape") dtype = input_x.get("dtype") check_shape(shape_x, param_name="input_x") check_list = ("float16", "float32") check_dtype(dtype.lower(), check_list, param_name="input_x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x*y, shape_x) data_x = tvm.placeholder(fuseshape, dtype=dtype.lower(), name="data") res = rint_compute(data_x, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, res]} te.lang.cce.cce_build_code(sch, config)
def square(input_x, output_y, kernel_name="square"): """ algorithm: square calculating data's square,y= x*x Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32, int32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "square" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") check_list = ["float16", "float32", "int32"] if not dtype in check_list: raise RuntimeError("square only support float16, float32, int32") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=dtype.lower()) with tvm.target.cce(): res = square_compute(data, output_y, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def sign(input_x, output_y, kernel_name="sign"): """ x*32768 algrithm: sign = round(-------------------------) 2 ** (-15) + |x*32768| Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32, int32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is sign Returns ------- None """ shape = input_x.get("shape") check_shape(shape, param_name="input_x") check_list = ["float16", "float32", "int32"] inp_dtype = input_x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("sign only support float16, float32, int32") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=inp_dtype) res = sign_compute(data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def neg(input_x, output_y, kernel_name="neg"): """ Computes numerical negative value element-wise, y = -x. Parameters ---------- input_x: dict shape and dtype of input, only support float16, float32, int32, int8 output_y: dict shape and dtype of output, should be same type as input kernel_name: str kernel name, default value is "neg" Returns ------- None """ dtype_input = input_x.get("dtype").lower() check_list = ("float16", "float32", "int32", "int8") check_dtype(dtype_input, check_list, param_name="input_x") ins = classify([input_x], Mode.ELEWISE) schedules, tensors = [], [] for (input_x, ) in ins: with te.op.compute(): x_shape = variable_shape([input_x]) fuse_shape = [1] fuse_shape[0] = reduceIns(lambda x, y: x * y, x_shape[0]) data_input = tvm.placeholder(fuse_shape, name="data_input", dtype=dtype_input) res = neg_compute(data_input, output_y, kernel_name) tensors.append([data_input, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def gelu(input_x, output_y, kernel_name="gelu"): """ mathematical formula of gelu(x): gelu(x) = 0.5*x*(1.0+tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x,3)))) tanh(y) = 2/(1+exp(-2y)) - 1 convert gelu to result(x) = x/(1+e(-2*(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x,3))))) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is gelu Returns ------- none. """ shape = input_x.get("shape") check_shape(shape, param_name="input_x") check_list = ("float16", "float32") input_dtype = input_x.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x*y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=input_dtype) result = gelu_compute(data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(result) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data, result]} te.lang.cce.cce_build_code(sch, config)
def threshold_v2_d(x, y, threshold, value, kernel_name="threshold_v2_d_cce"): """ Thresholds each element of the input Tensor y = (x > threshold) ? x : value Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input threshold : float scale value to threshold at value : float scale value to replace with kernel_name : str kernel name, default value is "threshold_v2_d_cce" Returns ------- output tensor """ # get the shape and dtype shape_x = x.get("shape") dtype_x = x.get("dtype").lower() # check whether dtypes are right check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_x, check_list) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_x) data_x = tvm.placeholder(shape=fuseshape, name="data_x", dtype=dtype_x) res = threshold_v2_d_compute(data_x, y, threshold, value, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def sqrt(input_x, output_y, kernel_name="sqrt"): """ algorithm: sqrt calculating data sqrt,y= x**0.5, mini not support vsqrt, use exp(0.5*log(x)) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is sqrt Returns ------- None """ input_shape = input_x.get("shape") input_dtype = input_x.get("dtype").lower() check_shape(input_shape, param_name="input_x") check_dtype(input_dtype, ("float16", "float32"), param_name="input_x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, input_shape) input_data = tvm.placeholder(fuseshape, name="input_data", dtype=input_dtype) result = sqrt_compute(input_data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [input_data, result] } te.lang.cce.cce_build_code(sch, config)
def tanh(input_x, output_y, kernel_name="tanh"): """ algorithm: tanh calculating data's tanh,y= (e^(2x)-1)/(e^(2x)+1) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is tanh Returns ------- None """ input_shape = input_x.get("shape") input_dtype = input_x.get("dtype").lower() check_shape(input_shape, param_name="input_x") check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") # fuse single axis fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, input_shape) data = tvm.placeholder(fuseshape, name="data", dtype=input_dtype) res = tanh_compute(data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data, res]} te.lang.cce.cce_build_code(sch, config)
def fills(x, y, value, kernel_name="fills"): """ do fill operation Parameters: ---------- x : the dict of output y : the dict of output value: scalar value, kernel_name : cce kernel name, default value is "fill" Returns ------- None """ # get the shape and dtype shape = x.get("shape") dtype = x.get("dtype").lower() # check whether dtypes are right check_list = ("int32", "float16", "float32") check_dtype(dtype, check_list) # fuse shapes shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=dtype) res = fills_compute(data_x, value, dtype) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (data_x, res), "print_ir": False } te.lang.cce.cce_build_code(sch, config)
def tan(x, y, kernel_name="tan"): """ algorithm: tan calculating tan x = x + x^3/3 + 2*x^5/5 + 17*x^7/315 + 62*x^9/2835 + 1382*x^11/155925...(|x|<pi/2) Parameters ---------- x: dict dict with keys(shape and dtype) of input y: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "tan" Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype").lower() check_shape(shape_input, param_name="x") check_list = (FLOAT_16, FLOAT_32, INT_32) check_dtype(dtype_input, check_list, param_name="x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_input) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=dtype_input) res = tan_compute(data_input, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_input, res)} te.lang.cce.cce_build_code(sch, config)
def ceil(input_x, output_y, kernel_name="ceil"): """ algorithm: ceil calculating element-wise smallest integer not less than input_x, the type of input_x is float16 or float32 Parameters ---------- input_x: dict dict with keys(shape and dtype) of input output_y: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "ceil" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") check_list = {"float16", "float32"} check_dtype(dtype, check_list, param_name="input_x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, dtype=dtype, name="data") res = ceil_compute(data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data, res]} te.lang.cce.cce_build_code(sch, config)
def sigmoid(x, y, kernel_name="sigmoid"): """ calculating data Parameters ---------- x : dict dict of x, include keys(shape and dtype) y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "sigmoid" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") op_utils.check_shape(shape, param_name="x") input_dtype = dtype.lower() check_list = ("float16", "float32") op_utils.check_dtype(dtype, check_list, param_name="x") fused_shape = [reduceIns(lambda a, b: a * b, shape[:])] data_input = tvm.placeholder(fused_shape, name="data_input", dtype=input_dtype) res = sigmoid_compute(data_input, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def cast(input_x, output_y, dst_type, kernel_name="cast"): """ cast a tensor/scaler with input shape form src data type to dst data type. restrictions of input algorithms are as follow only types' groups blow are support tensor process: float16->float32 float16->int32 float32->float16 float32->int32 int8->float32 uint8->float32 int8->float16 uint8->float16 int8->int32 uint8->int32 int32->uint8 // number out of [0,255] can get unexpected result int32->int8 // number out of [-128,127] can get unexpected result int32->float32 // For tans with fp16, only guarantees number in [-1023,1023] get correct result int32->float16 // only guarantees number in [-1023,1023] get correct result scale convert support:(means only support shape [1,]) int64->int32 int64->float32 Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape as input, and the dtype is the dst dtype need to cast kernel_name : str cce kernel name, default value is cast Returns ------- None """ shape = util.scalar2tensor_one(input_x.get("shape")) src_type = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") if src_type == "bool": src_type = "int8" dst_type = _cast_dsttype_conversion(dst_type) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=src_type) if src_type == "int64": check_dtype(dst_type, ("float32", "int32"), param_name="dst_type") res = tvm.extern( [fuseshape], [data], lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"), name="res", dtype=dst_type) tensor_list = [data, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name) else: with tvm.target.cce(): res = cast_compute(data, output_y, dst_type, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)