def cosh(input_x, output_cosh, kernel_name="cosh"): """ algorithm: cosh calculating data's cosh, y = (e^(2x)+e^(-x))/2 Parameters ---------- input_x: dict shape and dtype of input, only support float16, float32 output_cosh: dict shape and dtype of output, should be same shape and type as input kernel_name: str kernel name, default value is "cosh" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") check_shape(shape, param_name="input_x") check_list = ("float16", "float32") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_x") reshape_input = (functools_reduce(lambda x, y: x * y, shape[:]), ) data_input = tvm.placeholder(reshape_input, name="data_input", dtype=input_dtype) res = cosh_compute(data_input, output_cosh, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def custom_Concat(shapes, dtype, axis, kernel_name="concat", need_build=False, need_print=False): """ concat one or two input data Parameters ---------- shapes : input shape of data dtype : the data type, assume src_dtype equals dst_dtype, support uint8, int8, int32, float16, float32 axis : concat axis kernel_name : cce kernel name, default value is "concat" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) for i in range(len(shapes)): util.check_shape_rule(shapes[i]) sum_dim = 0 for shape in shapes: sum_dim += functools_reduce(lambda x, y: x*y, shape) if sum_dim > 2**31-1: raise RuntimeError("shape exceed 32bit limitation") check_list = ["uint8", "int8", "float16", "float32", "int32"] if not (dtype.lower() in check_list): raise RuntimeError( "concat_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() data = [] for i in range(len(shapes)): shape = shapes[i] data.append(tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype)) with tvm.target.cce(): res = te.lang.cce.concat(data, axis) sch = generic.auto_schedule(res) data.append(res) config = {"print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": data} te.lang.cce.cce_build_code(sch, config)
def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"): """CorrectionMul op""" shape = x.get("shape") data_format = x.get("format") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32"] inp_dtype = x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("Dtype of input only support float16, float32") # shape = util.shape_refine(shape) x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype) shape_c = [1] * len(shape) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype) res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [x_t, batch_std_t, running_std_t, res]} te.lang.cce.cce_build_code(sch, config)
def invert(input_x, output_y, kernel_name="invert"): """Flips all bits elementwise. Parameters ---------- input_x: dict the dict of input tensor. Must be one of the following types: `int16`, `uint16`. output_y: dict the dict of output tensor. kernel_name: str cce kernel name, default value is "invert". Returns ------- None. """ shape_x = input_x.get("shape") dtype_x = input_x.get("dtype") dtype_x_lower = dtype_x.lower() check_list = ("int16", "uint16") check_shape(shape_x, param_name="input_x") check_dtype(dtype_x_lower, check_list, param_name="input_x") shape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]), ) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x_lower) res = invert_compute(data_x, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, res]} te.lang.cce.cce_build_code(sch, config)
def strided_write(x, y, axis, stride, kernel_name='strided_write'): """ write data to tensor by stride. Parameters: ---------- x: dict of input. y: dict of output. axis: which axis to write data by stride. stride: data write stride. kernel_name: cce kernel name, default value is "strided_write". Returns: ------- None """ check_params(x, y, axis) dtype_x = x.get("dtype") n_i, c1_i, h_i, w_i, c0_i = x.get("shape") shape_x = n_i, c1_i, h_i*w_i, c0_i input_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x) res = strided_write_compute(input_x, y, axis, stride, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res)
def reduce_std(x, y1, y2, dim=None, unbiased=True, keepdim=False, kernel_name="reduce_std"): # calculating data parameters check_list = ("float16", "float32") shape_x = x.get("shape") dtype_x = x.get("dtype").lower() util.check_dtype_rule(dtype_x, check_list) util.check_shape_rule(shape_x) data_x = tvm.placeholder(x.get("shape"), dtype=x.get("dtype"), name="data_x") res = reduce_std_compute(data_x, dim, unbiased, keepdim, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x] + list(res)} te.lang.cce.cce_build_code(schedule, config)
def minmax_update_perchannel(x, min_val, max_val, min_up, max_up, ema, ema_decay, channel_axis, kernel_name="minmax_update_perchannel"): """MinMaxUpdatePerChannel op""" x_shape = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[ 1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if channel_axis_ == 0: shape_c = min_val.get("ori_shape") else: shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]] input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res_list = minmax_update_perchannel_compute(input_data, min_data, max_data, ema, ema_decay, channel_axis_) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def logical_not(x, y, kernel_name="logical_not"): """ calculating data Parameters ---------- x : dict shape and dtype of input, only support int8, int32 y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "logical_not" Returns ------- None """ shape_x = x.get("shape") dtype_x = x.get("dtype").lower() check_shape(shape_x, param_name="x") check_dtype(dtype_x.lower(), ("int8", ), param_name="x") reshape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]), ) data = tvm.placeholder(reshape_x, name="data", dtype=dtype_x) res = logical_not_compute(data, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data, res]} te.lang.cce.cce_build_code(sch, config)
def fake_learned_scale_quant_perchannel_grad_d_reduce( dout_alpha, dalpha, channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d_reduce"): """FakeLearnedScaleQuantPerChannelGradDReduce""" dout_alpha_shape = dout_alpha.get("shape") dout_alpha_dtype = dout_alpha.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(dout_alpha_shape) util.check_tensor_shape_size(dout_alpha_shape) check_list = ["float32", 'float16'] dout_alpha_dtype = dout_alpha_dtype.lower() util.check_dtype_rule(dout_alpha_dtype, check_list) dout_alpha_data = tvm.placeholder(dout_alpha_shape, name="dout_alpha", dtype=dout_alpha_dtype) res = fake_learned_scale_quant_perchannel_grad_d_reduce_compute( dout_alpha_data, dout_alpha, channel_axis, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_alpha_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def zeros_like(x, y, kernel_name="zeros_like"): """ output a tensor of all zero, you can specify the output type Parameters ---------- x: dict shape and dtype of input, only support float16, float32, int32,int8,uint8 y: dict shape and dtype of output data kernel_name: str cce kernel name, default value is "zeros_like" Returns ------ None """ shape_x = x.get("shape") dtype_x = x.get("dtype") check_shape(shape_x, param_name="x") check_list_src = ("float16", "float32", "int32", "int8", "uint8") src_dtype = dtype_x.lower() check_dtype(src_dtype, check_list_src, param_name="x") shape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]),) x_input = tvm.placeholder(shape_x, name="x_input", dtype=src_dtype) res = zeros_like_compute(x_input, y, kernel_name=kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [x_input, res]} te.lang.cce.cce_build_code(auto_sch, config)
def squared_difference(x1, x2, y, kernel_name="squared_difference"): """ algorithm: squared_difference calculating data's tf_squared_difference,y= (x - y) * (x - y) Parameters ---------- x2 : dict shape and dtype of y input, only support float16, float32 input_dy : dict shape and dtype of dy input, only support float16, float32 output_x: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is squared_difference Returns ------- None """ shape_x = x1.get("shape") shape_y = x2.get("shape") check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_list = ["float16", "float32", "int32"] dtype = x1.get("dtype").lower() if not dtype in check_list: raise RuntimeError( "tf_squared_difference_cce only support float16, float32, int32") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y") with tvm.target.cce(): shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") data_x_tmp = te.lang.cce.broadcast(data_x, shape_max) data_y_tmp = te.lang.cce.broadcast(data_y, shape_max) data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp) res = te.lang.cce.vmul(data_sub, data_sub) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def sin(x, y, kernel_name="sin"): """ algorithm: sin calculating data's sin x = x - x^3/3! + x^5/5! + ... + (-1)^k*x^2(k+1)/(2(k+1))! Parameters ---------- x : dict shape and dtype of input, only support float16, float32 y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is "sin" Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype").lower() check_shape(shape_input, param_name="x") check_list = (FLOAT_16, FLOAT_32) check_dtype(dtype_input, check_list, param_name="x") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_input) data_input = tvm.placeholder(fuseshape, name="data_input", dtype=dtype_input) res = sin_compute(data_input, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_input, res)} te.lang.cce.cce_build_code(sch, config)
def JDEcoder(shape, dtype, num_classes, num_boxes, conf_thresh, iou_thresh, biases, masks, strides, kernel_name="JDEcoder", need_build=True, need_print=False): """ Parameters ---------- kernel_name : kernel name, default value is "JDEcoder" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ check_list = ["float16", "float32"] if not (dtype.lower() in check_list): raise RuntimeError("JDEcoder only support %s while dtype is %s" % (",".join(check_list), dtype)) if num_classes < 1: raise RuntimeError("num_classes must be > 1") inp_dtype = dtype.lower() inp_tensor = tvm.placeholder(shape, name='inp_tensor', dtype=inp_dtype) with tvm.target.cce(): res = inp_tensor sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_buid": need_build, "name": kernel_name, "tensor_list": [inp_tensor, res] } te.lang.cce.cce_build_code(sch, config) #if __name__ == "__main__": # JDEcoder((1, 536, 10, 18), "float16", 1, 4, 0.5, 0.45, (6, 16, 8, 23, 11, 32, 16, 45, 21, 64, 30, 90, 43, 128, 60, 180, 85, 255, 120, 360, 170, 420, 340, 320), (8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3), (32, 16, 8))
def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"): """ reduce a tensor on a certain axes based on max. Parameters ---------- x : dict shape and dtype of input y : dict shape and dtype of output, should be same shape and type as input axes: list the first axes to reduce,may be negative to index from the end (e.g., -1 for the last axes). axes may be int or list(e.g. [1,2]) keepdims: bool if true, retains reduced dimensions with length 1, default value is None kernel_name : str kernel name, default value is "reduce_max_d" Returns ------- None """ dtype = x["dtype"] dtype_lower = dtype.lower() check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_lower, check_list) with te.op.compute(): shape = x["shape"] shape_range = x["range"] shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = cce_util.axis_check(shape_len, axes) shape_new, shape_range_new, axes_new, fused_rel_dic = \ fused_reduce_axis(shape, shape_range, axes) add_compile_info("fused_rel_dic", fused_rel_dic) x["shape"] = shape_new x["range"] = shape_range_new shape_var_new = variable_shape([x])[0] data_input = tvm.placeholder(shape_var_new, name="data_input", dtype=dtype_lower) res = reduce_max_d_compute(data_input, y, axes_new, keepdims) with tvm.target.cce(): sch = generic.auto_schedule(res) # build config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.dynamic.build(sch, config)
def diag_part_d(x, assist, y, kernel_name="diag_part_d"): """ Returns the batched diagonal part of a batched tensor Parameters ---------- x: dict dict of x, include keys(shape and dtype) assist: dict dict of help Matrix, Its Diagonal Line value is 1 else value is 0 y: dict dict of output kernel_name: str cce kernel name, default value is "diag_part_d" Returns ------- None """ shape_x = x.get("shape") dtype_x = x.get("dtype") shape_assist = assist.get("shape") dtype_assist = assist.get("dtype") shape_y = y.get("shape") check_shape(shape_x, param_name="x") check_shape(shape_assist, param_name="assist") if len(shape_x) not in (2, 4, 6, 8): raise RuntimeError("Input tensors of rank 2,4,6,8 are supported!") if list(shape_x) != list(shape_assist): raise RuntimeError("the shape of data must be equal!") len_shape_out = len(shape_x) // VALUE_TWO for i in range(len_shape_out): if shape_x[i] != shape_x[i + len_shape_out]: raise RuntimeError("the shape of input is not supported!") if list(shape_x) != list(shape_y + shape_y): raise RuntimeError("the shape of output is not supported!") if list(shape_x) != list(shape_assist): raise RuntimeError("the shape of data must be equal!") check_list = ("float16", "float32", "int32") dtype_x = dtype_x.lower() check_dtype(dtype_x, check_list, param_name="x") dtype_assist = dtype_assist.lower() check_dtype(dtype_assist, check_list, param_name="assist") if dtype_assist != dtype_x: raise RuntimeError("the dtype of data must be equal!") data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x) data_assist = tvm.placeholder(shape_assist, name="data_assist", dtype=dtype_assist) res = diag_part_d_compute(data_x, data_assist, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_assist, res]} te.lang.cce.cce_build_code(sch, config)
def bn_infer_grad(grads, scale, batch_variance, x_backprop, epsilon=0.0001, kernel_name="bn_infer_grad"): """ algorithm: fused_batch_norm_grad_v2 bn_infer_grad. Parameters ---------- grads: dict dict of grads, A 5D Tensor for input grads. scale: dict dict of scale, A 5D Tensor for input scale. batch_variance: dict dict of batch_variance, A 5D Tensor for input batch_variance. x_backprop: dict dict of x_backprop, A 5D Tensor for output x_backprop. epsilon: float A small float number added to the variance of x. Defaults to `0.0001`. kernel_name: str kernel name, default value is "bn_infer_grad" Returns ------- None """ shape_grads = grads.get("shape") shape_scale = scale.get("shape") shape_batch_variance = batch_variance.get("shape") input_grads_dtype = grads.get("dtype").lower() input_scale_dtype = scale.get("dtype").lower() batch_variance_dtype = batch_variance.get("dtype").lower() check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads") check_dtype(input_scale_dtype, ("float32",), param_name="scale") check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance") _check_shape(shape_grads, shape_batch_variance) util.compare_tensor_dict_key(scale, batch_variance, "shape") grads_input = tvm.placeholder(shape_grads, name="grads_input", dtype=input_grads_dtype) scale_input = tvm.placeholder(shape_scale, name="x_input", dtype=input_scale_dtype) batch_variance_input = tvm.placeholder(shape_batch_variance, name="batch_variance_input", dtype=batch_variance_dtype) res = bn_infer_grad_compute(grads_input, scale_input, batch_variance_input, x_backprop, epsilon, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [grads_input, scale_input, batch_variance_input, res] config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def _conv3dbp_input_achieve_with_tvm(): dedy = tvm.placeholder(shape_dedy, name="dedy", dtype=out_backprop_dtype) shape_filter_ncdhw = [ filter_batch, filter_channel, filter_depth, filter_h, filter_w ] filters = tvm.placeholder(shape_filter_frac, name="filter", dtype=filter_dtype) dedx = te.lang.cce.conv3d_backprop_input_compute( filters=filters, out_backprop=dedy, filter_sizes=shape_filter_ncdhw, input_sizes=input_sizes, strides=strides, padding=pads, dilations=dilations, res_dtype=res_dtype, kernel_name=kernel_name) tensor_list = [filters, dedy, dedx] with tvm.target.cce(): sch = generic.auto_schedule(dedx) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def fake_quant_per_layer(x, min_val, max_val, y, symmetric, narrow_range, num_bits, kernel_name="fake_quant_per_layer"): """FakeQuantPerLayer""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_per_layer_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def square_sum_v2(input_x, output1, output2, attr1, attr2=True, kernel_name="square_sum_v2"): """ calculating data Parameters ---------- Input and output of fusion graph Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() check_shape(shape, param_name="input_x") data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype) res = suqare_sum_v2_compute(data_input, output1, output2, attr1, attr2, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input] + list(res)} te.lang.cce.cce_build_code(sch, config)
def cos(input_x, output_y, kernel_name="cos"): """ algorithm: cos calculating data's cos x = 1 - x^2/2! + x^4/4! + ... + (-1)^k*x^2k/(2k)! Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "cos" Returns ------- None """ shape_input = input_x.get("shape") dtype_input = input_x.get("dtype").lower() check_shape(shape_input, param_name="input_x") check_list = ("float16", "float32") check_dtype(dtype_input, check_list, param_name="input_x") reshape_input = (functools_reduce(lambda x, y: x * y, shape_input[:]), ) data_input = tvm.placeholder(reshape_input, name="data_input", dtype=dtype_input) res = cos_compute(data_input, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def CusSquare(input_x, output_y, kernel_name="square"): """ algorithm: square calculating data's square,y= x*x Parameters ---------- input_x : dict shape and dtype of input, only support float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "square" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype").lower() shape = util.shape_refine(shape) data = tvm.placeholder(shape, name="data", dtype=dtype.lower()) with tvm.target.cce(): res = square_compute(data, output_y, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def strided_read(x, y, axis, stride, kernel_name='strided_read'): """ read data from tensor by stride. Parameters: ---------- x: dict of input. y: dict of output. axis: which axis to read data by stride. stride: data read stride. kernel_name: cce kernel name, default value is "strided_read". Returns: ------- None """ check_params(x, y, axis) shape_x = x.get("shape") dtype_x = x.get("dtype") input_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x) res = strided_read_compute(input_x, y, axis, stride, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res)
def fused_mul_apply_momentum(var, accum, lr, x1, momentum, x2, out_var, out_accum, use_nesterov=False, kernel_name="fused_mul_apply_momentum"): """ Update '*var' according to the ApplyMomentum algorithm. accum = accum * momentum + x1 * x2 if use_nesterov is True: var -= gard * lr + accum * momentum * lr else: var -= accum * lr Parameters: ---------- var : the dict of mutable tensor var, only support float16, float32. accum: the dict of mutable tensor accum. Must have the same dtype as `var`. lr : the dict of scalar lr. Must have the same dtype as `var`. x1 : the dict of tensor grad. Must have the same dtype as `var`. momentum : the dict of scalar momentum. Must have the same dtype as `var`. x2 : the dict of scalar grad. Must have the same dtype as `var`. out_var : the dict of var output. out_accum : the dict of accum output use_nesterov: bool. If true, use nesterov computing grad, default value is False. kernel_name : cce kernel name, default value is "fused_mul_apply_momentum". Returns ------- None """ input_name_list = ['var', 'accum', 'lr', 'x1', 'momentum', 'x2'] var, accum, lr, x1, momentum, x2 = _get_placeholder( [var, accum, lr, x1, momentum, x2], input_name_list) out_var, out_accum = _fused_mul_apply_momentum_compute( var, accum, lr, x1, momentum, x2, out_var, out_accum, use_nesterov) outs = [out_var, out_accum] build_list = [var, accum, lr, x1, momentum, x2, out_var, out_accum] with tvm.target.cce(): sch = generic.auto_schedule(outs) config = {"name": kernel_name, "tensor_list": build_list} te.lang.cce.cce_build_code(sch, config)
def gn_training_reduce(x, sum, square_sum, num_groups=2, kernel_name="gn_training_reduce"): """ calculating data Parameters ---------- x: dict dict of input, A 5HD Tensor for input data. sum: dict dict of sum, A `Tensor`. Sum of x. square_sum: dict dict of square_sum, A `Tensor`. Square sum of x. num_groups: int A integer value indicates the group in channel. kernel_name : str kernel name, default value is "gn_training_reduce" Returns ------- None """ shape_x = x.get("shape") dtype_x = x.get("dtype") data_format = x.get("format") input_dtype = dtype_x.lower() _shape_check(shape_x, data_format, num_groups) check_dtype(input_dtype, ("float16", "float32"), param_name="x") # Reshape NCHW -> N[GD]HW if data_format == "NCHW": shape_x = [ shape_x[0], num_groups, shape_x[1] // num_groups, shape_x[2], shape_x[3] ] # Reshape NHWC -> NHW[GD] elif data_format == "NHWC": shape_x = [ shape_x[0], shape_x[1], shape_x[2], num_groups, shape_x[3] // num_groups ] x_input = tvm.placeholder(shape_x, name="x_input", dtype=input_dtype) res = gn_training_reduce_compute(x_input, data_format, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) tensor_list = [x_input] + list(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(schedule, config)
def gelu_grad(input_dy, input_x, input_y, output_z, kernel_name="gelu_grad"): """ algorithm: gelu_grad calculating: dy*res' res' = res/x + x*0.5*(1 - tanh(math_four)*tanh(math_four))* np.sqrt(2 / np.pi)*(1 + 3*0.044715*x2) math_four = (np.sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3))) Parameters ---------- input_dy : dict shape and dtype of dy input, only support float16, float32 input_x : dict shape and dtype of x input, only support float16, float32 input_y : dict shape and dtype of y input, only support float16, float32 output_z: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is gelu_grad Returns: ------- none. """ shape_dy = input_dy.get("shape") shape_x = input_x.get("shape") shape_y = input_y.get("shape") check_shape(shape_dy, param_name="input_dy") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = input_dy.get("dtype").lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_dy") shape_dy = list(shape_dy) shape_x = list(shape_x) shape_y = list(shape_y) if not (operator.eq(shape_dy, shape_x) and operator.eq(shape_dy, shape_y)): raise RuntimeError("all input shape must be equal") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_dy) data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype) data_gelu = tvm.placeholder(fuseshape, name="data_gelu", dtype=input_dtype) res = gelu_grad_compute(data_dy, data_x, data_gelu, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_dy, data_x, data_gelu, res] } te.lang.cce.cce_build_code(sch, config)
def atan_grad(y, dy, z, kernel_name="atan_grad"): """ Gradient calculation for atan(x) Parameters: ---------- y : dict of y, include shape and dtype, dtype support float16, float32 dy : dict of dy, include shape and dtype, dtype support float16, float32 z : dict of output, include shape and dtype kernel_name : cce kernel name, default value is atan_grad Algorithm : ---------- forward : y = atan(x) backward gradient : de/dx = dy/dx*de/dy = 1/(1+x^2)*grad Returns ---------- None """ # get the shape and dtype shape = y.get("shape") shape_grad = dy.get("shape") dtype = y.get("dtype") dtype_grad = dy.get("dtype") # check whether kernel name is unique # check whether the shape is right check_shape(shape, param_name="y") check_shape(shape_grad, param_name="dy") if not operator.eq(shape, shape_grad): raise RuntimeError("all input shape must be the same") shape, _ = refine_shape_axes(shape, []) # check whether dtypes are fp16,fp32 and whether they are the same check_list = ("float16", "float32") check_dtype(dtype, check_list, param_name="y") check_dtype(dtype_grad, check_list, param_name="dy") dtype = dtype.lower() if dtype != dtype_grad.lower(): raise RuntimeError("all input dtype must be same") # get 2 input placeholders: data_input, grad data_input = tvm.placeholder(shape, name="input_data", dtype=dtype) grad = tvm.placeholder(shape, name="input_grad", dtype=dtype) # compute the backward gradient res = atan_grad_compute(data_input, grad, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, grad, res]} te.lang.cce.cce_build_code(sch, config)
def floor_mod(x1, x2, y, kernel_name="floor_mod"): """ calculate the remainder of division, support fp16,fp32,int32 res = x1 -floor(input_data_x / input_data_y)* input_data_y Parameters ---------- x1: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 x2: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 y: dict, reserved field dict with keys(shape and dtype) of output kernel_name: str cce kernel name, default value is "floor_mod" Returns ------ None """ # get dtype and shape attributes dtype_x = x1.get("dtype").lower() shape_x = x1.get("shape") dtype_y = x2.get("dtype").lower() shape_y = x2.get("shape") # check_kernel_name & shape check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") # check input tensor data_type check_list = ("float16", "float32", "int32") check_dtype(dtype_x, check_list, param_name="x1") check_dtype(dtype_y, check_list, param_name="x2") if dtype_x != dtype_y: raise RuntimeError("the type of dtype in two dict is not the same") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) input_data_x = tvm.placeholder(shape_x, name="input_data_x", dtype=dtype_x) input_data_y = tvm.placeholder(shape_y, name="input_data_y", dtype=dtype_y) res = floor_mod_compute(input_data_x, input_data_y, y, kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_data_x, input_data_y, res] } te.lang.cce.cce_build_code(auto_sch, config)
def sqrt(input_x, output_y, kernel_name="sqrt"): """ calculating data Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "sqrt" Returns ------- None """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ """ TODO: operator check """ """ TODO: operator compute, invoke sqrt_compute """ print("=================当你看到这句话时,说明我这个自定义sqrt算子被执行了============================") shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() util.check_shape_rule(shape) util.check_tensor_shape_size(shape) util.check_kernel_name(kernel_name) data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype) res = sqrt_compute(data_input, output_y, kernel_name) """ TODO: auto schedule """ with tvm.target.cce(): schedule = generic.auto_schedule(res) """ TODO: operator build """ config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(schedule, config)
def softplus_grad(input_gradients, input_features, output_backprops, kernel_name="softplus_grad"): """ Computes softplus gradients for a softplus operation. The gradients: "dy * exp(x) / (1 + exp(x))". Parameters ---------- input_gradients: dict The backpropagated gradients to the corresponding softplus operation. input_features: dict The input_features passed as input to the corresponding softplus operation. source data type support "float16", "float32", "int32", "int8", "uint8". output_backprops: dict data of output. kernel_name: str kernel name, default value is "softplus_grad". Returns ------- None """ shape_dy = input_gradients.get("shape") dtype_dy = input_gradients.get("dtype") shape_x = input_features.get("shape") dtype_x = input_features.get("dtype") if dtype_dy.lower() != dtype_x.lower(): raise RuntimeError("type of dy and type of x must be same, \ while the types are different") dtype = dtype_dy check_shape(shape_dy, param_name="input_gradients") check_shape(shape_x, param_name="input_features") check_list = ("float16", "float32", "int32", "int8", "uint8") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_gradients") shape_dy, shape_x, shape_max = broadcast_shapes( shape_dy, shape_x, param_name_input1="input_gradients", param_name_input2="input_features") reshape_dy, reshape_x = refine_shapes_for_broadcast(shape_dy, shape_x) data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype) res = softplus_grad_compute(data_dy, data_x, output_backprops, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_dy, data_x, res]} te.lang.cce.cce_build_code(sch, config)
def log(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="log"): """ calculating data Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "log" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() # input_x' shape check op_utils.check_shape(shape, param_name="input_x") # input_x' dtype check, only supports fp16 and fp32 check_list = ("float16", "float32") op_utils.check_dtype(input_dtype, check_list, param_name="input_x") if base <= 0 and (not isclose(base, -1.0)): error_info = {} error_info['errCode'] = 'E80000' error_info['param_name'] = 'base' error_info['op_name'] = 'log' error_info['expect_value'] = "strictly positive or -1" error_info['real_value'] = base raise RuntimeError("In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['param_name'], \ error_info['expect_value'], error_info['real_value'])) fused_shape = [reduceIns(lambda x, y: x * y, shape[:])] data_input = tvm.placeholder(fused_shape, name="data_input", dtype=input_dtype) res = log_compute(data_input, output_y, base, scale, shift, kernel_name) # auto schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # operator build config = { "name": kernel_name, "need_build": True, "tensor_list": (data_input, res) } te.lang.cce.cce_build_code(sch, config)