def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") ori_shape_x = x.get("ori_shape") ori_shape_y = y.get("ori_shape") shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) if format_pattern == 1: ori_shape_x, shape_y, _ = util.produce_shapes(ori_shape_x, shape_y) if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, ori_shape_y, _ = util.produce_shapes(shape_x, ori_shape_y) if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def select_v2_compute(condition, x1, x2, y, kernel_name="select_v2"): """ compute for select_v2 Parameters ---------- condition: TVM tensor the placeholder of input condition x1: TVM tensor the placeholder of input x1 x2: TVM tensor the placeholder of input x2 y: dict dict of y kernel_name: str cce kernel name, default value is "select_v2" Returns ------- res: TVM tensor the result of compute """ num_dtype = x1.dtype condition_dtype = condition.dtype x1 = te.lang.cce.cast_to(x1, "float32") x2 = te.lang.cce.cast_to(x2, "float32") condition = te.lang.cce.cast_to(condition, "float32") shape_x1list = te.lang.cce.util.shape_to_list(x1.shape) shape_x2list = te.lang.cce.util.shape_to_list(x2.shape) con_shapelist = te.lang.cce.util.shape_to_list(condition.shape) shape_x1list, con_shapelist, shape_max_x1 = util.produce_shapes( shape_x1list, con_shapelist) shape_x2list, shape_max_x1, shape_max = util.produce_shapes( shape_x2list, shape_max_x1) x1 = te.lang.cce.broadcast(x1, shape_max) x2 = te.lang.cce.broadcast(x2, shape_max) condition = te.lang.cce.broadcast(condition, shape_max) ones = te.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype="float32"), shape_max, output_dtype="float32") res = te.lang.cce.vcmpsel(condition, rhs=ones, operation='eq', slhs=x1, srhs=x2) res = te.lang.cce.cast_to(res, num_dtype) return res
def fake_quant_per_layer(x, min_val, max_val, y, symmetric, narrow_range, num_bits, kernel_name="fake_quant_per_layer"): """FakeQuantPerLayer""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_per_layer_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def custom_subtract(shape_x, shape_y, dtype, kernel_name="cce_subtract", need_build=True, need_print=True): """ do element-wise subtract operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_subtract" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_subtract_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) print("######## shape") shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data2_tmp1 = te.lang.cce.broadcast(data2, shape_max) res = te.lang.cce.vsub(data1_tmp1, data2_tmp1) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data1, data2, res] } te.lang.cce.cce_build_code(sch, config)
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def _shape_check(shape_x1, shape_x2, shape_tgt): # check whether the shape meets the broadcast requirements, and output broadcast shape try: _, _, x_shape = util.produce_shapes(shape_x1, shape_x2) except RuntimeError: raise RuntimeError("x1 and x2 can't be broadcast") x_shape_reduce = x_shape[:] x_shape_reduce.pop(1) try: _, _, tgt_shape = util.produce_shapes(x_shape_reduce, shape_tgt) except RuntimeError: raise RuntimeError("x and target can't be broadcast") min_dim = min(len(shape_x1), len(shape_x2), len(shape_tgt)) if min_dim >= 3: reduce_dim = -1 for i in range(-1, -min_dim, -1): if (shape_x1[i] == shape_x2) or (shape_x1[i] == shape_tgt[i]): reduce_dim = i else: break if reduce_dim != -1: shape_x1 = list(shape_x1[:reduce_dim]) + [ reduce(lambda x, y: x * y, shape_x1[reduce_dim:]) ] shape_x2 = list(shape_x2[:reduce_dim]) + [ reduce(lambda x, y: x * y, shape_x2[reduce_dim:]) ] shape_tgt = list(shape_tgt[:reduce_dim]) + [ reduce(lambda x, y: x * y, shape_tgt[reduce_dim:]) ] x_shape = list(x_shape[:reduce_dim]) + [ reduce(lambda x, y: x * y, x_shape[reduce_dim:]) ] tgt_shape = list(tgt_shape[:reduce_dim]) + [ reduce(lambda x, y: x * y, tgt_shape[reduce_dim:]) ] util.check_shape_rule(shape_x1) util.check_shape_rule(shape_x2) util.check_shape_rule(shape_tgt) util.check_tensor_shape_size(shape_x1) util.check_tensor_shape_size(shape_x2) util.check_tensor_shape_size(shape_tgt) return x_shape, tgt_shape, shape_x1, shape_x2, shape_tgt
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ shape_g = g.get("shape") shape_x = x.get("shape") dtype_g = g.get("dtype").lower() dtype_x = x.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_g) util.check_shape_rule(shape_x) util.check_tensor_shape_size(shape_g) util.check_tensor_shape_size(shape_x) shape_list = util.produce_shapes(shape_g, shape_x) util.check_tensor_shape_size(shape_list[2]) # check input tensor data_type check_list = ["float16", "float32"] util.check_dtype_rule(dtype_g, check_list) util.check_dtype_rule(dtype_x, check_list) util.compare_tensor_dict_key(g, x, "dtype") shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g) res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def minmax_update_perlayer(x, min_val, max_val, min_up, max_up, ema, ema_decay, kernel_name="minmax_update_perlayer"): """MinMaxUpdatePerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = minmax_update_perlayer_compute(input_data, min_data, max_data, ema, ema_decay) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def axpy_v2_compute(x1, x2, alpha, y, kernel_name="axpy_v2"): """ calculating data Parameters ---------- x1 : TVM tensor the placeholder of input_x x2 : TVM tensor the placeholder of x2 y : dict dict of y, include keys(shape and dtype) alpha : TVM tensor scalar of mul-factor kernel_name : str kernel name, default value is "axpy_v2" Returns ------- output tensor """ # broadcast shape_x1 = te.lang.cce.util.shape_to_list(x1.shape) shape_x2 = te.lang.cce.util.shape_to_list(x2.shape) dtype_alpha = alpha.dtype.lower() dtype = x1.dtype.lower() precision_dtype = "float32" if dtype != precision_dtype: x1 = te.lang.cce.cast_to(x1, precision_dtype) x2 = te.lang.cce.cast_to(x2, precision_dtype) if dtype_alpha != precision_dtype: alpha = te.lang.cce.cast_to(alpha, precision_dtype) if shape_x1 != shape_x2: # if shape not equal, then apply broadcast. shape_x, shape_y, shape_max = util.produce_shapes(shape_x1, shape_x2) x1 = te.lang.cce.broadcast(x1, shape_max) x2 = te.lang.cce.broadcast(x2, shape_max) alpha = te.lang.cce.broadcast(alpha, shape_max) else: alpha = te.lang.cce.broadcast(alpha, shape_x1) res = te.lang.cce.vmla(x2, alpha, x1) res = te.lang.cce.cast_to(res, dtype) return res
def mul_no_nan_compute(input_x1, input_x2, output_y, kernel_name="mul_no_nan"): """ calculating data Parameters ---------- input_x1 : TVM tensor the placeholder of input_x1 input_x2 : TVM tensor the placeholder of input_x2 output_y : dict dict of output_y, include keys(shape and dtype) kernel_name : str kernel name, default value is "mul_no_nan" Returns ------- output tensor """ """ np.where(np.equal(y, 0.), np.zeros((), dtype=dtype), np.multiply(x, y)) """ src_dtype = input_x1.dtype.lower() shape_x1 = te.lang.cce.util.shape_to_list(input_x1.shape) shape_x2 = te.lang.cce.util.shape_to_list(input_x2.shape) shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) input_x1 = te.lang.cce.broadcast(input_x1, shape_max) input_x2 = te.lang.cce.broadcast(input_x2, shape_max) mul_res = te.lang.cce.vmul(input_x1, input_x2) zero = tvm.const(0, dtype=src_dtype) zeros = te.lang.cce.broadcast(zero, shape_max) res = te.lang.cce.vcmpsel(input_x2, zeros, operation='eq', slhs=zeros, srhs=mul_res) return res
def xdivy_grad(x1, x2, grad, y1, y2, kernel_name="xdivy_grad"): """ Returns gradient of xdivy(x, y) with respect to x and y. Parameters ---------- x1 : dict shape and dtype of input, only support float16, float32 x2 : dict shape and dtype of input, only support float16, float32 grad : dict shape and dtype of input, only support float16, float32 y1 : dict shape and dtype of output, should be same shape and type as input y2 : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "xdivygrad" Returns ------- None """ shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype").lower() shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype").lower() shape_grad = grad.get("shape") dtype_grad = grad.get("dtype").lower() if dtype_x1 != dtype_x2 or dtype_x2 != dtype_grad or dtype_grad != dtype_x1: raise RuntimeError("the type of x1, x2 and grad must be the same.") op_utils.check_shape(shape_x1, param_name="x1") op_utils.check_shape(shape_x2, param_name="x2") op_utils.check_shape(shape_grad, param_name="grad") check_list = ("float16", "float32") op_utils.check_dtype(dtype_x1, check_list, param_name="x1") shape_x1, shape_x2, shape_max_x1x2 = util.produce_shapes( shape_x1, shape_x2) if len(shape_max_x1x2) < len(shape_grad): raise RuntimeError( "the length of shape_grad can not be longer than the maximum " "length of x1 and x2.") shape_grad, _, shape_max = util.produce_shapes(shape_grad, shape_max_x1x2) for (x, y) in zip(shape_max_x1x2, shape_grad): if x < y: raise RuntimeError("this shape is not supported.") op_utils.check_shape(shape_max, param_name="x") rx, ry = _broadcast_gradient_args(shape_x1, shape_x2) x1 = tvm.placeholder(shape_x1, name="x", dtype=dtype_x1) x2 = tvm.placeholder(shape_x2, name="y", dtype=dtype_x1) grad = tvm.placeholder(shape_grad, name="grad", dtype=dtype_x1) output_y1, output_y2 = xdivy_grad_compute([x1, x2, grad], shape_max, dtype_x1, rx, ry) with tvm.target.cce(): sch = generic.auto_schedule([output_y1, output_y2]) config = { "name": kernel_name, "tensor_list": [x1, x2, grad, output_y1, output_y2] } te.lang.cce.cce_build_code(sch, config)
def threshold_grad_v2_d(input_gradients, input_features, output_backprops, threshold, kernel_name="threshold_grad_v2_d"): """ calculating data Parameters ---------- input_gradients : dict shape and dtype of input_gradients input_features : dict shape and dtype of input_features output_backprops : dict shape and dtype of output_backprops, should be same shape and type as inputs threshold : dict shape and dtype of threshold, 0-dimensional array kernel_name : str kernel name, default value is "threshold_grad_v2_d" Returns ------- None """ shape_input_gradients = input_gradients.get("shape") dtype_input_gradients = input_gradients.get("dtype").lower() shape_input_features = input_features.get("shape") dtype_input_features = input_features.get("dtype").lower() shape_list = util.produce_shapes(shape_input_gradients, shape_input_features) util.check_tensor_shape_size(shape_list[2]) shape_input_gradients, shape_input_features = \ refine_shapes_for_broadcast(shape_list[0], shape_list[1]) check_list = ("float16", "float32", "int32", "int8", "uint8") check_dtype(dtype_input_gradients, check_list) check_dtype(dtype_input_features, check_list) data_input_gradients = tvm.placeholder(shape_input_gradients, name="data_input_gradients", dtype=dtype_input_gradients) data_input_features = tvm.placeholder(shape_input_features, name="data_input_features", dtype=dtype_input_features) res = threshold_grad_v2_d_compute(data_input_gradients, data_input_features, output_backprops, threshold, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_gradients, data_input_features, res] } te.lang.cce.cce_build_code(schedule, config)
def select_v2(condition, x1, x2, y, kernel_name="select_v2"): """ Selects elements from `x1` or `x2`, depending on `condition`. Parameters ---------- condition: dict dict of condition, include keys(shape and dtype), only support bool x1: dict dict of x1, only support float16, float32, int32, int8, uint8 x2: dict dict of x2, only support float16, float32, int32, int8, uint8 y: dict dict of output kernel_name: str cce kernel name, default value is "select" Returns ------- None """ shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype") shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype") bool_dtype = condition.get("dtype") con_shape = condition.get("shape") shape_x1, con_shape, shape_max_x1 = util.produce_shapes( shape_x1, con_shape) shape_x2, con_shape, shape_max_x2 = util.produce_shapes( shape_x2, con_shape) if shape_x1[-1] == 1 and shape_x2[-1] == 1 and con_shape[-1] == 1 \ and shape_max_x1[-1] == 1: shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1] shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1] con_shape = con_shape if len(con_shape) == 1 else con_shape[:-1] util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x1) util.check_tensor_shape_size(shape_x1) if shape_x1 == shape_x2 == con_shape: shape_x1 = (functools_reduce(lambda x, y: x * y, shape_x1[:]), ) shape_x2 = (functools_reduce(lambda x, y: x * y, shape_x2[:]), ) con_shape = (functools_reduce(lambda x, y: x * y, con_shape[:]), ) dtype_x1 = dtype_x1.lower() dtype_x2 = dtype_x2.lower() check_list = ("float16", "float32", "int32", "int8", "uint8") util.check_dtype_rule(dtype_x1, check_list) if dtype_x1 != dtype_x2: raise RuntimeError("Dtype of tensor x1 and x2 must be equal!") bool_dtype = bool_dtype.lower() bool_check_list = ("bool", "int8", "uint8") util.check_dtype_rule(bool_dtype, bool_check_list) condition = tvm.placeholder(con_shape, name="condition", dtype=bool_dtype) input_then = tvm.placeholder(shape_x1, name="input_then", dtype=dtype_x1) input_else = tvm.placeholder(shape_x2, name="input_else", dtype=dtype_x2) with tvm.target.cce(): res = select_v2_compute(condition, input_then, input_else, y, kernel_name) sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [condition, input_then, input_else, res], "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def sigmoid_cross_entropy_with_logits_grad_v2_compute(predict, target, dout, weight, pos_weight, reduction="mean"): """ :param predict: TVM tensor, the placeholder of predict :param target: TVM tensor, the placeholder of target :param dout: TVM tensor, the placeholder of dout :param weight: TVM tensor, the placeholder of weight :param pos_weight: TVM tensor, the placeholder of pos_weight :param reduction: str, specifies the reduction mode :'none' | 'mean' | 'sum' :return: TVM tensor """ predict_shape = te.lang.cce.util.shape_to_list(predict.shape) predict_dtype = predict.dtype precision_dtype = "float32" if predict.dtype.lower() == "float16": predict = te.lang.cce.cast_to(predict, precision_dtype) target = te.lang.cce.cast_to(target, precision_dtype) # calculate sigmoid(predict) exp_predict = te.lang.cce.vexp(predict) exp_add1 = te.lang.cce.vadds(exp_predict, tvm.const(1, precision_dtype)) sigmoid_tmp = te.lang.cce.vdiv(exp_predict, exp_add1) sigmoid_res = te.lang.cce.cast_to(sigmoid_tmp, precision_dtype) # calculate the result of gradient = ((log_weight + 1 - target) * sigmoid(predict) - log_weight) * dout if pos_weight is not None: pos_weight_shape = te.lang.cce.util.shape_to_list(pos_weight.shape) if pos_weight_shape != predict_shape: _, _, broadcast_pos_shape = util.produce_shapes( pos_weight_shape, predict_shape) pos_weight = te.lang.cce.broadcast(pos_weight, broadcast_pos_shape, precision_dtype) log_weight = te.lang.cce.vmul(pos_weight, target) weight_tmp = te.lang.cce.vadds(log_weight, tvm.const(1, precision_dtype)) weight_sub = te.lang.cce.vsub(weight_tmp, target) grad_tmp = te.lang.cce.vmul(weight_sub, sigmoid_res) grad_cur = te.lang.cce.vsub(grad_tmp, log_weight) grad_output = te.lang.cce.vmul(grad_cur, dout) else: grad_cur = te.lang.cce.vsub(sigmoid_res, target) grad_output = te.lang.cce.vmul(grad_cur, dout) # calculate the result of gradient = gradient * weight if weight is not None: weight_shape = te.lang.cce.util.shape_to_list(weight.shape) if weight_shape != predict_shape: _, _, broadcast_weight_shape = util.produce_shapes( weight_shape, predict_shape) weight = te.lang.cce.broadcast(weight, broadcast_weight_shape, precision_dtype) grad_output = te.lang.cce.vmul(grad_output, weight) # calculate the result of gradient = gradient / num if reduction == "mean": num = reduce(lambda x, y: x * y, predict_shape) norm = 1.0 / num grad_output = te.lang.cce.vmuls(grad_output, norm) grad_output = te.lang.cce.cast_to(grad_output, predict_dtype) return grad_output
def _broadcast_shape_check(input_shape, target_shape): try: util.produce_shapes(input_shape, target_shape) except RuntimeError: raise RuntimeError("input_shape can't be broadcast to target_shape")
def axpy_compute(x1, x2, y, alpha, kernel_name="axpy"): """ calculating data Parameters ---------- x1 : TVM tensor the placeholder of input_x x2 : TVM tensor the placeholder of x2 y : dict dict of y, include keys(shape and dtype) alpha : float scalar of mul-factor kernel_name : str kernel name, default value is "axpy" Returns ------- output tensor """ # broadcast shape_x = te.lang.cce.util.shape_to_list(x1.shape) shape_y = te.lang.cce.util.shape_to_list(x2.shape) dtype = x1.dtype.lower() # neg_1_axis_flag neg_1_axis_flag = 0 if shape_x != shape_y: # if shape not equal, then apply broadcast. shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) for i in range(len(shape_x) - 1): if shape_x[i] != shape_y[i]: neg_1_axis_flag = 1 break util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x1 = te.lang.cce.broadcast(x1, shape_max) x2 = te.lang.cce.broadcast(x2, shape_max) # start the main logic if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") == "Ascend910": if dtype in ("float16", "float32"): # fp16 or fp32 if neg_1_axis_flag: res_muls = te.lang.cce.vmuls(x2, alpha) res = te.lang.cce.vadd(x1, res_muls) else: res = te.lang.cce.vaxpy(x2, x1, tvm.const(alpha, dtype=dtype)) else: # int32 if alpha != 1: # add+muls use fp32 to_type = "float32" input_x_cast = te.lang.cce.cast_to(x1, to_type) input_y_cast = te.lang.cce.cast_to(x2, to_type) if neg_1_axis_flag: res_muls = te.lang.cce.vmuls(x2, alpha) res_tmp = te.lang.cce.vadd(x1, res_muls) else: res_tmp = te.lang.cce.vaxpy(input_y_cast, input_x_cast, tvm.const(alpha, dtype=to_type)) res = te.lang.cce.cast_to(res_tmp, dtype) else: # if alpha == 1 res = te.lang.cce.vadd(x2, x1) else: if dtype in ("float16", "float32"): # fp16 or fp32 res_muls = te.lang.cce.vmuls(x2, alpha) res = te.lang.cce.vadd(x1, res_muls) else: # int32 if alpha != 1: # add+muls use fp32 to_type = "float32" input_x1_cast = te.lang.cce.cast_to(x1, to_type) input_x2_cast = te.lang.cce.cast_to(x2, to_type) res_muls = te.lang.cce.vmuls(input_x2_cast, alpha) res_tmp = te.lang.cce.vadd(input_x1_cast, res_muls) res = te.lang.cce.cast_to(res_tmp, dtype) else: # if alpha == 1 res = te.lang.cce.vadd(x2, x1) return res
def axpy_v2(x1, x2, alpha, y, kernel_name="axpy_v2"): """ calculating data Parameters ---------- x1 : dict shape and dtype of input_x x2 : dict shape and dtype of input_y alpha : dict shape and dtype of alpha scalar apply to input_y:input_y*alpha y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "axpy" Returns ------- None """ # check kernel name util.check_kernel_name(kernel_name) # infer shape according to the format pattern format_pattern = _add_check_format(x1, x2) shape_x1, shape_x2 = _infer_shape(format_pattern, x1, x2) dtype_x1 = x1.get("dtype").lower() dtype_x2 = x2.get("dtype").lower() alpha_dtype = alpha.get("dtype").lower() alpha_shape = alpha.get("shape") # check shape shape_x1 = util.scalar2tensor_one(shape_x1) shape_x2 = util.scalar2tensor_one(shape_x2) alpha_shape = util.scalar2tensor_one(alpha_shape) op_utils.check_shape(shape_x1) op_utils.check_shape(shape_x2) op_utils.check_shape(alpha_shape) # check dtype dtype_list0 = ("float16", "float32", "int32") dtype_list1 = ("float16", "float32") check_dtype(dtype_x1, dtype_list0) check_dtype(dtype_x2, dtype_list0) check_dtype(alpha_dtype, dtype_list1) util.compare_tensor_dict_key(x1, x2, "dtype") # check alpha is 0D or 1D tensor if len(alpha_shape) and not util.is_scalar(alpha_shape): raise RuntimeError("alpha should be 0D or 1D tensor") # produce shapes shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2) if shape_x1[-1] == 1 and shape_x2[-1] == 1 and shape_max[-1] == 1: shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1] shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) util.produce_shapes(shape_max, alpha_shape) shape_x1, shape_x2 = refine_shapes_for_broadcast(shape_x1, shape_x2) data_input_x1 = tvm.placeholder(shape_x1, name="data_input_x1", dtype=dtype_x1) data_input_x2 = tvm.placeholder(shape_x2, name="data_input_x2", dtype=dtype_x2) alpha_shape = tuple([1] * (len(shape_x1) - len(alpha_shape))) + tuple(alpha_shape) alpha_input = tvm.placeholder(alpha_shape, name="alpha_input", dtype=alpha_dtype) res = axpy_v2_compute(data_input_x1, data_input_x2, alpha_input, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data_input_x1, data_input_x2, alpha_input, res]} te.lang.cce.cce_build_code(schedule, config)
def custom_logical_and(shape_x, shape_y, dtype, kernel_name="cce_tf_logical_and", need_build=False, need_print=False): """ do element-wise logical-and operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support "bool" kernel_name : cce kernel name, default value is "cce_tf_logical_and" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["bool"] if not (dtype.lower() in check_list): raise RuntimeError( "logical_and_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data1_tmp2 = te.lang.cce.broadcast(data2, shape_max) min_value = tvm.const(0, dtype=inp_dtype) res = tvm.compute( shape_max, lambda *i: tvm.select( tvm.all( tvm.any( data1_tmp1(*i) > min_value, data1_tmp1(*i) < -min_value), tvm.any( data1_tmp2(*i) > min_value, data1_tmp2(*i) < -min_value)), True, False), name="res") sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [data1, data2, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
def custom_squared_difference(shape_x, shape_y, dtype, kernel_name="cce_tf_squared_difference", need_build=False, need_print=False): """ algorithm: tf_squared_difference calculating data's tf_squared_difference,y= (x - y) * (x - y) Parameters ---------- shape_x : shape of input x shape_y : shape of input y dtype : the data type, assume src_dtype equals dst_dtype, only support \ float16, float32, int32 kernel_name : cce kernel name, default value is "cce_tf_squared_difference" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] if not dtype.lower() in check_list: raise RuntimeError( "tf_squared_difference_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) dtype = dtype.lower() shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y") with tvm.target.cce(): data_x_tmp = te.lang.cce.broadcast(data_x, shape_max) data_y_tmp = te.lang.cce.broadcast(data_y, shape_max) data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp) res = te.lang.cce.vmul(data_sub, data_sub) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def axpy(x1, x2, y, alpha, kernel_name="axpy"): """ calculating data Parameters ---------- x1 : dict shape and dtype of input_x x2 : dict shape and dtype of input_y y : dict shape and dtype of output, should be same shape and type as input alpha : float scalar apply to input_y:input_y*alpha kernel_name : str kernel name, default value is "axpy" Returns ------- None """ # check kernel name util.check_kernel_name(kernel_name) # infer shape according to the format pattern format_pattern = _add_check_format(x1, x2) shape_x1, shape_x2 = _infer_shape(format_pattern, x1, x2) # check shape shape_x1 = util.scalar2tensor_one(shape_x1) util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) shape_x2 = util.scalar2tensor_one(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) util.check_shape_rule(shape_x1) util.check_tensor_shape_size(shape_x1) util.check_shape_rule(shape_x2) util.check_tensor_shape_size(shape_x2) # check dtype dtype_list = ("float16", "float32", "int32") dtype_x1 = x1.get("dtype").lower() check_dtype(dtype_x1, dtype_list) dtype_x2 = x2.get("dtype").lower() check_dtype(dtype_x2, dtype_list) # produce shapes shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2) if shape_x1[-1] == 1 and shape_x2[-1] == 1 and shape_max[-1] == 1: shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1] shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) shape_x1, shape_x2 = refine_shapes_for_broadcast(shape_x1, shape_x2) data_input_x1 = tvm.placeholder(shape_x1, name="data_input_x1", dtype=dtype_x1) data_input_x2 = tvm.placeholder(shape_x2, name="data_input_x2", dtype=dtype_x2) res = axpy_compute(data_input_x1, data_input_x2, y, alpha, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data_input_x1, data_input_x2, res]} te.lang.cce.cce_build_code(schedule, config)
def fused_minimum_or_maximum_grad_cce( shape_dz, shape_x, shape_y, grad_x=True, grad_y=True, cmp_type="LE", dtype="float32", kernel_name="cce_fused_minimum_or_maximum_grad", need_build=False, need_print=False): """ algorithm: calculating minimum or maximum_grad of the two input data Parameters ---------- shape_dz: list or tuple. shape of data_inputdz shape_x: list or tuple. shape of data_inputx shape_y: list or tuple. shape of data_inputy grad_x: bool if grad_x is true,output need return dx grad_y: bool if grad_y is true,output need return dy cmp_type: str LessEqual or GreatEqual dtype: str the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name: str cce kernel name, default value is "cce_fused_minimum_or_maximum_grad" need_build: bool if need to build CCEC kernel, default value is False need_print: bool if need to print the ir, default value is False Returns: ------- none. """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_rule(shape_max) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) if list(shape_dz) != list(shape_max): raise RuntimeError( "fused_minimum_or_maximum_grad_cce shape_dz != shape_max") dtype = dtype.lower() if dtype not in ["float16", "float32", "int32"]: raise RuntimeError("fused_minimum_or_maximum_grad_cce only support" " float16, float32, int32") if (grad_x, grad_y) == (False, False): raise RuntimeError("grad_x and grad_x at least one is true") placeholders = [] placeholders.append(tvm.placeholder(shape_dz, name="input_dz", dtype=dtype)) placeholders.append(tvm.placeholder(shape_x, name="input_x", dtype=dtype)) placeholders.append(tvm.placeholder(shape_y, name="input_y", dtype=dtype)) outs = fused_minimum_or_maximum_grad_compute(placeholders, shape_x, shape_y, shape_dz, cmp_type, dtype) with tvm.target.cce(): if (grad_x, grad_y) == (True, False): sch = generic.auto_schedule(outs[0]) outs = [outs[0]] if (grad_x, grad_y) == (False, True): sch = generic.auto_schedule(outs[1]) outs = [outs[1]] if (grad_x, grad_y) == (True, True): sch = generic.auto_schedule(outs) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": placeholders + outs } te.lang.cce.cce_build_code(sch, config)
def fake_quant_with_min_max_update(x, min_val, max_val, min_up, max_up, ema, ema_decay, symmetric, narrow_range, training, num_bits, quant_delay, kernel_name="fake_quant_update"): """FakeQuantWithMinMax op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2**(num_bits - 1) quant_max = 2**(num_bits - 1) - 1 else: quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = fake_quant_with_min_max_update_compute(input_data, min_data, max_data, ema, ema_decay, quant_min, quant_max, training, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod", need_build=False, need_print=False): """ do element-wise truncatemod operation between two input tensors Parameters: ---------- shape1 : shape of input data1 shape2 : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_tf_truncatemod" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ max_dim = 8 shape1_len = len(shape1) shape2_len = len(shape2) if shape1_len > max_dim or shape2_len > max_dim: raise RuntimeError( "mod_cce only support up to %d dimensions while the shape's \ dimensions is %d, %d" % (max_dim, shape1_len, shape2_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape1) util.check_shape_rule(shape2) util.check_shape_size(shape1, SHAPE_SIZE_LIMIT) util.check_shape_size(shape2, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] device_api_map = {"float16": "cc_device_truncatemod_float16", "float32": "cc_device_truncatemod_float", "int32": "cc_device_truncatemod_int32"} dtype = dtype.lower() if dtype not in check_list: raise RuntimeError( "tf_truncatemod_cce only support %s while dtype is %s" % ( ",".join(check_list), dtype)) shape1, shape2, shape_out = util.produce_shapes(shape1, shape2) util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() device_api = device_api_map[inp_dtype] # block block_num = "block_num" block_idx = "block_idx" # x param v_xndim_cnt = tvm.const(len(shape1), "int32") p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape") xpad_c0 = tvm.const(0, "int32") data_input_x = tvm.placeholder(shape1, name="data_input_x", dtype=inp_dtype) # y param v_yndim_cnt = tvm.const(len(shape2), "int32") p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape") ypad_c0 = tvm.const(0, "int32") data_input_y = tvm.placeholder(shape2, name="data_input_y", dtype=inp_dtype) # output v_out_ndim_cnt = tvm.const(len(shape_out), "int32") p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape") out_padc0 = tvm.const(0, "int32") output = tvm.extern(shape_out, [p_xshape, data_input_x, p_yshape, data_input_y, p_out_shape], lambda ins, outs: tvm.call_extern("int32_t", device_api, block_num, block_idx, v_xndim_cnt, ins[0].access_ptr("r"), # shape x xpad_c0, ins[1].access_ptr("r"), # input x v_yndim_cnt, ins[2].access_ptr("r"), # shape y ypad_c0, ins[3].access_ptr("r"), # input y v_out_ndim_cnt, ins[4].access_ptr("r"), # shape out out_padc0, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) # print IR if need_print: with build_config: print(tvm.lower(schedule, [data_input_x, data_input_y, output], simple_mode=True)) # Compile to generate the cce file if need_build: with build_config: tvm.build(schedule, [data_input_x, data_input_y, output], "cce", name=kernel_name)
def leaky_relu_grad_compute(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). Parameters ---------- g : TVM tensor the placeholder of input g x : TVM tensor the placeholder of input x y : dict dict of output y, include keys(shape and dtype) negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- res: TVM tensor the result of leaky_relu_grad_compute """ shape_list = util.produce_shapes(te.lang.cce.util.shape_to_list(g.shape), te.lang.cce.util.shape_to_list(x.shape)) util.check_tensor_shape_size(shape_list[2]) dtype = g.dtype g = te.lang.cce.broadcast(g, shape_list[2]) x = te.lang.cce.broadcast(x, shape_list[2]) if dtype == "float32": help_min = tvm.const(2**(-126), "float32") help_rec_one = tvm.const(2**38, "float32") help_rec_sec = tvm.const(2**44, "float32") elif dtype == "float16": help_min = tvm.const(2**(-24), "float16") help_rec_one = tvm.const(2**12, "float16") help_rec_sec = help_rec_one tmp_min_x = te.lang.cce.vmins(x, help_min) tmp_max_x = te.lang.cce.vmaxs(tmp_min_x, tvm.const(SCALAR_ZERO, "float32")) tmp_mul_x = te.lang.cce.vmuls(tmp_max_x, help_rec_one) if dtype == "float32": tmp_mul_x = te.lang.cce.vmuls(tmp_mul_x, help_rec_sec) result_tmp_right = te.lang.cce.vmuls(tmp_mul_x, help_rec_sec) result_sub = te.lang.cce.vadds(result_tmp_right, tvm.const(NEGATIVE_ONE, "float32")) result_abs = te.lang.cce.vabs(result_sub) result_tmp_left = te.lang.cce.vmuls(result_abs, negative_slope) result_tmp = te.lang.cce.vadd(result_tmp_left, result_tmp_right) res = te.lang.cce.vmul(g, result_tmp) return res