def _set_reduce_axis(reduce_tensor): shape_reduce = te.lang.cce.util.shape_to_list(reduce_tensor.shape) axis_d = [] for i, _ in enumerate(shape_reduce): axis_d.append(i) axis_d = util.axis_check(len(shape_reduce), axis_d) return axis_d
def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"): """ reduce a tensor on a certain axes based on max. Parameters ---------- x : dict shape and dtype of input y : dict shape and dtype of output, should be same shape and type as input axes: list the first axes to reduce,may be negative to index from the end (e.g., -1 for the last axes). axes may be int or list(e.g. [1,2]) keepdims: bool if true, retains reduced dimensions with length 1, default value is None kernel_name : str kernel name, default value is "reduce_max_d" Returns ------- None """ dtype = x["dtype"] dtype_lower = dtype.lower() check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_lower, check_list) with te.op.compute(): shape = x["shape"] shape_range = x["range"] shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = cce_util.axis_check(shape_len, axes) shape_new, shape_range_new, axes_new, fused_rel_dic = \ fused_reduce_axis(shape, shape_range, axes) add_compile_info("fused_rel_dic", fused_rel_dic) x["shape"] = shape_new x["range"] = shape_range_new shape_var_new = variable_shape([x])[0] data_input = tvm.placeholder(shape_var_new, name="data_input", dtype=dtype_lower) res = reduce_max_d_compute(data_input, y, axes_new, keepdims) with tvm.target.cce(): sch = generic.auto_schedule(res) # build config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.dynamic.build(sch, config)
def _param_check(shape_x, dtype_x, axis, kernel_name): """ Check the input parameter Parameters ---------- shape_x: tuple or list the shape of input tensor dtype_x: string the dtype of input tensor axis: list the axis list for reverse kernel_name: str kernel name, default value is "reverse_ext2" Returns: axis: list """ check_shape(shape_x, param_name="input_x") check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") check_dtype(dtype_x.lower(), check_list, param_name="input_x") axis = list(set(axis)) axis = util.axis_check(len(shape_x), axis) return axis
def caffe_reduction_layer_compute(placeholders, shape, dtype, axis, op, coeff, kernel_name="cce_reductionLayer", need_build=False, need_print=False): """ Since the shape of placeholder created by caffe_reduce is not same as input_shape, fusion_op could not process the fusion of two op which have different shape. So, caffe_reduce op could not be fused until tvm supports reshape in D. """ data = placeholders[0] inp_dtype = dtype.lower() axis = util.axis_check(len(shape), axis) shape = list(shape) shape1 = shape[:axis] + [ functools_reduce(lambda x, y: x * y, shape[axis:]) ] shape1, axis = util.shape_refine(shape1, axis) if not axis: axis = [0] shape1 = [1] + shape1 if op == "ASUM": data_tmp_input = te.lang.cce.vabs(data) cof = coeff tmp = te.lang.cce.vmuls(data_tmp_input, cof) elif op == "SUMSQ": data_tmp_input = te.lang.cce.vmul(data, data) cof = coeff tmp = te.lang.cce.vmuls(data_tmp_input, cof) elif op == "MEAN": size = shape1[-1] cof = float(coeff) * (size**(-1)) if inp_dtype == "int8" \ or inp_dtype == "uint8": data1 = te.lang.cce.vmuls(data, 1.0) data_cast = te.lang.cce.cast_to(data1, "float32") tmp = te.lang.cce.vmuls(data_cast, cof) else: tmp = te.lang.cce.vmuls(data, cof) elif op == "SUM": cof = coeff data_tmp_input = te.lang.cce.vmuls(data, cof) tmp = data_tmp_input res = te.lang.cce.sum(tmp, axis=axis) # Although the data type (int8/uint8) has changed, # the data values remain integer # during the calculation of other operators (SUM/ASUM/SUMSQ). if op != "MEAN": res = te.lang.cce.cast_to(res, inp_dtype, f1628IntegerFlag=True) return res
def reduce_sum_d(x, y, axis=None, keepdims=None, kernel_name="reduce_sum_d"): """reduce a tensor on a certain axis based on sum. Parameters: ---------- x: dict the dict of input tensor. y: dict the dict of output tensor. axis: int, list, tuple or NONETYPE the axis for reduce. keepdims: bool or NONETYPE if true, retains reduced dimensions with length 1. kernel_name: str cce kernel name, default value is "reduce_sum_d". Returns ------- None """ dtype = x["dtype"] dtype_lower = dtype.lower() check_list = ("float16", "float32") check_dtype(dtype_lower, check_list, param_name="x") with te.op.compute(): shape = x["shape"] shape_range = x["range"] axes = [] shape_len = len(shape) if not axis: for i, _ in enumerate(shape): axes.append(i) else: axes = list(axis) axes = cce_util.axis_check(shape_len, axes) shape_new, shape_range_new, axes_new, fused_rel_dic = \ fused_reduce_axis(shape, shape_range, axes) add_compile_info("fused_rel_dic", fused_rel_dic) x["shape"] = shape_new x["range"] = shape_range_new shape_var_new = variable_shape([x])[0] data_input = tvm.placeholder(shape_var_new, name="data_input", dtype=dtype_lower) res = reduce_sum_d_compute(data_input, y, axes_new, keepdims) with tvm.target.cce(): sch = generic.auto_schedule(res) # build config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.dynamic.build(sch, config)
def reduce_sum_d(x, y, axis, keepdims=None, kernel_name="reduce_sum_d"): """reduce a tensor on a certain axis based on sum. Parameters: ---------- x: dict the dict of input tensor. y: dict the dict of output tensor. axis: int, list, tuple or NONETYPE the axis for reduce. keepdims: bool or NONETYPE if true, retains reduced dimensions with length 1. kernel_name: str cce kernel name, default value is "reduce_sum_d". Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") dtype_lower = dtype.lower() check_list = ("float16", "float32") check_shape(shape, param_name="x") check_dtype(dtype_lower, check_list, param_name="x") axis_d = [] shape_len = len(shape) if not axis: for i, _ in enumerate(shape): axis_d.append(i) else: axis_d = list(axis) axis_d = util.axis_check(shape_len, axis_d) # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(x, axis) if not keepdims and not is_5hdc: shape, axis_d = util.shape_refine(list(shape), axis_d, keepdims) shape, axis_d = util.simplify_axis_shape(shape, axis_d) data_input = tvm.placeholder(shape, name="data_input_" + kernel_name, dtype=dtype_lower) res = reduce_sum_d_compute(data_input, y, axis_d, keepdims, is_5hdc=is_5hdc) if is_5hdc: res.ori_shape = x["ori_shape"] res.ori_format = x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def reduce_prod_d(x, y, axes, keep_dims=None, kernel_name="reduce_prod_d"): """ Reduce a tensor on a certain axes based on product. Parameters: ---------- x : dict shape and dtype of input y: dict shape and dtype of output axes : int, list, tuple, NoneType The dimensions to reduce. If None (the default), reduces all dimensions. Must be in the range [-rank(input_tensor), rank(input_tensor)). keep_dims : bool, NoneType if true, retains reduced dimensions with length 1, default value is None. kernel_name : str cce kernel name, default value is reduce_prod_d Returns ------- None """ shape = x.get("shape") check_shape(shape, param_name="x") inp_dtype = x.get("dtype").lower() check_list = ["float16", "float32", "int8", "uint8"] check_dtype(inp_dtype, check_list, param_name="x") shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) util.check_reduce_shape_rule(shape) shape, axes = util.shape_refine(list(shape), axes) shape, axes = util.simplify_axis_shape(shape, axes) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) with tvm.target.cce(): res = reduce_prod_d_compute(data_input, y, axes, keep_dims, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, res] } te.lang.cce.cce_build_code(sch, config)
def mse_loss_compute(predict, label, reduction='mean', kernel_name="mse_loss"): ''' calculating mse_loss :param predict: TVM tensor the output of previous layer :param label: TVM tensor label :param reduction: str reduce configuration parameter: mean/sum/none. Default: mean :param kernel_name: str kernel name, default value is "mse_loss" :return:y when reduction=none:TVM tensor, output tensor when reduction=sum/mean, A Scalar ''' ori_dtype = predict.dtype shape = te.lang.cce.util.shape_to_list(predict.shape) if ori_dtype == "float16" and tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32"): predict = te.lang.cce.cast_to(predict, "float32") label = te.lang.cce.cast_to(label, "float32") # get total number of tensor reduce_elts = 1.0 for i in shape: reduce_elts *= i cof = reduce_elts**(-1) # get total axis for reduce axis_d = [] for i, _ in enumerate(shape): axis_d.append(i) axis_d = util.axis_check(len(shape), axis_d) # calcu value:(predict_n - label_n)^2 res = te.lang.cce.vsub(predict, label) res_sqr = te.lang.cce.vmul(res, res) y = 0.0 if reduction == 'mean': # calcu mean y = te.lang.cce.sum(res_sqr, axis=axis_d, keepdims=False) y = te.lang.cce.vmuls(y, cof) elif reduction == 'sum': # calcu sum y = te.lang.cce.sum(res_sqr, axis=axis_d, keepdims=False) elif reduction == 'none': y = res_sqr if ori_dtype == "float16": y = te.lang.cce.cast_to(y, "float16") return y
def _param_check(shape_x, dtype_x, axis, kernel_name): """check param Parameters ---------- shape_x: list input shape dtype_x: str input dtype axis: int axis int num kernel_name: str kernel_name string Returns ------- None """ check_shape(shape_x, param_name="x") check_list = ("float16", "float32") check_dtype(dtype_x.lower(), check_list, param_name="x") axis = util.axis_check(len(shape_x), axis)
def check_param(input_x, output_y, tiles, axis, kernel_name): """ Check the input parameter Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same type as input axis: int The index of the axis to tile tiles: int The number of copies (tiles) of the blob to output. kernel_name : str kernel name, default value is "tile_with_axis" Returns ---------- axis: int The index of the axis to tile which is adjusted to positive """ shape_x = input_x.get("shape") dtype_x = input_x.get("dtype").lower() shape_y = output_y.get("shape") dtype_y = output_y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="input_x") op_utils.check_shape(shape_y, param_name="input_y") check_list = [ "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32" ] op_utils.check_dtype(dtype_x, check_list, param_name="input_x") op_utils.check_dtype(dtype_y, check_list, param_name="input_x") if dtype_x != dtype_y: error_info = {} error_info['errCode'] = 'E80019' error_info['op_name'] = 'tile_with_axis' error_info['input1_name'] = 'x' error_info['input2_name'] = 'y' error_info['input1_dtype'] = str(dtype_x) error_info['input2_dtype'] = str(dtype_y) raise RuntimeError( "In op[%s], the shape of input[%s] and input[%s] should be the same, but actually are [%s] and [%s]." % (error_info[op_name], error_info['input1_name'], error_info['input2_name'], error_info['input1_dtype'], error_info['input2_dtype'])) if tiles <= 0: check_param_range('tiles', 1, 'inf', tiles) shape_x_len = len(shape_x) # check for 5HD input_format = input_x.get("format") if input_format == "NC1HWC0": shape_x_ori = input_x.get("ori_shape") ori_format = input_x.get("ori_format") length_x_ori = len(shape_x_ori) if ori_format not in ("NCHW", "NHWC"): raise RuntimeError("input_x's ori_format is invalid for 5D Tensor") if shape_x_len != 5: raise RuntimeError("input_x's shape is invalid for 5D Tensor") if length_x_ori != 4: raise RuntimeError("input_x's ori_shape is invalid for 5D Tensor") axis = util.axis_check(length_x_ori, axis) axis = util.axis_transfrom_5d(axis, ori_format) if axis in (1, 4): raise RuntimeError("axis is invalid for 5D Tensor") else: if axis >= shape_x_len or axis < -shape_x_len: check_param_range('axis', -shape_x_len, shape_x_len - 1, axis) if axis < 0: axis += shape_x_len shape_y_expected = [0] * shape_x_len shape_y_expected[0:shape_x_len] = shape_x[0:shape_x_len] shape_y_expected[axis] *= tiles if not check_same_shape(shape_y, shape_y_expected): error_info = {} error_info['errCode'] = 'E80017' error_info['op_name'] = 'tile_with_axis' error_info['attr_name'] = 'shape_y' error_info['expect_value'] = str(shape_y_expected) error_info['real_value'] = str(shape_y) raise RuntimeError( "In op[%s], the parameter[%s] should be [%s], but actually is [%s]." % (error_info['op_name'], error_info['attr_name'], error_info['expect_value'], error_info['real_value'])) shape_x_adapt = [] shape_y_adapt = [] for i in range(shape_x_len): if i == axis: shape_x_adapt.append(1) shape_y_adapt.append(tiles) if shape_x[i] == 1: continue shape_x_adapt.append(shape_x[i]) shape_y_adapt.append(shape_x[i]) return axis, shape_x_adapt, shape_y_adapt, dtype_x
def reduce_mean_d(input_x, output_y, axes, keepdims=None, kernel_name="reduce_mean_d", impl_mode="high_performance"): """ Reduce a tensor on a certa in axes based on mean. Parameters: ---------- input_x : dict shape and dtype of input output_y: dict shape and dtype of output axes : int, list, tuple, NoneType The dimensions to reduce. If None (the default), reduces all dimensions. Must be in the range [-rank(input_tensor), rank(input_tensor)). keepdims : bool, NoneType if true, retains reduced dimensions with length 1, default value is None. kernel_name : str cce kernel name, default value is reduce_mean_d Returns ------- None """ global ori_shape global ori_format shape = input_x.get("shape") check_shape(shape, param_name="input_x") check_list = ["float16", "float32"] shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) inp_dtype = input_x.get("dtype").lower() check_dtype(inp_dtype, check_list, param_name="input_x") axes = util.axis_check(shape_len, axes) # Shape should not be modified in 5HD mode # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(input_x, axes) if not is_5hdc: shape, axes = util.shape_refine(list(shape), axes) shape, axes = util.simplify_axis_shape(shape, axes) ori_shape = [input_x["ori_shape"], input_x["shape"]] ori_format = [input_x["ori_format"], input_x["format"]] data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) res = reduce_mean_d_compute(data_input, output_y, axes, keepdims, impl_mode=impl_mode, is_5hdc=is_5hdc) if is_5hdc: res.ori_shape = input_x["ori_shape"] res.ori_format = input_x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, res] } te.lang.cce.cce_build_code(sch, config)
def log_softmax_grad(input_dy, input_x, output_z, axis=-1, kernel_name="log_softmax_grad"): """ algorithm: log_softmax_grad calculating: gradient of log_softmax Parameters ---------- input_dy : dict shape and dtype of grad input, only support float16, float32 input_x : dict shape and dtype of input, only support float16, float32 output_z: dict shape and dtype of output, should be the same shape and type as input axis: int, list or tuple . the first axis to reduce, may be negative to index from the end (e.g., -1 for the last axis). axis may be int or list(e.g. [1,2]) if true, retains reduced dimensions with length 1, default value is -1 kernel_name: str cce kernel name, default value is log_softmax_grad Returns ------- None """ check_list = ("float16", "float32") input_dtype = input_dy.get("dtype").lower() if not isinstance(axis, int): axis = list(axis) shape1 = input_dy.get("shape") shape2 = input_x.get("shape") check_shape(shape1, param_name="input_dy") check_shape(shape2, param_name="input_x") check_dtype(input_dtype, check_list, param_name="input_dy") axis = util.axis_check(len(shape1), axis) if not isinstance(axis, int): for i in axis: if list(shape1)[i] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") else: if list(shape1)[axis] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") if not operator.eq(list(shape1), list(shape2)): raise RuntimeError("all input shape must be equal") shape1, axis = util.shape_refine(list(shape1), axis) shape2 = shape1 data1 = tvm.placeholder(shape1, dtype=input_dtype, name="data1") data2 = tvm.placeholder(shape2, dtype=input_dtype, name="data2") result = log_softmax_grad_compute(data1, data2, output_z, axis, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data1, data2, result] } te.lang.cce.cce_build_code(sch, config)
def custom_Reduction(shape, dtype, axis, op, coeff, kernel_name="cce_reductionLayer", need_build=False, need_print=False): """ Reduce a tensor on a certain axis, and scale output with coeff Parameters ---------- shape : shape of data dtype : source data type, only support float16, float32, int8, uint8 axis : the first axis to reduce, may be negative to index from the end (e.g., -1 for the last axis). If axis == 0, the output Blob always has the empty shape (count 1), performing reduction across the entire input. op : can only be one of "SUM, ASUM (sum of abs), SUMSQ (sum of sqr), MEAN" coeff : scale for output kernel_name : cce kernel name, default value is "cce_reductionLayer" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["float16", "float32", "int8", "uint8"] if not dtype.lower() in check_list: raise RuntimeError( "reductionLayer_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) reduction_op = ("SUM", "ASUM", "SUMSQ", "MEAN") if not isinstance(axis, int): raise RuntimeError("type of axis value should be int") if op not in reduction_op: raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ , MEAN") if not isinstance(coeff, int) and not isinstance(coeff, float): raise RuntimeError("coeff must be a value") axis_origin = axis shape_origin = shape axis = util.axis_check(len(shape), axis) util.check_reduce_shape_rule(shape) shape = list(shape) shape1 = shape[:axis] + [ functools_reduce(lambda x, y: x * y, shape[axis:]) ] shape1, axis = util.shape_refine(shape1, axis) if not axis: axis = [0] shape1 = [1] + shape1 inp_dtype = dtype.lower() data = tvm.placeholder(shape1, name="data_input", dtype=inp_dtype) with tvm.target.cce(): res = caffe_reduction_layer_compute([data], shape_origin, dtype, axis_origin, op, coeff, kernel_name, need_build, need_print) if op == "MEAN" and (inp_dtype == "int8" or inp_dtype == "uint8"): util.check_shape_size(shape, SHAPE_SIZE_LIMIT) res = te.lang.cce.cast_to(res, inp_dtype) schedule = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(schedule, [data, res], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, res], "cce", name=kernel_name) else: with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def reduce_mean_d_compute(x, y, axes, keepdims, kernel_name="reduce_mean_d", impl_mode="high_performance", is_5hdc=False): """reduce_mean_d compute Parameters: ---------- x: TVM tensor input tensor. y: dict the dict of output tensor. axes: int, list, tuple or NoneType the axes for reduce. keepdims: bool or NoneType if true, retains reduced dimensions with length 1. kernel_name: str cce kernel name, default value is "reduce_mean_d". Returns ------- res: TVM tensor output tensor, has the same shape and type as input tensor. """ shape = te.lang.cce.util.shape_to_list(x.shape) shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) reduce_elts = 1.0 if isinstance(axes, Iterable): for i in axes: reduce_elts *= shape[i] else: reduce_elts = shape[axes] cof = reduce_elts**(-1) if ori_format[0] == 'NHWC' and ori_format[1] == 'NC1HWC0' and len(axes) == 2 \ and axes == [1, 4] and len(ori_shape[0]) == 4: cof = ori_shape[0][-1]**(-1) dtype = x.dtype data_input_tmp = x has_improve_precision = False cce_product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") if cce_product not in ("Ascend310",) and dtype == "float16" and \ tbe_platform.cce_conf.api_check_support( "te.lang.cce.sum", "float32") and not is_5hdc: data_input_tmp = te.lang.cce.cast_to(data_input_tmp, "float32") has_improve_precision = True elif cce_product in ("Ascend310",) and dtype == "float16" \ and tbe_platform.cce_conf.api_check_support("te.lang.cce.sum", "float32") \ and not is_5hdc and impl_mode != "high_performance": data_input_tmp = te.lang.cce.cast_to(data_input_tmp, "float32") has_improve_precision = True data_input_tmp = te.lang.cce.vmuls(data_input_tmp, cof) res = te.lang.cce.sum(data_input_tmp, axis=axes, keepdims=keepdims) if has_improve_precision: res = te.lang.cce.cast_to(res, dtype) return res
def reduce_sum(x, axes, y, keepdims=False, kernel_name="reduce_sum"): """reduce a tensor on a certain axes based on sum. Parameters: ---------- x: dict the dict of input tensor. axes: dict the axes for reduce. y: dict the dict of output tensor. keepdims: bool or NONETYPE if true, retains reduced dimensions with length 1. kernel_name: str cce kernel name, default value is "reduce_sum". Returns ------- None """ dtype_x = x["dtype"] dtype_lower_x = dtype_x.lower() check_list_x = ("float16", "float32") check_dtype(dtype_lower_x, check_list_x, param_name="x") dtype_axes = axes["dtype"] dtype_lower_axes = dtype_axes.lower() check_list_axes = ("int32", "int64") check_dtype(dtype_lower_axes, check_list_axes, param_name="axes") input_shape = x.get("shape") if not _check_data_shape_const(input_shape): schedules = [] ins = classify([x, axes], Mode.REDUCE) tensors = [] shape_axes = variable_shape([axes])[0] data_input_axes = tvm.placeholder(shape_axes, name="data_input_axes", dtype=dtype_lower_axes) for (x, axes) in ins: with te.op.compute(): shape_x = variable_shape([x])[0] data_input_x = tvm.placeholder(shape_x, name="data_input_x", dtype=dtype_lower_x) shape_len = len(shape_x) axes_d = cce_util.axis_check(shape_len, axes) res = reduce_sum_compute(data_input_x, axes_d, y, keepdims) tensors.append([data_input_x, data_input_axes, res]) with tvm.target.cce(): schedule = generic.auto_schedule(res) schedules.append(schedule) # build config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config) add_compile_info("reduce_axis_unknown", 1) else: _reduce_sum_const(x, axes, keepdims, kernel_name)
def op_select_format(input_x, output_y, axis, kernel_name="reverse_v2_d"): """ select format for op """ input_ori_shape = input_x.get("ori_shape") input_ori_format = input_x.get("ori_format") axis = list(set(axis)) axis = util.axis_check(len(input_ori_shape), axis) is_support_5hd = True if input_ori_format != "NCHW": is_support_5hd = False if (input_ori_format == "NCHW" and (1 in axis)) \ or (input_ori_format == "NHWC" and (3 in axis)): is_support_5hd = False if (input_ori_format == "NCHW") and len(input_ori_shape) > 1 \ and (input_ori_shape[1] % 16 != 0): is_support_5hd = False cce_product = cce.cce_conf.get_soc_spec("SOC_VERSION") if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"): dtype_base = [ "float16", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] dtype_5hd = [ "float16", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] else: dtype_base = [ "float16", "float", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] dtype_5hd = [ "float16", "float", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64" ] format_base = ["ND"] * len(dtype_base) if is_support_5hd: dtype_base = dtype_base + dtype_5hd format_base = format_base + ["NC1HWC0"] * len(dtype_5hd) dtype_str = ','.join(dtype_base) format_str = ','.join(format_base) input0 = gen_param(classify="input0", name="x", datatype=dtype_str, format=format_str) output0 = gen_param(classify="output0", name="y", datatype=dtype_str, format=format_str) param_list = [input0, output0] param_dynamic_in_json = get_dynamic_param_in_json(param_list) return param_dynamic_in_json
def layer_norm(input_x, input_gamma, input_beta, output_y, output_mean, output_variance, begin_norm_axis, begin_params_axis, epsilon=1e-12, kernel_name="layer_norm", impl_mode="high_performance"): """ layernorm operator interface implementation calculating: x, gamma, beta mean = np.mean(x, reduce_axis, keepdims=True) variance = np.mean(np.power((x - mean),2), reduce_axis, keepdims=True) result = gamma*((x - mean) / np.sqrt(variance + 0.001)) + beta Parameters ---------- input_x : dict shape and dtype of input x, only support float16, float32 input_gamma: dict shape and dtype of input gamma, only support float16, float32 input_beta: dict shape and dtype of input beta, only support float16, float32 output_y: dict shape and dtype of output, only support float16, float32 begin_norm_axis: int The first normalization dimension: normalization will be performed along dimensions `begin_norm_axis : rank(inputs)` begin_params_axis: int The first parameter (beta, gamma) dimension: scale and centering parameters will have dimensions `begin_params_axis : rank(inputs)` and will be broadcast with the normalized inputs accordingly. epsilon: float, Minimum positive number greater than 0 kernel_name: str cce kernel name, default value is "layernorm" Returns ------- None """ shape_x = list(input_x.get("shape")) input_gamma_shape = input_gamma.get("shape") input_beta_shape = input_beta.get("shape") ori_shape_x = list(input_x.get("ori_shape")) input_format = input_x.get("format").upper() input_gamma_format = input_gamma.get("format").upper() input_beta_format = input_beta.get("format").upper() check_shape(input_gamma_shape, param_name="input_gamma") check_shape(input_beta_shape, param_name="input_beta") check_shape(shape_x, param_name="input_x") check_list = ("float16", "float32") dtype = input_x.get("dtype").lower() dtype_gamma = input_gamma.get("dtype").lower() dtype_beta = input_gamma.get("dtype").lower() check_dtype(dtype, check_list, param_name="input_x") check_dtype(dtype_gamma, check_list, param_name="input_gamma") check_dtype(dtype_beta, check_list, param_name="input_gamma") shape_gamma = list(input_gamma.get("shape")) shape_beta = list(input_beta.get("shape")) if input_format == "FRACTAL_NZ": begin_norm_axis = util.axis_check(len(ori_shape_x), begin_norm_axis) begin_params_axis = util.axis_check(len(ori_shape_x), begin_params_axis) if input_gamma_format == "FRACTAL_NZ" or \ input_beta_format == "FRACTAL_NZ": raise RuntimeError("gamma and beta not support Nz in bert") if shape_gamma != shape_beta: raise RuntimeError("gamma and beta's must be same.") if ori_shape_x[begin_params_axis:] != shape_gamma: raise RuntimeError("x or gamma or begin_params_axis is wrong.") if len(shape_gamma) > 1: raise RuntimeError("shape of gamma or beta only support 1D in bert") # make shape_x,shape_gamma,shape_beta dim same if begin_params_axis != 0: for i in range(begin_params_axis): shape_gamma.insert(i, 1) shape_gamma[-2] = shape_x[-4] shape_gamma[-1] = 1 shape_gamma.append(1) shape_gamma.append(shape_x[-1]) if begin_params_axis > len(ori_shape_x) - 2: shape_x[-3:] = [shape_x[-3]*shape_x[-2], shape_x[-1]] shape_gamma[-3:] = [shape_gamma[-3]*shape_gamma[-2], shape_gamma[-1]] shape_beta = shape_gamma else: begin_norm_axis = util.axis_check(len(shape_x), begin_norm_axis) begin_params_axis = util.axis_check(len(shape_x), begin_params_axis) if shape_gamma != shape_beta: raise RuntimeError("gamma and beta's must be same.") no_need_fix_gamma = False no_need_fix_beta = False if shape_x[begin_params_axis:] != shape_gamma: if len(shape_x) == len(shape_gamma): no_need_fix_gamma = True else: raise RuntimeError("x or gamma or begin_params_axis is wrong.") if shape_x[begin_params_axis:] != shape_beta: if len(shape_x) == len(shape_beta): no_need_fix_beta = True else: raise RuntimeError("x or beta or begin_params_axis is wrong.") # make shape_x,shape_gamma,shape_beta dim same if begin_params_axis != 0 and not no_need_fix_gamma: for i in range(begin_params_axis): shape_gamma.insert(i, 1) if begin_params_axis != 0 and not no_need_fix_beta: for i in range(begin_params_axis): shape_beta.insert(i, 1) data_x = tvm.placeholder(shape_x, name="x", dtype=dtype) data_gamma = tvm.placeholder(shape_gamma, name="gamma", dtype=dtype) data_beta = tvm.placeholder(shape_beta, name="beta", dtype=dtype) if input_format == "FRACTAL_NZ": mean, variance, res = \ layer_norm_compute_nz(data_x, data_gamma, data_beta, output_y, output_mean, output_variance, begin_norm_axis, begin_params_axis, ori_shape_x, epsilon, kernel_name, impl_mode) else: mean, variance, res = \ layer_norm_compute(data_x, data_gamma, data_beta, output_y, output_mean, output_variance, begin_norm_axis, begin_params_axis, epsilon, kernel_name, impl_mode) with tvm.target.cce(): sch = generic.auto_schedule([res, mean, variance]) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_gamma, data_beta, res, mean, variance]} te.lang.cce.cce_build_code(sch, config)
def reduce_max_d(x, y, axis, keepdims=False, kernel_name="reduce_max_d"): """ calculating data Parameters ---------- x : dict shape and dtype of input y : dict shape and dtype of output, should be same shape and type as input axis: list the first axis to reduce,may be negative to index from the end (e.g., -1 for the last axis). axis may be int or list(e.g. [1,2]) keepdims: bool if true, retains reduced dimensions with length 1, default value is None kernel_name : str kernel name, default value is "reduce_max_d" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") input_dtype = dtype.lower() check_shape(shape, param_name="x") check_list = ["float16", "float32", "int8", "uint8", "int32"] check_dtype(input_dtype, check_list, param_name="x") shape_len = len(shape) if not axis: axis = range(shape_len) if hasattr(axis, 'index'): axis = list(axis) axis = util.axis_check(shape_len, axis) # Shape should not be modified in 5HD mode # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(x, axis) if not is_5hdc: shape, axis = util.shape_refine(list(shape), axis) shape, axis = util.simplify_axis_shape(shape, axis) shape_len = len(shape) x["shape"] = shape if input_dtype in ("float32", "int32") and len(axis) == 1 \ and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)): reduce_max_d_tik(x, y, axis[0], kernel_name) else: data_input = tvm.placeholder(shape, name="data_input_" + kernel_name, dtype=input_dtype) res = reduce_max_d_compute(data_input, y, axis, keepdims, kernel_name) if is_5hdc: res.ori_shape = x["ori_shape"] res.ori_format = x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def log_softmax_v2(input_x, output_y, axis=-1, kernel_name="log_softmax_v2", impl_mode="high_performance"): """ algorithm: log_softmax calculating data's log_softmax, x - log(sum(exp(x))) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input axis: int, list or tuple the data's axis, range is [-d, d-1] kernel_name : str cce kernel name, default value is log_softmax_v2 Returns ------- None """ check_list = ("float16", "float32") shape = input_x.get("shape") input_dtype = input_x.get("dtype").lower() shape_len = len(shape) shape_list = list(shape) if not isinstance(axis, int): axis = list(axis) check_shape(shape, param_name="input_x") check_dtype(input_dtype, check_list, param_name="input_x") axis = util.axis_check(shape_len, axis) if not isinstance(axis, int): for i in axis: if shape_list[i] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") else: if shape_list[axis] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") shape, axis = util.shape_refine(list(shape), axis) shape, axis = util.simplify_axis_shape(shape, axis) data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype) result = log_softmax_v2_compute(data_input, output_y, axis=axis, kernel_name=kernel_name, impl_mode=impl_mode) with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, result] } te.lang.cce.cce_build_code(sch, config)
def reduce_all_d(input_data, output_data, axes, keepdims=None, kernel_name="reduce_all_d"): """ Reduce a tensor on a certain axes based on min Parameters: ---------- input_data: dict shape and dtype of input_data, only support int8 output_data: dict source data type, only support int8 axes : int, list ,tuple or None. the first axes to reduce, may be negative to index from the end (e.g., -1 for the last axes). axes may be int or list(e.g. [1,2]) keepdims : bool or None . if true, retains reduced dimensions with length 1, default value is None kernel_name : str cce kernel name, default value is "cce_all" Returns ------- None """ input_shape = input_data.get("shape") input_dtype = input_data.get("dtype").lower() if input_dtype == "bool": input_dtype = "int8" check_shape(input_shape, param_name="input_data") check_dtype(input_dtype, ("int8"), param_name="input_data") shape_len = len(input_shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) if not isinstance(axes, int): for i in axes: if i >= len(input_shape): raise RuntimeError("axes should be less than dimension") else: if axes >= len(input_shape): raise RuntimeError("axes should be less than dimension") # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(input_data, axes) if not is_5hdc: input_shape, axes = util.shape_refine(list(input_shape), axes) input_shape, axes = util.simplify_axis_shape(input_shape, axes) data_input = tvm.placeholder(input_shape, name="data_input_" + kernel_name, dtype=input_dtype) result = reduce_all_d_compute(data_input, output_data, axes, keepdims, kernel_name) if is_5hdc: result.ori_shape = input_data["ori_shape"] result.ori_format = input_data["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, result] } te.lang.cce.cce_build_code(sch, config)
def reduce_any_d(x, y, axes, keepdims=None, kernel_name="reduce_any_d"): """ Reduce a tensor on a certain axes based on max Parameters: ---------- x : shape and dtype of input_data, only support int8 y : shape and dtype of output_res, reserved parameter, not used now axes : the first axes to reduce, may be negative to index from the end (e.g., -1 for the last axes). aixs may be int or list(e.g. [1,2]) keepdims : if true, retains reduced dimensions with length 1, default value is None kernel_name : cce kernel name, default value is "reduce_any_d" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") check_shape(shape, param_name="x") if dtype == "bool": dtype = "int8" check_list = ("int8", ) check_dtype(dtype, check_list, param_name="x") shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) is_5hdc = util.check_and_init_5hdc_reduce_support(x, axes) if not is_5hdc: shape, axes = util.shape_refine(list(shape), axes) shape, axes = util.simplify_axis_shape(shape, axes) inp_dtype = dtype.lower() data_input = tvm.placeholder(shape, name="data_input_" + kernel_name, dtype=inp_dtype) res = reduce_any_d_compute(data_input, y, axes, keepdims, kernel_name) if is_5hdc: res.ori_shape = x["ori_shape"] res.ori_format = x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def reduce_min_d(input_min, output_min, axis, keep_dims=None, kernel_name="reduce_min_d"): """ Reduce a tensor on a certain axis based on min Parameters: ---------- input_min: dict dict of input, which contains shape and dtype output_min: dict dict of output, which contains shape and dtype axis: int or None The dimensions to reduce. If None (the default), reduces all dimensions. Must be in the range (-rank(input_tensor), rank(input_tensor)) keep_dims: True or False if true, retains reduced dimensions with length 1, default value is None kernel_name: str cce kernel name, default value is "reduce_min_d" Returns ------- None """ shape_input = input_min.get("shape") dtype_input = input_min.get("dtype") check_shape(shape_input, param_name="input_min") check_list = ("float16", "float32", "int8", "uint8") check_dtype(dtype_input.lower(), check_list, param_name="input_min") shape_len = len(shape_input) if not axis: axis = range(shape_len) if hasattr(axis, 'index'): axis = list(axis) axis = util.axis_check(shape_len, axis) is_5hdc = util.check_and_init_5hdc_reduce_support(input_min, axis) if not is_5hdc: shape_input, axis = util.shape_refine(list(shape_input), axis) shape_input, axis = util.simplify_axis_shape(shape_input, axis) data_input = tvm.placeholder(shape_input, name="data_input_" + kernel_name, dtype=dtype_input.lower()) shape_len = len(shape_input) if dtype_input.lower() in ("float32", "int32") and len(axis) == 1 \ and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)): input_min["shape"] = tuple(shape_input) reduce_min_d_tik.reduce_min_d_tik(input_min, output_min, -1, kernel_name) else: res = reduce_min_d_compute(data_input, output_min, axis, keep_dims, kernel_name) if is_5hdc: res.ori_shape = input_min["ori_shape"] res.ori_format = input_min["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def op_select_format(input_x, output_y, tiles, axis=1, kernel_name="tile_with_axis"): """ select format dynamically """ ori_format = input_x.get("ori_format") ori_shape = input_x.get("ori_shape") if ori_shape is not None: axis = util.axis_check(len(ori_shape), axis) cce_product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") # for 5hd, axis is only valid for n,h,w if ((ori_format == "NHWC" and axis != 3) or (ori_format == "NCHW" and axis != 1)) and \ len(ori_shape) == 4: # NC1HWC0+ND if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"): # fp16 input0 = gen_param( classify="input0", name="x", datatype= "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64," "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND," "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0" ) output0 = gen_param( classify="output0", name="y", datatype= "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64," "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND," "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0" ) else: # fp16/fp32 input0 = gen_param( classify="input0", name="x", datatype= "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64," "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND,NC1HWC0," "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0" ) output0 = gen_param( classify="output0", name="y", datatype= "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64," "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND,NC1HWC0," "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0" ) else: # ND if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"): # fp16 input0 = gen_param( classify="input0", name="x", datatype= "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND") output0 = gen_param( classify="output0", name="y", datatype= "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND") else: # fp16/fp32 input0 = gen_param( classify="input0", name="x", datatype= "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND") output0 = gen_param( classify="output0", name="y", datatype= "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64", format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND") param_list = [input0, output0] param_dynamic_in_json = get_dynamic_param_in_json(param_list) return param_dynamic_in_json
def split_v_d(input_value, output_data, size_splits, split_dim, num_split, kernel_name="split_v_d"): """Split a tensor into len(size_splits) tensors along one dimension. Parameters ---------- input_value: dict the dict of input tensor. output_data: list or tuple the list of output tensor. size_splits: list or tuple a Python list containing the sizes of each output tensor along `split_dim`. split_dim: int the dimension along which to split_d. num_split: int used to specify the number of outputs. kernel_name: str cce kernel name, default value is "split_v_d". Returns ------- None. """ input_format = input_value.get("format") ori_format = input_value.get("ori_format") if input_format == "NC1HWC0": split_dim = util.axis_transfrom_5d(split_dim, ori_format) split_with_5hd_not_align = \ SplitWith5HD(input_value, output_data, split_dim, num_split, kernel_name) if split_with_5hd_not_align.check_5hd_vnchw(): split_with_5hd_not_align.do_5hd_split_cut_by_batch() return if split_dim == 1: size_splits = list(size_splits) size_splits = [size // 16 for size in size_splits] shape = input_value.get("shape") dtype = input_value.get("dtype") dtype_lower = dtype.lower() check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") check_shape(shape, param_name="input_value") check_dtype(dtype_lower, check_list, param_name="input_value") shape_len = len(shape) split_dim = util.axis_check(shape_len, split_dim) dim = shape[split_dim] if len(size_splits) + 1 == num_split or len(size_splits) == 0: spilt_list = [] split_sum = 0 if len(size_splits) != 0: for i, _ in enumerate(size_splits): spilt_list.append(size_splits[i]) split_sum = split_sum + size_splits[i] if dim - split_sum > 0: spilt_list.append(dim - split_sum) else: batch = dim / num_split for i in range(0, num_split): spilt_list.append(int(batch)) size_splits = spilt_list size_splits = list(size_splits) size_splits_sum = 0 for size in size_splits: if size != -1: size_splits_sum += size if dim != size_splits_sum: for i, _ in enumerate(size_splits): if _ == -1: size_splits[i] = dim - size_splits_sum size_sum = 0 for size in size_splits: if size < 1: raise RuntimeError( "The size (%d) of size_splits must be greater or equal to %d" % (size, 1)) size_sum = size_sum + size if size_sum != shape[split_dim]: raise RuntimeError( "The sum size (%d) of size_splits must be equal to the length of " "split_dim (%d)" % (size_sum, shape[split_dim])) if len(size_splits) != num_split: raise RuntimeError( "The length (%d) of size_splits must be equal to num_split(%d)" % (len(size_splits), num_split)) if num_split == 1: copy_only(input_value, input_value, kernel_name) return split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, size_splits, kernel_name) new_shape = split_mov.input_shape new_split_dim = split_mov.split_dim new_size_splits = split_mov.size_splits new_output_shapes = split_mov.output_shapes input_size = functools_reduce(lambda x, y: x * y, new_shape) last_dim_same = True input_last_dim = new_output_shapes[0][-1] for i, _ in enumerate(new_output_shapes): if input_last_dim != new_output_shapes[i][-1]: last_dim_same = False break if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \ last_dim_same and new_size_splits[0] == 1 and num_split <= 16 \ and input_size >= TRANSPOSE_SIZE * num_split: split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes, new_split_dim, num_split, kernel_name) split_vnc.split_last_dim_vnc_compute() return if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim, num_split, new_size_splits): split_last_dim(new_shape, dtype_lower, new_split_dim, num_split, new_size_splits, kernel_name) return if split_mov.check_whether_use_split_mov(): split_mov.split_mov_compute() return data = tvm.placeholder(shape, name="data", dtype=dtype_lower) output_shape_list, output_tensor_list = split_v_d_compute( data, output_data, size_splits, split_dim, num_split, kernel_name) sch, build_list = te.lang.cce.split_schedule_com(data, split_dim, output_shape_list, output_tensor_list) with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def split_d(input_value, output_data, split_dim, num_split, kernel_name="split_d"): """Split a tensor into `num_split` tensors along one dimension. Parameters ---------- input_value: dict the dict of input tensor. output_data: list or tuple the list of output tensor. split_dim: int the dimension along which to split_d. num_split: int an integer indicating the number of split_d along `split_dim`. kernel_name: str cce kernel name, default value is "split_d". Returns ------- None. """ input_format = input_value.get("format") ori_format = input_value.get("ori_format") if input_format == "NC1HWC0": split_dim = util.axis_transfrom_5d(split_dim, ori_format) shape = input_value.get("shape") dtype = input_value.get("dtype") dtype_lower = dtype.lower() check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") check_shape(shape, param_name="input_value") check_dtype(dtype_lower, check_list, param_name="input_value") shape_len = len(shape) split_dim = util.axis_check(shape_len, split_dim) if num_split < 1: raise RuntimeError( "The num_split (%d) must be greater or equal to %d" % (num_split, 1)) split_with_5hd_not_align = \ SplitWith5HD(input_value, output_data, split_dim, num_split, kernel_name) if split_with_5hd_not_align.check_5hd_vnchw(): split_with_5hd_not_align.do_5hd_split_cut_by_batch() return if shape[split_dim] % num_split != 0: raise RuntimeError( "The num_split (%d) must be divisible by the length of" "split_dim (%d)" % (num_split, shape[split_dim])) if num_split == 1: copy_only(input_value, input_value, kernel_name) return split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, None, kernel_name) new_shape = split_mov.input_shape new_split_dim = split_mov.split_dim new_size_splits = split_mov.size_splits new_output_shapes = split_mov.output_shapes input_size = functools_reduce(lambda x, y: x * y, new_shape) if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \ new_size_splits[0] == 1 and num_split <= 16 \ and input_size >= TRANSPOSE_SIZE * num_split: split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes, new_split_dim, num_split, kernel_name) split_vnc.split_last_dim_vnc_compute() return if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim, num_split, new_size_splits): split_last_dim(new_shape, dtype_lower, new_split_dim, num_split, new_size_splits, kernel_name) return if split_mov.check_whether_use_split_mov(): split_mov.split_mov_compute() return data = tvm.placeholder(shape, name="data", dtype=dtype_lower) output_shape_list, output_tensor_list = split_d_compute( data, output_data, split_dim, num_split, kernel_name) sch, build_list = te.lang.cce.split_schedule_com(data, split_dim, output_shape_list, output_tensor_list) with build_config: tvm.build(sch, build_list, "cce", name=kernel_name)
def binary_cross_entropy_compute(x, y, weight, output, reduction, kernel_name): """ calculating binary_cross_entropy Parameters ---------- x : TVM tensor the output of previous layer y : TVM tensor label weight : a manual rescaling weight given to the loss of each batch element. If given, has to be a Tensor of size nbatch output : loss result after compute reduction : reduce configuration parameter: mean/sum/none. Default: mean kernel_name : str kernel name, default value is "binary_cross_entropy" Returns ------- result : TVM tensor output tensor """ ori_dtype = x.dtype trans_dtype = ori_dtype shape = te.lang.cce.util.shape_to_list(x.shape) if ori_dtype == "float16" and tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32"): x = te.lang.cce.cast_to(x, "float32") y = te.lang.cce.cast_to(y, "float32") if weight is not None: weight = te.lang.cce.cast_to(weight, "float32") trans_dtype = "float32" const_one = tvm.const(1, trans_dtype) const_neg_one = tvm.const(-1, trans_dtype) # calcu value : y * log(x) x = te.lang.cce.vmaxs(x, tvm.const(SCALAR_EPS, trans_dtype)) x_log_tmp = te.lang.cce.vlog(x, priority_flag=1) data_mul1 = te.lang.cce.vmul(x_log_tmp, y) # calcu value : (1-y) * log(1-x) x_neg_tmp = te.lang.cce.vmuls(x, const_neg_one) x1_tmp = te.lang.cce.vadds(x_neg_tmp, const_one) y_neg_tmp = te.lang.cce.vmuls(y, const_neg_one) y1_tmp = te.lang.cce.vadds(y_neg_tmp, const_one) x1_tmp = te.lang.cce.vmaxs(x1_tmp, tvm.const(SCALAR_EPS, trans_dtype)) x1_log_tmp = te.lang.cce.vlog(x1_tmp, priority_flag=1) data_mul2 = te.lang.cce.vmul(x1_log_tmp, y1_tmp) # calcu value : y * log(x) + (1-y) * log(1-x) data_sum = te.lang.cce.vadd(data_mul1, data_mul2) # calcu value : -(y * log(x) + (1-y) * log(1-x)) result = te.lang.cce.vmuls(data_sum, const_neg_one) if weight is not None: result = te.lang.cce.vmul(result, weight) # get total number of tensor reduce_elts = 1.0 for i in shape: reduce_elts *= i cof = reduce_elts**(-1) # get total axis for reduce axis_d = [] for i, _ in enumerate(shape): axis_d.append(i) axis_d = util.axis_check(len(shape), axis_d) if reduction == "mean": result = te.lang.cce.vmuls(result, cof) result = te.lang.cce.sum(result, axis=axis_d, keepdims=False) elif reduction == "sum": result = te.lang.cce.sum(result, axis=axis_d, keepdims=False) elif reduction == "none": pass if ori_dtype == "float16": result = te.lang.cce.cast_to(result, "float16") return result