Beispiel #1
0
def _set_reduce_axis(reduce_tensor):
    shape_reduce = te.lang.cce.util.shape_to_list(reduce_tensor.shape)
    axis_d = []
    for i, _ in enumerate(shape_reduce):
        axis_d.append(i)
    axis_d = util.axis_check(len(shape_reduce), axis_d)
    return axis_d
Beispiel #2
0
def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"):
    """
    reduce a tensor on a certain axes based on max.

    Parameters
    ----------
    x : dict
        shape and dtype of input
    y : dict
        shape and dtype of output, should be same shape and type as input
    axes: list
        the first axes to reduce,may be negative to index from the end
        (e.g., -1 for the last axes).
        axes may be int or list(e.g. [1,2])
    keepdims: bool
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name : str
        kernel name, default value is "reduce_max_d"

    Returns
    -------
    None
    """

    dtype = x["dtype"]
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_lower, check_list)

    with te.op.compute():
        shape = x["shape"]
        shape_range = x["range"]

        shape_len = len(shape)
        if not axes:
            axes = range(shape_len)
        if hasattr(axes, 'index'):
            axes = list(axes)
        axes = cce_util.axis_check(shape_len, axes)

        shape_new, shape_range_new, axes_new, fused_rel_dic = \
            fused_reduce_axis(shape, shape_range, axes)
        add_compile_info("fused_rel_dic", fused_rel_dic)

        x["shape"] = shape_new
        x["range"] = shape_range_new
        shape_var_new = variable_shape([x])[0]

        data_input = tvm.placeholder(shape_var_new, name="data_input",
                                     dtype=dtype_lower)
        res = reduce_max_d_compute(data_input, y, axes_new, keepdims)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    config = {"name": kernel_name,
              "tensor_list": [data_input, res]}
    te.lang.dynamic.build(sch, config)
Beispiel #3
0
def _param_check(shape_x, dtype_x, axis, kernel_name):
    """
    Check the input parameter

    Parameters
    ----------
    shape_x: tuple or list
        the shape of input tensor
    dtype_x: string
        the dtype of input tensor
    axis: list
        the axis list for reverse
    kernel_name: str
        kernel name, default value is "reverse_ext2"

    Returns:
    axis: list
    """
    check_shape(shape_x, param_name="input_x")
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")
    check_dtype(dtype_x.lower(), check_list, param_name="input_x")
    axis = list(set(axis))
    axis = util.axis_check(len(shape_x), axis)

    return axis
Beispiel #4
0
def caffe_reduction_layer_compute(placeholders,
                                  shape,
                                  dtype,
                                  axis,
                                  op,
                                  coeff,
                                  kernel_name="cce_reductionLayer",
                                  need_build=False,
                                  need_print=False):
    """
        Since the shape of placeholder created by caffe_reduce is not same as
        input_shape, fusion_op could not process the fusion of two op
        which have different shape. So, caffe_reduce op could not be
        fused until tvm supports reshape in D.
    """
    data = placeholders[0]
    inp_dtype = dtype.lower()

    axis = util.axis_check(len(shape), axis)
    shape = list(shape)
    shape1 = shape[:axis] + [
        functools_reduce(lambda x, y: x * y, shape[axis:])
    ]
    shape1, axis = util.shape_refine(shape1, axis)
    if not axis:
        axis = [0]
        shape1 = [1] + shape1

    if op == "ASUM":
        data_tmp_input = te.lang.cce.vabs(data)
        cof = coeff
        tmp = te.lang.cce.vmuls(data_tmp_input, cof)
    elif op == "SUMSQ":
        data_tmp_input = te.lang.cce.vmul(data, data)
        cof = coeff
        tmp = te.lang.cce.vmuls(data_tmp_input, cof)
    elif op == "MEAN":
        size = shape1[-1]
        cof = float(coeff) * (size**(-1))
        if inp_dtype == "int8" \
                or inp_dtype == "uint8":
            data1 = te.lang.cce.vmuls(data, 1.0)
            data_cast = te.lang.cce.cast_to(data1, "float32")
            tmp = te.lang.cce.vmuls(data_cast, cof)
        else:
            tmp = te.lang.cce.vmuls(data, cof)
    elif op == "SUM":
        cof = coeff
        data_tmp_input = te.lang.cce.vmuls(data, cof)
        tmp = data_tmp_input

    res = te.lang.cce.sum(tmp, axis=axis)
    # Although the data type (int8/uint8) has changed,
    # the data values remain integer
    # during the calculation of other operators (SUM/ASUM/SUMSQ).
    if op != "MEAN":
        res = te.lang.cce.cast_to(res, inp_dtype, f1628IntegerFlag=True)
    return res
Beispiel #5
0
def reduce_sum_d(x, y, axis=None, keepdims=None, kernel_name="reduce_sum_d"):
    """reduce a tensor on a certain axis based on sum.

    Parameters:
    ----------
    x: dict
        the dict of input tensor.
    y: dict
        the dict of output tensor.
    axis: int, list, tuple or NONETYPE
        the axis for reduce.
    keepdims: bool or NONETYPE
        if true, retains reduced dimensions with length 1.
    kernel_name: str
        cce kernel name, default value is "reduce_sum_d".

    Returns
    -------
    None
    """

    dtype = x["dtype"]
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32")
    check_dtype(dtype_lower, check_list, param_name="x")

    with te.op.compute():
        shape = x["shape"]
        shape_range = x["range"]

        axes = []
        shape_len = len(shape)
        if not axis:
            for i, _ in enumerate(shape):
                axes.append(i)
        else:
            axes = list(axis)
        axes = cce_util.axis_check(shape_len, axes)

        shape_new, shape_range_new, axes_new, fused_rel_dic = \
            fused_reduce_axis(shape, shape_range, axes)

        add_compile_info("fused_rel_dic", fused_rel_dic)
        x["shape"] = shape_new
        x["range"] = shape_range_new
        shape_var_new = variable_shape([x])[0]

        data_input = tvm.placeholder(shape_var_new,
                                     name="data_input",
                                     dtype=dtype_lower)
        res = reduce_sum_d_compute(data_input, y, axes_new, keepdims)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.dynamic.build(sch, config)
Beispiel #6
0
def reduce_sum_d(x, y, axis, keepdims=None, kernel_name="reduce_sum_d"):
    """reduce a tensor on a certain axis based on sum.

    Parameters:
    ----------
    x: dict
        the dict of input tensor.
    y: dict
        the dict of output tensor.
    axis: int, list, tuple or NONETYPE
        the axis for reduce.
    keepdims: bool or NONETYPE
        if true, retains reduced dimensions with length 1.
    kernel_name: str
        cce kernel name, default value is "reduce_sum_d".

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32")

    check_shape(shape, param_name="x")
    check_dtype(dtype_lower, check_list, param_name="x")

    axis_d = []
    shape_len = len(shape)
    if not axis:
        for i, _ in enumerate(shape):
            axis_d.append(i)
    else:
        axis_d = list(axis)
    axis_d = util.axis_check(shape_len, axis_d)
    # 5HD Special param for 5hd schedule
    is_5hdc = util.check_and_init_5hdc_reduce_support(x, axis)

    if not keepdims and not is_5hdc:
        shape, axis_d = util.shape_refine(list(shape), axis_d, keepdims)
        shape, axis_d = util.simplify_axis_shape(shape, axis_d)

    data_input = tvm.placeholder(shape, name="data_input_" + kernel_name,
                                 dtype=dtype_lower)
    res = reduce_sum_d_compute(data_input, y, axis_d, keepdims,
                               is_5hdc=is_5hdc)
    if is_5hdc:
        res.ori_shape = x["ori_shape"]
        res.ori_format = x["ori_format"]

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.cce.cce_build_code(sch, config)
Beispiel #7
0
def reduce_prod_d(x, y, axes, keep_dims=None, kernel_name="reduce_prod_d"):
    """
    Reduce a tensor on a certain axes based on product.

    Parameters:
    ----------
    x : dict
        shape and dtype of input
    y: dict
        shape and dtype of output
    axes : int, list, tuple, NoneType
        The dimensions to reduce. If None (the default), reduces all dimensions.
        Must be in the range [-rank(input_tensor), rank(input_tensor)).
    keep_dims : bool, NoneType
        if true, retains reduced dimensions with length 1,
        default value is None.
    kernel_name : str
        cce kernel name, default value is reduce_prod_d
    Returns
    -------
    None
    """
    shape = x.get("shape")
    check_shape(shape, param_name="x")

    inp_dtype = x.get("dtype").lower()
    check_list = ["float16", "float32", "int8", "uint8"]
    check_dtype(inp_dtype, check_list, param_name="x")

    shape_len = len(shape)

    if not axes:
        axes = range(shape_len)

    if hasattr(axes, 'index'):
        axes = list(axes)

    axes = util.axis_check(shape_len, axes)
    util.check_reduce_shape_rule(shape)

    shape, axes = util.shape_refine(list(shape), axes)
    shape, axes = util.simplify_axis_shape(shape, axes)

    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    with tvm.target.cce():
        res = reduce_prod_d_compute(data_input, y, axes, keep_dims,
                                    kernel_name)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_input, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Beispiel #8
0
def mse_loss_compute(predict, label, reduction='mean', kernel_name="mse_loss"):
    '''
    calculating mse_loss
    :param predict: TVM tensor
                   the output of previous layer
    :param label: TVM tensor
                label
    :param reduction: str
                    reduce configuration parameter: mean/sum/none. Default: mean
    :param kernel_name: str
                    kernel name, default value is "mse_loss"
    :return:y
            when reduction=none:TVM tensor, output tensor
            when reduction=sum/mean, A Scalar
    '''
    ori_dtype = predict.dtype
    shape = te.lang.cce.util.shape_to_list(predict.shape)

    if ori_dtype == "float16" and tbe_platform.cce_conf.api_check_support(
            "te.lang.cce.vmul", "float32"):
        predict = te.lang.cce.cast_to(predict, "float32")
        label = te.lang.cce.cast_to(label, "float32")

    # get total number of tensor
    reduce_elts = 1.0
    for i in shape:
        reduce_elts *= i
    cof = reduce_elts**(-1)

    # get total axis for reduce
    axis_d = []
    for i, _ in enumerate(shape):
        axis_d.append(i)
    axis_d = util.axis_check(len(shape), axis_d)

    # calcu value:(predict_n - label_n)^2
    res = te.lang.cce.vsub(predict, label)
    res_sqr = te.lang.cce.vmul(res, res)

    y = 0.0

    if reduction == 'mean':
        # calcu mean
        y = te.lang.cce.sum(res_sqr, axis=axis_d, keepdims=False)
        y = te.lang.cce.vmuls(y, cof)
    elif reduction == 'sum':
        # calcu sum
        y = te.lang.cce.sum(res_sqr, axis=axis_d, keepdims=False)
    elif reduction == 'none':
        y = res_sqr

    if ori_dtype == "float16":
        y = te.lang.cce.cast_to(y, "float16")

    return y
def _param_check(shape_x, dtype_x, axis, kernel_name):
    """check param

    Parameters
    ----------
    shape_x: list
        input shape
    dtype_x: str
        input dtype
    axis: int
        axis int num
    kernel_name: str
        kernel_name string

    Returns
    -------
    None
    """
    check_shape(shape_x, param_name="x")
    check_list = ("float16", "float32")
    check_dtype(dtype_x.lower(), check_list, param_name="x")
    axis = util.axis_check(len(shape_x), axis)
Beispiel #10
0
def check_param(input_x, output_y, tiles, axis, kernel_name):
    """
    Check the input parameter

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_y : dict
        shape and dtype of output, should be same type as input
    axis: int
         The index of the axis to tile
    tiles: int
        The number of copies (tiles) of the blob to output.
    kernel_name : str
        kernel name, default value is "tile_with_axis"

    Returns
    ----------
    axis: int
         The index of the axis to tile which is adjusted to positive
    """
    shape_x = input_x.get("shape")
    dtype_x = input_x.get("dtype").lower()
    shape_y = output_y.get("shape")
    dtype_y = output_y.get("dtype").lower()

    op_utils.check_shape(shape_x, param_name="input_x")
    op_utils.check_shape(shape_y, param_name="input_y")

    check_list = [
        "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32",
        "uint64", "float16", "float32"
    ]

    op_utils.check_dtype(dtype_x, check_list, param_name="input_x")
    op_utils.check_dtype(dtype_y, check_list, param_name="input_x")

    if dtype_x != dtype_y:
        error_info = {}
        error_info['errCode'] = 'E80019'
        error_info['op_name'] = 'tile_with_axis'
        error_info['input1_name'] = 'x'
        error_info['input2_name'] = 'y'
        error_info['input1_dtype'] = str(dtype_x)
        error_info['input2_dtype'] = str(dtype_y)
        raise RuntimeError(
            "In op[%s], the shape of input[%s] and input[%s] should be the same, but actually are [%s] and [%s]."
            % (error_info[op_name], error_info['input1_name'],
               error_info['input2_name'], error_info['input1_dtype'],
               error_info['input2_dtype']))

    if tiles <= 0:
        check_param_range('tiles', 1, 'inf', tiles)

    shape_x_len = len(shape_x)

    # check for 5HD
    input_format = input_x.get("format")
    if input_format == "NC1HWC0":
        shape_x_ori = input_x.get("ori_shape")
        ori_format = input_x.get("ori_format")
        length_x_ori = len(shape_x_ori)

        if ori_format not in ("NCHW", "NHWC"):
            raise RuntimeError("input_x's ori_format is invalid for 5D Tensor")
        if shape_x_len != 5:
            raise RuntimeError("input_x's shape is invalid for 5D Tensor")
        if length_x_ori != 4:
            raise RuntimeError("input_x's ori_shape is invalid for 5D Tensor")
        axis = util.axis_check(length_x_ori, axis)
        axis = util.axis_transfrom_5d(axis, ori_format)
        if axis in (1, 4):
            raise RuntimeError("axis is invalid for 5D Tensor")
    else:
        if axis >= shape_x_len or axis < -shape_x_len:
            check_param_range('axis', -shape_x_len, shape_x_len - 1, axis)

        if axis < 0:
            axis += shape_x_len

    shape_y_expected = [0] * shape_x_len
    shape_y_expected[0:shape_x_len] = shape_x[0:shape_x_len]
    shape_y_expected[axis] *= tiles

    if not check_same_shape(shape_y, shape_y_expected):
        error_info = {}
        error_info['errCode'] = 'E80017'
        error_info['op_name'] = 'tile_with_axis'
        error_info['attr_name'] = 'shape_y'
        error_info['expect_value'] = str(shape_y_expected)
        error_info['real_value'] = str(shape_y)
        raise RuntimeError(
            "In op[%s], the parameter[%s] should be [%s], but actually is [%s]."
            % (error_info['op_name'], error_info['attr_name'],
               error_info['expect_value'], error_info['real_value']))

    shape_x_adapt = []
    shape_y_adapt = []
    for i in range(shape_x_len):
        if i == axis:
            shape_x_adapt.append(1)
            shape_y_adapt.append(tiles)
            if shape_x[i] == 1:
                continue
        shape_x_adapt.append(shape_x[i])
        shape_y_adapt.append(shape_x[i])

    return axis, shape_x_adapt, shape_y_adapt, dtype_x
Beispiel #11
0
def reduce_mean_d(input_x,
                  output_y,
                  axes,
                  keepdims=None,
                  kernel_name="reduce_mean_d",
                  impl_mode="high_performance"):
    """
    Reduce a tensor on a certa in axes based on mean.

    Parameters:
    ----------
    input_x : dict
        shape and dtype of input
    output_y: dict
        shape and dtype of output
    axes : int, list, tuple, NoneType
        The dimensions to reduce. If None (the default), reduces all dimensions.
        Must be in the range [-rank(input_tensor), rank(input_tensor)).
    keepdims : bool, NoneType
        if true, retains reduced dimensions with length 1,
        default value is None.
    kernel_name : str
        cce kernel name, default value is reduce_mean_d

    Returns
    -------
    None
    """
    global ori_shape
    global ori_format
    shape = input_x.get("shape")
    check_shape(shape, param_name="input_x")
    check_list = ["float16", "float32"]
    shape_len = len(shape)

    if not axes:
        axes = range(shape_len)

    if hasattr(axes, 'index'):
        axes = list(axes)

    inp_dtype = input_x.get("dtype").lower()
    check_dtype(inp_dtype, check_list, param_name="input_x")

    axes = util.axis_check(shape_len, axes)

    # Shape should not be modified in 5HD mode
    # 5HD Special param for 5hd schedule
    is_5hdc = util.check_and_init_5hdc_reduce_support(input_x, axes)
    if not is_5hdc:
        shape, axes = util.shape_refine(list(shape), axes)
        shape, axes = util.simplify_axis_shape(shape, axes)

    ori_shape = [input_x["ori_shape"], input_x["shape"]]
    ori_format = [input_x["ori_format"], input_x["format"]]
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    res = reduce_mean_d_compute(data_input,
                                output_y,
                                axes,
                                keepdims,
                                impl_mode=impl_mode,
                                is_5hdc=is_5hdc)
    if is_5hdc:
        res.ori_shape = input_x["ori_shape"]
        res.ori_format = input_x["ori_format"]

    with tvm.target.cce():
        sch = generic.auto_schedule(res)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_input, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Beispiel #12
0
def log_softmax_grad(input_dy,
                     input_x,
                     output_z,
                     axis=-1,
                     kernel_name="log_softmax_grad"):
    """
    algorithm: log_softmax_grad
    calculating: gradient of log_softmax

    Parameters
    ----------
    input_dy : dict
        shape and dtype of grad input, only support float16, float32
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_z: dict
        shape and dtype of output, should be the same shape and type as input
    axis: int, list or tuple .
        the first axis to reduce, may be negative to index from the end
        (e.g., -1 for the last axis).
        axis may be int or list(e.g. [1,2])
        if true, retains reduced dimensions with length 1,
        default value is -1
    kernel_name: str
        cce kernel name, default value is log_softmax_grad

    Returns
    -------
    None
    """
    check_list = ("float16", "float32")
    input_dtype = input_dy.get("dtype").lower()

    if not isinstance(axis, int):
        axis = list(axis)

    shape1 = input_dy.get("shape")
    shape2 = input_x.get("shape")
    check_shape(shape1, param_name="input_dy")
    check_shape(shape2, param_name="input_x")
    check_dtype(input_dtype, check_list, param_name="input_dy")

    axis = util.axis_check(len(shape1), axis)

    if not isinstance(axis, int):
        for i in axis:
            if list(shape1)[i] == 1:
                raise RuntimeError("Cannot reduce on an axis with dimension 1")
    else:
        if list(shape1)[axis] == 1:
            raise RuntimeError("Cannot reduce on an axis with dimension 1")

    if not operator.eq(list(shape1), list(shape2)):
        raise RuntimeError("all input shape must be equal")

    shape1, axis = util.shape_refine(list(shape1), axis)
    shape2 = shape1

    data1 = tvm.placeholder(shape1, dtype=input_dtype, name="data1")
    data2 = tvm.placeholder(shape2, dtype=input_dtype, name="data2")
    result = log_softmax_grad_compute(data1, data2, output_z, axis,
                                      kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(result)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data1, data2, result]
    }
    te.lang.cce.cce_build_code(sch, config)
Beispiel #13
0
def custom_Reduction(shape,
                     dtype,
                     axis,
                     op,
                     coeff,
                     kernel_name="cce_reductionLayer",
                     need_build=False,
                     need_print=False):
    """
    Reduce a tensor on a certain axis, and scale output with coeff

    Parameters
    ----------
    shape : shape of data

    dtype : source data type, only support float16, float32, int8, uint8

    axis : the first axis to reduce, may be negative to index from the end
           (e.g., -1 for the last axis).
           If axis == 0, the output Blob always has the empty shape (count 1),
           performing reduction across the entire input.

    op : can only be one of "SUM, ASUM (sum of abs), SUMSQ (sum of sqr), MEAN"

    coeff : scale for output

    kernel_name : cce kernel name, default value is "cce_reductionLayer"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["float16", "float32", "int8", "uint8"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "reductionLayer_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    reduction_op = ("SUM", "ASUM", "SUMSQ", "MEAN")

    if not isinstance(axis, int):
        raise RuntimeError("type of axis value should be int")
    if op not in reduction_op:
        raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ , MEAN")
    if not isinstance(coeff, int) and not isinstance(coeff, float):
        raise RuntimeError("coeff must be a value")
    axis_origin = axis
    shape_origin = shape
    axis = util.axis_check(len(shape), axis)
    util.check_reduce_shape_rule(shape)
    shape = list(shape)
    shape1 = shape[:axis] + [
        functools_reduce(lambda x, y: x * y, shape[axis:])
    ]
    shape1, axis = util.shape_refine(shape1, axis)
    if not axis:
        axis = [0]
        shape1 = [1] + shape1
    inp_dtype = dtype.lower()
    data = tvm.placeholder(shape1, name="data_input", dtype=inp_dtype)
    with tvm.target.cce():
        res = caffe_reduction_layer_compute([data], shape_origin, dtype,
                                            axis_origin, op, coeff,
                                            kernel_name, need_build,
                                            need_print)

    if op == "MEAN" and (inp_dtype == "int8" or inp_dtype == "uint8"):
        util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
        res = te.lang.cce.cast_to(res, inp_dtype)
        schedule = tvm.create_schedule(res.op)
        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, res], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, res], "cce", name=kernel_name)
    else:
        with tvm.target.cce():
            sch = generic.auto_schedule(res)

        config = {
            "print_ir": need_print,
            "need_build": need_build,
            "name": kernel_name,
            "tensor_list": [data, res]
        }
        te.lang.cce.cce_build_code(sch, config)
Beispiel #14
0
def reduce_mean_d_compute(x,
                          y,
                          axes,
                          keepdims,
                          kernel_name="reduce_mean_d",
                          impl_mode="high_performance",
                          is_5hdc=False):
    """reduce_mean_d compute

    Parameters:
    ----------
    x: TVM tensor
        input tensor.
    y: dict
        the dict of output tensor.
    axes: int, list, tuple or NoneType
        the axes for reduce.
    keepdims: bool or NoneType
        if true, retains reduced dimensions with length 1.
    kernel_name: str
        cce kernel name, default value is "reduce_mean_d".

    Returns
    -------
    res: TVM tensor
        output tensor, has the same shape and type as input tensor.
    """
    shape = te.lang.cce.util.shape_to_list(x.shape)

    shape_len = len(shape)
    if not axes:
        axes = range(shape_len)
    if hasattr(axes, 'index'):
        axes = list(axes)
    axes = util.axis_check(shape_len, axes)

    reduce_elts = 1.0
    if isinstance(axes, Iterable):
        for i in axes:
            reduce_elts *= shape[i]
    else:
        reduce_elts = shape[axes]
    cof = reduce_elts**(-1)

    if ori_format[0] == 'NHWC' and ori_format[1] == 'NC1HWC0' and len(axes) == 2 \
            and axes == [1, 4] and len(ori_shape[0]) == 4:
        cof = ori_shape[0][-1]**(-1)

    dtype = x.dtype
    data_input_tmp = x

    has_improve_precision = False
    cce_product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION")

    if cce_product not in ("Ascend310",) and dtype == "float16" and \
            tbe_platform.cce_conf.api_check_support(
                "te.lang.cce.sum", "float32") and not is_5hdc:
        data_input_tmp = te.lang.cce.cast_to(data_input_tmp, "float32")
        has_improve_precision = True
    elif cce_product in ("Ascend310",) and dtype == "float16" \
            and tbe_platform.cce_conf.api_check_support("te.lang.cce.sum",
                                                        "float32") \
            and not is_5hdc and impl_mode != "high_performance":
        data_input_tmp = te.lang.cce.cast_to(data_input_tmp, "float32")
        has_improve_precision = True

    data_input_tmp = te.lang.cce.vmuls(data_input_tmp, cof)
    res = te.lang.cce.sum(data_input_tmp, axis=axes, keepdims=keepdims)

    if has_improve_precision:
        res = te.lang.cce.cast_to(res, dtype)

    return res
Beispiel #15
0
def reduce_sum(x, axes, y, keepdims=False, kernel_name="reduce_sum"):
    """reduce a tensor on a certain axes based on sum.

    Parameters:
    ----------
    x: dict
        the dict of input tensor.
    axes: dict
        the axes for reduce.
    y: dict
        the dict of output tensor.
    keepdims: bool or NONETYPE
        if true, retains reduced dimensions with length 1.
    kernel_name: str
        cce kernel name, default value is "reduce_sum".

    Returns
    -------
    None
    """

    dtype_x = x["dtype"]
    dtype_lower_x = dtype_x.lower()
    check_list_x = ("float16", "float32")
    check_dtype(dtype_lower_x, check_list_x, param_name="x")

    dtype_axes = axes["dtype"]
    dtype_lower_axes = dtype_axes.lower()
    check_list_axes = ("int32", "int64")
    check_dtype(dtype_lower_axes, check_list_axes, param_name="axes")
    input_shape = x.get("shape")

    if not _check_data_shape_const(input_shape):
        schedules = []
        ins = classify([x, axes], Mode.REDUCE)
        tensors = []
        shape_axes = variable_shape([axes])[0]
        data_input_axes = tvm.placeholder(shape_axes,
                                          name="data_input_axes",
                                          dtype=dtype_lower_axes)

        for (x, axes) in ins:
            with te.op.compute():
                shape_x = variable_shape([x])[0]
                data_input_x = tvm.placeholder(shape_x,
                                               name="data_input_x",
                                               dtype=dtype_lower_x)
                shape_len = len(shape_x)
                axes_d = cce_util.axis_check(shape_len, axes)
                res = reduce_sum_compute(data_input_x, axes_d, y, keepdims)

                tensors.append([data_input_x, data_input_axes, res])

            with tvm.target.cce():
                schedule = generic.auto_schedule(res)
            schedules.append(schedule)

        # build
        config = {"name": kernel_name, "tensor_list": tensors}
        te.lang.dynamic.build(schedules, config)
        add_compile_info("reduce_axis_unknown", 1)

    else:
        _reduce_sum_const(x, axes, keepdims, kernel_name)
Beispiel #16
0
def op_select_format(input_x, output_y, axis, kernel_name="reverse_v2_d"):
    """
    select format for op
    """
    input_ori_shape = input_x.get("ori_shape")
    input_ori_format = input_x.get("ori_format")

    axis = list(set(axis))
    axis = util.axis_check(len(input_ori_shape), axis)

    is_support_5hd = True

    if input_ori_format != "NCHW":
        is_support_5hd = False

    if (input_ori_format == "NCHW" and (1 in axis)) \
            or (input_ori_format == "NHWC" and (3 in axis)):
        is_support_5hd = False

    if (input_ori_format == "NCHW") and len(input_ori_shape) > 1 \
            and (input_ori_shape[1] % 16 != 0):
        is_support_5hd = False

    cce_product = cce.cce_conf.get_soc_spec("SOC_VERSION")
    if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"):
        dtype_base = [
            "float16", "int8", "int16", "int32", "int64", "uint8", "uint16",
            "uint32", "uint64"
        ]
        dtype_5hd = [
            "float16", "int8", "int16", "int32", "int64", "uint8", "uint16",
            "uint32", "uint64"
        ]
    else:
        dtype_base = [
            "float16", "float", "int8", "int16", "int32", "int64", "uint8",
            "uint16", "uint32", "uint64"
        ]
        dtype_5hd = [
            "float16", "float", "int8", "int16", "int32", "int64", "uint8",
            "uint16", "uint32", "uint64"
        ]

    format_base = ["ND"] * len(dtype_base)

    if is_support_5hd:
        dtype_base = dtype_base + dtype_5hd
        format_base = format_base + ["NC1HWC0"] * len(dtype_5hd)

    dtype_str = ','.join(dtype_base)
    format_str = ','.join(format_base)

    input0 = gen_param(classify="input0",
                       name="x",
                       datatype=dtype_str,
                       format=format_str)
    output0 = gen_param(classify="output0",
                        name="y",
                        datatype=dtype_str,
                        format=format_str)
    param_list = [input0, output0]
    param_dynamic_in_json = get_dynamic_param_in_json(param_list)

    return param_dynamic_in_json
Beispiel #17
0
def layer_norm(input_x, input_gamma, input_beta,
               output_y, output_mean, output_variance,
               begin_norm_axis, begin_params_axis,
               epsilon=1e-12, kernel_name="layer_norm",
               impl_mode="high_performance"):
    """
    layernorm operator interface implementation
    calculating: x, gamma, beta
        mean  = np.mean(x, reduce_axis, keepdims=True)
        variance = np.mean(np.power((x - mean),2), reduce_axis, keepdims=True)
        result = gamma*((x - mean) / np.sqrt(variance + 0.001)) + beta

    Parameters
    ----------
    input_x : dict
        shape and dtype of input x, only support float16, float32
    input_gamma: dict
        shape and dtype of input gamma, only support float16, float32
    input_beta: dict
        shape and dtype of input beta, only support float16, float32
    output_y: dict
        shape and dtype of output, only support float16, float32
    begin_norm_axis: int
      The first normalization dimension: normalization will be
      performed along dimensions `begin_norm_axis : rank(inputs)`
    begin_params_axis: int
      The first parameter (beta, gamma) dimension: scale
      and centering parameters will have dimensions
      `begin_params_axis : rank(inputs)` and will be broadcast with the
      normalized inputs accordingly.
    epsilon: float,
      Minimum positive number greater than 0
    kernel_name: str
        cce kernel name, default value is "layernorm"

    Returns
    -------
    None
    """


    shape_x = list(input_x.get("shape"))
    input_gamma_shape = input_gamma.get("shape")
    input_beta_shape = input_beta.get("shape")
    ori_shape_x = list(input_x.get("ori_shape"))
    input_format = input_x.get("format").upper()
    input_gamma_format = input_gamma.get("format").upper()
    input_beta_format = input_beta.get("format").upper()

    check_shape(input_gamma_shape, param_name="input_gamma")
    check_shape(input_beta_shape, param_name="input_beta")
    check_shape(shape_x, param_name="input_x")

    check_list = ("float16", "float32")
    dtype = input_x.get("dtype").lower()
    dtype_gamma = input_gamma.get("dtype").lower()
    dtype_beta = input_gamma.get("dtype").lower()
    check_dtype(dtype, check_list, param_name="input_x")
    check_dtype(dtype_gamma, check_list, param_name="input_gamma")
    check_dtype(dtype_beta, check_list, param_name="input_gamma")

    shape_gamma = list(input_gamma.get("shape"))
    shape_beta = list(input_beta.get("shape"))

    if input_format == "FRACTAL_NZ":
        begin_norm_axis = util.axis_check(len(ori_shape_x), begin_norm_axis)
        begin_params_axis = util.axis_check(len(ori_shape_x), begin_params_axis)

        if input_gamma_format == "FRACTAL_NZ" or \
                input_beta_format == "FRACTAL_NZ":
            raise RuntimeError("gamma and beta not support Nz in bert")
        if shape_gamma != shape_beta:
            raise RuntimeError("gamma and beta's must be same.")
        if ori_shape_x[begin_params_axis:] != shape_gamma:
            raise RuntimeError("x or gamma or begin_params_axis is wrong.")
        if len(shape_gamma) > 1:
            raise RuntimeError("shape of gamma or beta only support 1D in bert")

        # make shape_x,shape_gamma,shape_beta dim same
        if begin_params_axis != 0:
            for i in range(begin_params_axis):
                shape_gamma.insert(i, 1)
        shape_gamma[-2] = shape_x[-4]
        shape_gamma[-1] = 1
        shape_gamma.append(1)
        shape_gamma.append(shape_x[-1])
        if begin_params_axis > len(ori_shape_x) - 2:
            shape_x[-3:] = [shape_x[-3]*shape_x[-2], shape_x[-1]]
            shape_gamma[-3:] = [shape_gamma[-3]*shape_gamma[-2], shape_gamma[-1]]
        shape_beta = shape_gamma
    else:
        begin_norm_axis = util.axis_check(len(shape_x), begin_norm_axis)
        begin_params_axis = util.axis_check(len(shape_x), begin_params_axis)

        if shape_gamma != shape_beta:
            raise RuntimeError("gamma and beta's must be same.")
        no_need_fix_gamma = False
        no_need_fix_beta = False
        if shape_x[begin_params_axis:] != shape_gamma:
            if len(shape_x) == len(shape_gamma):
                no_need_fix_gamma = True
            else:
                raise RuntimeError("x or gamma or begin_params_axis is wrong.")
        if shape_x[begin_params_axis:] != shape_beta:
            if len(shape_x) == len(shape_beta):
                no_need_fix_beta = True
            else:
                raise RuntimeError("x or beta or begin_params_axis is wrong.")
        # make shape_x,shape_gamma,shape_beta dim same
        if begin_params_axis != 0 and not no_need_fix_gamma:
            for i in range(begin_params_axis):
                shape_gamma.insert(i, 1)
        if begin_params_axis != 0 and not no_need_fix_beta:
            for i in range(begin_params_axis):
                shape_beta.insert(i, 1)

    data_x = tvm.placeholder(shape_x, name="x", dtype=dtype)
    data_gamma = tvm.placeholder(shape_gamma, name="gamma", dtype=dtype)
    data_beta = tvm.placeholder(shape_beta, name="beta", dtype=dtype)

    if input_format == "FRACTAL_NZ":

        mean, variance, res = \
            layer_norm_compute_nz(data_x, data_gamma, data_beta,
                                  output_y, output_mean, output_variance,
                                  begin_norm_axis, begin_params_axis,
                                  ori_shape_x, epsilon, kernel_name, impl_mode)
    else:

        mean, variance, res = \
            layer_norm_compute(data_x, data_gamma, data_beta,
                               output_y, output_mean,
                               output_variance,
                               begin_norm_axis, begin_params_axis,
                               epsilon, kernel_name, impl_mode)

    with tvm.target.cce():
        sch = generic.auto_schedule([res, mean, variance])

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [data_x, data_gamma,
                              data_beta, res, mean, variance]}

    te.lang.cce.cce_build_code(sch, config)
Beispiel #18
0
def reduce_max_d(x, y, axis, keepdims=False, kernel_name="reduce_max_d"):
    """
    calculating data

    Parameters
    ----------
    x : dict
        shape and dtype of input
    y : dict
        shape and dtype of output, should be same shape and type as input
    axis: list
        the first axis to reduce,may be negative to index from the end
        (e.g., -1 for the last axis).
        axis may be int or list(e.g. [1,2])
    keepdims: bool
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name : str
        kernel name, default value is "reduce_max_d"

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")
    input_dtype = dtype.lower()

    check_shape(shape, param_name="x")

    check_list = ["float16", "float32", "int8", "uint8", "int32"]
    check_dtype(input_dtype, check_list, param_name="x")

    shape_len = len(shape)

    if not axis:
        axis = range(shape_len)

    if hasattr(axis, 'index'):
        axis = list(axis)

    axis = util.axis_check(shape_len, axis)

    # Shape should not be modified in 5HD mode
    # 5HD Special param for 5hd schedule
    is_5hdc = util.check_and_init_5hdc_reduce_support(x, axis)
    if not is_5hdc:
        shape, axis = util.shape_refine(list(shape), axis)
        shape, axis = util.simplify_axis_shape(shape, axis)
    shape_len = len(shape)
    x["shape"] = shape
    if input_dtype in ("float32", "int32") and len(axis) == 1 \
            and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)):
        reduce_max_d_tik(x, y, axis[0], kernel_name)
    else:
        data_input = tvm.placeholder(shape,
                                     name="data_input_" + kernel_name,
                                     dtype=input_dtype)
        res = reduce_max_d_compute(data_input, y, axis, keepdims, kernel_name)

        if is_5hdc:
            res.ori_shape = x["ori_shape"]
            res.ori_format = x["ori_format"]
        with tvm.target.cce():
            sch = generic.auto_schedule(res)

        config = {"name": kernel_name, "tensor_list": [data_input, res]}

        te.lang.cce.cce_build_code(sch, config)
Beispiel #19
0
def log_softmax_v2(input_x,
                   output_y,
                   axis=-1,
                   kernel_name="log_softmax_v2",
                   impl_mode="high_performance"):
    """
    algorithm: log_softmax
    calculating data's log_softmax, x - log(sum(exp(x)))

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape and type as input
    axis: int, list or tuple
        the data's axis, range is [-d, d-1]
    kernel_name : str
        cce kernel name, default value is log_softmax_v2

    Returns
    -------
    None
    """
    check_list = ("float16", "float32")
    shape = input_x.get("shape")
    input_dtype = input_x.get("dtype").lower()
    shape_len = len(shape)
    shape_list = list(shape)

    if not isinstance(axis, int):
        axis = list(axis)

    check_shape(shape, param_name="input_x")
    check_dtype(input_dtype, check_list, param_name="input_x")

    axis = util.axis_check(shape_len, axis)

    if not isinstance(axis, int):
        for i in axis:
            if shape_list[i] == 1:
                raise RuntimeError("Cannot reduce on an axis with dimension 1")
    else:
        if shape_list[axis] == 1:
            raise RuntimeError("Cannot reduce on an axis with dimension 1")

    shape, axis = util.shape_refine(list(shape), axis)
    shape, axis = util.simplify_axis_shape(shape, axis)

    data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype)
    result = log_softmax_v2_compute(data_input,
                                    output_y,
                                    axis=axis,
                                    kernel_name=kernel_name,
                                    impl_mode=impl_mode)

    with tvm.target.cce():
        sch = generic.auto_schedule(result)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_input, result]
    }

    te.lang.cce.cce_build_code(sch, config)
Beispiel #20
0
def reduce_all_d(input_data,
                 output_data,
                 axes,
                 keepdims=None,
                 kernel_name="reduce_all_d"):
    """
    Reduce a tensor on a certain axes based on min

    Parameters:
    ----------
    input_data: dict
        shape and dtype of input_data, only support int8
    output_data: dict
        source data type, only support int8
    axes : int, list ,tuple or None.
        the first axes to reduce, may be negative to index from the end
        (e.g., -1 for the last axes).
        axes may be int or list(e.g. [1,2])
    keepdims : bool or None .
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name : str
        cce kernel name, default value is "cce_all"

    Returns
    -------
    None
    """
    input_shape = input_data.get("shape")
    input_dtype = input_data.get("dtype").lower()
    if input_dtype == "bool":
        input_dtype = "int8"
    check_shape(input_shape, param_name="input_data")
    check_dtype(input_dtype, ("int8"), param_name="input_data")

    shape_len = len(input_shape)
    if not axes:
        axes = range(shape_len)

    if hasattr(axes, 'index'):
        axes = list(axes)
    axes = util.axis_check(shape_len, axes)

    if not isinstance(axes, int):
        for i in axes:
            if i >= len(input_shape):
                raise RuntimeError("axes should be less than dimension")
    else:
        if axes >= len(input_shape):
            raise RuntimeError("axes should be less than dimension")

    # 5HD Special param for 5hd schedule
    is_5hdc = util.check_and_init_5hdc_reduce_support(input_data, axes)
    if not is_5hdc:
        input_shape, axes = util.shape_refine(list(input_shape), axes)
        input_shape, axes = util.simplify_axis_shape(input_shape, axes)

    data_input = tvm.placeholder(input_shape,
                                 name="data_input_" + kernel_name,
                                 dtype=input_dtype)
    result = reduce_all_d_compute(data_input, output_data, axes, keepdims,
                                  kernel_name)
    if is_5hdc:
        result.ori_shape = input_data["ori_shape"]
        result.ori_format = input_data["ori_format"]

    with tvm.target.cce():
        sch = generic.auto_schedule(result)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_input, result]
    }
    te.lang.cce.cce_build_code(sch, config)
Beispiel #21
0
def reduce_any_d(x, y, axes, keepdims=None, kernel_name="reduce_any_d"):
    """
    Reduce a tensor on a certain axes based on max

    Parameters:
    ----------
    x : shape and dtype of input_data, only support int8

    y : shape and dtype of output_res, reserved parameter, not used now

    axes : the first axes to reduce, may be negative to index from the end
           (e.g., -1 for the last axes).
           aixs may be int or list(e.g. [1,2])

    keepdims : if true, retains reduced dimensions with length 1,
               default value is None

    kernel_name : cce kernel name, default value is "reduce_any_d"

    Returns
    -------
    None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")

    check_shape(shape, param_name="x")

    if dtype == "bool":
        dtype = "int8"
    check_list = ("int8", )
    check_dtype(dtype, check_list, param_name="x")

    shape_len = len(shape)
    if not axes:
        axes = range(shape_len)

    if hasattr(axes, 'index'):
        axes = list(axes)

    axes = util.axis_check(shape_len, axes)

    is_5hdc = util.check_and_init_5hdc_reduce_support(x, axes)
    if not is_5hdc:
        shape, axes = util.shape_refine(list(shape), axes)
        shape, axes = util.simplify_axis_shape(shape, axes)

    inp_dtype = dtype.lower()
    data_input = tvm.placeholder(shape,
                                 name="data_input_" + kernel_name,
                                 dtype=inp_dtype)
    res = reduce_any_d_compute(data_input, y, axes, keepdims, kernel_name)

    if is_5hdc:
        res.ori_shape = x["ori_shape"]
        res.ori_format = x["ori_format"]

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.cce.cce_build_code(sch, config)
Beispiel #22
0
def reduce_min_d(input_min, output_min, axis,
                 keep_dims=None, kernel_name="reduce_min_d"):
    """
    Reduce a tensor on a certain axis based on min

    Parameters:
    ----------
    input_min: dict
        dict of input, which contains shape and dtype
    output_min: dict
        dict of output, which contains shape and dtype
    axis: int or None
        The dimensions to reduce. If None (the default), reduces all dimensions.
        Must be in the range (-rank(input_tensor), rank(input_tensor))
    keep_dims: True or False
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name: str
        cce kernel name, default value is "reduce_min_d"

    Returns
    -------
    None
    """
    shape_input = input_min.get("shape")
    dtype_input = input_min.get("dtype")
    check_shape(shape_input, param_name="input_min")

    check_list = ("float16", "float32", "int8", "uint8")
    check_dtype(dtype_input.lower(), check_list, param_name="input_min")

    shape_len = len(shape_input)

    if not axis:
        axis = range(shape_len)

    if hasattr(axis, 'index'):
        axis = list(axis)

    axis = util.axis_check(shape_len, axis)

    is_5hdc = util.check_and_init_5hdc_reduce_support(input_min, axis)
    if not is_5hdc:
        shape_input, axis = util.shape_refine(list(shape_input), axis)
        shape_input, axis = util.simplify_axis_shape(shape_input, axis)

    data_input = tvm.placeholder(shape_input, name="data_input_" + kernel_name,
                                 dtype=dtype_input.lower())
    shape_len = len(shape_input)
    if dtype_input.lower() in ("float32", "int32") and len(axis) == 1 \
            and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)):
        input_min["shape"] = tuple(shape_input)
        reduce_min_d_tik.reduce_min_d_tik(input_min, output_min, -1,
                                          kernel_name)
    else:
        res = reduce_min_d_compute(data_input, output_min, axis, keep_dims,
                                   kernel_name)
        if is_5hdc:
            res.ori_shape = input_min["ori_shape"]
            res.ori_format = input_min["ori_format"]
        with tvm.target.cce():
            sch = generic.auto_schedule(res)

        config = {"name": kernel_name,
                  "tensor_list": [data_input, res]}
        te.lang.cce.cce_build_code(sch, config)
Beispiel #23
0
def op_select_format(input_x,
                     output_y,
                     tiles,
                     axis=1,
                     kernel_name="tile_with_axis"):
    """
    select format dynamically
    """
    ori_format = input_x.get("ori_format")
    ori_shape = input_x.get("ori_shape")

    if ori_shape is not None:
        axis = util.axis_check(len(ori_shape), axis)

    cce_product = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION")

    # for 5hd, axis is only valid for n,h,w
    if ((ori_format == "NHWC" and axis != 3) or (ori_format == "NCHW" and axis != 1)) and \
            len(ori_shape) == 4:
        # NC1HWC0+ND
        if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"):
            # fp16
            input0 = gen_param(
                classify="input0",
                name="x",
                datatype=
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64,"
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,"
                "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0"
            )
            output0 = gen_param(
                classify="output0",
                name="y",
                datatype=
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64,"
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,"
                "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0"
            )
        else:
            # fp16/fp32
            input0 = gen_param(
                classify="input0",
                name="x",
                datatype=
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64,"
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND,NC1HWC0,"
                "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0"
            )
            output0 = gen_param(
                classify="output0",
                name="y",
                datatype=
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64,"
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND,NC1HWC0,"
                "NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0,NC1HWC0"
            )
    else:
        # ND
        if cce_product in ("Hi3796CV300ES", "Hi3796CV300CS"):
            # fp16
            input0 = gen_param(
                classify="input0",
                name="x",
                datatype=
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND")
            output0 = gen_param(
                classify="output0",
                name="y",
                datatype=
                "float16,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND")
        else:
            # fp16/fp32
            input0 = gen_param(
                classify="input0",
                name="x",
                datatype=
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND")
            output0 = gen_param(
                classify="output0",
                name="y",
                datatype=
                "float16,float32,int8,int16,int32,int64,uint8,uint16,uint32,uint64",
                format="ND,ND,ND,ND,ND,ND,ND,ND,ND,ND")

    param_list = [input0, output0]
    param_dynamic_in_json = get_dynamic_param_in_json(param_list)
    return param_dynamic_in_json
Beispiel #24
0
def split_v_d(input_value,
              output_data,
              size_splits,
              split_dim,
              num_split,
              kernel_name="split_v_d"):
    """Split a tensor into len(size_splits) tensors along one dimension.

    Parameters
    ----------
    input_value: dict
        the dict of input tensor.
    output_data: list or tuple
        the list of output tensor.
    size_splits: list or tuple
        a Python list containing the sizes of each output tensor
        along `split_dim`.
    split_dim: int
        the dimension along which to split_d.
    num_split: int
        used to specify the number of outputs.
    kernel_name: str
        cce kernel name, default value is "split_v_d".

    Returns
    -------
    None.
    """
    input_format = input_value.get("format")
    ori_format = input_value.get("ori_format")
    if input_format == "NC1HWC0":
        split_dim = util.axis_transfrom_5d(split_dim, ori_format)
        split_with_5hd_not_align = \
            SplitWith5HD(input_value, output_data,
                         split_dim, num_split, kernel_name)
        if split_with_5hd_not_align.check_5hd_vnchw():
            split_with_5hd_not_align.do_5hd_split_cut_by_batch()
            return
        if split_dim == 1:
            size_splits = list(size_splits)
            size_splits = [size // 16 for size in size_splits]

    shape = input_value.get("shape")
    dtype = input_value.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")

    check_shape(shape, param_name="input_value")
    check_dtype(dtype_lower, check_list, param_name="input_value")

    shape_len = len(shape)
    split_dim = util.axis_check(shape_len, split_dim)

    dim = shape[split_dim]
    if len(size_splits) + 1 == num_split or len(size_splits) == 0:
        spilt_list = []
        split_sum = 0
        if len(size_splits) != 0:
            for i, _ in enumerate(size_splits):
                spilt_list.append(size_splits[i])
                split_sum = split_sum + size_splits[i]
            if dim - split_sum > 0:
                spilt_list.append(dim - split_sum)
        else:
            batch = dim / num_split
            for i in range(0, num_split):
                spilt_list.append(int(batch))
        size_splits = spilt_list

    size_splits = list(size_splits)
    size_splits_sum = 0
    for size in size_splits:
        if size != -1:
            size_splits_sum += size
    if dim != size_splits_sum:
        for i, _ in enumerate(size_splits):
            if _ == -1:
                size_splits[i] = dim - size_splits_sum

    size_sum = 0
    for size in size_splits:
        if size < 1:
            raise RuntimeError(
                "The size (%d) of size_splits must be greater or equal to %d" %
                (size, 1))
        size_sum = size_sum + size
    if size_sum != shape[split_dim]:
        raise RuntimeError(
            "The sum size (%d) of size_splits must be equal to the length of "
            "split_dim (%d)" % (size_sum, shape[split_dim]))
    if len(size_splits) != num_split:
        raise RuntimeError(
            "The length (%d) of size_splits must be equal to num_split(%d)" %
            (len(size_splits), num_split))

    if num_split == 1:
        copy_only(input_value, input_value, kernel_name)
        return

    split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, size_splits,
                         kernel_name)
    new_shape = split_mov.input_shape
    new_split_dim = split_mov.split_dim
    new_size_splits = split_mov.size_splits
    new_output_shapes = split_mov.output_shapes
    input_size = functools_reduce(lambda x, y: x * y, new_shape)
    last_dim_same = True
    input_last_dim = new_output_shapes[0][-1]
    for i, _ in enumerate(new_output_shapes):
        if input_last_dim != new_output_shapes[i][-1]:
            last_dim_same = False
            break

    if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \
            last_dim_same and new_size_splits[0] == 1 and num_split <= 16 \
            and input_size >= TRANSPOSE_SIZE * num_split:
        split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes,
                                    new_split_dim, num_split, kernel_name)
        split_vnc.split_last_dim_vnc_compute()
        return

    if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim,
                                 num_split, new_size_splits):
        split_last_dim(new_shape, dtype_lower, new_split_dim, num_split,
                       new_size_splits, kernel_name)
        return

    if split_mov.check_whether_use_split_mov():
        split_mov.split_mov_compute()
        return

    data = tvm.placeholder(shape, name="data", dtype=dtype_lower)
    output_shape_list, output_tensor_list = split_v_d_compute(
        data, output_data, size_splits, split_dim, num_split,
        kernel_name)

    sch, build_list = te.lang.cce.split_schedule_com(data, split_dim,
                                                     output_shape_list,
                                                     output_tensor_list)

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
Beispiel #25
0
def split_d(input_value,
            output_data,
            split_dim,
            num_split,
            kernel_name="split_d"):
    """Split a tensor into `num_split` tensors along one dimension.

    Parameters
    ----------
    input_value: dict
        the dict of input tensor.
    output_data: list or tuple
        the list of output tensor.
    split_dim: int
        the dimension along which to split_d.
    num_split: int
        an integer indicating the number of split_d along `split_dim`.
    kernel_name: str
        cce kernel name, default value is "split_d".

    Returns
    -------
    None.
    """
    input_format = input_value.get("format")
    ori_format = input_value.get("ori_format")
    if input_format == "NC1HWC0":
        split_dim = util.axis_transfrom_5d(split_dim, ori_format)

    shape = input_value.get("shape")
    dtype = input_value.get("dtype")
    dtype_lower = dtype.lower()
    check_list = ("int8", "int16", "int32", "int64", "uint8", "uint16",
                  "uint32", "uint64", "float16", "float32")

    check_shape(shape, param_name="input_value")
    check_dtype(dtype_lower, check_list, param_name="input_value")

    shape_len = len(shape)
    split_dim = util.axis_check(shape_len, split_dim)

    if num_split < 1:
        raise RuntimeError(
            "The num_split (%d) must be greater or equal to %d" %
            (num_split, 1))

    split_with_5hd_not_align = \
        SplitWith5HD(input_value, output_data,
                     split_dim, num_split, kernel_name)
    if split_with_5hd_not_align.check_5hd_vnchw():
        split_with_5hd_not_align.do_5hd_split_cut_by_batch()
        return

    if shape[split_dim] % num_split != 0:
        raise RuntimeError(
            "The num_split (%d) must be divisible by the length of"
            "split_dim (%d)" % (num_split, shape[split_dim]))

    if num_split == 1:
        copy_only(input_value, input_value, kernel_name)
        return

    split_mov = SplitMov(shape, dtype_lower, split_dim, num_split, None,
                         kernel_name)
    new_shape = split_mov.input_shape
    new_split_dim = split_mov.split_dim
    new_size_splits = split_mov.size_splits
    new_output_shapes = split_mov.output_shapes
    input_size = functools_reduce(lambda x, y: x * y, new_shape)

    if dtype_lower == "float16" and new_split_dim == len(new_shape) - 1 and \
            new_size_splits[0] == 1 and num_split <= 16 \
            and input_size >= TRANSPOSE_SIZE * num_split:
        split_vnc = SplitLastDimVnv(new_shape, dtype_lower, new_output_shapes,
                                    new_split_dim, num_split, kernel_name)
        split_vnc.split_last_dim_vnc_compute()
        return

    if check_use_last_dim_branch(new_shape, dtype_lower, new_split_dim,
                                 num_split, new_size_splits):
        split_last_dim(new_shape, dtype_lower, new_split_dim, num_split,
                       new_size_splits, kernel_name)
        return

    if split_mov.check_whether_use_split_mov():
        split_mov.split_mov_compute()
        return

    data = tvm.placeholder(shape, name="data", dtype=dtype_lower)
    output_shape_list, output_tensor_list = split_d_compute(
        data, output_data, split_dim, num_split, kernel_name)

    sch, build_list = te.lang.cce.split_schedule_com(data, split_dim,
                                                     output_shape_list,
                                                     output_tensor_list)

    with build_config:
        tvm.build(sch, build_list, "cce", name=kernel_name)
Beispiel #26
0
def binary_cross_entropy_compute(x, y, weight, output,
                                 reduction, kernel_name):
    """
    calculating binary_cross_entropy

    Parameters
    ----------
    x : TVM tensor
        the output of previous layer
    y : TVM tensor
        label
    weight :
        a manual rescaling weight given to the loss of each batch element.
        If given, has to be a Tensor of size nbatch
    output :
        loss result after compute
    reduction :
        reduce configuration parameter: mean/sum/none. Default: mean
    kernel_name : str
        kernel name, default value is "binary_cross_entropy"

    Returns
    -------
    result : TVM tensor
        output tensor
    """
    ori_dtype = x.dtype
    trans_dtype = ori_dtype
    shape = te.lang.cce.util.shape_to_list(x.shape)
    if ori_dtype == "float16" and tbe_platform.cce_conf.api_check_support(
            "te.lang.cce.vmul", "float32"):
        x = te.lang.cce.cast_to(x, "float32")
        y = te.lang.cce.cast_to(y, "float32")
        if weight is not None:
            weight = te.lang.cce.cast_to(weight, "float32")
        trans_dtype = "float32"

    const_one = tvm.const(1, trans_dtype)
    const_neg_one = tvm.const(-1, trans_dtype)
    # calcu value : y * log(x)
    x = te.lang.cce.vmaxs(x, tvm.const(SCALAR_EPS, trans_dtype))
    x_log_tmp = te.lang.cce.vlog(x, priority_flag=1)
    data_mul1 = te.lang.cce.vmul(x_log_tmp, y)
    # calcu value : (1-y) * log(1-x)
    x_neg_tmp = te.lang.cce.vmuls(x, const_neg_one)
    x1_tmp = te.lang.cce.vadds(x_neg_tmp, const_one)
    y_neg_tmp = te.lang.cce.vmuls(y, const_neg_one)
    y1_tmp = te.lang.cce.vadds(y_neg_tmp, const_one)
    x1_tmp = te.lang.cce.vmaxs(x1_tmp, tvm.const(SCALAR_EPS, trans_dtype))
    x1_log_tmp = te.lang.cce.vlog(x1_tmp, priority_flag=1)
    data_mul2 = te.lang.cce.vmul(x1_log_tmp, y1_tmp)
    # calcu value : y * log(x) + (1-y) * log(1-x)
    data_sum = te.lang.cce.vadd(data_mul1, data_mul2)
    # calcu value : -(y * log(x) + (1-y) * log(1-x))
    result = te.lang.cce.vmuls(data_sum, const_neg_one)

    if weight is not None:
        result = te.lang.cce.vmul(result, weight)

    # get total number of tensor
    reduce_elts = 1.0
    for i in shape:
        reduce_elts *= i
    cof = reduce_elts**(-1)

    # get total axis for reduce
    axis_d = []
    for i, _ in enumerate(shape):
        axis_d.append(i)
    axis_d = util.axis_check(len(shape), axis_d)

    if reduction == "mean":
        result = te.lang.cce.vmuls(result, cof)
        result = te.lang.cce.sum(result, axis=axis_d, keepdims=False)
    elif reduction == "sum":
        result = te.lang.cce.sum(result, axis=axis_d, keepdims=False)
    elif reduction == "none":
        pass

    if ori_dtype == "float16":
        result = te.lang.cce.cast_to(result, "float16")

    return result