Esempio n. 1
0
def maximum_compute(input_x, input_y, output_z, kernel_name="maximum"):
    """
    calculating data maximum

    Parameters
    ----------
    input_data: TVM tensor
        the placeholder of input data
    output_data: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name: str
        cce kernel name, default value is sqrt

    Returns
    -------
    result: TVM tensor
        the result of sqrt
    """
    shape1 = te.lang.cce.util.shape_to_list(input_x.shape)
    shape2 = te.lang.cce.util.shape_to_list(input_y.shape)
    shape1 = util.scalar2tensor_one(shape1)

    shape2 = util.scalar2tensor_one(shape2)

    shape1, shape2, shape_max = broadcast_shapes(
        shape1,
        shape2,
        param_name_input1="select1_result",
        param_name_input2="maximum_ones")

    data1_tmp1 = te.lang.cce.broadcast(input_x, shape_max)
    data2_tmp1 = te.lang.cce.broadcast(input_y, shape_max)
    res = te.lang.cce.vmax(data1_tmp1, data2_tmp1)
    return res
Esempio n. 2
0
def _mul_check_format(x, y):
    format_pattern = 0
    shape1 = x.get("shape")
    shape2 = y.get("shape")
    list_format = [x.get("format"), y.get("format")]
    shape1 = util.scalar2tensor_one(shape1)
    shape2 = util.scalar2tensor_one(shape2)
    check_list = [["FRACTAL_NZ", "ND"], ["ND", "FRACTAL_NZ"],
                  ["FRACTAL_NZ", "NHWC"], ["NHWC", "FRACTAL_NZ"],
                  ["FRACTAL_NZ", "NCHW"], ["NCHW", "FRACTAL_NZ"]]
    if list_format == check_list[0] \
            and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)):
        format_pattern = 1
    elif list_format == check_list[1] \
            and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)):
        format_pattern = 2
    elif list_format == check_list[2] \
            and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)):
        format_pattern = 1
    elif list_format == check_list[3] \
            and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)):
        format_pattern = 2
    elif list_format == check_list[4] \
            and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)):
        format_pattern = 1
    elif list_format == check_list[5] \
            and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)):
        format_pattern = 2

    return format_pattern
Esempio n. 3
0
def op_select_format(grad,
                     x1,
                     x2,
                     y,
                     axis,
                     keepdims,
                     kernel_name="softmax_grad_ext"):
    """
    select format dynamically
    """
    origin_shape0 = util.scalar2tensor_one(grad.get("ori_shape"))
    origin_shape1 = util.scalar2tensor_one(x1.get("ori_shape"))
    origin_shape2 = util.scalar2tensor_one(x2.get("ori_shape"))

    condition_0 = len(origin_shape2) == 1 and origin_shape2[0] == 1
    condition_1 = _division_sixteen(origin_shape0)
    condition_2 = _division_sixteen(origin_shape1)

    if condition_0 and condition_1 and condition_2:
        # NZ + NZ + Scalar
        input0 = gen_param(classify="input0",
                           name="grad",
                           datatype="float16,float",
                           format="FRACTAL_NZ, FRACTAL_NZ")
        input1 = gen_param(classify="input1",
                           name="x1",
                           datatype="float16,float",
                           format="FRACTAL_NZ, FRACTAL_NZ")
        input2 = gen_param(classify="input2",
                           name="x2",
                           datatype="float16,float",
                           format="ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float",
                            format="FRACTAL_NZ,FRACTAL_NZ")
    else:
        # ND+ND+ND
        input0 = gen_param(classify="input0",
                           name="grad",
                           datatype="float16,float",
                           format="ND,ND")
        input1 = gen_param(classify="input1",
                           name="x1",
                           datatype="float16,float",
                           format="ND,ND")
        input2 = gen_param(classify="input2",
                           name="x2",
                           datatype="float16,float",
                           format="ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float",
                            format="ND,ND")

    param_list = [input0, input1, input2, output0]
    param_dynamic_in_json = get_dynamic_param_in_json(param_list)

    return param_dynamic_in_json
Esempio n. 4
0
def mul(x, y, output, kernel_name="mul"):
    """
    do element-wise mul operation between two input tensors

    Parameters:
    ----------
    x : dict.
        shape, dtype of input x
    y : dict.
        shape, dtype of input y
    output : dict.
        shape, dtype of ouput
    kernel_name : str.
        cce kernel name, default value is "mul"

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _mul_check_format(x, y)
    shape_x, shape_y = _infer_shape(format_pattern, x, y)

    shape_x = util.scalar2tensor_one(shape_x)
    dtype_x = x.get("dtype").lower()
    shape_y = util.scalar2tensor_one(shape_y)
    dtype_y = y.get("dtype").lower()

    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_shape(shape_y, param_name="y")

    if dtype_x != dtype_y:
        raise RuntimeError("dtype of inputs should be consistent")
    dtype = dtype_x
    check_list = ("int32", "float16", "float32", "int16")
    op_utils.check_dtype(dtype, check_list, param_name="x")

    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if dtype_x == "float32" and not vmul_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
        shape_x, shape_y, param_name_input1="x", param_name_input2="y")

    shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y)
    input_x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    input_y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    res = _mul_compute(input_x, input_y, output, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 5
0
def op_select_format(input_tensor,
                     input_mask,
                     input_keep_prob,
                     output,
                     kernel_name="dropout_do_mask"):
    """
    _division_sixteen : judge whether the last two dimensions are divided by 16
    scalar2tensor_one : convert scalar to tensor
    """
    shape_0 = input_tensor.get("ori_shape")
    shape_1 = input_mask.get("ori_shape")
    shape_2 = input_keep_prob.get("ori_shape")

    shape_0 = util.scalar2tensor_one(shape_0)
    shape_1 = util.scalar2tensor_one(shape_1)
    shape_2 = util.scalar2tensor_one(shape_2)

    if _division_sixteen(shape_0) and not _division_sixteen(
            shape_1) and not _division_sixteen(shape_2):
        # Nz+ND+ND
        input0 = gen_param(classify="input0",
                           name="x",
                           datatype="float16,float16,float,float",
                           format="ND,FRACTAL_NZ,ND,FRACTAL_NZ")
        input1 = gen_param(classify="input1",
                           name="mask",
                           datatype="uint8,uint8,uint8,uint8",
                           format="ND,ND,ND,ND")
        input2 = gen_param(classify="input2",
                           name="keep_prob",
                           datatype="float16,float16,float,float",
                           format="ND,ND,ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float,float",
                            format="ND,FRACTAL_NZ,ND,FRACTAL_NZ")
    else:
        # ND+ND
        input0 = gen_param(classify="input0",
                           name="x",
                           datatype="float16,float",
                           format="ND,ND")
        input1 = gen_param(classify="input1",
                           name="mask",
                           datatype="uint8,uint8",
                           format="ND,ND")
        input2 = gen_param(classify="input2",
                           name="keep_prob",
                           datatype="float16,float",
                           format="ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float",
                            format="ND,ND")

    param_list = [input0, input1, input2, output0]
    param_dynamic_in_json = get_dynamic_param_in_json(param_list)
    return param_dynamic_in_json
Esempio n. 6
0
def fake_learned_scale_quant_perlayer(
        input_x,
        alpha,
        quant_max,
        out,
        neg_trunc,
        kernel_name="fake_learned_scale_quant_perlayer"):
    """FakeLearnedScaleQuantPerLayer"""
    input_shape = input_x.get("shape")
    input_dtype = input_x.get("dtype")
    alpha_shape = alpha.get("ori_shape")
    alpha_dtype = alpha.get("dtype")
    quant_max_shape = quant_max.get("ori_shape")
    quant_max_dtype = quant_max.get("dtype")

    alpha_shape = util.scalar2tensor_one(alpha_shape)
    quant_max_shape = util.scalar2tensor_one(quant_max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(alpha_shape, 1, 1, 1)
    util.check_shape_rule(quant_max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(alpha_shape)
    util.check_tensor_shape_size(quant_max_shape)

    check_list = ["float32", "float16"]
    input_dtype = input_dtype.lower()
    alpha_dtype = alpha_dtype.lower()
    quant_max_dtype = quant_max_dtype.lower()
    util.check_dtype_rule(input_dtype, check_list)
    util.check_dtype_rule(alpha_dtype, check_list)
    util.check_dtype_rule(quant_max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )

    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
    alpha_data = tvm.placeholder(alpha_shape,
                                 name="alpha_data",
                                 dtype=alpha_dtype)
    quant_max_data = tvm.placeholder(quant_max_shape,
                                     name="quant_max_data",
                                     dtype=quant_max_dtype)
    res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data,
                                                    quant_max_data, neg_trunc,
                                                    kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, alpha_data, quant_max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list,
        "bool_storage_as_1bit": False
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 7
0
def add(input_x, input_y, output_z, kernel_name="add"):
    """
    algorithm: add
    calculating data's add, c = a + b

    Parameters
    ----------
    input_x : dict
        shape and dtype of first input, only support float16, float32, int32
    input_y : dict
        shape and dtype of second input, only support float16, float32, int32
    output_z: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is add

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _add_check_format(input_x, input_y)
    shape_x, shape_y = _infer_shape(format_pattern, input_x, input_y)
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")

    check_tuple = ("float16", "float32", "int32")
    input_data_type = input_x.get("dtype").lower()
    check_dtype(input_data_type, check_tuple, param_name="input_x")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1:
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

    data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type)
    data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type)

    res = add_compute(data_x, data_y, output_z, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": (data_x, data_y, res)
    }
    te.lang.cce.cce_build_code(schedule, config)
Esempio n. 8
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    ori_shape_x = x.get("ori_shape")
    ori_shape_y = y.get("ori_shape")
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)

    if format_pattern == 1:
        ori_shape_x, shape_y, shape_max = broadcast_shapes(
            ori_shape_x,
            shape_y,
            param_name_input1="input_x",
            param_name_input2="input_y")
        if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, ori_shape_y, shape_max = broadcast_shapes(
            shape_x,
            ori_shape_y,
            param_name_input1="input_x",
            param_name_input2="input_y")
        if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Esempio n. 9
0
def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx,
                                 num_bits, quant_delay, symmetric, narrow_range,
                                 kernel_name="fake_quant_with_min_max_grad"):
    """FakeQuantWithMinMaxGrad"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", 'float16']
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    if symmetric:
        quant_min = 0 - 2 ** (num_bits - 1)
        quant_max = 2 ** (num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    dout_data = tvm.placeholder(input_shape, name="dout", dtype=x_dtype)
    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_with_min_max_grad_compute(dout_data, input_data, min_data, max_data, quant_min,
                                               quant_max, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_data, input_data, min_data, max_data, res]
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 10
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    ori_shape_x = x.get("ori_shape")
    ori_shape_y = y.get("ori_shape")
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)
    if format_pattern == 1:
        ori_shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
            ori_shape_x, shape_y, param_name_input1="x", param_name_input2="y")
        if shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == ori_shape_x[-1]:
            raise RuntimeError("the inputshape of y is illegal")

        if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, ori_shape_y, shape_max = op_utils.broadcast_shapes(
            shape_x, ori_shape_y, param_name_input1="x", param_name_input2="y")
        if shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == ori_shape_y[-1]:
            raise RuntimeError("the inputshape of x is illegal")

        if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Esempio n. 11
0
def real_div(x1, x2, y, kernel_name="real_div"):
    """
    algorithm: real_div
    calculating data's real_div, c = a / b

    Parameters
    ----------
    x1 : dict
        shape and dtype of first input, only support float16, float32, int32
    x2 : dict
        shape and dtype of second input, only support float16, float32, int32
    y: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is real_div

    Returns
    -------
    None
    """
    shape_x = util.scalar2tensor_one(x1.get("shape"))
    shape_y = util.scalar2tensor_one(x2.get("shape"))
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    check_tuple = ("float16", "float32")
    input_data_type = x1.get("dtype").lower()
    check_dtype(input_data_type, check_tuple, param_name="x1")
    input_data_type_x2 = x2.get("dtype").lower()
    check_dtype(input_data_type_x2, check_tuple, param_name="x2")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")
    if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1:
        shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1]
        shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_data_type)
    data_y = tvm.placeholder(shape_y, name="data_y", dtype=input_data_type)

    res = real_div_compute(data_x, data_y, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": (data_x, data_y, res)
    }

    te.lang.cce.cce_build_code(schedule, config)
def fake_quant_minmax_update(x, min_val, max_val, min_up, max_up,
                             ema, ema_decay, symmetric, narrow_range, training, num_bits,
                             kernel_name="fake_quant_minmax_update"):
    """FakeQuantPerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    if symmetric:
        quant_min = 0 - 2 ** (num_bits - 1)
        quant_max = 2 ** (num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = fake_quant_minmax_update_compute(input_data, min_data, max_data,
                                                ema, ema_decay, quant_min, quant_max, training, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
def minmax_update_perlayer(x,
                           min_val,
                           max_val,
                           min_up,
                           max_up,
                           ema,
                           ema_decay,
                           kernel_name="minmax_update_perlayer"):
    """MinMaxUpdatePerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = minmax_update_perlayer_compute(input_data, min_data, max_data,
                                              ema, ema_decay)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 14
0
def check_ori_shape(input0, input1, input2):
    """
    check the ND shapes whether they can be broadcasted
    """
    shape_0 = list(util.scalar2tensor_one(input0.get("ori_shape")))
    shape_1 = list(util.scalar2tensor_one(input1.get("ori_shape")))
    shape_2 = list(util.scalar2tensor_one(input2.get("ori_shape")))
    shape_input0, shape_input1, shape_max_mul = \
        broadcast_shapes(shape_0, shape_1, param_name_input1="input0",
                         param_name_input2="input1")
    shape_input2, shape_max_mul, shape_max_add0 = \
        broadcast_shapes(shape_0, shape_2, param_name_input1="input0",
                         param_name_input2="input2")
Esempio n. 15
0
def minimum(x1, x2, y, kernel_name="minimum"):
    """
    do element-wise minimum operation between two input tensors

    Parameters:
    ----------
    x1 : dict
        shape and dtype of first input, only support float16, float32, int32
    x2 : dict
        shape and dtype of second input, only support float16, float32, int32
    y: dict
        shape and dtype of output, should be the broadcast shape and
        type as input
    kernel_name : str
        cce kernel name, default value is minimum

    Returns
    -------
    None
    """
    shape1 = util.scalar2tensor_one(x1.get("shape"))
    shape2 = util.scalar2tensor_one(x2.get("shape"))

    check_shape(shape1, param_name="x1")
    check_shape(shape2, param_name="x2")

    check_list = ["float16", "float32", "int32"]
    dtype = x1.get("dtype").lower()
    dtype_x2 = x2.get("dtype").lower()
    check_dtype(dtype, check_list, param_name="x1")
    check_dtype(dtype_x2, check_list, param_name="x2")

    shape1, shape2, _ = broadcast_shapes(shape1,
                                         shape2,
                                         param_name_input1="x1",
                                         param_name_input2="x2")

    data1 = tvm.placeholder(shape1, dtype=dtype, name="data1")
    data2 = tvm.placeholder(shape2, dtype=dtype, name="data2")

    res = minimum_compute(data1, data2, y, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data1, data2, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 16
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    ori_shape_x = x.get("ori_shape")
    ori_shape_y = y.get("ori_shape")
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)

    if format_pattern == 1:
        ori_shape_x, shape_y, _ = util.produce_shapes(ori_shape_x, shape_y)

        if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, ori_shape_y, _ = util.produce_shapes(shape_x, ori_shape_y)

        if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Esempio n. 17
0
def less(input_x, input_y, output_z, kernel_name="less"):
    """
    do element-wise less operation between two input tensors

    Parameters:
    ----------
    input_x : dict
        shape and dtype of first input, support float16,float32,int32,
        int8,uint8
    input_y : dict
        shape and dtype of second input, support float16,float32,int32,
        int8,uint8
    output_x: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name : str
        cce kernel name, default value is less

    Returns
    -------
    None
    """
    shape_x = util.scalar2tensor_one(input_x.get("shape"))
    shape_y = util.scalar2tensor_one(input_y.get("shape"))
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    input_dtype = input_x.get("dtype").lower()
    check_dtype(input_dtype, check_list, param_name="input_x")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")

    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)
    data_x = tvm.placeholder(shape_x, dtype=input_dtype, name="data_x")
    data_y = tvm.placeholder(shape_y, dtype=input_dtype, name="data_y")

    res = less_compute(data_x, data_y, output_z, kernel_name="less")
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_x, data_y, res]
    }
    te.lang.cce.cce_build_code(sch, config)
def _check_para_and_getplaceholder(scalar_input, tensor_input, input_dict):
    check_list = ("float32", )
    var_shape = input_dict["var"].get("shape")
    var_dtype = input_dict["var"].get("dtype")
    list_placeholder = []
    for key, value in input_dict.items():
        shape = util.scalar2tensor_one(value.get("shape"))
        op_utils.check_shape(shape)
        if value in scalar_input:
            if not util.is_scalar(shape):
                raise RuntimeError("The shape of ", key, " must be scalar")
        if value in tensor_input:
            if shape != var_shape:
                raise RuntimeError("The shape of", key,
                                   "must be the same as the var")

        dtype = value.get("dtype").lower()
        op_utils.check_dtype(dtype, check_list, param_name="var")
        if dtype != var_dtype:
            raise RuntimeError("The dtype of", key,
                               "must be the same as the var")

        shape_refine = (functools_reduce(operator.mul, shape), )
        list_placeholder.append(
            tvm.placeholder(shape=shape_refine, name=key, dtype=dtype))
    return list_placeholder
Esempio n. 19
0
def _condition(x, perm, shape, transpose_first):
    shape_x = util.scalar2tensor_one(x.get("ori_shape"))

    if transpose_first:
        shape_reshapein = _shape_after_transpose(shape_x, perm)
    else:
        shape_reshapein = shape_x
        if not _division_sixteen(_shape_after_transpose(shape, perm)):
            return False

    if (len(perm) == 4 and _division_sixteen(shape_x) and perm[3] == 3):
        if len(shape_reshapein) == 2 and len(shape) == 4:
            if (shape[0] * shape[1] == shape_reshapein[0]
                    and shape[2] * shape[3] == shape_reshapein[1]):
                return True
        if len(shape_reshapein) == 4 and len(shape) == 2:
            if (shape_reshapein[0] * shape_reshapein[1] == shape[0]
                    and shape_reshapein[2] * shape_reshapein[3] == shape[1]):
                return True
        if len(shape_reshapein) == 3 and len(shape) == 4:
            if (shape[1] * shape[2] == shape_reshapein[1]
                    and shape[0] == shape_reshapein[0]
                    and shape[3] == shape_reshapein[2]):
                return True
        if len(shape_reshapein) == 4 and len(shape) == 3:
            if (shape_reshapein[1] * shape_reshapein[2] == shape[1]
                    and shape_reshapein[0] == shape[0]
                    and shape_reshapein[3] == shape[2]):
                return True

    return False
Esempio n. 20
0
def sub(input_x, input_y, output_z, kernel_name="sub"):
    """
    do element-wise sub operation between two input tensors

    Parameters:
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32,int32
    input_y : dict
        shape and dtype of input, only support float16, float32,int32
    output_z: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : kernel name, default value is "sub"

    Returns
    -------
    None
    """
    shape_x = util.scalar2tensor_one(input_x.get("shape"))
    shape_y = util.scalar2tensor_one(input_y.get("shape"))
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")

    check_list = ["float16", "float32", "int32"]
    dtype = input_x.get("dtype").lower()
    if not dtype in check_list:
        raise RuntimeError("sub only support float16, float32, int32")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")

    data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2")

    res = sub_compute(data1, data2, output_z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data1, data2, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 21
0
def greater(x, y, z, kernel_name="greater"):
    """
    do element-wise greater operation between two input tensors

    Parameters:
    ----------
    x : dict
        shape and dtype of input data_x
    y : dict
        shape and dtype of input data_y
    z : dict
        shape and dtype of output data_z
    kernel_name : str
        cce kernel name, default value is "greater"

    Returns
    -------
    None
    """
    shape_input_x = util.scalar2tensor_one(x.get("shape"))
    dtype_input_x = x.get("dtype").lower()
    shape_input_y = util.scalar2tensor_one(y.get("shape"))
    dtype_input_y = y.get("dtype").lower()

    check_shape(shape_input_x, param_name="x")
    check_shape(shape_input_y, param_name="y")

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    check_dtype(dtype_input_x, check_list, param_name="x")

    shape_list = broadcast_shapes(shape_input_x,
                                  shape_input_y,
                                  param_name_input1="x",
                                  param_name_input2="y")

    reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0],
                                                       shape_list[1])
    data_x = tvm.placeholder(reshape_x, dtype=dtype_input_x, name="data_x")
    data_y = tvm.placeholder(reshape_y, dtype=dtype_input_y, name="data_y")

    res = greater_compute(data_x, data_y, z, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 22
0
def _add_check_format(x, y):
    shape1 = x.get("shape")
    shape2 = y.get("shape")
    list_format = [x.get("format"), y.get("format")]
    shape1 = util.scalar2tensor_one(shape1)
    shape2 = util.scalar2tensor_one(shape2)

    format_list = ("ND", "NCHW", "NHWC")
    if list_format[0] == "FRACTAL_NZ" and list_format[1] in format_list \
            and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)):
        format_pattern = 1
    elif list_format[0] in format_list and list_format[1] == "FRACTAL_NZ" \
            and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)):
        format_pattern = 2
    else:
        format_pattern = 0

    return format_pattern
Esempio n. 23
0
def maximum(input_x, input_y, output_z, kernel_name="maximum"):
    """
    do element-wise maximum operation between two input tensors

    """
    shape1 = te.lang.cce.util.shape_to_list(input_x.shape)
    shape2 = te.lang.cce.util.shape_to_list(input_y.shape)
    shape1 = util.scalar2tensor_one(shape1)

    shape2 = util.scalar2tensor_one(shape2)

    shape1, shape2, shape_max = broadcast_shapes(shape1,
                                                 shape2,
                                                 param_name_input1="input_x",
                                                 param_name_input2="input_y")

    data1_tmp1 = te.lang.cce.broadcast(input_x, shape_max)
    data2_tmp1 = te.lang.cce.broadcast(input_y, shape_max)
    res = te.lang.cce.vmax(data1_tmp1, data2_tmp1)
    return res
Esempio n. 24
0
def relu6_d(input_x, output_y, scale=1.0, kernel_name="relu6_d"):
    """
       f(x)= 6(x >= 6)
       f(x)= 0(x <= 0)
       f(x)= x(0<x<6)

    Parameters
    ----------
    input_x : dict
        shape and dtype of input_x
    output_y : dict
        shape and dtype of output_y, should be same shape and type as input

    kernel_name : str
        cce kernel name, default value is "relu6"

    Returns
    ------
    None
    """
    input_shape = util.scalar2tensor_one(input_x.get("shape"))
    input_dtype = input_x.get("dtype").lower()
    op_utils.check_shape(input_shape, param_name="input_x")

    vmaxs_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmaxs", "float32")
    if input_dtype == "float32" and not vmaxs_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    # check input tensor data_type
    check_list = ("int32", "float16", "float32")
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")

    input_shape = [reduce_ins(lambda x, y: x * y, input_shape[:])]
    input_data = tvm.placeholder(input_shape,
                                 name="input_data",
                                 dtype=input_dtype)
    final_res = relu6_d_compute(input_data,
                                output_y,
                                scale,
                                kernel_name=kernel_name)

    with tvm.target.cce():
        auto_sch = topi.generic.auto_schedule(final_res)

    config = {"name": kernel_name, "tensor_list": (input_data, final_res)}
    te.lang.cce.cce_build_code(auto_sch, config)
Esempio n. 25
0
def reciprocal(input_x, output_y, kernel_name="reciprocal"):
    """
    algorithm: reciprocal

    calculating data's reciprocal,y= 1 / x

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is reciprocal

    Returns
    -------
    None
    """
    shape = util.scalar2tensor_one(input_x.get("shape"))
    check_shape(shape, param_name="input_x")

    check_list = ["float16", "float32"]
    inp_dtype = input_x.get("dtype").lower()
    check_dtype(inp_dtype, check_list, param_name="input_x")

    shape = util.shape_refine(shape)
    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape)
    data = tvm.placeholder(fuseshape, name="data", dtype=inp_dtype)

    res = reciprocal_compute(data, output_y, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 26
0
def fused_mul_add(input0, input1, input2, output, kernel_name="fused_mul_add"):
    """
    function: fused for mul+add

    Parameters
    ----------
    input0: dict
         the dict of input of mul, support float16,float32,int32
    input1: dict
         the dict of input of mul, support float16,float32,int32
    input2: dict
         the dict of input of add, support float16,float32,int32
    output: dict
         the dict of output of add, support float16,float32,int32
    kernel_name: str
        cce kernel name, default value is fused_mul_add

    Returns
    -------
    None
    """
    shape_input0 = list(util.scalar2tensor_one(input0.get("shape")))
    shape_input1 = list(util.scalar2tensor_one(input1.get("shape")))
    shape_input2 = list(util.scalar2tensor_one(input2.get("shape")))

    dtype_input0 = input0.get("dtype").lower()
    dtype_input1 = input1.get("dtype").lower()
    dtype_input2 = input2.get("dtype").lower()

    format_input0 = input0.get("format").upper()
    format_input1 = input1.get("format").upper()
    format_input2 = input2.get("format").upper()

    check_ori_shape(input0, input1, input2)
    format_pattern = check_format(format_input0, format_input1, format_input2)
    if format_pattern in [1, 2, 3]:
        shape_input0, shape_input1, shape_input2 = \
            _infer_shape_one(shape_input0, shape_input1,
                             shape_input2, format_pattern)
    elif format_pattern == 4:
        shape_input0, shape_input1, shape_input2 = \
            _infer_shape_two(shape_input0, shape_input1,
                             shape_input2, format_pattern)
    else:
        shape_input0, shape_input1, shape_max_mul = \
            broadcast_shapes(shape_input0, shape_input1, param_name_input1="input0",
                             param_name_input2="input1")
        shape_input2, shape_max_mul, shape_max_add0 = \
            broadcast_shapes(shape_input2, shape_max_mul, param_name_input1="input2",
                             param_name_input2="shape_max_mul")

    data_input0 = tvm.placeholder(shape_input0,
                                  name="data_input0",
                                  dtype=dtype_input0)
    data_input1 = tvm.placeholder(shape_input1,
                                  name="data_input1",
                                  dtype=dtype_input1)
    data_input2 = tvm.placeholder(shape_input2,
                                  name="data_input2",
                                  dtype=dtype_input2)

    res = fused_mul_add_compute(data_input0, data_input1, data_input2, output,
                                kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": (data_input0, data_input1, data_input2, res)
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 27
0
def op_select_format(input0,
                     input1,
                     input2,
                     output,
                     kernel_name="fused_mul_add"):
    """
    _division_sixteen : judge whether the last two dimensions are divided by 16
    scalar2tensor_one : convert scalar to tensor
    """
    shape_0 = input0.get("ori_shape")
    shape_1 = input1.get("ori_shape")
    shape_2 = input2.get("ori_shape")

    shape_0 = util.scalar2tensor_one(shape_0)
    shape_1 = util.scalar2tensor_one(shape_1)
    shape_2 = util.scalar2tensor_one(shape_2)

    if _division_sixteen(shape_0) and not _division_sixteen(shape_1) \
            and not _division_sixteen(shape_2):
        # Nz+ND+ND
        input0 = gen_param(classify="input0",
                           name="x1",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
        input1 = gen_param(classify="input1",
                           name="x2",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        input2 = gen_param(classify="input2",
                           name="x3",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float16,float16,float16,\
                                      float,float,float,float,float,\
                                      int32,int32,int32,int32,int32",
                            format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")

    elif _division_sixteen(shape_0) and not _division_sixteen(shape_1) \
            and _division_sixteen(shape_2):
        # Nz+ND+Nz
        input0 = gen_param(classify="input0",
                           name="x1",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
        input1 = gen_param(classify="input1",
                           name="x2",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        input2 = gen_param(classify="input2",
                           name="x3",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float16,float16,float16,\
                                      float,float,float,float,float,\
                                      int32,int32,int32,int32,int32",
                            format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")

    elif not _division_sixteen(shape_0) and _division_sixteen(shape_1) \
            and not _division_sixteen(shape_2):
        # ND+NZ+ND
        input0 = gen_param(classify="input0",
                           name="x1",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        input1 = gen_param(classify="input1",
                           name="x2",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
        input2 = gen_param(classify="input2",
                           name="x3",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float16,float16,float16,\
                                      float,float,float,float,float,\
                                      int32,int32,int32,int32,int32",
                            format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")

    elif not _division_sixteen(shape_0) and not _division_sixteen(shape_1) \
            and _division_sixteen(shape_2):
        # ND+ND+NZ
        input0 = gen_param(classify="input0",
                           name="x1",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        input1 = gen_param(classify="input1",
                           name="x2",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,ND")
        input2 = gen_param(classify="input2",
                           name="x3",
                           datatype="float16,float16,float16,float16,float16,\
                                     float,float,float,float,float,\
                                     int32,int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                   NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float16,float16,float16,\
                                      float,float,float,float,float,\
                                      int32,int32,int32,int32,int32",
                            format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\
                                    NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ")
    else:
        # ND+ND
        input0 = gen_param(classify="input0",
                           name="x1",
                           datatype="float16,float16,float16,float16,\
                                     float,float,float,float,\
                                     int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND")
        input1 = gen_param(classify="input1",
                           name="x2",
                           datatype="float16,float16,float16,float16,\
                                     float,float,float,float,\
                                     int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND")
        input2 = gen_param(classify="input2",
                           name="x3",
                           datatype="float16,float16,float16,float16,\
                                     float,float,float,float,\
                                     int32,int32,int32,int32",
                           format="NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND")
        output0 = gen_param(classify="output0",
                            name="y",
                            datatype="float16,float16,float16,float16,\
                                      float,float,float,float,\
                                      int32,int32,int32,int32",
                            format="NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND,\
                                   NCHW,NC1HWC0,NHWC,ND")

    param_list = [input0, input1, input2, output0]
    param_dynamic_in_json = get_dynamic_param_in_json(param_list)
    return param_dynamic_in_json
Esempio n. 28
0
def lamb_next_m_v(input_mul3,
                  input_mul2,
                  input_realdiv1,
                  input_mul1,
                  input_mul0,
                  input_realdiv0,
                  input_mul4,
                  mul0_x,
                  mul1_sub,
                  mul2_x,
                  mul3_sub1,
                  mul4_x,
                  add2_y,
                  y1,
                  y2,
                  y3,
                  y4,
                  kernel_name="lamb_next_m_v"):
    """
    function: For bert lamb fuse

    Parameters
    ----------
    input_mul3: dict
        the dict of input of mul_3, and dtype supports 'float16', 'float32'
    input_mul2: dict
        the dict of input of mul_2, and dtype supports 'float16', 'float32'
    input_realdiv1
        the dict of input of truediv_1,
        and dtype supports 'float16', 'float32'
    input_mul1: dict
        the dict of input of mul_1, and dtype supports 'float16', 'float32'
    input_mul0: dict
        the dict of input of mul, and dtype supports 'float16', 'float32'
    input_realdiv0
        the dict of input of truediv, and dtype supports 'float16', 'float32'
    input_mul4: dict
        the dict of input of mul_4, and dtype supports 'float16', 'float32'
    mul0_x: dict
        the dict of input of mul, and dtype supports 'float16', 'float32'
    mul1_sub: dict
        the dict of input of mul_1, and dtype supports 'float16', 'float32'
    mul2_x: dict
        the dict of input of mul_2, and dtype supports 'float16', 'float32'
    mul3_sub1: dict
        the dict of input of mul_3, and dtype supports 'float16', 'float32'
    mul4_x: dict
        the dict of input of mul_4, and dtype supports 'float16', 'float32'
    add2_y: dict
        the dict of input of add_2 and add_4,
        and dtype supports 'float16', 'float32'
    y1: dict
        the dict of output of add_3, and dtype supports 'float16', 'float32'
    y2: dict
        the dict of output of add, and dtype supports 'float16', 'float32'
    y3: dict
        the dict of output of add_1, and dtype supports 'float16', 'float32'
    y4: dict
        the dict of output of truediv_4,
        and dtype supports 'float16', 'float32'
    kernel_name: str
       cce kernel name, default value is lamb_next_m_v

    Returns
    -------
    None
    """
    shape_input_mul3 = util.scalar2tensor_one(input_mul3.get("shape"))
    shape_input_mul2 = util.scalar2tensor_one(input_mul2.get("shape"))
    shape_input_realdiv1 = util.scalar2tensor_one(input_realdiv1.get("shape"))
    shape_input_mul1 = util.scalar2tensor_one(input_mul1.get("shape"))
    shape_input_mul0 = util.scalar2tensor_one(input_mul0.get("shape"))
    shape_input_realdiv0 = util.scalar2tensor_one(input_realdiv0.get("shape"))
    shape_input_mul4 = util.scalar2tensor_one(input_mul4.get("shape"))
    shape_mul0_x = util.scalar2tensor_one(mul0_x.get("shape"))
    shape_mul1_sub = util.scalar2tensor_one(mul1_sub.get("shape"))
    shape_mul2_x = util.scalar2tensor_one(mul2_x.get("shape"))
    shape_mul3_sub1 = util.scalar2tensor_one(mul3_sub1.get("shape"))
    shape_mul4_x = util.scalar2tensor_one(mul4_x.get("shape"))
    shape_add2_y = util.scalar2tensor_one(add2_y.get("shape"))

    input_dtype = input_mul3.get("dtype").lower()

    shape_input_mul3, shape_mul3_sub1, shape_max_mul3 = \
        broadcast_shapes(shape_input_mul3, shape_mul3_sub1, param_name_input1="input_mul3",
                         param_name_input2="mul3_sub1")
    shape_input_mul2, shape_mul2_x, shape_max_mul2 = \
        broadcast_shapes(shape_input_mul2, shape_mul2_x, param_name_input1="input_mul2",
                         param_name_input2="mul2_x")
    shape_max_mul2, shape_max_mul3, shape_max_add1 = \
        broadcast_shapes(shape_max_mul2, shape_max_mul3, param_name_input1="shape_max_mul2",
                         param_name_input2="shape_max_mul3")
    shape_input_realdiv1, shape_max_add1, shape_max_truediv1 = \
        broadcast_shapes(shape_input_realdiv1, shape_max_add1, param_name_input1="input_realdiv1",
                         param_name_input2="shape_max_add1")
    shape_max_truediv1, shape_add2_y, shape_max_add2 = \
        broadcast_shapes(shape_max_truediv1, shape_add2_y, param_name_input1="shape_max_truediv1",
                         param_name_input2="add2_y")
    shape_input_mul1, shape_mul1_sub, shape_max_mul1 = \
        broadcast_shapes(shape_input_mul1, shape_mul1_sub, param_name_input1="input_mul1",
                         param_name_input2="mul1_sub")
    shape_input_mul0, shape_mul0_x, shape_max_mul0 = \
        broadcast_shapes(shape_input_mul0, shape_mul0_x, param_name_input1="input_mul0",
                         param_name_input2="mul0_x")
    shape_max_mul0, shape_max_mul1, shape_max_add0 = \
        broadcast_shapes(shape_max_mul0, shape_max_mul1, param_name_input1="shape_max_mul0",
                         param_name_input2="shape_max_mul1")
    shape_max_add0, shape_input_realdiv0, shape_max_truediv0 = \
        broadcast_shapes(shape_max_add0, shape_input_realdiv0, param_name_input1="shape_max_add0",
                         param_name_input2="input_realdiv0")
    shape_input_mul4, shape_mul4_x, shape_max_mul4 = \
        broadcast_shapes(shape_input_mul4, shape_mul4_x, param_name_input1="input_mul4",
                         param_name_input2="mul4_x")

    data_input_mul3 = tvm.placeholder(shape_input_mul3,
                                      name="data_input_mul3",
                                      dtype=input_dtype)
    data_input_mul2 = tvm.placeholder(shape_input_mul2,
                                      name="data_input_mul2",
                                      dtype=input_dtype)
    data_input_realdiv1 = tvm.placeholder(shape_input_realdiv1,
                                          name="data_input_realdiv1",
                                          dtype=input_dtype)
    data_input_mul1 = tvm.placeholder(shape_input_mul1,
                                      name="data_input_mul1",
                                      dtype=input_dtype)
    data_input_mul0 = tvm.placeholder(shape_input_mul0,
                                      name="data_input_mul0",
                                      dtype=input_dtype)
    data_input_realdiv0 = tvm.placeholder(shape_input_realdiv0,
                                          name="data_input_realdiv0",
                                          dtype=input_dtype)
    data_input_mul4 = tvm.placeholder(shape_input_mul4,
                                      name="data_input_mul4",
                                      dtype=input_dtype)
    data_mul0_x = tvm.placeholder(shape_mul0_x,
                                  name="data_mul0_x",
                                  dtype=input_dtype)
    data_mul1_sub = tvm.placeholder(shape_mul1_sub,
                                    name="data_mul1_sub",
                                    dtype=input_dtype)
    data_mul2_x = tvm.placeholder(shape_mul2_x,
                                  name="data_mul2_x",
                                  dtype=input_dtype)
    data_mul3_sub1 = tvm.placeholder(shape_mul3_sub1,
                                     name="data_mul3_sub1",
                                     dtype=input_dtype)
    data_mul4_x = tvm.placeholder(shape_mul4_x,
                                  name="data_mul4_x",
                                  dtype=input_dtype)
    data_add2_y = tvm.placeholder(shape_add2_y,
                                  name="data_add2_y",
                                  dtype=input_dtype)

    res = lamb_next_m_v_compute(data_input_mul3, data_input_mul2,
                                data_input_realdiv1, data_input_mul1,
                                data_input_mul0, data_input_realdiv0,
                                data_input_mul4, data_mul0_x, data_mul1_sub,
                                data_mul2_x, data_mul3_sub1, data_mul4_x,
                                data_add2_y, y1, y2, y3, y4, kernel_name)

    inputlist = [
        data_input_mul3, data_input_mul2, data_input_realdiv1, data_input_mul1,
        data_input_mul0, data_input_realdiv0, data_input_mul4, data_mul0_x,
        data_mul1_sub, data_mul2_x, data_mul3_sub1, data_mul4_x, data_add2_y
    ]
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": list(inputlist) + list(res)}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 29
0
def cast(input_x, output_y, dst_type, kernel_name="cast"):
    """
    cast a tensor/scaler with input shape form src data type to dst data
    type. restrictions of input algorithms are as follow
    only types' groups blow are support tensor process:
        float16->float32
        float16->int32
        float32->float16
        float32->int32
        int8->float32
        uint8->float32
        int8->float16
        uint8->float16
        int8->int32
        uint8->int32
        int32->uint8 // number out of [0,255] can get unexpected result
        int32->int8 // number out of [-128,127] can get unexpected result
        int32->float32 // For tans with fp16, only guarantees
                        number in [-1023,1023] get correct result
        int32->float16 // only guarantees
                        number in [-1023,1023] get correct result
    scale convert support:(means only support shape [1,])
        int64->int32
        int64->float32

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape as input,
        and the dtype is the dst dtype need to cast
    kernel_name : str
        cce kernel name, default value is cast

    Returns
    -------
    None
    """
    shape = util.scalar2tensor_one(input_x.get("shape"))
    src_type = input_x.get("dtype").lower()
    check_shape(shape, param_name="input_x")

    if src_type == "bool":
        src_type = "int8"

    dst_type = _cast_dsttype_conversion(dst_type)
    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape)
    data = tvm.placeholder(fuseshape, name="data", dtype=src_type)
    if src_type == "int64":
        check_dtype(dst_type, ("float32", "int32"), param_name="dst_type")
        res = tvm.extern(
            [fuseshape], [data],
            lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"),
            name="res",
            dtype=dst_type)
        tensor_list = [data, res]
        schedule = tvm.create_schedule(res.op)
        with build_config:
            tvm.build(schedule, tensor_list, "cce", name=kernel_name)
    else:
        with tvm.target.cce():
            res = cast_compute(data, output_y, dst_type, kernel_name)
            sch = generic.auto_schedule(res)
        config = {
            "print_ir": False,
            "name": kernel_name,
            "tensor_list": [data, res]
        }
        te.lang.cce.cce_build_code(sch, config)
Esempio n. 30
0
def fake_quant_with_min_max_vars(x, min, max, y, num_bits,
                                 narrow_range,
                                 kernel_name="fake_quant_with_min_max_vars"):
    """
    algorithm: calculate the fake quant value of input tensor
    calculating data's fake quant

    Parameters
    ----------
    x: dict
           shape and dtype of input data
    min: dict
         shape and dtype of min
    max: dict
         shape and dtype of max
    y: dict
            shape and dtype of fake quant output
    num_bits: int
                  define the range of quant max
    narrow_range: bool
                  define the range of quant min
    kernel_name : string
                  cce kernel name, default value is
                  "fake_quant_with_min_max_vars"

    Returns
    -------
    None
    """
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min.get("shape")
    min_dtype = min.get("dtype")
    max_shape = max.get("shape")
    max_dtype = max.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    check_shape(input_shape, param_name="x")
    check_shape(min_shape, min_rank=1, max_rank=1, param_name="min")
    check_shape(max_shape, min_rank=1, max_rank=1, param_name="max")

    if num_bits > 16 or num_bits < 2:
        raise RuntimeError(
            "The value of num_bits must be between"
            "2 and 16")

    check_tuple = ("float32",)
    x_type = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    check_dtype(x_type, check_tuple, param_name="x")
    check_dtype(min_dtype, check_tuple, param_name="min")
    check_dtype(max_dtype, check_tuple, param_name="max")
    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
    shape_min, shape_max, shape_broadcast = broadcast_shapes(min_shape, input_shape,
                                                             param_name_input1="min",
                                                             param_name_input2="x")
    data = tvm.placeholder(input_shape, dtype=x_type, name="data_input")
    data_min = tvm.placeholder(shape_min, dtype=min_dtype, name="data_min")
    data_max = tvm.placeholder(shape_min, dtype=max_dtype, name="data_max")

    res = fake_quant_with_min_max_vars_compute(data, data_min, data_max,
                                               y, num_bits, narrow_range,
                                               kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": (data, data_min, data_max, res)}

    te.lang.cce.cce_build_code(schedule, config)