Esempio n. 1
0
def cosh(input_x, output_cosh, kernel_name="cosh"):
    """
    algorithm: cosh
    calculating data's cosh, y = (e^(2x)+e^(-x))/2

    Parameters
    ----------
    input_x: dict
        shape and dtype of input, only support float16, float32
    output_cosh: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name: str
        kernel name, default value is "cosh"

    Returns
    -------
    None
    """
    shape = input_x.get("shape")
    dtype = input_x.get("dtype")
    check_shape(shape, param_name="input_x")
    check_list = ("float16", "float32")
    input_dtype = dtype.lower()
    check_dtype(input_dtype, check_list, param_name="input_x")
    reshape_input = (functools_reduce(lambda x, y: x * y, shape[:]), )
    data_input = tvm.placeholder(reshape_input,
                                 name="data_input",
                                 dtype=input_dtype)
    res = cosh_compute(data_input, output_cosh, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 2
0
def custom_Concat(shapes, dtype, axis, kernel_name="concat", need_build=False, need_print=False):
    """
    concat one or two input data

    Parameters
    ----------
    shapes : input shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, support uint8, int8, int32, float16, float32

    axis : concat axis

    kernel_name : cce kernel name, default value is "concat"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """

    
    util.check_kernel_name(kernel_name)

    for i in range(len(shapes)):
        util.check_shape_rule(shapes[i])

    sum_dim = 0
    for shape in shapes:
        sum_dim += functools_reduce(lambda x, y: x*y, shape)

    if sum_dim > 2**31-1:
        raise RuntimeError("shape exceed 32bit limitation")

    check_list = ["uint8", "int8", "float16", "float32", "int32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "concat_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    data = []
    for i in range(len(shapes)):
        shape = shapes[i]
        data.append(tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype))

    with tvm.target.cce():
        res = te.lang.cce.concat(data, axis)
        sch = generic.auto_schedule(res)

    data.append(res)

    config = {"print_ir": need_print,
              "need_build": need_build,
              "name": kernel_name,
              "tensor_list": data}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 3
0
def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"):
    """CorrectionMul op"""
    shape = x.get("shape")
    data_format = x.get("format")
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    check_list = ["float16", "float32"]
    inp_dtype = x.get("dtype").lower()
    if not inp_dtype in check_list:
        raise RuntimeError("Dtype of input only support float16, float32")

    # shape = util.shape_refine(shape)
    x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype)
    shape_c = [1] * len(shape)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")
    batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype)
    running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype)
    res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [x_t, batch_std_t, running_std_t, res]}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 4
0
def invert(input_x, output_y, kernel_name="invert"):
    """Flips all bits elementwise.

    Parameters
    ----------
    input_x: dict
        the dict of input tensor.
        Must be one of the following types: `int16`, `uint16`.
    output_y: dict
        the dict of output tensor.
    kernel_name: str
        cce kernel name, default value is "invert".

    Returns
    -------
    None.
    """
    shape_x = input_x.get("shape")
    dtype_x = input_x.get("dtype")
    dtype_x_lower = dtype_x.lower()
    check_list = ("int16", "uint16")

    check_shape(shape_x, param_name="input_x")
    check_dtype(dtype_x_lower, check_list, param_name="input_x")

    shape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]), )
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x_lower)
    res = invert_compute(data_x, output_y, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_x, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 5
0
def strided_write(x, y, axis, stride, kernel_name='strided_write'):
    """
    write data to tensor by stride.

    Parameters:
    ----------
    x: dict of input.

    y: dict of output.

    axis: which axis to write data by stride.

    stride: data write stride.

    kernel_name: cce kernel name, default value is "strided_write".

    Returns:
    -------
    None
    """

    check_params(x, y, axis)
    dtype_x = x.get("dtype")
    n_i, c1_i, h_i, w_i, c0_i = x.get("shape")
    shape_x = n_i, c1_i, h_i*w_i, c0_i
    input_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x)
    res = strided_write_compute(input_x, y, axis, stride, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)
Esempio n. 6
0
def reduce_std(x,
               y1,
               y2,
               dim=None,
               unbiased=True,
               keepdim=False,
               kernel_name="reduce_std"):

    # calculating data parameters
    check_list = ("float16", "float32")

    shape_x = x.get("shape")
    dtype_x = x.get("dtype").lower()
    util.check_dtype_rule(dtype_x, check_list)
    util.check_shape_rule(shape_x)

    data_x = tvm.placeholder(x.get("shape"),
                             dtype=x.get("dtype"),
                             name="data_x")

    res = reduce_std_compute(data_x, dim, unbiased, keepdim, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_x] + list(res)}
    te.lang.cce.cce_build_code(schedule, config)
Esempio n. 7
0
def minmax_update_perchannel(x,
                             min_val,
                             max_val,
                             min_up,
                             max_up,
                             ema,
                             ema_decay,
                             channel_axis,
                             kernel_name="minmax_update_perchannel"):
    """MinMaxUpdatePerChannel op"""
    x_shape = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[
            1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if channel_axis_ == 0:
        shape_c = min_val.get("ori_shape")
    else:
        shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res_list = minmax_update_perchannel_compute(input_data, min_data, max_data,
                                                ema, ema_decay, channel_axis_)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 8
0
def logical_not(x, y, kernel_name="logical_not"):
    """
    calculating data

    Parameters
    ----------
    x : dict
        shape and dtype of input, only support int8, int32
    y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "logical_not"

    Returns
    -------
    None
    """

    shape_x = x.get("shape")
    dtype_x = x.get("dtype").lower()

    check_shape(shape_x, param_name="x")
    check_dtype(dtype_x.lower(), ("int8", ), param_name="x")

    reshape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]), )
    data = tvm.placeholder(reshape_x, name="data", dtype=dtype_x)
    res = logical_not_compute(data, y, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data, res]}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 9
0
def fake_learned_scale_quant_perchannel_grad_d_reduce(
        dout_alpha,
        dalpha,
        channel_axis,
        kernel_name="fake_learned_scale_quant_perchannel_grad_d_reduce"):
    """FakeLearnedScaleQuantPerChannelGradDReduce"""

    dout_alpha_shape = dout_alpha.get("shape")
    dout_alpha_dtype = dout_alpha.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(dout_alpha_shape)
    util.check_tensor_shape_size(dout_alpha_shape)

    check_list = ["float32", 'float16']
    dout_alpha_dtype = dout_alpha_dtype.lower()
    util.check_dtype_rule(dout_alpha_dtype, check_list)

    dout_alpha_data = tvm.placeholder(dout_alpha_shape,
                                      name="dout_alpha",
                                      dtype=dout_alpha_dtype)
    res = fake_learned_scale_quant_perchannel_grad_d_reduce_compute(
        dout_alpha_data, dout_alpha, channel_axis, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_alpha_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 10
0
def zeros_like(x, y, kernel_name="zeros_like"):
    """
    output a tensor of all zero, you can specify the output type

    Parameters
    ----------
    x: dict
        shape and dtype of input, only support float16, float32,
        int32,int8,uint8
    y: dict
        shape and dtype of output data
    kernel_name: str
        cce kernel name, default value is "zeros_like"

    Returns
    ------
    None
    """
    shape_x = x.get("shape")
    dtype_x = x.get("dtype")
    check_shape(shape_x, param_name="x")

    check_list_src = ("float16", "float32", "int32", "int8", "uint8")
    src_dtype = dtype_x.lower()
    check_dtype(src_dtype, check_list_src, param_name="x")
    shape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]),)
    x_input = tvm.placeholder(shape_x, name="x_input", dtype=src_dtype)
    res = zeros_like_compute(x_input, y, kernel_name=kernel_name)

    with tvm.target.cce():
        auto_sch = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [x_input, res]}
    te.lang.cce.cce_build_code(auto_sch, config)
Esempio n. 11
0
def squared_difference(x1, x2, y, kernel_name="squared_difference"):
    """
    algorithm: squared_difference

    calculating data's tf_squared_difference,y= (x - y) * (x - y)

    Parameters
    ----------
    x2 : dict
        shape and dtype of y input, only support float16, float32
    input_dy : dict
        shape and dtype of dy input, only support float16, float32
    output_x: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is squared_difference

    Returns
    -------
    None
    """
    shape_x = x1.get("shape")
    shape_y = x2.get("shape")
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    check_list = ["float16", "float32", "int32"]
    dtype = x1.get("dtype").lower()

    if not dtype in check_list:
        raise RuntimeError(
            "tf_squared_difference_cce only support float16, float32, int32")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")

    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)
    data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x")
    data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y")

    with tvm.target.cce():
        shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                       shape_y,
                                                       param_name_input1="x1",
                                                       param_name_input2="x2")
        data_x_tmp = te.lang.cce.broadcast(data_x, shape_max)
        data_y_tmp = te.lang.cce.broadcast(data_y, shape_max)
        data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp)
        res = te.lang.cce.vmul(data_sub, data_sub)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_x, data_y, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 12
0
def sin(x, y, kernel_name="sin"):
    """
    algorithm: sin
    calculating data's sin x = x - x^3/3! + x^5/5! + ... + (-1)^k*x^2(k+1)/(2(k+1))!

    Parameters
    ----------
    x : dict
        shape and dtype of input, only support float16, float32
    y: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is "sin"

    Returns
    -------
    None
    """
    shape_input = x.get("shape")
    dtype_input = x.get("dtype").lower()

    check_shape(shape_input, param_name="x")
    check_list = (FLOAT_16, FLOAT_32)
    check_dtype(dtype_input, check_list, param_name="x")
    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape_input)
    data_input = tvm.placeholder(fuseshape,
                                 name="data_input",
                                 dtype=dtype_input)
    res = sin_compute(data_input, y, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (data_input, res)}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 13
0
def JDEcoder(shape,
             dtype,
             num_classes,
             num_boxes,
             conf_thresh,
             iou_thresh,
             biases,
             masks,
             strides,
             kernel_name="JDEcoder",
             need_build=True,
             need_print=False):
    """
    
    Parameters
    ----------

    kernel_name : kernel name, default value is "JDEcoder"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
        
    """
    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """

    check_list = ["float16", "float32"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("JDEcoder only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    if num_classes < 1:
        raise RuntimeError("num_classes must be > 1")

    inp_dtype = dtype.lower()
    inp_tensor = tvm.placeholder(shape, name='inp_tensor', dtype=inp_dtype)

    with tvm.target.cce():
        res = inp_tensor
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_buid": need_build,
        "name": kernel_name,
        "tensor_list": [inp_tensor, res]
    }

    te.lang.cce.cce_build_code(sch, config)


#if __name__ == "__main__":
#    JDEcoder((1, 536, 10, 18), "float16", 1, 4, 0.5, 0.45, (6, 16, 8, 23, 11, 32, 16, 45, 21, 64, 30, 90, 43, 128, 60, 180, 85, 255, 120, 360, 170, 420, 340, 320), (8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3), (32, 16, 8))
Esempio n. 14
0
def reduce_max_d(x, y, axes=None, keepdims=None, kernel_name="reduce_max_d"):
    """
    reduce a tensor on a certain axes based on max.

    Parameters
    ----------
    x : dict
        shape and dtype of input
    y : dict
        shape and dtype of output, should be same shape and type as input
    axes: list
        the first axes to reduce,may be negative to index from the end
        (e.g., -1 for the last axes).
        axes may be int or list(e.g. [1,2])
    keepdims: bool
        if true, retains reduced dimensions with length 1,
        default value is None
    kernel_name : str
        kernel name, default value is "reduce_max_d"

    Returns
    -------
    None
    """

    dtype = x["dtype"]
    dtype_lower = dtype.lower()
    check_list = ("float16", "float32", "int8", "uint8", "int32")
    check_dtype(dtype_lower, check_list)

    with te.op.compute():
        shape = x["shape"]
        shape_range = x["range"]

        shape_len = len(shape)
        if not axes:
            axes = range(shape_len)
        if hasattr(axes, 'index'):
            axes = list(axes)
        axes = cce_util.axis_check(shape_len, axes)

        shape_new, shape_range_new, axes_new, fused_rel_dic = \
            fused_reduce_axis(shape, shape_range, axes)
        add_compile_info("fused_rel_dic", fused_rel_dic)

        x["shape"] = shape_new
        x["range"] = shape_range_new
        shape_var_new = variable_shape([x])[0]

        data_input = tvm.placeholder(shape_var_new, name="data_input",
                                     dtype=dtype_lower)
        res = reduce_max_d_compute(data_input, y, axes_new, keepdims)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # build
    config = {"name": kernel_name,
              "tensor_list": [data_input, res]}
    te.lang.dynamic.build(sch, config)
Esempio n. 15
0
def diag_part_d(x, assist, y, kernel_name="diag_part_d"):
    """
    Returns the batched diagonal part of a batched tensor

    Parameters
    ----------
    x: dict
        dict of x, include keys(shape and dtype)
    assist: dict
        dict of help Matrix, Its Diagonal Line value is 1 else value is 0
    y: dict
        dict of output
    kernel_name: str
        cce kernel name, default value is "diag_part_d"

    Returns
    -------
    None
    """
    shape_x = x.get("shape")
    dtype_x = x.get("dtype")
    shape_assist = assist.get("shape")
    dtype_assist = assist.get("dtype")
    shape_y = y.get("shape")

    check_shape(shape_x, param_name="x")
    check_shape(shape_assist, param_name="assist")

    if len(shape_x) not in (2, 4, 6, 8):
        raise RuntimeError("Input tensors of rank 2,4,6,8 are supported!")
    if list(shape_x) != list(shape_assist):
        raise RuntimeError("the shape of data must be equal!")
    len_shape_out = len(shape_x) // VALUE_TWO
    for i in range(len_shape_out):
        if shape_x[i] != shape_x[i + len_shape_out]:
            raise RuntimeError("the shape of input is not supported!")
    if list(shape_x) != list(shape_y + shape_y):
        raise RuntimeError("the shape of output is not supported!")
    if list(shape_x) != list(shape_assist):
        raise RuntimeError("the shape of data must be equal!")

    check_list = ("float16", "float32", "int32")
    dtype_x = dtype_x.lower()
    check_dtype(dtype_x, check_list, param_name="x")
    dtype_assist = dtype_assist.lower()
    check_dtype(dtype_assist, check_list, param_name="assist")
    if dtype_assist != dtype_x:
        raise RuntimeError("the dtype of data must be equal!")

    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x)
    data_assist = tvm.placeholder(shape_assist,
                                  name="data_assist",
                                  dtype=dtype_assist)

    res = diag_part_d_compute(data_x, data_assist, y, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_x, data_assist, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 16
0
def bn_infer_grad(grads, scale, batch_variance,
                  x_backprop, epsilon=0.0001,
                  kernel_name="bn_infer_grad"):
    """
    algorithm: fused_batch_norm_grad_v2
    bn_infer_grad.

    Parameters
    ----------
    grads: dict
        dict of grads, A 5D Tensor for input grads.
    scale: dict
        dict of scale, A 5D Tensor for input scale.
    batch_variance: dict
        dict of batch_variance, A 5D Tensor for input batch_variance.
    x_backprop: dict
        dict of x_backprop, A 5D Tensor for output x_backprop.
    epsilon: float
        A small float number added to the variance of x. Defaults to `0.0001`.
    kernel_name: str
        kernel name, default value is "bn_infer_grad"

    Returns
    -------
    None
    """

    shape_grads = grads.get("shape")
    shape_scale = scale.get("shape")
    shape_batch_variance = batch_variance.get("shape")

    input_grads_dtype = grads.get("dtype").lower()
    input_scale_dtype = scale.get("dtype").lower()
    batch_variance_dtype = batch_variance.get("dtype").lower()

    check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads")
    check_dtype(input_scale_dtype, ("float32",), param_name="scale")
    check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance")

    _check_shape(shape_grads, shape_batch_variance)
    util.compare_tensor_dict_key(scale, batch_variance, "shape")

    grads_input = tvm.placeholder(shape_grads, name="grads_input",
                                  dtype=input_grads_dtype)
    scale_input = tvm.placeholder(shape_scale, name="x_input",
                                  dtype=input_scale_dtype)
    batch_variance_input = tvm.placeholder(shape_batch_variance,
                                           name="batch_variance_input",
                                           dtype=batch_variance_dtype)

    res = bn_infer_grad_compute(grads_input, scale_input,
                                batch_variance_input,
                                x_backprop, epsilon,
                                kernel_name=kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)
    tensor_list = [grads_input, scale_input, batch_variance_input, res]
    config = {"name": kernel_name,
              "tensor_list": tensor_list}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 17
0
    def _conv3dbp_input_achieve_with_tvm():
        dedy = tvm.placeholder(shape_dedy,
                               name="dedy",
                               dtype=out_backprop_dtype)
        shape_filter_ncdhw = [
            filter_batch, filter_channel, filter_depth, filter_h, filter_w
        ]

        filters = tvm.placeholder(shape_filter_frac,
                                  name="filter",
                                  dtype=filter_dtype)

        dedx = te.lang.cce.conv3d_backprop_input_compute(
            filters=filters,
            out_backprop=dedy,
            filter_sizes=shape_filter_ncdhw,
            input_sizes=input_sizes,
            strides=strides,
            padding=pads,
            dilations=dilations,
            res_dtype=res_dtype,
            kernel_name=kernel_name)
        tensor_list = [filters, dedy, dedx]

        with tvm.target.cce():
            sch = generic.auto_schedule(dedx)

        config = {"name": kernel_name, "tensor_list": tensor_list}
        te.lang.cce.cce_build_code(sch, config)
def fake_quant_per_layer(x,
                         min_val,
                         max_val,
                         y,
                         symmetric,
                         narrow_range,
                         num_bits,
                         kernel_name="fake_quant_per_layer"):
    """FakeQuantPerLayer"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    quant_min = 0
    quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_per_layer_compute(input_data, min_data, max_data, y,
                                       quant_min, quant_max, symmetric,
                                       kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 19
0
def square_sum_v2(input_x,
                  output1,
                  output2,
                  attr1,
                  attr2=True,
                  kernel_name="square_sum_v2"):
    """
    calculating data

    Parameters
    ----------
    Input and output of fusion graph

    Returns
    -------
    None
    """
    shape = input_x.get("shape")
    dtype = input_x.get("dtype")

    input_dtype = dtype.lower()

    check_shape(shape, param_name="input_x")

    data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype)

    res = suqare_sum_v2_compute(data_input, output1, output2, attr1, attr2,
                                kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_input] + list(res)}

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 20
0
def cos(input_x, output_y, kernel_name="cos"):
    """
    algorithm: cos
    calculating data's cos x = 1 - x^2/2! + x^4/4! + ... + (-1)^k*x^2k/(2k)!

    Parameters
    ----------
    input_x : dict
              shape and dtype of input, only support float16, float32
    output_y: dict
              shape and dtype of output, should be same shape and type as input
    kernel_name : str
              kernel name, default value is "cos"

    Returns
    -------
    None
    """
    shape_input = input_x.get("shape")
    dtype_input = input_x.get("dtype").lower()

    check_shape(shape_input, param_name="input_x")
    check_list = ("float16", "float32")
    check_dtype(dtype_input, check_list, param_name="input_x")

    reshape_input = (functools_reduce(lambda x, y: x * y, shape_input[:]), )
    data_input = tvm.placeholder(reshape_input,
                                 name="data_input",
                                 dtype=dtype_input)
    res = cos_compute(data_input, output_y, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_input, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 21
0
def CusSquare(input_x, output_y, kernel_name="square"):
    """
    algorithm: square
    calculating data's square,y= x*x

    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float32
    output_y: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "square"

    Returns
    -------
    None
    """
    shape = input_x.get("shape")
    dtype = input_x.get("dtype").lower()

    shape = util.shape_refine(shape)
    data = tvm.placeholder(shape, name="data", dtype=dtype.lower())

    with tvm.target.cce():
        res = square_compute(data, output_y, kernel_name)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 22
0
def strided_read(x, y, axis, stride, kernel_name='strided_read'):
    """
    read data from tensor by stride.

    Parameters:
    ----------
    x: dict of input.

    y: dict of output.

    axis: which axis to read data by stride.

    stride: data read stride.

    kernel_name: cce kernel name, default value is "strided_read".

    Returns:
    -------
    None
    """

    check_params(x, y, axis)
    shape_x = x.get("shape")
    dtype_x = x.get("dtype")

    input_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x)
    res = strided_read_compute(input_x, y, axis, stride, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)
def fused_mul_apply_momentum(var,
                             accum,
                             lr,
                             x1,
                             momentum,
                             x2,
                             out_var,
                             out_accum,
                             use_nesterov=False,
                             kernel_name="fused_mul_apply_momentum"):
    """
    Update '*var' according to the ApplyMomentum algorithm.

    accum = accum * momentum + x1 * x2
    if use_nesterov is True:
        var -= gard * lr + accum * momentum * lr
    else:
        var -= accum * lr

    Parameters:
    ----------
    var : the dict of mutable tensor var, only support float16, float32.

    accum: the dict of mutable tensor accum. Must have the same dtype as `var`.

    lr : the dict of scalar lr. Must have the same dtype as `var`.

    x1 : the dict of tensor grad. Must have the same dtype as `var`.

    momentum : the dict of scalar momentum. Must have the same dtype as `var`.

    x2 : the dict of scalar grad. Must have the same dtype as `var`.

    out_var : the dict of var output.

    out_accum : the dict of accum output

    use_nesterov: bool. If true, use nesterov computing grad,
                 default value is False.

    kernel_name : cce kernel name, default value is "fused_mul_apply_momentum".

    Returns
    -------
    None
    """

    input_name_list = ['var', 'accum', 'lr', 'x1', 'momentum', 'x2']
    var, accum, lr, x1, momentum, x2 = _get_placeholder(
        [var, accum, lr, x1, momentum, x2], input_name_list)
    out_var, out_accum = _fused_mul_apply_momentum_compute(
        var, accum, lr, x1, momentum, x2, out_var, out_accum, use_nesterov)
    outs = [out_var, out_accum]
    build_list = [var, accum, lr, x1, momentum, x2, out_var, out_accum]

    with tvm.target.cce():
        sch = generic.auto_schedule(outs)
    config = {"name": kernel_name, "tensor_list": build_list}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 24
0
def gn_training_reduce(x,
                       sum,
                       square_sum,
                       num_groups=2,
                       kernel_name="gn_training_reduce"):
    """
    calculating data

    Parameters
    ----------
    x: dict
        dict of input, A 5HD Tensor for input data.
    sum: dict
        dict of sum, A `Tensor`. Sum of x.
    square_sum: dict
        dict of square_sum, A `Tensor`. Square sum of x.
    num_groups: int
        A integer value indicates the group in channel.
    kernel_name : str
        kernel name, default value is "gn_training_reduce"

    Returns
    -------
    None
    """
    shape_x = x.get("shape")
    dtype_x = x.get("dtype")
    data_format = x.get("format")
    input_dtype = dtype_x.lower()

    _shape_check(shape_x, data_format, num_groups)
    check_dtype(input_dtype, ("float16", "float32"), param_name="x")

    # Reshape NCHW -> N[GD]HW
    if data_format == "NCHW":
        shape_x = [
            shape_x[0], num_groups, shape_x[1] // num_groups, shape_x[2],
            shape_x[3]
        ]

    # Reshape NHWC -> NHW[GD]
    elif data_format == "NHWC":
        shape_x = [
            shape_x[0], shape_x[1], shape_x[2], num_groups,
            shape_x[3] // num_groups
        ]

    x_input = tvm.placeholder(shape_x, name="x_input", dtype=input_dtype)

    res = gn_training_reduce_compute(x_input, data_format, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    tensor_list = [x_input] + list(res)

    config = {"name": kernel_name, "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(schedule, config)
Esempio n. 25
0
def gelu_grad(input_dy, input_x, input_y, output_z, kernel_name="gelu_grad"):
    """
    algorithm: gelu_grad
    calculating: dy*res'
    res' = res/x +
           x*0.5*(1 - tanh(math_four)*tanh(math_four))*
           np.sqrt(2 / np.pi)*(1 + 3*0.044715*x2)
    math_four = (np.sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3)))

    Parameters
    ----------
    input_dy : dict
        shape and dtype of dy input, only support float16, float32
    input_x : dict
        shape and dtype of x input, only support float16, float32
    input_y : dict
        shape and dtype of y input, only support float16, float32
    output_z: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is gelu_grad

    Returns:
    -------
    none.
    """
    shape_dy = input_dy.get("shape")
    shape_x = input_x.get("shape")
    shape_y = input_y.get("shape")

    check_shape(shape_dy, param_name="input_dy")
    check_shape(shape_x, param_name="input_x")
    check_shape(shape_y, param_name="input_y")
    input_dtype = input_dy.get("dtype").lower()
    check_list = ("float16", "float32")
    check_dtype(input_dtype, check_list, param_name="input_dy")
    shape_dy = list(shape_dy)
    shape_x = list(shape_x)
    shape_y = list(shape_y)
    if not (operator.eq(shape_dy, shape_x) and operator.eq(shape_dy, shape_y)):
        raise RuntimeError("all input shape must be equal")

    fuseshape = [1]
    fuseshape[0] = reduceIns(lambda x, y: x * y, shape_dy)
    data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype)
    data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype)
    data_gelu = tvm.placeholder(fuseshape, name="data_gelu", dtype=input_dtype)
    res = gelu_grad_compute(data_dy, data_x, data_gelu, output_z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data_dy, data_x, data_gelu, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 26
0
def atan_grad(y, dy, z, kernel_name="atan_grad"):
    """
    Gradient calculation for atan(x)

    Parameters:
    ----------
    y : dict of y, include shape and dtype, dtype support float16, float32
    dy : dict of dy, include shape and dtype, dtype support float16, float32
    z : dict of output, include shape and dtype
    kernel_name : cce kernel name, default value is atan_grad

    Algorithm :
    ----------
    forward :
        y = atan(x)
    backward gradient :
        de/dx = dy/dx*de/dy = 1/(1+x^2)*grad

    Returns
    ----------
    None
    """

    # get the shape and dtype
    shape = y.get("shape")
    shape_grad = dy.get("shape")
    dtype = y.get("dtype")
    dtype_grad = dy.get("dtype")

    # check whether kernel name is unique

    # check whether the shape is right
    check_shape(shape, param_name="y")
    check_shape(shape_grad, param_name="dy")
    if not operator.eq(shape, shape_grad):
        raise RuntimeError("all input shape must be the same")
    shape, _ = refine_shape_axes(shape, [])

    # check whether dtypes are fp16,fp32 and whether they are the same
    check_list = ("float16", "float32")
    check_dtype(dtype, check_list, param_name="y")
    check_dtype(dtype_grad, check_list, param_name="dy")
    dtype = dtype.lower()
    if dtype != dtype_grad.lower():
        raise RuntimeError("all input dtype must be same")

    # get 2 input placeholders: data_input, grad
    data_input = tvm.placeholder(shape, name="input_data", dtype=dtype)
    grad = tvm.placeholder(shape, name="input_grad", dtype=dtype)

    # compute the backward gradient
    res = atan_grad_compute(data_input, grad, z, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_input, grad, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 27
0
def floor_mod(x1, x2, y, kernel_name="floor_mod"):
    """
    calculate the remainder of division, support fp16,fp32,int32
    res = x1 -floor(input_data_x / input_data_y)* input_data_y

    Parameters
    ----------
    x1: dict
        dict{"shape":tuple or list,"dtype":str}
        shape of data
        the data type, src_dtype equals dst_dtype, support fp16,fp32,int32
    x2: dict
        dict{"shape":tuple or list,"dtype":str}
        shape of data
        the data type, src_dtype equals dst_dtype, support fp16,fp32,int32
    y: dict, reserved field
        dict with keys(shape and dtype) of output
    kernel_name: str
        cce kernel name, default value is "floor_mod"

    Returns
    ------
    None
    """
    # get dtype and shape attributes
    dtype_x = x1.get("dtype").lower()
    shape_x = x1.get("shape")
    dtype_y = x2.get("dtype").lower()
    shape_y = x2.get("shape")

    # check_kernel_name & shape
    check_shape(shape_x, param_name="x1")
    check_shape(shape_y, param_name="x2")

    # check input tensor data_type
    check_list = ("float16", "float32", "int32")
    check_dtype(dtype_x, check_list, param_name="x1")
    check_dtype(dtype_y, check_list, param_name="x2")

    if dtype_x != dtype_y:
        raise RuntimeError("the type of dtype in two dict is not the same")

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="x1",
                                                   param_name_input2="x2")
    shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y)

    input_data_x = tvm.placeholder(shape_x, name="input_data_x", dtype=dtype_x)
    input_data_y = tvm.placeholder(shape_y, name="input_data_y", dtype=dtype_y)
    res = floor_mod_compute(input_data_x, input_data_y, y, kernel_name)
    with tvm.target.cce():
        auto_sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [input_data_x, input_data_y, res]
    }
    te.lang.cce.cce_build_code(auto_sch, config)
Esempio n. 28
0
def sqrt(input_x, output_y, kernel_name="sqrt"):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "sqrt"

    Returns
    -------
    None
    """

    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """

    """
    TODO:
    operator check
    """

    """
    TODO:
    operator compute, invoke sqrt_compute
    """
    print("=================当你看到这句话时,说明我这个自定义sqrt算子被执行了============================")
    shape = input_x.get("shape")
    dtype = input_x.get("dtype")
    input_dtype = dtype.lower()

    util.check_shape_rule(shape)
    util.check_tensor_shape_size(shape)
    util.check_kernel_name(kernel_name)

    data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype)
    res = sqrt_compute(data_input, output_y, kernel_name)

    """
    TODO:
    auto schedule
    """
    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    """
    TODO:
    operator build
    """
    config = {"name": kernel_name,
              "tensor_list": [data_input, res]}

    te.lang.cce.cce_build_code(schedule, config)
Esempio n. 29
0
def softplus_grad(input_gradients,
                  input_features,
                  output_backprops,
                  kernel_name="softplus_grad"):
    """
    Computes softplus gradients for a softplus operation.
    The gradients: "dy * exp(x) / (1 + exp(x))".

    Parameters
    ----------
    input_gradients: dict
        The backpropagated gradients to the corresponding softplus operation.
    input_features: dict
        The input_features passed as input to the corresponding softplus operation.
        source data type support "float16", "float32", "int32", "int8", "uint8".
    output_backprops: dict
        data of output.
    kernel_name: str
        kernel name, default value is "softplus_grad".

    Returns
    -------
    None
    """
    shape_dy = input_gradients.get("shape")
    dtype_dy = input_gradients.get("dtype")
    shape_x = input_features.get("shape")
    dtype_x = input_features.get("dtype")

    if dtype_dy.lower() != dtype_x.lower():
        raise RuntimeError("type of dy and type of x must be same, \
             while the types are different")
    dtype = dtype_dy

    check_shape(shape_dy, param_name="input_gradients")
    check_shape(shape_x, param_name="input_features")

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    input_dtype = dtype.lower()
    check_dtype(input_dtype, check_list, param_name="input_gradients")
    shape_dy, shape_x, shape_max = broadcast_shapes(
        shape_dy,
        shape_x,
        param_name_input1="input_gradients",
        param_name_input2="input_features")
    reshape_dy, reshape_x = refine_shapes_for_broadcast(shape_dy, shape_x)

    data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=input_dtype)
    data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype)

    res = softplus_grad_compute(data_dy,
                                data_x,
                                output_backprops,
                                kernel_name=kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_dy, data_x, res]}
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 30
0
def log(input_x, output_y, base=-1.0, scale=1.0, shift=0.0, kernel_name="log"):
    """
    calculating data

    Parameters
    ----------
    input_x : dict
        shape and dtype of input
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "log"

    Returns
    -------
    None
    """

    shape = input_x.get("shape")
    dtype = input_x.get("dtype")
    input_dtype = dtype.lower()

    # input_x' shape check
    op_utils.check_shape(shape, param_name="input_x")

    # input_x' dtype check, only supports fp16 and fp32
    check_list = ("float16", "float32")
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")

    if base <= 0 and (not isclose(base, -1.0)):
        error_info = {}
        error_info['errCode'] = 'E80000'
        error_info['param_name'] = 'base'
        error_info['op_name'] = 'log'
        error_info['expect_value'] = "strictly positive or -1"
        error_info['real_value'] = base
        raise RuntimeError("In op[%s], the parameter[%s] should be [%s], but actually is [%s]."
                           % (error_info['op_name'], error_info['param_name'], \
                              error_info['expect_value'], error_info['real_value']))

    fused_shape = [reduceIns(lambda x, y: x * y, shape[:])]
    data_input = tvm.placeholder(fused_shape,
                                 name="data_input",
                                 dtype=input_dtype)

    res = log_compute(data_input, output_y, base, scale, shift, kernel_name)

    # auto schedule
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # operator build
    config = {
        "name": kernel_name,
        "need_build": True,
        "tensor_list": (data_input, res)
    }

    te.lang.cce.cce_build_code(sch, config)