Example #1
0
def relu6_grad(input_grad, input_x, output_y, kernel_name="relu6_grad"):
    """
    Parameters
    ----------
    input_grad : dict
        shape and dtype of input_grad
    input_x : dict
        shape and dtype of input_x
    output_y : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is "relu6_grad"

    Returns
    ------
    None
    """
    # check input shape
    shape_x = input_x.get("shape")
    shape_grad = input_grad.get("shape")
    op_utils.check_shape(shape_x, param_name="input_x")
    op_utils.check_shape(shape_grad, param_name="input_grad")
    if list(shape_x) != list(shape_grad):
        raise RuntimeError("input_grad and input_x must have the same shape.")

    # check input tensor data_type and kernel_name
    check_list = ("float16", "float32")
    input_dtype = input_x.get("dtype").lower()
    grad_dtype = input_grad.get("dtype").lower()
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")
    op_utils.check_dtype(grad_dtype, check_list, param_name="input_grad")
    if input_dtype == "float32" and not tbe_platform.cce_conf.api_check_support(
            "te.lang.cce.vmuls", "float32"):
        raise RuntimeError(
            "Input dtype only support float16 while input dtype is float32")

    shape_x = [reduce_ins(lambda x, y: x * y, shape_x[:])]
    input_data_orginal = tvm.placeholder(shape_x,
                                         name="input_data",
                                         dtype=input_dtype)
    input_grad = tvm.placeholder(shape_x, name="input_grad", dtype=grad_dtype)

    final_res = relu6_grad_compute(input_grad,
                                   input_data_orginal,
                                   output_y,
                                   kernel_name="relu6_grad")
    with tvm.target.cce():
        auto_sch = generic.auto_schedule(final_res)

    config = {
        "name": kernel_name,
        "tensor_list": (input_grad, input_data_orginal, final_res)
    }

    te.lang.cce.cce_build_code(auto_sch, config)
Example #2
0
def fast_gelu_grad(input_dy, input_x, output_z,
                   kernel_name="fast_gelu_grad"):
    """
    algorithm: fast_gelu_grad
    calculating: dy*res'
    res' = div_up/div_down
    div_up = e^(-1.702x) + 1.702xe^(-1.702x) + e^(1.702(x-|x|))
    div_down = (e^(-1.702x)+1)^2

    Parameters
    ----------
    input_dy : dict
        shape and dtype of dy input, only support float16, float32
    input_x : dict
        shape and dtype of x input, only support float16, float32
    output_z: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is fast_gelu_grad

    Returns
    -------
    none.
    """
    shape_dy = input_dy.get("shape")
    shape_x = input_x.get("shape")

    check_shape(shape_dy, param_name="input_dy")
    check_shape(shape_x, param_name="input_x")
    input_dtype = input_dy.get("dtype").lower()
    check_list = ("float16", "float32")
    check_dtype(input_dtype, check_list, param_name="input_dy")
    shape_dy = list(shape_dy)
    shape_x = list(shape_x)
    if not operator.eq(shape_dy, shape_x):
        raise RuntimeError("all input shape must be equal")

    fuseshape = [1]
    fuseshape[0] = reduce_ins(lambda x, y: x * y, shape_dy)
    data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype)
    data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype)
    res = fast_gelu_grad_compute(data_dy, data_x, output_z, kernel_name)
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [data_dy, data_x, res]}

    te.lang.cce.cce_build_code(sch, config)
Example #3
0
def sigmoid_grad(x, y, z, kernel_name="sigmoid_grad"):
    """
    do sigmoid grad

    sigmoid_grad = (sigmoid - sigmoid*sigmoid)*grad

    Parameters:
    ----------
    x : dictionary shape of sigmoid input

    y : dictionary shape of grad

    z: dictionary output

    kernel_name : cce kernel name, default value is "sigmoid_grad_cce"

    Returns
    -------
    None
    """
    shape_sig = x.get("shape")
    shape_d = y.get("shape")
    dtype = x.get("dtype")
    dtype_y = y.get("dtype")
    if dtype != dtype_y:
        raise RuntimeError("Input dtype must be equal")
    if not operator.eq(list(shape_sig), list(shape_d)):
        raise RuntimeError("Input shapes must be equal")
    op_utils.check_shape(shape_sig, param_name="x")
    input_dtype = dtype.lower()
    op_utils.check_dtype(input_dtype, ("float16", "float32"), param_name="x")

    shape_sig = [reduce_ins(lambda x, y: x * y, shape_sig[:])]
    input_sigmoid = tvm.placeholder(shape_sig,
                                    name="input_sigmoid",
                                    dtype=input_dtype)
    input_grad = tvm.placeholder(shape_sig,
                                 name="input_grad",
                                 dtype=input_dtype)

    with tvm.target.cce():
        res = sigmoid_grad_compute(input_sigmoid, input_grad, z, kernel_name)
        auto_sch = topi.generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [input_sigmoid, input_grad, res]
    }

    te.lang.cce.cce_build_code(auto_sch, config)
Example #4
0
def relu6_d(input_x, output_y, scale=1.0, kernel_name="relu6_d"):
    """
       f(x)= 6(x >= 6)
       f(x)= 0(x <= 0)
       f(x)= x(0<x<6)

    Parameters
    ----------
    input_x : dict
        shape and dtype of input_x
    output_y : dict
        shape and dtype of output_y, should be same shape and type as input

    kernel_name : str
        cce kernel name, default value is "relu6"

    Returns
    ------
    None
    """
    input_shape = util.scalar2tensor_one(input_x.get("shape"))
    input_dtype = input_x.get("dtype").lower()
    op_utils.check_shape(input_shape, param_name="input_x")

    vmaxs_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmaxs", "float32")
    if input_dtype == "float32" and not vmaxs_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    # check input tensor data_type
    check_list = ("int32", "float16", "float32")
    op_utils.check_dtype(input_dtype, check_list, param_name="input_x")

    input_shape = [reduce_ins(lambda x, y: x * y, input_shape[:])]
    input_data = tvm.placeholder(input_shape,
                                 name="input_data",
                                 dtype=input_dtype)
    final_res = relu6_d_compute(input_data,
                                output_y,
                                scale,
                                kernel_name=kernel_name)

    with tvm.target.cce():
        auto_sch = topi.generic.auto_schedule(final_res)

    config = {"name": kernel_name, "tensor_list": (input_data, final_res)}
    te.lang.cce.cce_build_code(auto_sch, config)
Example #5
0
def sqrt_grad(x, dx, out, kernel_name="sqrt_grad"):
    """
    algorithm: sqrt_grad_cce

    Parameters
    ----------
    x : dict of data: dict

    dx : dict of data_grad: dict

    out : dict of output: dict

    kernel_name : cce kernel name, default value is "sqrt_grad": str

    Returns
    -------
    None

    """

    shape_x = x.get("shape")
    shape_dx = dx.get("shape")
    dtype_x = x.get("dtype").lower()
    dtype_dx = dx.get("dtype").lower()
    if not operator.eq(list(shape_x), list(shape_dx)):
        raise RuntimeError("Input shapes must be equal")
    if not dtype_x == dtype_dx:
        raise RuntimeError("Input dtype must be same")

    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_dtype(dtype_x, ("float16", "float32"), param_name="x")

    shape_x = [reduce_ins(lambda x, y: x * y, shape_x[:])]
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x)
    data_dx = tvm.placeholder(shape_x, name="data_dx", dtype=dtype_x)
    with tvm.target.cce():
        res = sqrt_grad_compute(data_x, data_dx, kernel_name)
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (data_x, data_dx, res)}

    te.lang.cce.cce_build_code(sch, config)
Example #6
0
def fast_gelu(input_x, output_y, kernel_name="fast_gelu"):
    """
    mathematical formula of fast_gelu(x):
    Parameters
    ----------
    input_x : dict
        shape and dtype of input, only support float16, float32
    output_y: dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        cce kernel name, default value is fast_fast_gelu

    Returns
    -------
    none.
    """
    attr = 1.702
    shape = input_x.get("shape")
    check_shape(shape, param_name="input_x")

    check_list = ("float16", "float32")
    input_dtype = input_x.get("dtype").lower()
    check_dtype(input_dtype, check_list, param_name="input_x")

    fuseshape = [1]
    fuseshape[0] = reduce_ins(lambda x, y: x * y, shape)
    data = tvm.placeholder(fuseshape, name="data", dtype=input_dtype)
    result = fast_gelu_compute(data, output_y, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(result)

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": [data, result]
    }

    te.lang.cce.cce_build_code(sch, config)
Example #7
0
def unsorted_segment_max_d(x,
                           segment_ids,
                           y,
                           num_segments,
                           kernel_name="unsorted_segment_max_d"):
    """
    Operation and Schedule for unsorted_segment_max_d.

    Parameters
    ----------
    x: dict
        shape and dtype of input.
        dtype only support float16, float32, int32
        on Ascend710, dtype also support int16

    segment_ids : dict
        should be the size of the first dimension
        need not cover all values in the full range of valid values
        dtype only support int32

    y: dict
        shape and dtype of output.

    num_segments : the dimension of the first axis of
                   the output tensor(>= max(segment_ids) + 1)

    kernel_name : cce kernel name,
                  default value is "unsorted_segment_max_d"

    Returns
    -------
        None
    """
    shape = x.get("shape")
    dtype = x.get("dtype")
    segment_ids_shape = segment_ids.get("shape")
    segment_ids_dtype = segment_ids.get("dtype")

    segment_max_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.unsorted_segment_max", "float32")
    if dtype == "float32" and not segment_max_support:
        raise RuntimeError(
            "Input dtype only support float16 while input dtype is float32")
    if num_segments <= 0:
        raise RuntimeError("unsorted_segment_max_d only support num_segments"
                           " greater than 0, while num_segments is %d" %
                           (num_segments))

    first_shape = int(shape[0])
    ids_length = int(segment_ids_shape[0])
    if first_shape != ids_length:
        raise RuntimeError(
            "unsorted_segment_max_d only supports inputs[0]"
            "equal to segment_ids_shape[0], while inputs[0] is %d, "
            "segment_ids_shape[0] is %d" % (first_shape, ids_length))
    total_ub_size = (num_segments + first_shape) * BLOCK_LENGTH + (
        (BLOCK_LENGTH // 2 - first_shape %
         (BLOCK_LENGTH // 4)) + first_shape) * (BLOCK_LENGTH // 8)
    if total_ub_size > UB_SIZE_MAX // 2:
        raise RuntimeError("unsorted_segment_max_d num_segments=%d,"
                           "shape[0]=%d, greater than UB_SIZE_MAX" %
                           (num_segments, shape[0]))

    dtype = dtype.lower()
    if len(shape) != 1:
        shape = (first_shape, reduce_ins(lambda x, y: x * y, shape[1:]))
    data_inputs = tvm.placeholder(shape, name="data_inputs", dtype=dtype)
    data_segments_id = tvm.placeholder(segment_ids_shape,
                                       name="data_segments_id",
                                       dtype=segment_ids_dtype)
    with tvm.target.cce():
        res = unsorted_segment_max_d_compute(data_inputs, data_segments_id, y,
                                             num_segments, kernel_name)

        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [data_inputs, data_segments_id, res]
    }
    te.lang.cce.cce_build_code(sch, config)