Esempio n. 1
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    ori_shape_x = x.get("ori_shape")
    ori_shape_y = y.get("ori_shape")
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)

    if format_pattern == 1:
        ori_shape_x, shape_y, _ = util.produce_shapes(ori_shape_x, shape_y)

        if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, ori_shape_y, _ = util.produce_shapes(shape_x, ori_shape_y)

        if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Esempio n. 2
0
def select_v2_compute(condition, x1, x2, y, kernel_name="select_v2"):
    """
    compute for select_v2

    Parameters
    ----------
    condition: TVM tensor
        the placeholder of input condition
    x1: TVM tensor
        the placeholder of input x1
    x2: TVM tensor
        the placeholder of input x2
    y: dict
        dict of y
    kernel_name: str
        cce kernel name, default value is "select_v2"

    Returns
    -------
    res: TVM tensor
        the result of compute
    """
    num_dtype = x1.dtype
    condition_dtype = condition.dtype
    x1 = te.lang.cce.cast_to(x1, "float32")
    x2 = te.lang.cce.cast_to(x2, "float32")
    condition = te.lang.cce.cast_to(condition, "float32")
    shape_x1list = te.lang.cce.util.shape_to_list(x1.shape)
    shape_x2list = te.lang.cce.util.shape_to_list(x2.shape)
    con_shapelist = te.lang.cce.util.shape_to_list(condition.shape)
    shape_x1list, con_shapelist, shape_max_x1 = util.produce_shapes(
        shape_x1list, con_shapelist)
    shape_x2list, shape_max_x1, shape_max = util.produce_shapes(
        shape_x2list, shape_max_x1)
    x1 = te.lang.cce.broadcast(x1, shape_max)
    x2 = te.lang.cce.broadcast(x2, shape_max)
    condition = te.lang.cce.broadcast(condition, shape_max)

    ones = te.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype="float32"),
                                 shape_max,
                                 output_dtype="float32")

    res = te.lang.cce.vcmpsel(condition,
                              rhs=ones,
                              operation='eq',
                              slhs=x1,
                              srhs=x2)
    res = te.lang.cce.cast_to(res, num_dtype)
    return res
def fake_quant_per_layer(x,
                         min_val,
                         max_val,
                         y,
                         symmetric,
                         narrow_range,
                         num_bits,
                         kernel_name="fake_quant_per_layer"):
    """FakeQuantPerLayer"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    quant_min = 0
    quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_per_layer_compute(input_data, min_data, max_data, y,
                                       quant_min, quant_max, symmetric,
                                       kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 4
0
def custom_subtract(shape_x,
                    shape_y,
                    dtype,
                    kernel_name="cce_subtract",
                    need_build=True,
                    need_print=True):
    """
    do element-wise subtract operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_subtract"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_subtract_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))
    print("######## shape")
    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data2_tmp1 = te.lang.cce.broadcast(data2, shape_max)
        res = te.lang.cce.vsub(data1_tmp1, data2_tmp1)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data1, data2, res]
    }
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 5
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
Esempio n. 6
0
def _shape_check(shape_x1, shape_x2, shape_tgt):
    # check whether the shape meets the broadcast requirements, and output broadcast shape
    try:
        _, _, x_shape = util.produce_shapes(shape_x1, shape_x2)
    except RuntimeError:
        raise RuntimeError("x1 and x2 can't be broadcast")

    x_shape_reduce = x_shape[:]
    x_shape_reduce.pop(1)
    try:
        _, _, tgt_shape = util.produce_shapes(x_shape_reduce, shape_tgt)
    except RuntimeError:
        raise RuntimeError("x and target can't be broadcast")
    min_dim = min(len(shape_x1), len(shape_x2), len(shape_tgt))
    if min_dim >= 3:
        reduce_dim = -1
        for i in range(-1, -min_dim, -1):
            if (shape_x1[i] == shape_x2) or (shape_x1[i] == shape_tgt[i]):
                reduce_dim = i
            else:
                break
        if reduce_dim != -1:
            shape_x1 = list(shape_x1[:reduce_dim]) + [
                reduce(lambda x, y: x * y, shape_x1[reduce_dim:])
            ]
            shape_x2 = list(shape_x2[:reduce_dim]) + [
                reduce(lambda x, y: x * y, shape_x2[reduce_dim:])
            ]
            shape_tgt = list(shape_tgt[:reduce_dim]) + [
                reduce(lambda x, y: x * y, shape_tgt[reduce_dim:])
            ]
            x_shape = list(x_shape[:reduce_dim]) + [
                reduce(lambda x, y: x * y, x_shape[reduce_dim:])
            ]
            tgt_shape = list(tgt_shape[:reduce_dim]) + [
                reduce(lambda x, y: x * y, tgt_shape[reduce_dim:])
            ]
    util.check_shape_rule(shape_x1)
    util.check_shape_rule(shape_x2)
    util.check_shape_rule(shape_tgt)
    util.check_tensor_shape_size(shape_x1)
    util.check_tensor_shape_size(shape_x2)
    util.check_tensor_shape_size(shape_tgt)

    return x_shape, tgt_shape, shape_x1, shape_x2, shape_tgt
Esempio n. 7
0
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"):
    """
    calculate the backpropagation of leaky_relu operation
    y = gradients(x>0) or negative_slope*gradients(x<=0).
    support dtype:float16,float32

    Parameters
    ----------
    g : dict
        the backpropagated gradients to the corresponding leaky_relu operation
    x : dict
        the x passed as output of leaky_relu operation
    y : dict
        the output of leaky_relu back propagation
    negative_slope : float or int
        allow non-zero slope for negative inputs to speed up optimization
    kernel_name : str
        kernel name, default value is "leaky_relu_grad"

    Returns
    -------
    None
    """

    shape_g = g.get("shape")
    shape_x = x.get("shape")
    dtype_g = g.get("dtype").lower()
    dtype_x = x.get("dtype").lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_g)
    util.check_shape_rule(shape_x)
    util.check_tensor_shape_size(shape_g)
    util.check_tensor_shape_size(shape_x)

    shape_list = util.produce_shapes(shape_g, shape_x)
    util.check_tensor_shape_size(shape_list[2])

    # check input tensor data_type
    check_list = ["float16", "float32"]
    util.check_dtype_rule(dtype_g, check_list)
    util.check_dtype_rule(dtype_x, check_list)
    util.compare_tensor_dict_key(g, x, "dtype")

    shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0],
                                                   shape_list[1])
    data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g)
    data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g)
    res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope,
                                  kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]}

    te.lang.cce.cce_build_code(schedule, config)
def minmax_update_perlayer(x,
                           min_val,
                           max_val,
                           min_up,
                           max_up,
                           ema,
                           ema_decay,
                           kernel_name="minmax_update_perlayer"):
    """MinMaxUpdatePerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = minmax_update_perlayer_compute(input_data, min_data, max_data,
                                              ema, ema_decay)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 9
0
def axpy_v2_compute(x1, x2, alpha, y, kernel_name="axpy_v2"):
    """
    calculating data

    Parameters
    ----------
    x1 : TVM tensor
        the placeholder of input_x
    x2 : TVM tensor
        the placeholder of x2
    y : dict
        dict of y, include keys(shape and dtype)
    alpha : TVM tensor
        scalar of mul-factor
    kernel_name : str
        kernel name, default value is "axpy_v2"

    Returns
    -------
    output tensor
    """
    # broadcast
    shape_x1 = te.lang.cce.util.shape_to_list(x1.shape)
    shape_x2 = te.lang.cce.util.shape_to_list(x2.shape)
    dtype_alpha = alpha.dtype.lower()
    dtype = x1.dtype.lower()
    precision_dtype = "float32"

    if dtype != precision_dtype:
        x1 = te.lang.cce.cast_to(x1, precision_dtype)
        x2 = te.lang.cce.cast_to(x2, precision_dtype)

    if dtype_alpha != precision_dtype:
        alpha = te.lang.cce.cast_to(alpha, precision_dtype)

    if shape_x1 != shape_x2:
        # if shape not equal, then apply broadcast.
        shape_x, shape_y, shape_max = util.produce_shapes(shape_x1, shape_x2)
        x1 = te.lang.cce.broadcast(x1, shape_max)
        x2 = te.lang.cce.broadcast(x2, shape_max)
        alpha = te.lang.cce.broadcast(alpha, shape_max)
    else:
        alpha = te.lang.cce.broadcast(alpha, shape_x1)

    res = te.lang.cce.vmla(x2, alpha, x1)
    res = te.lang.cce.cast_to(res, dtype)
    return res
Esempio n. 10
0
def mul_no_nan_compute(input_x1, input_x2, output_y, kernel_name="mul_no_nan"):
    """
    calculating data

    Parameters
    ----------
    input_x1 : TVM tensor
        the placeholder of input_x1
    input_x2 : TVM tensor
        the placeholder of input_x2
    output_y : dict
        dict of output_y, include keys(shape and dtype)
    kernel_name : str
        kernel name, default value is "mul_no_nan"

    Returns
    -------
    output tensor
    """
    """
    np.where(np.equal(y, 0.), np.zeros((), dtype=dtype), np.multiply(x, y))
    """
    src_dtype = input_x1.dtype.lower()
    shape_x1 = te.lang.cce.util.shape_to_list(input_x1.shape)
    shape_x2 = te.lang.cce.util.shape_to_list(input_x2.shape)

    shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)
    input_x1 = te.lang.cce.broadcast(input_x1, shape_max)
    input_x2 = te.lang.cce.broadcast(input_x2, shape_max)

    mul_res = te.lang.cce.vmul(input_x1, input_x2)
    zero = tvm.const(0, dtype=src_dtype)
    zeros = te.lang.cce.broadcast(zero, shape_max)
    res = te.lang.cce.vcmpsel(input_x2,
                              zeros,
                              operation='eq',
                              slhs=zeros,
                              srhs=mul_res)
    return res
Esempio n. 11
0
def xdivy_grad(x1, x2, grad, y1, y2, kernel_name="xdivy_grad"):
    """
    Returns gradient of xdivy(x, y) with respect to x and y.

    Parameters
    ----------
    x1 : dict
        shape and dtype of input, only support float16, float32
    x2 : dict
        shape and dtype of input, only support float16, float32
    grad : dict
        shape and dtype of input, only support float16, float32
    y1 : dict
        shape and dtype of output, should be same shape and type as input
    y2 : dict
        shape and dtype of output, should be same shape and type as input
    kernel_name : str
        kernel name, default value is "xdivygrad"

    Returns
    -------
    None
    """
    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype").lower()
    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype").lower()
    shape_grad = grad.get("shape")
    dtype_grad = grad.get("dtype").lower()
    if dtype_x1 != dtype_x2 or dtype_x2 != dtype_grad or dtype_grad != dtype_x1:
        raise RuntimeError("the type of x1, x2 and grad must be the same.")

    op_utils.check_shape(shape_x1, param_name="x1")
    op_utils.check_shape(shape_x2, param_name="x2")
    op_utils.check_shape(shape_grad, param_name="grad")
    check_list = ("float16", "float32")
    op_utils.check_dtype(dtype_x1, check_list, param_name="x1")
    shape_x1, shape_x2, shape_max_x1x2 = util.produce_shapes(
        shape_x1, shape_x2)
    if len(shape_max_x1x2) < len(shape_grad):
        raise RuntimeError(
            "the length of shape_grad can not be longer than the maximum "
            "length of x1 and x2.")

    shape_grad, _, shape_max = util.produce_shapes(shape_grad, shape_max_x1x2)

    for (x, y) in zip(shape_max_x1x2, shape_grad):
        if x < y:
            raise RuntimeError("this shape is not supported.")

    op_utils.check_shape(shape_max, param_name="x")
    rx, ry = _broadcast_gradient_args(shape_x1, shape_x2)

    x1 = tvm.placeholder(shape_x1, name="x", dtype=dtype_x1)
    x2 = tvm.placeholder(shape_x2, name="y", dtype=dtype_x1)
    grad = tvm.placeholder(shape_grad, name="grad", dtype=dtype_x1)

    output_y1, output_y2 = xdivy_grad_compute([x1, x2, grad], shape_max,
                                              dtype_x1, rx, ry)

    with tvm.target.cce():
        sch = generic.auto_schedule([output_y1, output_y2])

    config = {
        "name": kernel_name,
        "tensor_list": [x1, x2, grad, output_y1, output_y2]
    }
    te.lang.cce.cce_build_code(sch, config)
Esempio n. 12
0
def threshold_grad_v2_d(input_gradients,
                        input_features,
                        output_backprops,
                        threshold,
                        kernel_name="threshold_grad_v2_d"):
    """
    calculating data

    Parameters
    ----------
    input_gradients : dict
        shape and dtype of input_gradients
    input_features : dict
        shape and dtype of input_features
    output_backprops : dict
        shape and dtype of output_backprops,
        should be same shape and type as inputs
    threshold : dict
        shape and dtype of threshold, 0-dimensional array
    kernel_name : str
        kernel name, default value is "threshold_grad_v2_d"

    Returns
    -------
    None
    """
    shape_input_gradients = input_gradients.get("shape")
    dtype_input_gradients = input_gradients.get("dtype").lower()

    shape_input_features = input_features.get("shape")
    dtype_input_features = input_features.get("dtype").lower()

    shape_list = util.produce_shapes(shape_input_gradients,
                                     shape_input_features)
    util.check_tensor_shape_size(shape_list[2])
    shape_input_gradients, shape_input_features = \
        refine_shapes_for_broadcast(shape_list[0], shape_list[1])

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    check_dtype(dtype_input_gradients, check_list)
    check_dtype(dtype_input_features, check_list)

    data_input_gradients = tvm.placeholder(shape_input_gradients,
                                           name="data_input_gradients",
                                           dtype=dtype_input_gradients)
    data_input_features = tvm.placeholder(shape_input_features,
                                          name="data_input_features",
                                          dtype=dtype_input_features)
    res = threshold_grad_v2_d_compute(data_input_gradients,
                                      data_input_features, output_backprops,
                                      threshold, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [data_input_gradients, data_input_features, res]
    }

    te.lang.cce.cce_build_code(schedule, config)
Esempio n. 13
0
def select_v2(condition, x1, x2, y, kernel_name="select_v2"):
    """
      Selects elements from `x1` or `x2`, depending on `condition`.

      Parameters
      ----------
      condition: dict
          dict of condition, include keys(shape and dtype),
          only support bool
      x1: dict
          dict of x1, only support float16, float32, int32, int8, uint8
      x2: dict
          dict of x2, only support float16, float32, int32, int8, uint8
      y: dict
          dict of output
      kernel_name: str
          cce kernel name, default value is "select"

      Returns
      -------
      None
      """
    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype")
    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype")
    bool_dtype = condition.get("dtype")
    con_shape = condition.get("shape")

    shape_x1, con_shape, shape_max_x1 = util.produce_shapes(
        shape_x1, con_shape)
    shape_x2, con_shape, shape_max_x2 = util.produce_shapes(
        shape_x2, con_shape)

    if shape_x1[-1] == 1 and shape_x2[-1] == 1 and con_shape[-1] == 1 \
            and shape_max_x1[-1] == 1:
        shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1]
        shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1]
        con_shape = con_shape if len(con_shape) == 1 else con_shape[:-1]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x1)
    util.check_tensor_shape_size(shape_x1)

    if shape_x1 == shape_x2 == con_shape:
        shape_x1 = (functools_reduce(lambda x, y: x * y, shape_x1[:]), )
        shape_x2 = (functools_reduce(lambda x, y: x * y, shape_x2[:]), )
        con_shape = (functools_reduce(lambda x, y: x * y, con_shape[:]), )

    dtype_x1 = dtype_x1.lower()
    dtype_x2 = dtype_x2.lower()
    check_list = ("float16", "float32", "int32", "int8", "uint8")
    util.check_dtype_rule(dtype_x1, check_list)
    if dtype_x1 != dtype_x2:
        raise RuntimeError("Dtype of tensor x1 and x2 must be equal!")

    bool_dtype = bool_dtype.lower()
    bool_check_list = ("bool", "int8", "uint8")
    util.check_dtype_rule(bool_dtype, bool_check_list)

    condition = tvm.placeholder(con_shape, name="condition", dtype=bool_dtype)
    input_then = tvm.placeholder(shape_x1, name="input_then", dtype=dtype_x1)
    input_else = tvm.placeholder(shape_x2, name="input_else", dtype=dtype_x2)

    with tvm.target.cce():
        res = select_v2_compute(condition, input_then, input_else, y,
                                kernel_name)
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [condition, input_then, input_else, res],
        "bool_storage_as_1bit": False
    }
    te.lang.cce.cce_build_code(sch, config)
def sigmoid_cross_entropy_with_logits_grad_v2_compute(predict,
                                                      target,
                                                      dout,
                                                      weight,
                                                      pos_weight,
                                                      reduction="mean"):
    """
    :param predict: TVM tensor, the placeholder of predict
    :param target: TVM tensor, the placeholder of target
    :param dout: TVM tensor, the placeholder of dout
    :param weight: TVM tensor, the placeholder of weight
    :param pos_weight: TVM tensor, the placeholder of pos_weight
    :param reduction: str, specifies the reduction mode :'none' | 'mean' | 'sum'
    :return: TVM tensor
    """
    predict_shape = te.lang.cce.util.shape_to_list(predict.shape)
    predict_dtype = predict.dtype

    precision_dtype = "float32"

    if predict.dtype.lower() == "float16":
        predict = te.lang.cce.cast_to(predict, precision_dtype)
        target = te.lang.cce.cast_to(target, precision_dtype)

    # calculate sigmoid(predict)
    exp_predict = te.lang.cce.vexp(predict)
    exp_add1 = te.lang.cce.vadds(exp_predict, tvm.const(1, precision_dtype))
    sigmoid_tmp = te.lang.cce.vdiv(exp_predict, exp_add1)
    sigmoid_res = te.lang.cce.cast_to(sigmoid_tmp, precision_dtype)

    # calculate the result of gradient = ((log_weight + 1 - target) * sigmoid(predict) - log_weight) * dout
    if pos_weight is not None:
        pos_weight_shape = te.lang.cce.util.shape_to_list(pos_weight.shape)
        if pos_weight_shape != predict_shape:
            _, _, broadcast_pos_shape = util.produce_shapes(
                pos_weight_shape, predict_shape)
            pos_weight = te.lang.cce.broadcast(pos_weight, broadcast_pos_shape,
                                               precision_dtype)

        log_weight = te.lang.cce.vmul(pos_weight, target)
        weight_tmp = te.lang.cce.vadds(log_weight,
                                       tvm.const(1, precision_dtype))
        weight_sub = te.lang.cce.vsub(weight_tmp, target)
        grad_tmp = te.lang.cce.vmul(weight_sub, sigmoid_res)
        grad_cur = te.lang.cce.vsub(grad_tmp, log_weight)
        grad_output = te.lang.cce.vmul(grad_cur, dout)
    else:
        grad_cur = te.lang.cce.vsub(sigmoid_res, target)
        grad_output = te.lang.cce.vmul(grad_cur, dout)

    # calculate the result of gradient = gradient * weight
    if weight is not None:
        weight_shape = te.lang.cce.util.shape_to_list(weight.shape)
        if weight_shape != predict_shape:
            _, _, broadcast_weight_shape = util.produce_shapes(
                weight_shape, predict_shape)
            weight = te.lang.cce.broadcast(weight, broadcast_weight_shape,
                                           precision_dtype)

        grad_output = te.lang.cce.vmul(grad_output, weight)

    # calculate the result of gradient = gradient / num
    if reduction == "mean":
        num = reduce(lambda x, y: x * y, predict_shape)
        norm = 1.0 / num
        grad_output = te.lang.cce.vmuls(grad_output, norm)

    grad_output = te.lang.cce.cast_to(grad_output, predict_dtype)
    return grad_output
def _broadcast_shape_check(input_shape, target_shape):
    try:
        util.produce_shapes(input_shape, target_shape)
    except RuntimeError:
        raise RuntimeError("input_shape can't be broadcast to target_shape")
Esempio n. 16
0
def axpy_compute(x1, x2, y, alpha, kernel_name="axpy"):
    """
    calculating data

    Parameters
    ----------
    x1 : TVM tensor
        the placeholder of input_x
    x2 : TVM tensor
        the placeholder of x2
    y : dict
        dict of y, include keys(shape and dtype)
    alpha : float
        scalar of mul-factor
    kernel_name : str
        kernel name, default value is "axpy"

    Returns
    -------
    output tensor
    """
    # broadcast
    shape_x = te.lang.cce.util.shape_to_list(x1.shape)
    shape_y = te.lang.cce.util.shape_to_list(x2.shape)
    dtype = x1.dtype.lower()

    # neg_1_axis_flag
    neg_1_axis_flag = 0
    if shape_x != shape_y:
        # if shape not equal, then apply broadcast.
        shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

        for i in range(len(shape_x) - 1):
            if shape_x[i] != shape_y[i]:
                neg_1_axis_flag = 1
                break
        util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

        x1 = te.lang.cce.broadcast(x1, shape_max)
        x2 = te.lang.cce.broadcast(x2, shape_max)

    # start the main logic
    if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") == "Ascend910":
        if dtype in ("float16", "float32"):
            # fp16 or fp32
            if neg_1_axis_flag:
                res_muls = te.lang.cce.vmuls(x2, alpha)
                res = te.lang.cce.vadd(x1, res_muls)
            else:
                res = te.lang.cce.vaxpy(x2, x1, tvm.const(alpha, dtype=dtype))
        else:
            # int32
            if alpha != 1:
                # add+muls use fp32
                to_type = "float32"
                input_x_cast = te.lang.cce.cast_to(x1, to_type)
                input_y_cast = te.lang.cce.cast_to(x2, to_type)

                if neg_1_axis_flag:
                    res_muls = te.lang.cce.vmuls(x2, alpha)
                    res_tmp = te.lang.cce.vadd(x1, res_muls)
                else:
                    res_tmp = te.lang.cce.vaxpy(input_y_cast, input_x_cast,
                                                tvm.const(alpha, dtype=to_type))

                res = te.lang.cce.cast_to(res_tmp, dtype)

            else:
                # if alpha == 1
                res = te.lang.cce.vadd(x2, x1)
    else:
        if dtype in ("float16", "float32"):
            # fp16 or fp32
            res_muls = te.lang.cce.vmuls(x2, alpha)
            res = te.lang.cce.vadd(x1, res_muls)
        else:
            # int32
            if alpha != 1:
                # add+muls use fp32
                to_type = "float32"
                input_x1_cast = te.lang.cce.cast_to(x1, to_type)
                input_x2_cast = te.lang.cce.cast_to(x2, to_type)

                res_muls = te.lang.cce.vmuls(input_x2_cast, alpha)
                res_tmp = te.lang.cce.vadd(input_x1_cast, res_muls)

                res = te.lang.cce.cast_to(res_tmp, dtype)
            else:
                # if alpha == 1
                res = te.lang.cce.vadd(x2, x1)

    return res
Esempio n. 17
0
def axpy_v2(x1, x2, alpha, y, kernel_name="axpy_v2"):
    """
    calculating data

    Parameters
    ----------
    x1 : dict
        shape and dtype of input_x
    x2 : dict
        shape and dtype of input_y
    alpha : dict
        shape and dtype of alpha
        scalar apply to input_y:input_y*alpha
    y : dict
        shape and dtype of output, should be same shape and type as input

    kernel_name : str
        kernel name, default value is "axpy"

    Returns
    -------
    None
    """
    # check kernel name
    util.check_kernel_name(kernel_name)

    # infer shape according to the format pattern
    format_pattern = _add_check_format(x1, x2)

    shape_x1, shape_x2 = _infer_shape(format_pattern, x1, x2)

    dtype_x1 = x1.get("dtype").lower()
    dtype_x2 = x2.get("dtype").lower()
    alpha_dtype = alpha.get("dtype").lower()
    alpha_shape = alpha.get("shape")

    # check shape
    shape_x1 = util.scalar2tensor_one(shape_x1)
    shape_x2 = util.scalar2tensor_one(shape_x2)
    alpha_shape = util.scalar2tensor_one(alpha_shape)
    op_utils.check_shape(shape_x1)
    op_utils.check_shape(shape_x2)
    op_utils.check_shape(alpha_shape)

    # check dtype
    dtype_list0 = ("float16", "float32", "int32")
    dtype_list1 = ("float16", "float32")

    check_dtype(dtype_x1, dtype_list0)
    check_dtype(dtype_x2, dtype_list0)
    check_dtype(alpha_dtype, dtype_list1)
    util.compare_tensor_dict_key(x1, x2, "dtype")

    # check alpha is 0D or 1D tensor
    if len(alpha_shape) and not util.is_scalar(alpha_shape):
        raise RuntimeError("alpha should be 0D or 1D tensor")

    # produce shapes
    shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2)
    if shape_x1[-1] == 1 and shape_x2[-1] == 1 and shape_max[-1] == 1:
        shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1]
        shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)
    util.produce_shapes(shape_max, alpha_shape)

    shape_x1, shape_x2 = refine_shapes_for_broadcast(shape_x1, shape_x2)

    data_input_x1 = tvm.placeholder(shape_x1,
                                    name="data_input_x1", dtype=dtype_x1)
    data_input_x2 = tvm.placeholder(shape_x2,
                                    name="data_input_x2", dtype=dtype_x2)

    alpha_shape = tuple([1] * (len(shape_x1) - len(alpha_shape))) + tuple(alpha_shape)
    alpha_input = tvm.placeholder(alpha_shape, name="alpha_input", dtype=alpha_dtype)

    res = axpy_v2_compute(data_input_x1, data_input_x2, alpha_input, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [data_input_x1, data_input_x2, alpha_input, res]}

    te.lang.cce.cce_build_code(schedule, config)
def custom_logical_and(shape_x,
                       shape_y,
                       dtype,
                       kernel_name="cce_tf_logical_and",
                       need_build=False,
                       need_print=False):
    """
    do element-wise logical-and operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support "bool"

    kernel_name : cce kernel name, default value is "cce_tf_logical_and"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["bool"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "logical_and_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data1_tmp2 = te.lang.cce.broadcast(data2, shape_max)

        min_value = tvm.const(0, dtype=inp_dtype)
        res = tvm.compute(
            shape_max,
            lambda *i: tvm.select(
                tvm.all(
                    tvm.any(
                        data1_tmp1(*i) > min_value,
                        data1_tmp1(*i) < -min_value),
                    tvm.any(
                        data1_tmp2(*i) > min_value,
                        data1_tmp2(*i) < -min_value)), True, False),
            name="res")

        sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [data1, data2, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
Esempio n. 19
0
def custom_squared_difference(shape_x,
                              shape_y,
                              dtype,
                              kernel_name="cce_tf_squared_difference",
                              need_build=False,
                              need_print=False):
    """
    algorithm: tf_squared_difference

    calculating data's tf_squared_difference,y= (x - y) * (x - y)

    Parameters
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32, int32

    kernel_name : cce kernel name, default value is "cce_tf_squared_difference"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]

    if not dtype.lower() in check_list:
        raise RuntimeError(
            "tf_squared_difference_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    dtype = dtype.lower()

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x")
    data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y")

    with tvm.target.cce():
        data_x_tmp = te.lang.cce.broadcast(data_x, shape_max)
        data_y_tmp = te.lang.cce.broadcast(data_y, shape_max)
        data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp)
        res = te.lang.cce.vmul(data_sub, data_sub)
        sch = generic.auto_schedule(res)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": [data_x, data_y, res]
    }

    te.lang.cce.cce_build_code(sch, config)
Esempio n. 20
0
def axpy(x1, x2, y, alpha, kernel_name="axpy"):
    """
    calculating data

    Parameters
    ----------
    x1 : dict
        shape and dtype of input_x
    x2 : dict
        shape and dtype of input_y
    y : dict
        shape and dtype of output, should be same shape and type as input
    alpha : float
        scalar apply to input_y:input_y*alpha
    kernel_name : str
        kernel name, default value is "axpy"

    Returns
    -------
    None
    """
    # check kernel name
    util.check_kernel_name(kernel_name)

    # infer shape according to the format pattern
    format_pattern = _add_check_format(x1, x2)

    shape_x1, shape_x2 = _infer_shape(format_pattern, x1, x2)

    # check shape
    shape_x1 = util.scalar2tensor_one(shape_x1)
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)
    shape_x2 = util.scalar2tensor_one(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)

    util.check_shape_rule(shape_x1)
    util.check_tensor_shape_size(shape_x1)
    util.check_shape_rule(shape_x2)
    util.check_tensor_shape_size(shape_x2)

    # check dtype
    dtype_list = ("float16", "float32", "int32")

    dtype_x1 = x1.get("dtype").lower()
    check_dtype(dtype_x1, dtype_list)
    dtype_x2 = x2.get("dtype").lower()
    check_dtype(dtype_x2, dtype_list)

    # produce shapes
    shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2)
    if shape_x1[-1] == 1 and shape_x2[-1] == 1 and shape_max[-1] == 1:
        shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1]
        shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1]
        shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1]
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    shape_x1, shape_x2 = refine_shapes_for_broadcast(shape_x1, shape_x2)

    data_input_x1 = tvm.placeholder(shape_x1,
                                    name="data_input_x1", dtype=dtype_x1)
    data_input_x2 = tvm.placeholder(shape_x2,
                                    name="data_input_x2", dtype=dtype_x2)

    res = axpy_compute(data_input_x1, data_input_x2, y,
                       alpha, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": [data_input_x1, data_input_x2, res]}

    te.lang.cce.cce_build_code(schedule, config)
def fused_minimum_or_maximum_grad_cce(
        shape_dz,
        shape_x,
        shape_y,
        grad_x=True,
        grad_y=True,
        cmp_type="LE",
        dtype="float32",
        kernel_name="cce_fused_minimum_or_maximum_grad",
        need_build=False,
        need_print=False):
    """
    algorithm:
    calculating minimum or maximum_grad of the two input data

    Parameters
    ----------
    shape_dz: list or tuple.
        shape of data_inputdz
    shape_x: list or tuple.
        shape of data_inputx
    shape_y: list or tuple.
        shape of data_inputy
    grad_x: bool
        if grad_x is true,output need return dx
    grad_y: bool
        if grad_y is true,output need return dy
    cmp_type: str
        LessEqual or GreatEqual
    dtype: str
        the data type, assume src_dtype equals dst_dtype,
        only support float16, float32, int32
    kernel_name: str
        cce kernel name, default value is "cce_fused_minimum_or_maximum_grad"
    need_build: bool
        if need to build CCEC kernel, default value is False
    need_print: bool
        if need to print the ir, default value is False

    Returns:
    -------
    none.
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)
    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    util.check_shape_rule(shape_max)
    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)
    if list(shape_dz) != list(shape_max):
        raise RuntimeError(
            "fused_minimum_or_maximum_grad_cce shape_dz != shape_max")

    dtype = dtype.lower()
    if dtype not in ["float16", "float32", "int32"]:
        raise RuntimeError("fused_minimum_or_maximum_grad_cce only support"
                           " float16, float32, int32")

    if (grad_x, grad_y) == (False, False):
        raise RuntimeError("grad_x and grad_x at least one is true")

    placeholders = []
    placeholders.append(tvm.placeholder(shape_dz, name="input_dz",
                                        dtype=dtype))
    placeholders.append(tvm.placeholder(shape_x, name="input_x", dtype=dtype))
    placeholders.append(tvm.placeholder(shape_y, name="input_y", dtype=dtype))

    outs = fused_minimum_or_maximum_grad_compute(placeholders, shape_x,
                                                 shape_y, shape_dz, cmp_type,
                                                 dtype)

    with tvm.target.cce():
        if (grad_x, grad_y) == (True, False):
            sch = generic.auto_schedule(outs[0])
            outs = [outs[0]]
        if (grad_x, grad_y) == (False, True):
            sch = generic.auto_schedule(outs[1])
            outs = [outs[1]]
        if (grad_x, grad_y) == (True, True):
            sch = generic.auto_schedule(outs)

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": placeholders + outs
    }

    te.lang.cce.cce_build_code(sch, config)
def fake_quant_with_min_max_update(x,
                                   min_val,
                                   max_val,
                                   min_up,
                                   max_up,
                                   ema,
                                   ema_decay,
                                   symmetric,
                                   narrow_range,
                                   training,
                                   num_bits,
                                   quant_delay,
                                   kernel_name="fake_quant_update"):
    """FakeQuantWithMinMax op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    if symmetric:
        quant_min = 0 - 2**(num_bits - 1)
        quant_max = 2**(num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = fake_quant_with_min_max_update_compute(input_data, min_data,
                                                      max_data, ema, ema_decay,
                                                      quant_min, quant_max,
                                                      training, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod",
                       need_build=False, need_print=False):
    """
    do element-wise truncatemod operation between two input tensors

    Parameters:
    ----------
    shape1 : shape of input data1

    shape2 : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_tf_truncatemod"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    max_dim = 8
    shape1_len = len(shape1)
    shape2_len = len(shape2)
    if shape1_len > max_dim or shape2_len > max_dim:
        raise RuntimeError(
            "mod_cce only support up to %d dimensions while the shape's \
            dimensions is %d, %d" % (max_dim, shape1_len, shape2_len))
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape1)
    util.check_shape_rule(shape2)

    util.check_shape_size(shape1, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape2, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    device_api_map = {"float16": "cc_device_truncatemod_float16",
                      "float32": "cc_device_truncatemod_float",
                      "int32": "cc_device_truncatemod_int32"}

    dtype = dtype.lower()
    if dtype not in check_list:
        raise RuntimeError(
            "tf_truncatemod_cce only support %s while dtype is %s" % (
                ",".join(check_list), dtype))

    shape1, shape2, shape_out = util.produce_shapes(shape1, shape2)
    util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    device_api = device_api_map[inp_dtype]

    # block
    block_num = "block_num"
    block_idx = "block_idx"
    # x param
    v_xndim_cnt = tvm.const(len(shape1), "int32")
    p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape")
    xpad_c0 = tvm.const(0, "int32")
    data_input_x = tvm.placeholder(shape1, name="data_input_x",
                                   dtype=inp_dtype)
    # y param
    v_yndim_cnt = tvm.const(len(shape2), "int32")
    p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape")
    ypad_c0 = tvm.const(0, "int32")
    data_input_y = tvm.placeholder(shape2, name="data_input_y",
                                   dtype=inp_dtype)
    # output
    v_out_ndim_cnt = tvm.const(len(shape_out), "int32")
    p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape")
    out_padc0 = tvm.const(0, "int32")

    output = tvm.extern(shape_out,
                        [p_xshape, data_input_x, p_yshape, data_input_y,
                         p_out_shape], lambda ins, outs:
                        tvm.call_extern("int32_t", device_api,
                                        block_num,
                                        block_idx,
                                        v_xndim_cnt,
                                        ins[0].access_ptr("r"),  # shape x
                                        xpad_c0,
                                        ins[1].access_ptr("r"),  # input x
                                        v_yndim_cnt,
                                        ins[2].access_ptr("r"),  # shape y
                                        ypad_c0,
                                        ins[3].access_ptr("r"),  # input y
                                        v_out_ndim_cnt,
                                        ins[4].access_ptr("r"),  # shape out
                                        out_padc0,
                                        outs[0].access_ptr("w")),
                        name="output", dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    # print IR
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input_x, data_input_y, output],
                            simple_mode=True))
            # Compile to generate the cce file
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input_x, data_input_y, output], "cce",
                      name=kernel_name)
Esempio n. 24
0
def leaky_relu_grad_compute(g,
                            x,
                            y,
                            negative_slope=0,
                            kernel_name="leaky_relu_grad"):
    """
    calculate the backpropagation of leaky_relu operation
    y = gradients(x>0) or negative_slope*gradients(x<=0).

    Parameters
    ----------
    g : TVM tensor
        the placeholder of input g
    x : TVM tensor
        the placeholder of input x
    y : dict
        dict of output y, include keys(shape and dtype)
    negative_slope : float or int
        allow non-zero slope for negative inputs to speed up optimization
    kernel_name : str
        kernel name, default value is "leaky_relu_grad"

    Returns
    -------
    res: TVM tensor
        the result of leaky_relu_grad_compute
    """

    shape_list = util.produce_shapes(te.lang.cce.util.shape_to_list(g.shape),
                                     te.lang.cce.util.shape_to_list(x.shape))
    util.check_tensor_shape_size(shape_list[2])

    dtype = g.dtype
    g = te.lang.cce.broadcast(g, shape_list[2])
    x = te.lang.cce.broadcast(x, shape_list[2])

    if dtype == "float32":
        help_min = tvm.const(2**(-126), "float32")
        help_rec_one = tvm.const(2**38, "float32")
        help_rec_sec = tvm.const(2**44, "float32")
    elif dtype == "float16":
        help_min = tvm.const(2**(-24), "float16")
        help_rec_one = tvm.const(2**12, "float16")
        help_rec_sec = help_rec_one

    tmp_min_x = te.lang.cce.vmins(x, help_min)
    tmp_max_x = te.lang.cce.vmaxs(tmp_min_x, tvm.const(SCALAR_ZERO, "float32"))
    tmp_mul_x = te.lang.cce.vmuls(tmp_max_x, help_rec_one)

    if dtype == "float32":
        tmp_mul_x = te.lang.cce.vmuls(tmp_mul_x, help_rec_sec)

    result_tmp_right = te.lang.cce.vmuls(tmp_mul_x, help_rec_sec)

    result_sub = te.lang.cce.vadds(result_tmp_right,
                                   tvm.const(NEGATIVE_ONE, "float32"))
    result_abs = te.lang.cce.vabs(result_sub)
    result_tmp_left = te.lang.cce.vmuls(result_abs, negative_slope)

    result_tmp = te.lang.cce.vadd(result_tmp_left, result_tmp_right)

    res = te.lang.cce.vmul(g, result_tmp)
    return res