Ejemplo n.º 1
0
    def _im2col_fractal_indices(indices, A):
        """
        calculate im2col_fractal tvm lambda function
        Parameters
        ----------
        indices : indices in lambda function

        A : feature map

        -------
        Returns : im2col_fractal tvm lambda function
        """
        block_size = config['mac'][1]
        block_size_M = config['mac'][0]
        n, hw, c1, kernel_h, kernel_w, c0 = A.shape
        batch_size, i1, j1, i0, j0 = indices
        n_index = batch_size

        hw_index = i1 * block_size_M + i0

        c1_index = (((j1 * block_size + j0) // c0.value) //
                    kernel_w.value) // kernel_h.value

        kh_index = (((j1 * block_size + j0) // c0.value) //
                    kernel_w.value) % kernel_h.value

        kw_index = ((j1 * block_size + j0) // c0.value) % kernel_w.value

        c0_index = (j1 * block_size + j0) % c0.value

        dtype = compute_dtype
        return tvm.select(
            tvm.any(hw_index < 0, hw_index > hw.value - 1),
            tvm.const(0.0, dtype),
            A(n_index, hw_index, c1_index, kh_index, kw_index, c0_index))
Ejemplo n.º 2
0
    def lambda_func(*indice):
        new_indice = [0] * 5
        if tensor_flag:
            new_indice[4] = indice[c0_index]
            new_indice[1] = indice[c1_index]

        if tensor_flag:
            return tvm.select(
                indice[c1_index] < x_shape_list[c1_index],
                tvm.vdeq_cast(x(*indice),
                              req_scale(*new_indice),
                              "int8",
                              do_relu=relu_flag), tvm.const(0, dtype="int8"))
        return tvm.select(
            indice[c1_index] < x_shape_list[c1_index],
            tvm.deq_cast(x(*indice), req_scale(*new_indice), "int8"),
            tvm.const(0, dtype="int8"))
Ejemplo n.º 3
0
def _input_compute_generate(x, in_shape, read_shape, c1_dim, c1_index):
    """
    generate lambda func
    """
    x_shape = te.lang.cce.util.shape_to_list(x.shape)
    dtype = x.dtype
    x_slice_offset = _get_input_attr(x, "slice_offset", [], True)
    l1_fusion_flag = _get_input_attr(x, "l1_fusion_flag", -1, False)
    if not x_slice_offset:
        x_slice_offset = [0, 0, 0, 0, 0]

    if l1_fusion_flag != -1:
        x_w = x_shape[3]
        n_offset, _, h_offset, w_offset, _ = x_slice_offset
        if c1_dim % 2 == 0:
            input_ub = tvm.compute(
                in_shape,
                lambda n, c1, m, c0: x(n + n_offset, c1, (m // x_w) + h_offset,
                                       (m % x_w) + w_offset, c0),
                name="input_ub",
                attrs={"c_out": c1_dim})
        else:
            input_ub = tvm.compute(
                read_shape,
                lambda n, c1, m, c0: tvm.select(
                    c1 <= in_shape[c1_index] - 1,
                    x(n + n_offset, c1, (m // x_w) + h_offset,
                      (m % x_w) + w_offset, c0), tvm.const(0, dtype=dtype)),
                name='input_ub',
                attrs={"c_out": c1_dim})
    else:
        if c1_dim % 2 == 0:
            input_ub = tvm.compute(in_shape,
                                   lambda *i: x(*i),
                                   name="input_ub",
                                   attrs={"c_out": c1_dim})
        else:
            input_ub = tvm.compute(
                read_shape,
                lambda *indice: tvm.select(
                    indice[c1_index] <= in_shape[c1_index] - 1, x(*indice),
                    tvm.const(0, dtype=dtype)),
                name='input_ub',
                attrs={"c_out": c1_dim})
    return input_ub
Ejemplo n.º 4
0
def custom_logical_not(shape,
                       dtype,
                       kernel_name="cce_tf_logical_not",
                       need_build=False,
                       need_print=False):
    """
    logical not for the input tensor

    Parameters
    ----------
    shape : input shape of data

    dtype : the data type, support bool

    kernel_name : cce kernel name, default value is "cce_logical_not"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["bool"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logical_not_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)

    with tvm.target.cce():

        result = tvm.compute(
            shape,
            lambda *i: tvm.select(data[i] is True, False, True),
            name="result")

        schedule = tvm.create_schedule(result.op)

        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, result], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, result], "cce", name=kernel_name)
Ejemplo n.º 5
0
    def lambda_func(*indice):
        new_indice = [0] * 5
        if tensor_flag:
            new_indice[4] = indice[c0_index]
            new_indice[1] = indice[c1_index]

        return tvm.select(
            indice[c1_index] < x_shape_list[c1_index],
            tvm.conv_vdeq(res_s16(*indice),
                          req_scale(*new_indice)).astype("int8"),
            tvm.const(0, dtype="int8"))
Ejemplo n.º 6
0
    def _im2col_row_major_indices(indices, A, kernel_h, kernel_w, padding,
                                  stride, dilate):
        """
        calculate im2col_row_major tvm lambda function
        Parameters
        ----------
        indices : indices in lambda function

        A : feature map

        kernel_h: the kernel value in  h

        kernel_w: the kernel value in  w

        padding: the padding shape

        stride: the stride value

        -------
        Returns  im2col_row_major tvm lambda function
        """
        in_n, in_c1, inH, in_w, in_c0 = A.shape

        n, hw, c1, kh, kw, c0 = indices
        stride_h, stride_w = stride
        dilate_h, dilate_w = dilate
        padding_top, padding_bottom, padding_left, padding_right = padding

        kernel_dilate_w = (kernel_w - 1) * dilate[1] + 1

        width_out = (in_w.value + padding_left + padding_right -
                     kernel_dilate_w) // (stride_w) + 1

        n_index = n
        c1_index = c1
        h_index = (hw // width_out) * stride_h + (kh * dilate_h)
        w_index = (hw % width_out) * stride_w + (kw * dilate_w)
        c0_index = c0
        return tvm.select(
            tvm.any(h_index < padding_top,
                    h_index > inH.value + padding_top - 1,
                    w_index < padding_left,
                    w_index > in_w.value + padding_left - 1),
            tvm.const(0.0, compute_dtype),
            A(n_index, c1_index, h_index - padding_top, w_index - padding_left,
              c0_index))
Ejemplo n.º 7
0
 def _copy_from_input_value(index, shape):
     for idx, _ in enumerate(shape):
         i = len(shape) - idx - 1
         if idx == 0:
             select_result = tvm.select(
                 begin[i] <= index[i],
                 input_value_ub(*_map_input_value_index(*index)))
             select_result = tvm.select(
                 (index[i] - begin[i]) % strides[i] == 0, select_result)
             select_result = tvm.select(end[i] > index[i], select_result)
         else:
             select_result = tvm.select(begin[i] <= index[i], select_result)
             select_result = tvm.select(
                 (index[i] - begin[i]) % strides[i] == 0, select_result)
             select_result = tvm.select(end[i] > index[i], select_result)
     return select_result
Ejemplo n.º 8
0
def avg_pool_grad_compute(input_shape, weight, out, vealuemean, k_sizes,
                          strides, padding):
    """
    Computes the gradients of avg pool, insert input.

    Parameters
    ----------
    input_shape: a list or tuple representing the shape of input,
                6D format [N, C1, 1, H, W, C0]

    weight: a tensor, 5D with shape [C1, Hf*Wf, 1, C0, C0]

    out: a tensor, 6D format [N, Co1, 1, Ho, Wo, C0]

    weight_sizes: a list or tuple of two ints,[H, W]

    strides: a list or tuple of two ints,[H, W]

    padding: only support "SAME" yet, the type of padding algorithm to use

    Returns
    -------
    dx_res: compute of the gradients of avg pool grad
    """
    out_type = out.dtype
    _, _, _, input_h, input_w, _ = input_shape
    k_height, k_width = k_sizes
    out_shape = (int(i.value) for i in out.shape)
    out_n, out_cgroup, out_c1, out_h, out_w, out_c0 = out_shape
    out_mul_shape = out_n, out_cgroup, out_c1, out_h, out_w, out_c0
    out_mul = tvm.compute(out_mul_shape,
                          lambda *i: out(*i) * vealuemean(*i),
                          name='out_mul')

    dilated_shape, dilated_pad = calculation_dilation(input_shape, k_sizes,
                                                      strides, padding)
    dilated_strides = (1, 1)

    # compute of out_backprop dilation
    out_dilated = tvm.compute(
        dilated_shape,
        lambda n, cg, c1, h, w, c0: tvm.select(
            tvm.all(h % strides[0] == 0, w % strides[1] == 0), out_mul[
                n, cg, c1, h // strides[0], w // strides[1], c0],
            tvm.const(0, out.dtype)),
        attrs={'strides': strides},
        name='out_dilated')

    # image to column of dilated out_backprop
    out_im2col_row_major_shape = (out_n, out_cgroup, input_h * input_w, out_c1,
                                  k_height, k_width, BLOCK_SIZE)
    out_col = common.im2col_6d(out_dilated, out_im2col_row_major_shape,
                               k_height, k_width, dilated_pad, dilated_strides)
    hiwi_mad = (input_h * input_w + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE

    dout_im2col_fractal_shape = (out_n, out_cgroup, hiwi_mad // BLOCK_SIZE,
                                 out_c1 * k_height * k_width, BLOCK_SIZE,
                                 BLOCK_SIZE)
    dout_col_pad = common.im2col_fractal_6d(dout_im2col_fractal_shape, out_col)
    # unuse , waiting for delect
    weight_unuse = tvm.compute(weight.shape,
                               lambda *index: weight(*index),
                               name='weight_rotated')

    res_dtype = "float32"

    # matrix multiplication of dilated out_backprop and rotated weight
    mad_shape = (out_n, out_cgroup, out_c1, hiwi_mad, out_c0)
    mad_res = common.mad(mad_shape, dout_col_pad, weight_unuse, res_dtype)

    # cast dX from float32 to float16
    dx_cast = tvm.compute(mad_res.shape,
                          lambda *index: mad_res(*index).astype(out_type),
                          name='dx_cast')

    # remove the padding of dX
    res_shape = (out_n, out_cgroup, out_c1, input_h * input_w, out_c0)
    dx_res = tvm.compute(res_shape,
                         lambda *index: dx_cast(*index).astype(out_type),
                         name='dx_res',
                         attrs={
                             'weight_height': k_height,
                             'weight_width': k_width,
                             'dilated_pad': dilated_pad,
                             'dilated_strides': dilated_strides
                         })
    return dx_res
Ejemplo n.º 9
0
def tanh_split_input_by_val(shape, input_x, symbol):
    """
    split input into two tensor by 0.5
    shape : tensor shape
    input_x : tensor
    symbol : tensor symbol
    return: res, operations, scope
    """
    res = {}
    operation = {}
    scope = {}
    dtype_x = input_x.dtype
    const_zero = tvm.const(0.0, dtype="float16")
    const_0 = tvm.const(0.5, dtype="float16")

    key = "input_abs_" + symbol
    input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key)
    res[key] = input_abs
    operation[key] = "vector_abs"
    scope[key] = cce.scope_ubuf

    # vcmp only support fp16
    if dtype_x == "float32":
        key = "cmp_val_fp16_" + symbol
        cmp_val_fp16 = tvm.compute(
            shape, lambda *i: topi.cast(input_abs(*i), "float16"), name=key)
        res[key] = cmp_val_fp16
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_val_fp16_" + symbol
        input_val_fp16 = tvm.compute(
            shape, lambda *i: topi.cast(input_x(*i), "float16"), name=key)
        res[key] = input_val_fp16
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_gt_fp16_" + symbol
        input_gt_fp16 = \
            tvm.compute(shape,
                        lambda *i: tvm.select(cmp_val_fp16(*i) > const_0,
                                              input_val_fp16(*i), const_zero),
                        name=key)
        res[key] = input_gt_fp16
        operation[key] = "vector_select_gt"
        scope[key] = cce.scope_ubuf

        key = "input_lt_fp16_" + symbol
        input_lt_fp16 = \
            tvm.compute(shape,
                        lambda *i: tvm.select(cmp_val_fp16(*i) <= const_0,
                                              input_val_fp16(*i), const_zero),
                        name=key)
        res[key] = input_lt_fp16
        operation[key] = "vector_select_le"
        scope[key] = cce.scope_ubuf

        key = "input_gt_" + symbol
        input_gt = tvm.compute(
            shape,
            lambda *i: topi.cast(input_gt_fp16(*i), "float32"),
            name=key)
        res[key] = input_gt
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_lt_" + symbol
        input_lt = tvm.compute(
            shape,
            lambda *i: topi.cast(input_lt_fp16(*i), "float32"),
            name=key)
        res[key] = input_lt
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf
    else:
        key = "input_gt_" + symbol
        input_gt = tvm.compute(
            shape,
            lambda *i: tvm.select(
                input_abs(*i) > const_0, input_x(*i), const_zero),
            name=key)
        res[key] = input_gt
        operation[key] = "vector_select_gt"
        scope[key] = cce.scope_ubuf

        key = "input_lt_" + symbol
        input_lt = tvm.compute(
            shape,
            lambda *i: tvm.select(
                input_abs(*i) <= const_0, input_x(*i), const_zero),
            name=key)
        res[key] = input_lt
        operation[key] = "vector_select_le"
        scope[key] = cce.scope_ubuf

    return res, operation, scope
def _max_pool_grad_grad_with_argmax_compute(
        placeholders,
        x,
        argmax,
        grad,
        y,
        ksize,
        strides,
        padding="VALID",
        ori_format_x="NCHW",
        kernel_name="cce_max_pool_grad_grad_with_argmax"):
    """
    Computes second-order gradients of the maxpooling function.

    Parameters
    ----------
        x: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        grad: dict
             Include info about grad of ori_input,
             format, ori_format, shape, ori_shape, dtype.
        argmax: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        y: dict
             Include info about result of function,
             format, ori_format, shape, ori_shape, dtype.
        ksize: list or tuple
            The size of the window for each dimension of the input tensor.
        strides: list or tuple
            The stride of the sliding window of the input tensor.
        padding: str
            The type of padding algorithm to use.
            Only support "VALID" or "SAME"
        kernel_name: str
            Cce kernel name,
            default value is "cce_max_pool_grad_grad_with_argmax"
    Returns
    -------
        grad_in_l1:
            process of movement of grad from gm to l1.
        grad_im2col:
            process of vm tensor of grad on l1.
        grad_fractal:
            process of fractal of grad from l1 to ub.
        grad_fractal_transp:
            process of transposition of grad.
        argmax_ub:
            process of movement of argmax from gm to ub.
        tensor_zero_ub:
            process of movement of zero tensor from gm to ub.
        grad_grad_col:
            tensor after selection.
        grad_grad:
            tensor after reduce_sum.
        output_res:
            output of the calculation.
    """
    argmax_tensor = placeholders[1]
    grad_tensor = placeholders[2]

    (grad_n, grad_c1, grad_h, grad_w, grad_c0) = grad.get("shape")
    if ori_format_x == "NHWC":
        _, kernel_h, kernel_w, _ = ksize
        _, stride_h, stride_w, _ = strides
    else:
        _, _, kernel_h, kernel_w = ksize
        _, _, stride_h, stride_w = strides

    shape_max_pool_h, pad_top, pad_bottom = \
        common.tf_get_windowed_output_size_verbose(
            grad_h, kernel_h, stride_h, padding)

    shape_max_pool_w, pad_left, pad_right = \
        common.tf_get_windowed_output_size_verbose(
            grad_w, kernel_w, stride_w, padding)

    pad_list = (pad_top, pad_bottom, pad_left, pad_right)
    stride = (stride_h, stride_w)

    # howo must be multiple of 16
    howo = _ceil_to(shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE)

    # copy argmax from ub to gm
    shape_argmax_ub = (grad_n, grad_c1 * kernel_h * kernel_w,
                       howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    argmax_ub = tvm.compute(shape_argmax_ub,
                            lambda *i: argmax_tensor(*i),
                            name='argmax_ub')

    # load3d compute
    shape_grad = (grad_n, grad_c1, grad_h, grad_w, grad_c0)
    grad_in_l1 = tvm.compute(shape_grad,
                             lambda *i: grad_tensor[i],
                             name="grad_in_l1")
    # n howo c1 kh kw c0
    shape_grad_vm = (grad_n, shape_max_pool_h * shape_max_pool_w, grad_c1,
                     kernel_h, kernel_w, grad_c0)
    grad_im2col = common.img2col(
        grad_in_l1,
        shape_grad_vm,
        kernel_h,
        kernel_w,
        pad_list,
        stride,
    )
    # n hw c1 kh kw c0  ->  n c1 kh kw hw c0
    shape_fractal = (grad_n, howo // BLOCK_SIZE, grad_c1 * kernel_h * kernel_w,
                     BLOCK_SIZE, BLOCK_SIZE)
    grad_fractal = common.im2col_fractal(shape_fractal,
                                         grad_im2col,
                                         "ca",
                                         tag='')

    # (n, howo/16, c1khkw, 16, c0) -> (n, c1khkw, howo/16, 16, c0)
    shape_grad_fratical_transp = (grad_n, grad_c1 * kernel_h * kernel_w,
                                  howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    grad_fractal_transp = tvm.compute(
        shape_grad_fratical_transp,
        lambda i, j, k, l, m: grad_fractal[i, k, j, l, m],
        name='grad_fractal_transp')

    # declare a zero tensor, and move to ub for vsel
    dtype_tensor_zero = grad_tensor.dtype
    shape_tensor_zero = (BLOCK_SIZE, )
    tensor_zero_ub = tvm.compute(
        shape_tensor_zero,
        lambda *i: tvm.const(0, dtype=dtype_tensor_zero),
        name='tensor_zero_ub')

    # vsel compute
    shape_grad_grad_col = (grad_n, grad_c1 * kernel_h * kernel_w,
                           howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    grad_grad_col = tvm.compute(
        shape_grad_grad_col,
        lambda i, j, k, l, m: tvm.select(argmax_ub[
            i, j, k, l, m], grad_fractal_transp[i, j, k, l, m], tensor_zero_ub[
                m]),
        name='grad_grad_col')

    # reduce_sum
    # (n, c1khkw, howo/16, 16, c0) -> (n, c1, howo/16, 16, c0)
    m = tvm.reduce_axis((0, kernel_h * kernel_w), "m")
    shape_grad_grad = (grad_n, grad_c1, howo // BLOCK_SIZE, BLOCK_SIZE,
                       BLOCK_SIZE)
    grad_grad = tvm.compute(
        shape_grad_grad,
        lambda i, j, n, p, q: tvm.sum(
            grad_grad_col[i, j * kernel_h * kernel_w + m, n, p, q], axis=[m]),
        name="grad_grad")

    extract_params = {}
    extract_params["padding_mode"] = padding
    extract_params["shape_max_pool_h"] = shape_max_pool_h
    extract_params["shape_max_pool_w"] = shape_max_pool_w
    extract_params["fmap_shape"] = shape_grad
    extract_params["ksizes"] = ksize
    extract_params["strides"] = strides
    extract_params["pad"] = pad_list
    extract_params["fmap_vm_shape"] = shape_grad_vm
    extract_params["fractal_shape"] = shape_fractal
    extract_params["HoWo"] = howo

    setfmatrix_dict = {
        "conv_kernel_h": kernel_h,
        "conv_kernel_w": kernel_w,
        "conv_padding_top": pad_top,
        "conv_padding_bottom": pad_bottom,
        "conv_padding_left": pad_left,
        "conv_padding_right": pad_right,
        "conv_stride_h": stride_h,
        "conv_stride_w": stride_w,
        "conv_fm_c": grad_c1 * grad_c0,
        "conv_fm_h": grad_h,
        "conv_fm_w": grad_w,
    }

    # UB to OUT
    output_res = tvm.compute(
        (grad_n, grad_c1, shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE),
        lambda i, j, l, m: grad_grad[i, j, l // 16, l % 16, m],
        name="ub_to_out",
        attrs={
            'extract_params': extract_params,
            'setfmatrix_dict': setfmatrix_dict
        })

    return grad_in_l1, grad_im2col, grad_fractal, grad_fractal_transp, \
           argmax_ub, tensor_zero_ub, grad_grad_col, grad_grad, output_res
def custom_logical_and(shape_x,
                       shape_y,
                       dtype,
                       kernel_name="cce_tf_logical_and",
                       need_build=False,
                       need_print=False):
    """
    do element-wise logical-and operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support "bool"

    kernel_name : cce kernel name, default value is "cce_tf_logical_and"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["bool"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "logical_and_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data1_tmp2 = te.lang.cce.broadcast(data2, shape_max)

        min_value = tvm.const(0, dtype=inp_dtype)
        res = tvm.compute(
            shape_max,
            lambda *i: tvm.select(
                tvm.all(
                    tvm.any(
                        data1_tmp1(*i) > min_value,
                        data1_tmp1(*i) < -min_value),
                    tvm.any(
                        data1_tmp2(*i) > min_value,
                        data1_tmp2(*i) < -min_value)), True, False),
            name="res")

        sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [data1, data2, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
Ejemplo n.º 12
0
def dynamic_lstm(input_x, weight, bias,
                 output_h, kernel_name="dynamic_lstm"):
    """
    x : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_NZ]
    w : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_ZN_LSTM]
    b : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [ND]
    output_h : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_NZ]
    """

    check_dtype(input_x, weight, bias, output_h)

    shape_x_input = input_x.get("shape")
    shape_w_input = weight.get("shape")
    shape_b_input = bias.get("shape")
    shape_output = output_h.get("shape")

    check(shape_x_input, shape_w_input, shape_b_input, shape_output)

    scan_one_num = 1
    t_size = shape_x_input[0] + scan_one_num
    m_size = shape_x_input[2]
    k_size = shape_w_input[0]
    n_size = shape_w_input[1]
    hidden_size = shape_output[1]
    block_size = n_size // hidden_size
    in_x = k_size - hidden_size

    shape_b = (1, k_size, block_size, hidden_size, 16, 16)
    shape_c = (1, block_size, hidden_size, m_size, 16, 16)
    shape_bias = (1, block_size, hidden_size, 1, 1, 16)
    shape_x = (t_size, in_x, m_size, 16, 16)
    shape_h = (1, k_size - in_x, m_size, 16, 16)
    shape_i = (1, hidden_size, m_size, 16, 16)
    shape_i_t = (t_size, hidden_size, m_size, 16, 16)

    core_num = cce.get_soc_spec("CORE_NUM")
    # one core use 4 int64 that is 32B align
    shape_sync = (4 * core_num,)

    k0_size = 16

    input_dtype = input_x.get("dtype")
    data_dtype = 'float16'
    sync_dtype = 'int64'

    # define placeholder
    input_x = tvm.placeholder(shape_x, dtype=input_dtype, name='input_x')
    weight = tvm.placeholder(shape_b, dtype=input_dtype, name='weight')
    bias = tvm.placeholder(shape_bias, name='bias', dtype=input_dtype)
    s_state_h = tvm.placeholder(shape_h, dtype=input_dtype, name='state_h')
    s_state_c = tvm.placeholder(shape_i, dtype=input_dtype, name='state_c')

    sync0 = tvm.placeholder(shape_sync, name="sync0", dtype='int64')

    # compute

    # weight need first to ub and cast to float16
    weight_ub = \
        tvm.compute(
            shape_b,
            lambda *indices: weight(*indices),
            name="weight_ub")

    weight_fp16 = \
        tvm.compute(shape_b,
                    lambda *indices: weight_ub(*indices).astype(data_dtype),
                    name="weight_fp16")

    # input and s_state_h need first to ub and cast to float16
    shape_a_z_bigz = (t_size, m_size, k_size, 16, 16)

    # input and s_start_h is Nz, need trans to zZ
    # so change axis 1 and 2
    a_ub = tvm.compute(shape_a_z_bigz,
                       lambda *indice:
                       tvm.select(indice[2] < in_x,
                                  input_x[indice[0],
                                          indice[2],
                                          indice[1],
                                          indice[3],
                                          indice[4]],
                                  s_state_h[0,
                                            indice[2] - in_x,
                                            indice[1],
                                            indice[3],
                                            indice[4]]
                                  ),
                       name="a_ub", tag="concat")

    shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16)

    a_ub_fp16 = \
        tvm.compute(shape_a_z_bigz_1,
                    lambda *indices: a_ub(*indices).astype(data_dtype),
                    name="a_ub_fp16")

    a_l1 = tvm.compute(shape_a_z_bigz_1,
                       lambda *indices: a_ub_fp16(*indices),
                       name='a_l1')
    b_l1 = tvm.compute(shape_b,
                       lambda *indices: weight_fp16(*indices),
                       name='b_l1')

    # shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16)
    a_l0a = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1(*indices), name="a_l0a")
    b_l0b = tvm.compute(shape_b, lambda *indices: b_l1(*indices), name="b_l0b")

    k1 = tvm.reduce_axis((0, k_size), name='k1')
    k0 = tvm.reduce_axis((0, k0_size), name='k0')

    c_l0c = tvm.compute(shape_c,
                        lambda t, nb_0, nb_1, mb, mp, np:
                        tvm.sum((a_l0a[t, mb, k1, mp, k0] * \
                                b_l0b[t, k1, nb_0, nb_1, np, k0]) \
                                .astype('float32'),
                                axis=[k1, k0]),
                        name='c_l0c')

    c_ub = tvm.compute(shape_c, lambda *indices: c_l0c(*indices), name="c_ub")

    bias_ub = tvm.compute(shape_bias,
                          lambda *indices: bias(*indices),
                          name='bias_ub')

    bias_bc_ub = te.lang.cce.broadcast(bias_ub, shape_c)
    c_ub_bias = te.lang.cce.vadd(c_ub, bias_bc_ub)

    # split matmul res
    i_t_index = 0
    j_t_index = 1
    f_t_index = 2
    o_t_index = 3
    i_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, i_t_index, i, j, k, l),
                    name="i_t")
    j_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, j_t_index, i, j, k, l),
                    name="j_t")
    f_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, f_t_index, i, j, k, l),
                    name="f_t")
    o_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, o_t_index, i, j, k, l),
                    name="o_t")

    f_t_sigmoid = sigmoid_compute(f_t)
    i_t_sigmoid = sigmoid_compute(i_t)
    o_t_sigmoid = sigmoid_compute(o_t)
    j_t_tanh = tanh_compute(j_t)

    c_t_tmp1 = te.lang.cce.vmul(s_state_c, f_t_sigmoid)
    c_t_tmp2 = te.lang.cce.vmul(j_t_tanh, i_t_sigmoid)
    update_c = te.lang.cce.vadd(c_t_tmp1, c_t_tmp2)

    update_c_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_c(0, i, j, k, l),
                              name="update_c_gm")

    c_t_tanh = tanh_compute(update_c)

    update_h = te.lang.cce.vmul(c_t_tanh, o_t_sigmoid)
    update_h_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_h(0, i, j, k, l),
                              name="update_h_gm")

    update_hc_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_c_gm(0, i, j, k, l) +\
                                  update_h_gm(t, i, j, k, l),
            name="update_hc_vn")

    update_c_gm_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l),
            name="update_c_gm_vn")

    update_h_gm_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l),
            name="update_h_gm_vn")

    update_c_ub = \
        tvm.compute(
            shape_i,
            lambda t, i, j, k, l: update_c_gm_vn(t, i, j, k, l),
            name="update_c_ub")

    update_c_gm_2 = \
        tvm.compute(shape_i_t,
                    lambda t, i, j, k, l: update_c_ub(0, i, j, k, l),
                    name="update_c_gm_2")
    update_h_ub = \
        tvm.compute(
            shape_i,
            lambda t, i, j, k, l: update_h_gm_vn(t, i, j, k, l),
            name="update_h_ub")

    update_h_gm_2 = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_h_ub(0, i, j, k, l) +\
                                  update_c_gm_2(t, i, j, k, l),
            name="update_h_gm_2")

    update_h_gm_2_dummy = \
        tvm.compute(shape_i_t,
                    lambda t, i, j, k, l: update_h_gm_2(t, i, j, k, l),
                    name="update_h_gm_2_dummy")

    # state init
    init_shape = (1, hidden_size, m_size, 16, 16)

    s_state_h_ub = \
        tvm.compute(shape_h,
                    lambda *indices: tvm.const(0.0, dtype=input_dtype),
                    name='s_state_h_ub')
    s_state_c_ub = \
        tvm.compute(shape_i,
                    lambda *indices: tvm.const(0.0, dtype=input_dtype),
                    name='s_state_c_ub')

    s_init_h = \
        tvm.compute(
            init_shape,
            lambda _, i, j, k, l: s_state_h_ub[0, i, j, k, l],
            name="s_init_h")

    s_init_c = \
        tvm.compute(
            init_shape,
            lambda _, i, j, k, l: s_state_c_ub[0, i, j, k, l],
            name="s_init_c")

    # scan
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h_ub, update_c_ub],
        [s_state_h, s_state_c],
        scan_update=[update_h_gm_2, update_h_gm_2_dummy],
        name="lstm_scan")

    # end compute

    # schedule
    s = tvm.create_schedule([scan_h.op, scan_c.op])

    new_build_list = [input_x, weight, bias, update_h_gm, update_c_gm,
                      sync0, update_h_gm_vn, update_c_gm_vn]

    def gen_reversed_subgraph_list(out_tensor, tensor_list):
        """
        traverse tensors by Depth-First-Search
        """
        if out_tensor is None:
            return
        stack = [out_tensor]
        visited_list = []
        while stack:
            cur_tensor = stack.pop()
            visited_list.append(cur_tensor)
            for in_tensor in cur_tensor.op.input_tensors:
                if in_tensor not in visited_list:
                    stack.append(in_tensor)
                    if "elewise" in in_tensor.op.tag or \
                            "broadcast" == in_tensor.op.tag:
                        if in_tensor not in tensor_list:
                            tensor_list.append(in_tensor)

    elewise_tensors = []
    gen_reversed_subgraph_list(update_h_gm, elewise_tensors)

    barrier_tensor = c_ub_bias
    elewise_before_barrier_tensors = [bias_bc_ub]

    # set scope
    s[a_l1].set_scope(cce.scope_cbuf)
    s[b_l1].set_scope(cce.scope_cbuf)
    s[a_l0a].set_scope(cce.scope_ca)
    s[b_l0b].set_scope(cce.scope_cb)
    s[c_l0c].set_scope(cce.scope_cc)
    s[c_ub].set_scope(cce.scope_ubuf)
    s[s_init_h].set_scope(cce.scope_ubuf)
    s[bias_ub].set_scope(cce.scope_ubuf)
    s[bias_bc_ub].set_scope(cce.scope_ubuf)
    s[scan_h].set_scope(cce.scope_ubuf)
    s[scan_c].set_scope(cce.scope_ubuf)
    s[update_h_ub].set_scope(cce.scope_ubuf)
    s[update_c_ub].set_scope(cce.scope_ubuf)
    s[s_state_h_ub].set_scope(cce.scope_ubuf)
    s[s_state_c_ub].set_scope(cce.scope_ubuf)

    s[weight_ub].set_scope(cce.scope_ubuf)
    s[weight_fp16].set_scope(cce.scope_ubuf)
    s[a_ub].set_scope(cce.scope_ubuf)
    s[a_ub_fp16].set_scope(cce.scope_ubuf)

    for tensor in elewise_tensors:
        s[tensor].set_scope(cce.scope_ubuf)

    # compute inline
    compute_inline_tensors = [i_t, j_t, f_t, o_t]
    for tensor in compute_inline_tensors:
        s[tensor].compute_inline()

    # matmul tiling
    factor_l1_m, factor_l1_n, factor_l1_k, \
    factor_l0_m, factor_l0_n, factor_l0_k = \
        _get_lstm_tiling(m_size, k_size, n_size)
    l1_n_outer, l1_n_inner = \
        s[c_l0c].split(c_l0c.op.axis[2],
                       factor=factor_l1_n // block_size)

    l1_m_outer, l1_m_inner = \
        s[c_l0c].split(c_l0c.op.axis[3],
                       factor=factor_l1_m)
    l1_k_outer, l1_k_inner = \
        s[c_l0c].split(c_l0c.op.reduce_axis[0],
                       factor=factor_l1_k)

    l0_n_outer, l0_n_inner = s[c_l0c].split(l1_n_inner,
                                            factor=factor_l0_n)
    l0_m_outer, l0_m_inner = s[c_l0c].split(l1_m_inner,
                                            factor=factor_l0_m)
    l0_k_outer, l0_k_inner = s[c_l0c].split(l1_k_inner,
                                            factor=factor_l0_k)

    s[c_l0c].reorder(l1_n_outer, c_l0c.op.axis[1],
                     l1_m_outer, l1_k_outer,
                     l0_n_outer, l0_m_outer, l0_k_outer,
                     l0_n_inner, l0_m_inner, c_l0c.op.axis[3 + 1],
                     c_l0c.op.axis[4 + 1], l0_k_inner,
                     c_l0c.op.reduce_axis[1])

    s[weight_ub].compute_at(s[c_l0c], l1_k_outer)
    s[weight_fp16].compute_at(s[c_l0c], l1_k_outer)
    s[a_ub].compute_at(s[c_l0c], l1_k_outer)
    s[a_ub_fp16].compute_at(s[c_l0c], l1_k_outer)

    s[a_l0a].compute_at(s[c_l0c], l0_k_outer)
    s[b_l0b].compute_at(s[c_l0c], l0_k_outer)
    s[a_l1].compute_at(s[c_l0c], l1_k_outer)
    s[b_l1].compute_at(s[c_l0c], l1_k_outer)

    ub_n_outer, ub_n_inner = \
        s[c_ub].split(c_ub.op.axis[2],
                      factor=factor_l1_n // block_size)

    ub_m_outer, ub_m_inner = s[c_ub].split(c_ub.op.axis[3],
                                           factor=factor_l1_m)
    s[c_ub].reorder(ub_n_outer, c_ub.op.axis[1], ub_m_outer,
                    ub_n_inner, ub_m_inner, c_ub.op.axis[4],
                    c_ub.op.axis[5])

    s[c_l0c].compute_at(s[c_ub], ub_n_outer)

    # elewise compute_at
    barrier_outer, barrier_inner = \
        s[barrier_tensor].split(barrier_tensor.op.axis[2],
                                factor=factor_l1_n // block_size)

    s[barrier_tensor].reorder(
        barrier_tensor.op.axis[0], barrier_outer,
        barrier_tensor.op.axis[1], barrier_inner,
        barrier_tensor.op.axis[3],
        barrier_tensor.op.axis[4],
        barrier_tensor.op.axis[5])

    s[c_ub].compute_at(s[barrier_tensor], barrier_outer)
    s[bias_ub].compute_at(s[barrier_tensor], barrier_outer)

    for tensor in elewise_before_barrier_tensors:
        s[tensor].compute_at(s[barrier_tensor], barrier_outer)

    vn_outer, vn_inner = \
        s[update_hc_vn].split(update_hc_vn.op.axis[0 + 1],
                              factor=factor_l1_n // block_size)

    second_split_factor = \
        (hidden_size // (factor_l1_n // block_size)) // core_num

    vn_o_outer, vn_o_inner = \
        s[update_hc_vn].split(vn_outer,
                              factor=second_split_factor)

    s[barrier_tensor].compute_at(s[update_hc_vn], vn_o_inner)

    for tensor in elewise_tensors:
        if tensor not in elewise_before_barrier_tensors:
            s[tensor].compute_at(s[update_hc_vn], vn_o_inner)

    s[update_c_gm].compute_at(s[update_hc_vn], vn_o_inner)
    s[update_h_gm].compute_at(s[update_hc_vn], vn_o_inner)

    second_split_factor = hidden_size // core_num

    res_h_outer, res_h_inner = \
        s[update_h_gm_2].split(update_h_gm_2.op.axis[1],
                               factor=hidden_size)

    s[update_hc_vn].compute_at(s[update_h_gm_2], update_h_gm_2.op.axis[0])

    s[update_c_gm_vn].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_h_gm_vn].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_c_ub].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_c_gm_2].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_h_ub].compute_at(s[update_h_gm_2], res_h_outer)

    s[update_h_gm_vn].bind_buffer(
        update_h_gm_vn.op.axis[0], 0,
        scan_h.op.scan_axis + res_h_outer)
    s[update_c_gm_vn].bind_buffer(
        update_c_gm_vn.op.axis[0], 0,
        scan_h.op.scan_axis + res_h_outer)

    # bind
    s[update_hc_vn].bind(vn_o_outer, tvm.thread_axis("blockIdx.x"))

    # multi core sync
    s[update_hc_vn].pragma(update_hc_vn.op.axis[0],
                          pragma_type="multicore_sync_wait_after",
                          pragma_value=sync0[0])
    s[update_hc_vn].pragma(update_hc_vn.op.axis[0],
                          pragma_type="multicore_sync_set_after",
                          pragma_value=sync0[0])

    # modify for extend
    s[input_x].bind_buffer(0, 0, scan_h.op.scan_axis)

    s[update_h_gm].buffer_tile((scan_h.op.scan_axis*1, 1),
                               (None, None), (None, None),
                               (None, None), (None, None))

    s[update_c_gm].buffer_tile((scan_h.op.scan_axis*1, 1),
                               (None, None), (None, None),
                               (None, None), (None, None))

    s[update_h_gm_2].buffer_tile((0, 1), (None, None), (None, None),
                                 (None, None), (None, None))
    s[update_c_gm_2].buffer_tile((0, 1), (None, None), (None, None),
                                 (None, None), (None, None))

    # buffer reuse
    s[update_h_gm].reused_by(update_h_gm_vn)
    s[update_c_gm].reused_by(update_c_gm_vn)

    # emit_insn
    s[a_l1].emit_insn(a_l1.op.axis[0], 'dma_copy')
    s[b_l1].emit_insn(b_l1.op.axis[0], 'dma_copy')
    s[a_l0a].emit_insn(a_l0a.op.axis[0], 'dma_copy')
    s[b_l0b].emit_insn(b_l0b.op.axis[0], 'dma_copy')

    s[weight_ub].emit_insn(weight_ub.op.axis[0], 'dma_copy')
    s[weight_fp16].emit_insn(weight_fp16.op.axis[0], 'vector_conv')

    s[a_ub].emit_insn(a_ub.op.axis[0], 'dma_copy')
    s[a_ub_fp16].emit_insn(a_ub_fp16.op.axis[0], 'vector_conv')

    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer, l0_k_outer]}
    s[c_l0c].emit_insn(l0_n_inner, 'mad', mad_dict)
    s[c_ub].emit_insn(ub_n_inner, 'dma_copy')

    s[s_init_h].emit_insn(s_init_h.op.axis[0], 'dma_copy')
    s[s_init_c].emit_insn(s_init_c.op.axis[0], 'dma_copy')
    s[bias_bc_ub].emit_insn(bias_bc_ub.op.axis[0], 'unified_broadcast')

    s[s_state_h_ub].emit_insn(s_state_h_ub.op.axis[0], 'broadcast')
    s[s_state_c_ub].emit_insn(s_state_c_ub.op.axis[0], 'broadcast')

    s[barrier_tensor].emit_insn(barrier_tensor.op.axis[1], 'vector_add')

    for tensor in elewise_tensors:
        if tensor != barrier_tensor:
            insn = get_emit_insn_map(tensor)
            s[tensor].emit_insn(tensor.op.axis[0], insn)

    s[bias_ub].emit_insn(bias_ub.op.axis[0], 'dma_copy')

    s[update_c_gm].emit_insn(s[update_c_gm].op.axis[1], 'dma_copy')
    s[update_h_gm].emit_insn(s[update_h_gm].op.axis[1], 'dma_copy')

    s[update_c_ub].emit_insn(update_c_ub.op.axis[1], 'dma_copy')
    s[update_h_ub].emit_insn(update_h_ub.op.axis[1], 'dma_copy')

    s[update_hc_vn].emit_insn(vn_inner, 'phony_insn')
    s[update_c_gm_vn].emit_insn(s[update_c_gm_vn].op.axis[0], 'phony_insn')
    s[update_h_gm_vn].emit_insn(s[update_h_gm_vn].op.axis[0], 'phony_insn')
    s[update_h_gm_2].emit_insn(res_h_inner, 'phony_insn')
    s[update_c_gm_2].emit_insn(s[update_c_gm_2].op.axis[0], 'phony_insn')
    s[update_h_gm_2_dummy].emit_insn(
        update_h_gm_2_dummy.op.axis[0], 'phony_insn')

    def _write_workspace_info(shape_list, dtype_list, sync_num, kernel_name):
        """
        modify json after build
        """
        def _write_code(wkspace_dict, fname):
            fname = os.path.realpath(fname)
            if fname.startswith(os.getcwd()):
                if os.path.exists(fname):
                    with open(fname, "r") as f:
                        load_dict = json.load(f)

                    load_dict.update(wkspace_dict)
                    with open(fname, "w") as f:
                        json.dump(load_dict, f,
                                  sort_keys=True, indent=4,
                                  separators=(',', ':'))

        def _get_data_width(ele):
            """
            get data width
            """
            m_sea = re.search(r'\d+', ele)
            if m_sea:
                return int(m_sea.group(0)) // 8
            return 0

        if not os.path.exists("kernel_meta"):
            os.mkdir("kernel_meta")
            os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP)

        num = len(shape_list)
        wkspace_dict = {}
        if num:
            total_size = [functools_reduce(lambda x, y: x * y, list_i) for
                          list_i in shape_list]

            addr_type_list = []
            for i, element in enumerate(dtype_list):
                total_size[i] = total_size[i] * _get_data_width(element)
                addr_type_list.append(0)

            if not os.path.exists("kernel_meta"):
                os.mkdir("kernel_meta")
                os.chmod("kernel_meta",
                         stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP)

            wkspace_dict["workspace"] = {"num": num,
                                         "size": total_size,
                                         "type": addr_type_list}

        if sync_num:
            parameters_list = \
                (len(new_build_list) - 2 - sync_num) * [0, ] + sync_num * [1, ]
            wkspace_dict["parameters"] = parameters_list

        if wkspace_dict:
            _write_code(wkspace_dict, "kernel_meta/" + kernel_name + ".json")

    with build_config:
        tvm.build(s, new_build_list, "cce", name=kernel_name)
        _write_workspace_info(
            [shape_i_t, shape_sync],
            [input_dtype, sync_dtype],
            1, kernel_name)
Ejemplo n.º 13
0
def prior_box_compute(feature, img, data_h, data_w, box_height, box_width, y, \
                      rec_img_h, rec_img_w, step_h, step_w, clip, offset, scale, variance):
    """
    calculating data

    Parameters
    ----------
    input_x : TVM tensor
        the placeholder of input_x
    output_y : dict
        dict of output_y, include keys(shape and dtype)
    kernel_name : str
        kernel name, default value is "prior_box"

    Returns
    -------
    output tensor
    """

    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """
    tensor_dic = {}
    tensor_list = []
    op_list = []
    ins_list = []
    shape_data_h = data_h.get("shape")
    shape_data_w = data_w.get("shape")
    data_dtype = data_h.get("dtype")
    shape_box = box_height.get("shape")
    box_dtype = box_height.get("dtype")
    shape_image = img.get("shape")
    shape_image_h = shape_image[2]
    feature_input = tvm.placeholder(shape_data_h, name="feature_input", \
                                    dtype=data_dtype)
    img_input = tvm.placeholder((shape_image_h,), name="img_input", \
                                dtype=data_dtype)
    data_h_input = tvm.placeholder(shape_data_h, name="data_h_input", \
                                   dtype=data_dtype)
    data_w_input = tvm.placeholder(shape_data_w, name="data_w_input", \
                                   dtype=data_dtype)
    box_height_input = tvm.placeholder(shape_box, name="box_h_input", \
                                       dtype=box_dtype)
    box_width_input = tvm.placeholder(shape_box, name="box_w_input", \
                                      dtype=box_dtype)
    tensor_list.append(feature_input)
    tensor_list.append(img_input)
    tensor_list.append(data_h_input)
    tensor_list.append(data_w_input)
    tensor_list.append(box_height_input)
    tensor_list.append(box_width_input)

    feature_ub = tvm.compute(shape_data_h, lambda *i: feature_input(*i), \
                             name="feature_ub")
    img_ub = tvm.compute((shape_image_h,), lambda *i: img_input(*i), \
                         name="img_ub")
    tensor_dic["feature_ub"] = feature_ub
    op_list += [feature_ub]
    ins_list += ["dma_copy"]
    tensor_dic["img_ub"] = img_ub
    op_list += [img_ub]
    ins_list += ["dma_copy"]

    move_value = tvm.const(0.0, data_dtype)
    feature_move = tvm.compute(shape_data_h, \
                               lambda *i: feature_ub(*i) * move_value, \
                               name="feature_move")
    img_move = tvm.compute((shape_image_h,), \
                           lambda *i: img_ub(*i) * move_value, name="img_move")
    tensor_dic["feature_move"] = feature_move
    op_list += [feature_move]
    ins_list += ["vector_muls"]
    tensor_dic["img_move"] = img_move
    op_list += [img_move]
    ins_list += ["vector_muls"]

    data_h_ub_temp = tvm.compute(shape_data_h, lambda *i: data_h_input(*i), \
                                 name="data_h_ub_temp")
    data_w_ub = tvm.compute(shape_data_w, lambda *i: data_w_input(*i), \
                            name="data_w_ub")
    box_height_ub = tvm.compute(shape_box, lambda *i: box_height_input(*i), \
                                name="box_height_ub")
    box_width_ub = tvm.compute(shape_box, lambda *i: box_width_input(*i), \
                               name="box_width_ub")
    tensor_dic["data_h_ub_temp"] = data_h_ub_temp
    op_list += [data_h_ub_temp]
    ins_list += ["dma_copy"]
    tensor_dic["data_w_ub"] = data_w_ub
    op_list += [data_w_ub]
    ins_list += ["dma_copy"]
    tensor_dic["box_height_ub"] = box_height_ub
    op_list += [box_height_ub]
    ins_list += ["dma_copy"]
    tensor_dic["box_width_ub"] = box_width_ub
    op_list += [box_width_ub]
    ins_list += ["dma_copy"]

    data_h_ub_temp1 = tvm.compute(shape_data_h, \
                                  lambda *i: data_h_ub_temp(*i) + feature_move(*i), \
                                  name="data_h_ub_temp1")
    data_h_ub = tvm.compute(shape_data_h, \
                            lambda *i: data_h_ub_temp1(*i) + img_move[0], name="data_h_ub")
    tensor_dic["data_h_ub_temp1"] = data_h_ub_temp1
    op_list += [data_h_ub_temp1]
    ins_list += ["vector_add"]
    tensor_dic["data_h_ub"] = data_h_ub
    op_list += [data_h_ub]
    ins_list += ["vector_adds"]

    offset_value = tvm.const(offset, data_dtype)
    step_w_value = tvm.const(step_w, data_dtype)
    step_h_value = tvm.const(step_h, data_dtype)
    rec_img_w_value = tvm.const(rec_img_w, data_dtype)
    rec_img_h_value = tvm.const(rec_img_h, data_dtype)
    scale_value = tvm.const(scale, data_dtype)
    scale_oppo = 0 - scale
    scale_value_oppo = tvm.const(scale_oppo, data_dtype)

    # define 1 or 4 variance_value
    if len(variance) == 1:
        variance_value = tvm.const(variance[0], data_dtype)
    else:
        variance_value0 = tvm.const(variance[0], data_dtype)
        variance_value1 = tvm.const(variance[1], data_dtype)
        variance_value2 = tvm.const(variance[2], data_dtype)
        variance_value3 = tvm.const(variance[3], data_dtype)

    w_offset = tvm.compute(shape_data_w, \
                           lambda *i: data_w_ub(*i) + offset_value, name="w_offset")
    h_offset = tvm.compute(shape_data_h, \
                           lambda *i: data_h_ub(*i) + offset_value, name="h_offset")
    center_x = tvm.compute(shape_data_w, \
                           lambda *i: w_offset(*i) * step_w_value, name="center_x")
    center_y = tvm.compute(shape_data_h, \
                           lambda *i: h_offset(*i) * step_h_value, name="center_y")
    tensor_dic["w_offset"] = w_offset
    op_list += [w_offset]
    ins_list += ["vector_adds"]
    tensor_dic["h_offset"] = h_offset
    op_list += [h_offset]
    ins_list += ["vector_adds"]
    tensor_dic["center_x"] = center_x
    op_list += [center_x]
    ins_list += ["vector_muls"]
    tensor_dic["center_y"] = center_y
    op_list += [center_y]
    ins_list += ["vector_muls"]

    box_width_scale = tvm.compute(shape_box, \
                                  lambda *i: box_width_ub(*i) * scale_value, name="box_width_scale")
    box_height_scale = tvm.compute(shape_box, \
                                   lambda *i: box_height_ub(*i) * scale_value, name="box_height_scale")
    box_width_scale_oppo = tvm.compute(shape_box, \
                                       lambda *i: box_width_ub(*i) * scale_value_oppo, \
                                       name="box_width_scale_oppo")
    box_height_scale_oppo = tvm.compute(shape_box, \
                                        lambda *i: box_height_ub(*i) * scale_value_oppo, \
                                        name="box_height_scale_oppo")
    tensor_dic["box_width_scale"] = box_width_scale
    op_list += [box_width_scale]
    ins_list += ["vector_muls"]
    tensor_dic["box_height_scale"] = box_height_scale
    op_list += [box_height_scale]
    ins_list += ["vector_muls"]
    tensor_dic["box_width_scale_oppo"] = box_width_scale_oppo
    op_list += [box_width_scale_oppo]
    ins_list += ["vector_muls"]
    tensor_dic["box_height_scale_oppo"] = box_height_scale_oppo
    op_list += [box_height_scale_oppo]
    ins_list += ["vector_muls"]


    num_box = shape_box[0]
    h_length = shape_data_h[0]
    w_length = shape_data_w[0]

    center_x_minus_calc = tvm.compute((w_length, num_box), \
                                      lambda w, c: center_x[w, 0, 0, 0] + box_width_scale_oppo[c, 0, 0, 0], \
                                      name="center_x_minus_calc")
    center_y_minus_calc = tvm.compute((h_length, num_box), \
                                      lambda h, c: center_y[h, 0, 0, 0] + box_height_scale_oppo[c, 0, 0, 0], \
                                      name="center_y_minus_calc")
    center_x_add_calc = tvm.compute((w_length, num_box), \
                                    lambda w, c: center_x[w, 0, 0, 0] + box_width_scale[c, 0, 0, 0], \
                                    name="center_x_add_calc")
    center_y_add_calc = tvm.compute((h_length, num_box), \
                                    lambda h, c: center_y[h, 0, 0, 0] + box_height_scale[c, 0, 0, 0], \
                                    name="center_y_add_calc")
    tensor_dic["center_x_minus_calc"] = center_x_minus_calc
    op_list += [center_x_minus_calc]
    ins_list += ["vector_add"]
    tensor_dic["center_y_minus_calc"] = center_y_minus_calc
    op_list += [center_y_minus_calc]
    ins_list += ["vector_add"]
    tensor_dic["center_x_add_calc"] = center_x_add_calc
    op_list += [center_x_add_calc]
    ins_list += ["vector_add"]
    tensor_dic["center_y_add_calc"] = center_y_add_calc
    op_list += [center_y_add_calc]
    ins_list += ["vector_add"]

    top_data_xmin = tvm.compute((w_length, num_box), \
                                lambda *i: center_x_minus_calc(*i) * rec_img_w_value, \
                                name="top_data_xmin")
    top_data_ymin = tvm.compute((h_length, num_box), \
                                lambda *i: center_y_minus_calc(*i) * rec_img_h_value, \
                                name="top_data_ymin")
    top_data_xmax = tvm.compute((w_length, num_box), \
                                lambda *i: center_x_add_calc(*i) * rec_img_w_value, \
                                name="top_data_xmax")
    top_data_ymax = tvm.compute((h_length, num_box), \
                                lambda *i: center_y_add_calc(*i) * rec_img_h_value, \
                                name="top_data_ymax")
    tensor_dic["top_data_xmin"] = top_data_xmin
    op_list += [top_data_xmin]
    ins_list += ["vector_muls"]
    tensor_dic["top_data_ymin"] = top_data_ymin
    op_list += [top_data_ymin]
    ins_list += ["vector_muls"]
    tensor_dic["top_data_xmax"] = top_data_xmax
    op_list += [top_data_xmax]
    ins_list += ["vector_muls"]
    tensor_dic["top_data_ymax"] = top_data_ymax
    op_list += [top_data_ymax]
    ins_list += ["vector_muls"]

    top_data_res1 = tvm.compute((h_length, w_length, num_box), \
                                lambda a, b, c: top_data_xmin[b, c] + move_value, \
                                name="top_data_res1")
    top_data_res2 = tvm.compute((h_length, w_length, num_box), \
                                lambda a, b, c: top_data_ymin[a, c] + move_value, \
                                name="top_data_res2")
    top_data_res3 = tvm.compute((h_length, w_length, num_box), \
                                lambda a, b, c: top_data_xmax[b, c] + move_value, \
                                name="top_data_res3")
    top_data_res4 = tvm.compute((h_length, w_length, num_box), \
                                lambda a, b, c: top_data_ymax[a, c] + move_value, \
                                name="top_data_res4")
    tensor_dic["top_data_res1"] = top_data_res1
    op_list += [top_data_res1]
    ins_list += ["vector_add"]
    tensor_dic["top_data_res2"] = top_data_res2
    op_list += [top_data_res2]
    ins_list += ["vector_add"]
    tensor_dic["top_data_res3"] = top_data_res3
    op_list += [top_data_res3]
    ins_list += ["vector_add"]
    tensor_dic["top_data_res4"] = top_data_res4
    op_list += [top_data_res4]
    ins_list += ["vector_add"]
    top_data = tvm.compute((h_length, w_length, num_box, 4), \
                           lambda a, b, c, idx: tvm.select(idx < 1, top_data_res1[a, b, c], \
                                                           tvm.select(idx < 2, top_data_res2[a, b, c], \
                                                                      tvm.select(idx < 3, top_data_res3[a, b, c], \
                                                                                 top_data_res4[a, b, c], \
                                                                                 ))), name="top_data")


    tensor_dic["top_data"] = top_data
    op_list += [top_data]
    ins_list += ["data_mov"]

    top_data_true = top_data
    if clip:
        top_data_temp = tvm.compute((h_length, w_length, num_box, 4), \
                                    lambda *i: tvm.max(top_data(*i), 0), name="top_data_temp")
        top_data_true = tvm.compute((h_length, w_length, num_box, 4), \
                                    lambda *i: tvm.min(top_data_temp(*i), 1), name="top_data_true")
        tensor_dic["top_data_temp"] = top_data_temp
        op_list += [top_data_temp]
        ins_list += ["vector_maxs"]
        tensor_dic["top_data_true"] = top_data_true
        op_list += [top_data_true]
        ins_list += ["vector_mins"]

    if len(variance) == 1:
        variance_data = tvm.compute((h_length, w_length, num_box, 4), \
                                    lambda a, b, c, idx: tvm.select(idx < 1, variance_value, \
                                                                    tvm.select(idx < 2, variance_value, \
                                                                               tvm.select(idx < 3, variance_value, \
                                                                                          variance_value, \
                                                                                          ))), name="variance_data")
        tensor_dic["variance_data"] = variance_data
        op_list += [variance_data]
        ins_list += ["data_mov"]
    else:
        variance_data = tvm.compute((h_length, w_length, num_box, 4), \
                                    lambda a, b, c, idx: tvm.select(idx < 1, variance_value0, \
                                                                    tvm.select(idx < 2, variance_value1, \
                                                                               tvm.select(idx < 3, variance_value2, \
                                                                                          variance_value3, \
                                                                                          ))), name="variance_data")
        tensor_dic["variance_data"] = variance_data
        op_list += [variance_data]
        ins_list += ["data_mov"]

    y = tvm.compute((1, 2, h_length, w_length, num_box, 4), \
                    lambda i, idx, j, k, l, m: tvm.select(idx == 1, \
                                                          variance_data[j, k, l, m],
                                                          top_data_true[j, k, l, m], \
                                                          ), name='result')
    tensor_dic["y"] = y
    op_list += [y]
    ins_list += ["dma_copy"]
    tensor_list.append(y)
    return op_list, ins_list, tensor_dic, y, tensor_list
Ejemplo n.º 14
0
def _dynamic_gru_inner(input_list, custom_list):
    input_x = input_list[0]
    weight1 = input_list[1]
    weight2 = input_list[2]
    bias1 = input_list[3]
    bias2 = input_list[4]
    s_init_h_gm = input_list[5]
    s_state_h_gm_last = input_list[6]

    is_gate_output = custom_list[0]
    is_first_round = custom_list[1]
    is_global_init = custom_list[2]

    input_dtype = 'float16'
    bias_dtype = bias1.dtype
    fp16_input_output = bias_dtype == 'float16'

    shape_x_input = input_x.shape
    shape_w1_input = weight1.shape
    w1_size = 2
    w2_size = 1
    t_size = shape_x_input[0].value
    m_size = shape_x_input[2].value
    k_size = shape_w1_input[1].value
    hidden_size = shape_w1_input[3].value
    in_x = k_size - hidden_size

    shape_b_1 = (1, k_size, w1_size, hidden_size, 16, 16)
    shape_b_2 = (1, k_size, w2_size, hidden_size, 16, 16)
    shape_c_1 = (1, w1_size, hidden_size, m_size, 16, 16)
    shape_c_2 = (1, w2_size, hidden_size, m_size, 16, 16)
    shape_bias_1 = (1, w1_size, hidden_size, 1, 1, 16)
    shape_bias_2 = (1, hidden_size, 1, 1, 16)
    shape_i = (1, hidden_size, m_size, 16, 16)
    shape_i_t = (t_size, hidden_size, m_size, 16, 16)
    k0_size = 16

    if is_first_round and not is_global_init:
        s_state_h = tvm.compute(
            shape_i,
            lambda *indices: tvm.const(0.0, dtype='float32'),
            name='s_state_h')
        s_state_h_fp16 = tvm.compute(
            shape_i,
            lambda *indices: s_state_h(*indices).astype('float16'),
            name="s_state_h_fp16")
    else:
        last_h = s_init_h_gm if is_first_round else s_state_h_gm_last
        if fp16_input_output:
            s_state_h_fp16 = tvm.compute(shape_i,
                                         lambda *indices: last_h(*indices),
                                         name='s_state_h_fp16')
            s_state_h = tvm.compute(
                shape_i,
                lambda *indices: s_state_h_fp16(*indices).astype('float32'),
                name="s_state_h")
        else:
            s_state_h = tvm.compute(shape_i,
                                    lambda *indices: last_h(*indices),
                                    name='s_state_h')
            s_state_h_fp16 = tvm.compute(
                shape_i,
                lambda *indices: s_state_h(*indices).astype('float16'),
                name="s_state_h_fp16")

    # compute
    # input and s_state_h need first to ub and cast to float16
    shape_a_z_bigz = (1, m_size, k_size, 16, 16)

    # input and s_start_h is Nz, need trans to zZ
    # so change axis 1 and 2
    a_l1_1 = tvm.compute(
        shape_a_z_bigz,
        lambda *indice: tvm.select(
            indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[
                3], indice[4]], s_state_h_fp16[0, indice[2] - in_x, indice[1],
                                               indice[3], indice[4]]),
        name="a_l1_1",
        tag="concat")
    b_l1_1 = tvm.compute(shape_b_1,
                         lambda *indices: weight1(*indices),
                         name='b_l1_1')
    a_l0a_1 = tvm.compute(shape_a_z_bigz,
                          lambda *indices: a_l1_1(*indices),
                          name="a_l0a_1")
    b_l0b_1 = tvm.compute(shape_b_1,
                          lambda *indices: b_l1_1(*indices),
                          name="b_l0b_1")
    k1_1 = tvm.reduce_axis((0, k_size), name='k1_1')
    k0_1 = tvm.reduce_axis((0, k0_size), name='k0_1')
    c_l0c_1 = tvm.compute(shape_c_1,
                          lambda t, nb_0, nb_1, mb, mp, np:
                          tvm.sum((a_l0a_1[t, mb, k1_1, mp, k0_1] * \
                                   b_l0b_1[t, k1_1, nb_0, nb_1, np, k0_1]) \
                                  .astype('float32'),
                                  axis=[k1_1, k0_1]),
                          name='c_l0c_1')
    c_ub_1 = tvm.compute(shape_c_1,
                         lambda *indices: c_l0c_1(*indices),
                         name="c_ub_1")
    bias_ub_1 = tvm.compute(shape_bias_1,
                            lambda *indices: bias1(*indices),
                            name='bias_ub_1')
    bias_ub_1_fp32 = bias_ub_1
    if fp16_input_output:
        bias_ub_1_fp32 = tvm.compute(
            shape_bias_1,
            lambda *indices: bias_ub_1(*indices).astype('float32'),
            name="bias_ub_1_fp32")
    bias_bc_ub_1 = tbe.broadcast(bias_ub_1_fp32, shape_c_1)
    c_ub_bias_1 = tbe.vadd(c_ub_1, bias_bc_ub_1)

    # split matmul res
    r_t_index = 0
    i_t_index = 1
    r_t = tvm.compute(
        shape_i,
        lambda t, i, j, k, l: c_ub_bias_1(t, r_t_index, i, j, k, l),
        name="r_t")
    i_t = tvm.compute(
        shape_i,
        lambda t, i, j, k, l: c_ub_bias_1(t, i_t_index, i, j, k, l),
        name="i_t")
    r_t_sigmoid = _sigmoid_compute(r_t)
    i_t_sigmoid = _sigmoid_compute(i_t)
    r_t_mid = r_t_sigmoid
    i_t_mid = i_t_sigmoid
    if is_gate_output:
        if fp16_input_output:
            r_t_sigmoid_fp16 = tvm.compute(
                shape_i,
                lambda *indices: r_t_sigmoid(*indices).astype('float16'),
                name="r_t_sigmoid_fp16")
            i_t_sigmoid_fp16 = tvm.compute(
                shape_i,
                lambda *indices: i_t_sigmoid(*indices).astype('float16'),
                name="i_t_sigmoid_fp16")

            r_t_gm = tvm.compute(shape_i,
                                 lambda *indices: r_t_sigmoid_fp16(*indices),
                                 name="r_t_gm")
            i_t_gm = tvm.compute(shape_i,
                                 lambda *indices: i_t_sigmoid_fp16(*indices),
                                 name="i_t_gm")

            r_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: r_t_gm(*indices),
                                      name="r_t_gm_back")
            i_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: i_t_gm(*indices),
                                      name="i_t_gm_back")

            r_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: r_t_gm_back(*indices).astype('float32'),
                name="r_t_gm_back_fp32")
            i_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: i_t_gm_back(*indices).astype('float32'),
                name="i_t_gm_back_fp32")

            r_t_mid = r_t_gm_back_fp32
            i_t_mid = i_t_gm_back_fp32
        else:
            r_t_gm = tvm.compute(shape_i,
                                 lambda *indices: r_t_sigmoid(*indices),
                                 name="r_t_gm")
            i_t_gm = tvm.compute(shape_i,
                                 lambda *indices: i_t_sigmoid(*indices),
                                 name="i_t_gm")

            r_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: r_t_gm(*indices),
                                      name="r_t_gm_back")
            i_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: i_t_gm(*indices),
                                      name="i_t_gm_back")

            r_t_mid = r_t_gm_back
            i_t_mid = i_t_gm_back
    r_t_h = tbe.vmul(r_t_mid, s_state_h)
    r_t_h_fp16 = \
        tvm.compute(shape_i,
                    lambda *indices: r_t_h(*indices).astype(input_dtype),
                    name="r_t_h_fp16")

    # second matmul
    a_l1_2 = tvm.compute(
        shape_a_z_bigz,
        lambda *indice: tvm.select(
            indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[
                3], indice[4]], r_t_h_fp16[0, indice[2] - in_x, indice[1],
                                           indice[3], indice[4]]),
        name="a_l1_2",
        tag="concat")

    b_l1_2 = tvm.compute(shape_b_2,
                         lambda *indices: weight2(*indices),
                         name='b_l1_2')
    a_l0a_2 = tvm.compute(shape_a_z_bigz,
                          lambda *indices: a_l1_2(*indices),
                          name="a_l0a_2")
    b_l0b_2 = tvm.compute(shape_b_2,
                          lambda *indices: b_l1_2(*indices),
                          name="b_l0b_2")
    k1_2 = tvm.reduce_axis((0, k_size), name='k1_2')
    k0_2 = tvm.reduce_axis((0, k0_size), name='k0_2')
    c_l0c_2 = tvm.compute(shape_c_2,
                          lambda t, nb_0, nb_1, mb, mp, np:
                          tvm.sum((a_l0a_2[t, mb, k1_2, mp, k0_2] * \
                                   b_l0b_2[t, k1_2, nb_0, nb_1, np, k0_2]) \
                                  .astype('float32'),
                                  axis=[k1_2, k0_2]),
                          name='c_l0c_2')
    c_ub_2 = tvm.compute(shape_i,
                         lambda t, h, m, i, j: c_l0c_2(t, 0, h, m, i, j),
                         name="c_ub_2")
    bias_ub_2 = tvm.compute(shape_bias_2,
                            lambda t, h, m, i, j: bias2(t, h, m, i, j),
                            name='bias_ub_2')
    bias_ub_2_fp32 = bias_ub_2
    if fp16_input_output:
        bias_ub_2_fp32 = tvm.compute(
            shape_bias_2,
            lambda *indices: bias_ub_2(*indices).astype('float32'),
            name="bias_ub_2_fp32")
    bias_bc_ub_2 = tbe.broadcast(bias_ub_2_fp32, shape_i)
    c_ub_bias_2 = tbe.vadd(c_ub_2, bias_bc_ub_2)

    h_t_tanh = _tanh_compute(c_ub_bias_2)
    h_t_tanh_mid = h_t_tanh
    if is_gate_output:
        if fp16_input_output:
            h_t_tanh_fp16 = tvm.compute(
                shape_i,
                lambda *indices: h_t_tanh(*indices).astype('float16'),
                name="h_t_tanh_fp16")
            n_t_gm = tvm.compute(shape_i,
                                 lambda *indices: h_t_tanh_fp16(*indices),
                                 name="n_t_gm")
            n_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: n_t_gm(*indices),
                                      name="n_t_gm_back")
            n_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: n_t_gm_back(*indices).astype('float32'),
                name="n_t_gm_back_fp32")
            h_t_tanh_mid = n_t_gm_back_fp32
        else:
            n_t_gm = tvm.compute(shape_i,
                                 lambda *indices: h_t_tanh(*indices),
                                 name="n_t_gm")
            n_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: n_t_gm(*indices),
                                      name="n_t_gm_back")
            h_t_tanh_mid = n_t_gm_back

    c_t_tmp1 = tbe.vsub(s_state_h, h_t_tanh_mid)
    c_t_tmp2 = tbe.vmul(c_t_tmp1, i_t_mid)
    update_h = tbe.vadd(c_t_tmp2, h_t_tanh_mid)
    update_h_ub = update_h
    if fp16_input_output:
        update_h_fp16 = tvm.compute(
            shape_i_t,
            lambda *indices: update_h(*indices).astype('float16'),
            name="update_h_fp16")
        update_h_ub = update_h_fp16
    update_y_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_h_ub(0, i, j, k, l),
                              name="update_y_gm")
    update_y_gm_back = tvm.compute(
        shape_i_t,
        lambda t, i, j, k, l: update_y_gm(0, i, j, k, l),
        name="update_y_gm_back")
    update_h_gm = tvm.compute(
        shape_i_t,
        lambda t, i, j, k, l: update_y_gm_back(0, i, j, k, l),
        name="update_h_gm")
    # end compute

    # schedule
    s = tvm.schedule.create_schedule([update_h_gm.op])

    def gen_reversed_subgraph_list(out_tensor, tensor_list):
        """
        traverse tensors by Depth-First-Search
        """
        if out_tensor is None:
            return
        stack = [out_tensor]
        visited_list = []
        while stack:
            cur_tensor = stack.pop()
            visited_list.append(cur_tensor)
            for in_tensor in cur_tensor.op.input_tensors:
                if in_tensor not in visited_list:
                    stack.append(in_tensor)
                    if "elewise" in in_tensor.op.tag or \
                            "broadcast" == in_tensor.op.tag:
                        if in_tensor not in tensor_list:
                            tensor_list.append(in_tensor)

    elewise_tensors_r_t_h_fp16 = []
    gen_reversed_subgraph_list(r_t_h_fp16, elewise_tensors_r_t_h_fp16)

    elewise_tensors = []
    tmp_tensors = []
    gen_reversed_subgraph_list(update_h_gm, tmp_tensors)
    for i in tmp_tensors:
        if i not in elewise_tensors_r_t_h_fp16:
            elewise_tensors.append(i)

    # set scope
    s[s_state_h].set_scope(tbe_platform.scope_ubuf)
    s[s_state_h_fp16].set_scope(tbe_platform.scope_ubuf)
    s[a_l1_1].set_scope(tbe_platform.scope_cbuf)
    s[b_l1_1].set_scope(tbe_platform.scope_cbuf)
    s[a_l0a_1].set_scope(tbe_platform.scope_ca)
    s[b_l0b_1].set_scope(tbe_platform.scope_cb)
    s[c_l0c_1].set_scope(tbe_platform.scope_cc)
    s[c_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[bias_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[bias_bc_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[r_t_h_fp16].set_scope(tbe_platform.scope_ubuf)
    s[a_l1_2].set_scope(tbe_platform.scope_cbuf)
    s[b_l1_2].set_scope(tbe_platform.scope_cbuf)
    s[a_l0a_2].set_scope(tbe_platform.scope_ca)
    s[b_l0b_2].set_scope(tbe_platform.scope_cb)
    s[c_l0c_2].set_scope(tbe_platform.scope_cc)
    s[c_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[bias_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[bias_bc_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[update_y_gm_back].set_scope(tbe_platform.scope_ubuf)
    if is_gate_output:
        s[r_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        s[i_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        s[n_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        if fp16_input_output:
            s[r_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf)
            s[i_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf)
            s[h_t_tanh_fp16].set_scope(tbe_platform.scope_ubuf)
            s[r_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
            s[i_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
            s[n_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
    if fp16_input_output:
        s[bias_ub_1_fp32].set_scope(tbe_platform.scope_ubuf)
        s[bias_ub_2_fp32].set_scope(tbe_platform.scope_ubuf)
        s[update_h_fp16].set_scope(tbe_platform.scope_ubuf)

    # compute inline
    compute_inline_tensors = [i_t, r_t]
    for tensor in compute_inline_tensors:
        s[tensor].compute_inline()

    # matmul tiling
    factor_l1_m, factor_l1_n, factor_l1_k, factor_l0_m, factor_l0_n, factor_l0_k = \
        _get_tiling(m_size, k_size, hidden_size)

    l1_n_outer_1, l1_n_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[2],
                                                  factor=factor_l1_n)
    l1_m_outer_1, l1_m_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[3],
                                                  factor=factor_l1_m)
    l1_k_outer_1, l1_k_inner_1 = s[c_l0c_1].split(c_l0c_1.op.reduce_axis[0],
                                                  factor=factor_l1_k)
    l0_n_outer_1, l0_n_inner_1 = s[c_l0c_1].split(l1_n_inner_1,
                                                  factor=factor_l0_n)
    l0_m_outer_1, l0_m_inner_1 = s[c_l0c_1].split(l1_m_inner_1,
                                                  factor=factor_l0_m)
    l0_k_outer_1, l0_k_inner_1 = s[c_l0c_1].split(l1_k_inner_1,
                                                  factor=factor_l0_k)
    s[c_l0c_1].reorder(c_l0c_1.op.axis[0], l1_n_outer_1, l1_k_outer_1,
                       c_l0c_1.op.axis[1], l1_m_outer_1, l0_n_outer_1,
                       l0_m_outer_1, l0_k_outer_1, l0_n_inner_1, l0_m_inner_1,
                       c_l0c_1.op.axis[4], c_l0c_1.op.axis[5], l0_k_inner_1,
                       c_l0c_1.op.reduce_axis[1])
    s[a_l1_1].double_buffer()
    s[b_l1_1].double_buffer()
    s[a_l0a_1].double_buffer()
    s[b_l0b_1].double_buffer()
    s[c_l0c_1].double_buffer()
    s[c_ub_1].double_buffer()
    s[a_l1_1].compute_at(s[c_l0c_1], l1_k_outer_1)
    s[b_l1_1].compute_at(s[c_l0c_1], c_l0c_1.op.axis[1])
    s[a_l0a_1].compute_at(s[c_l0c_1], l1_k_outer_1)
    s[b_l0b_1].compute_at(s[c_l0c_1], l0_k_outer_1)

    c_ub_bias_1_outer, c_ub_bias_1_inner = s[c_ub_bias_1].split(
        c_ub_bias_1.op.axis[2], factor=factor_l1_n)
    s[c_ub_bias_1].reorder(c_ub_bias_1.op.axis[0], c_ub_bias_1_outer,
                           c_ub_bias_1.op.axis[1], c_ub_bias_1_inner,
                           c_ub_bias_1.op.axis[3], c_ub_bias_1.op.axis[4],
                           c_ub_bias_1.op.axis[5])
    s[c_l0c_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[c_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[bias_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[bias_bc_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    if fp16_input_output:
        s[bias_ub_1_fp32].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[c_ub_bias_1].emit_insn(c_ub_bias_1.op.axis[1], 'vector_add')

    r_t_h_fp16_outer, r_t_h_fp16_inner = s[r_t_h_fp16].split(
        r_t_h_fp16.op.axis[1], factor=factor_l1_n)
    for tensor in elewise_tensors_r_t_h_fp16:
        s[tensor].set_scope(tbe_platform.scope_ubuf)
        if tensor == c_ub_bias_1:
            continue
        s[tensor].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        insn = _get_emit_insn_map(tensor)
        s[tensor].emit_insn(tensor.op.axis[0], insn)
    if is_gate_output:
        s[r_t_gm].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        s[r_t_gm_back].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        if fp16_input_output:
            s[r_t_sigmoid_fp16].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
            s[r_t_gm_back_fp32].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
    s[r_t_h_fp16].emit_insn(r_t_h_fp16_inner, 'vector_conv')

    l1_n_outer_2, l1_n_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[2],
                                                  factor=factor_l1_n)
    l1_m_outer_2, l1_m_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[3],
                                                  factor=factor_l1_m)
    l1_k_outer_2, l1_k_inner_2 = s[c_l0c_2].split(c_l0c_2.op.reduce_axis[0],
                                                  factor=factor_l1_k)
    l0_n_outer_2, l0_n_inner_2 = s[c_l0c_2].split(l1_n_inner_2,
                                                  factor=factor_l0_n)
    l0_m_outer_2, l0_m_inner_2 = s[c_l0c_2].split(l1_m_inner_2,
                                                  factor=factor_l0_m)
    l0_k_outer_2, l0_k_inner_2 = s[c_l0c_2].split(l1_k_inner_2,
                                                  factor=factor_l0_k)
    s[c_l0c_2].reorder(c_l0c_2.op.axis[0], l1_n_outer_2, l1_k_outer_2,
                       c_l0c_2.op.axis[1], l1_m_outer_2, l0_n_outer_2,
                       l0_m_outer_2, l0_k_outer_2, l0_n_inner_2, l0_m_inner_2,
                       c_l0c_2.op.axis[4], c_l0c_2.op.axis[5], l0_k_inner_2,
                       c_l0c_2.op.reduce_axis[1])
    s[a_l1_2].double_buffer()
    s[b_l1_2].double_buffer()
    s[a_l0a_2].double_buffer()
    s[b_l0b_2].double_buffer()
    s[c_l0c_2].double_buffer()
    s[c_ub_2].double_buffer()
    s[a_l1_2].compute_at(s[c_l0c_2], l1_k_outer_2)
    s[b_l1_2].compute_at(s[c_l0c_2], c_l0c_2.op.axis[1])
    s[a_l0a_2].compute_at(s[c_l0c_2], l1_k_outer_2)
    s[b_l0b_2].compute_at(s[c_l0c_2], l0_k_outer_2)

    update_h_gm_outer, update_h_gm_inner = s[update_h_gm].split(
        update_h_gm.op.axis[1], factor=factor_l1_n)
    s[c_l0c_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[c_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[bias_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[bias_bc_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[c_ub_bias_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[update_y_gm].compute_at(s[update_h_gm], update_h_gm_outer)
    s[update_y_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
    if fp16_input_output:
        s[bias_ub_2_fp32].compute_at(s[update_h_gm], update_h_gm_outer)
        s[update_h_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
    if is_gate_output:
        s[i_t_gm].compute_at(s[update_h_gm], update_h_gm_outer)
        s[i_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
        s[n_t_gm].compute_at(s[update_h_gm], update_h_gm_outer)
        s[n_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
        if fp16_input_output:
            s[i_t_sigmoid_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
            s[i_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer)
            s[h_t_tanh_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
            s[n_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer)

    for tensor in elewise_tensors:
        s[tensor].set_scope(tbe_platform.scope_ubuf)
        s[tensor].compute_at(s[update_h_gm], update_h_gm_outer)
        insn = _get_emit_insn_map(tensor)
        s[tensor].emit_insn(tensor.op.axis[0], insn)

    # emit insn
    if is_first_round and not is_global_init:
        s[s_state_h].emit_insn(s_state_h.op.axis[0], 'broadcast')
        s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv')
    else:
        if fp16_input_output:
            s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'dma_copy')
            s[s_state_h].emit_insn(s_state_h.op.axis[0], 'vector_conv')
        else:
            s[s_state_h].emit_insn(s_state_h.op.axis[0], 'dma_copy')
            s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0],
                                        'vector_conv')

    s[a_l1_1].emit_insn(a_l1_1.op.axis[0], 'dma_copy')
    s[b_l1_1].emit_insn(b_l1_1.op.axis[0], 'dma_copy')
    s[a_l0a_1].emit_insn(a_l0a_1.op.axis[0], 'dma_copy')
    s[b_l0b_1].emit_insn(b_l0b_1.op.axis[0], 'dma_copy')
    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_1, l0_k_outer_1]}
    s[c_l0c_1].emit_insn(l0_n_inner_1, 'mad', mad_dict)
    s[c_ub_1].emit_insn(c_ub_1.op.axis[0], 'dma_copy')
    s[bias_ub_1].emit_insn(bias_ub_1.op.axis[0], 'dma_copy')
    if fp16_input_output:
        s[bias_ub_1_fp32].emit_insn(bias_ub_1_fp32.op.axis[0], 'vector_conv')
        s[bias_ub_2_fp32].emit_insn(bias_ub_2_fp32.op.axis[0], 'vector_conv')
        s[update_h_fp16].emit_insn(update_h_fp16.op.axis[0], 'vector_conv')
    s[bias_bc_ub_1].emit_insn(bias_bc_ub_1.op.axis[0], 'unified_broadcast')
    s[a_l1_2].emit_insn(a_l1_2.op.axis[0], 'dma_copy')
    s[b_l1_2].emit_insn(b_l1_2.op.axis[0], 'dma_copy')
    s[a_l0a_2].emit_insn(a_l0a_2.op.axis[0], 'dma_copy')
    s[b_l0b_2].emit_insn(b_l0b_2.op.axis[0], 'dma_copy')
    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_2, l0_k_outer_2]}
    s[c_l0c_2].emit_insn(l0_n_inner_2, 'mad', mad_dict)
    s[c_ub_2].emit_insn(c_ub_2.op.axis[0], 'dma_copy')
    s[bias_ub_2].emit_insn(bias_ub_2.op.axis[0], 'dma_copy')
    s[bias_bc_ub_2].emit_insn(bias_bc_ub_2.op.axis[0], 'unified_broadcast')
    s[update_y_gm].emit_insn(update_y_gm.op.axis[0], 'dma_copy')
    s[update_y_gm_back].emit_insn(update_y_gm_back.op.axis[0], 'phony_insn')
    s[update_y_gm_back].reused_by(update_h_ub)
    if is_gate_output:
        s[r_t_gm].emit_insn(r_t_gm.op.axis[0], 'dma_copy')
        s[i_t_gm].emit_insn(i_t_gm.op.axis[0], 'dma_copy')
        s[n_t_gm].emit_insn(n_t_gm.op.axis[0], 'dma_copy')
        s[r_t_gm_back].emit_insn(r_t_gm_back.op.axis[0], 'phony_insn')
        s[i_t_gm_back].emit_insn(i_t_gm_back.op.axis[0], 'phony_insn')
        s[n_t_gm_back].emit_insn(n_t_gm_back.op.axis[0], 'phony_insn')
        if fp16_input_output:
            s[r_t_sigmoid_fp16].emit_insn(r_t_sigmoid_fp16.op.axis[0],
                                          'vector_conv')
            s[i_t_sigmoid_fp16].emit_insn(i_t_sigmoid_fp16.op.axis[0],
                                          'vector_conv')
            s[h_t_tanh_fp16].emit_insn(h_t_tanh_fp16.op.axis[0], 'vector_conv')
            s[r_t_gm_back_fp32].emit_insn(r_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[i_t_gm_back_fp32].emit_insn(i_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[n_t_gm_back_fp32].emit_insn(n_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[r_t_gm_back_fp32].reused_by(r_t_sigmoid)
            s[i_t_gm_back_fp32].reused_by(i_t_sigmoid)
            s[n_t_gm_back_fp32].reused_by(h_t_tanh)
            s[r_t_gm_back].reused_by(r_t_sigmoid_fp16)
            s[i_t_gm_back].reused_by(i_t_sigmoid_fp16)
            s[n_t_gm_back].reused_by(h_t_tanh_fp16)
        else:
            s[r_t_gm_back].reused_by(r_t_sigmoid)
            s[i_t_gm_back].reused_by(i_t_sigmoid)
            s[n_t_gm_back].reused_by(h_t_tanh)
    s[update_h_gm].emit_insn(update_h_gm_inner, 'dma_copy')

    output_list = [update_y_gm, update_h_gm]
    if is_gate_output:
        output_list.append(r_t_gm)
        output_list.append(i_t_gm)
        output_list.append(n_t_gm)
    return output_list, s
Ejemplo n.º 15
0
def get_matmul_tensor(x, h, c, w, b, build_list, tensor_list, scope_list,
                      operation_list, is_hisi_es):
    shape_x = x.get("shape")
    shape_h = h.get("shape")
    shape_c = c.get("shape")
    dtype_x = x.get("dtype").lower()
    dtype_c = c.get("dtype").lower()
    dtype_b = b.get("dtype").lower()
    input_dim, batch_dim = shape_x[0:2]
    hidden_dim = shape_h[0]
    output_dim = hidden_dim
    shape_b = b.get("shape")
    shape_b = (shape_b[0] // 16, 16)
    shape_xh = (batch_dim, input_dim + hidden_dim, C0, C0)
    shape_w = w.get("shape")
    shape_w_split = list(shape_w)
    shape_w_split[1] = shape_w_split[1] // 4

    # Inputs in gm
    tensor_x = tvm.placeholder(shape_x, name='tensor_x', dtype=dtype_x)
    tensor_h = tvm.placeholder(shape_h, name='tensor_h', dtype=dtype_x)
    tensor_c = tvm.placeholder(shape_c, name='tensor_c', dtype=dtype_c)
    tensor_w = tvm.placeholder(shape_w, name='tensor_w', dtype=dtype_x)
    tensor_b = tvm.placeholder(shape_b, name='tensor_b', dtype=dtype_c)
    build_list["x"] = tensor_x
    build_list["h"] = tensor_h
    build_list["c"] = tensor_c
    build_list["w"] = tensor_w
    build_list["b"] = tensor_b

    symbol = ["it", "jt", "ft", "ot"]

    def _index_w(str_name, *index):
        if str_name == "it":
            return index[0], index[1], index[2], index[3]
        elif str_name == "jt":
            return index[0], index[1] + output_dim, index[2], index[3]
        elif str_name == "ft":
            return index[0], index[1] + output_dim * 2, index[2], index[3]
        return index[0], index[1] + output_dim * 3, index[2], index[3]

    def _index_bias(str_name):
        if str_name == "it":
            return 0
        elif str_name == "jt":
            return 1
        elif str_name == "ft":
            return 2
        return 3

    matmul_type = "float32"
    if is_hisi_es:
        matmul_type = "float16"

    for t in symbol:
        # caoncat x and h into 1 tensor,copy to L1
        tensor_xh_l1_tmp = tvm.compute(
            shape_xh,
            lambda *indice: tvm.select(
                indice[1] < input_dim, tensor_x[indice[1], indice[0], indice[
                    2], indice[3]], tensor_h[indice[1] - input_dim, indice[0],
                                             indice[2], indice[3]]),
            name="tensor_xh_l1_" + t,
            tag="concat")
        tensor_list["tensor_xh_l1_" + t] = tensor_xh_l1_tmp
        scope_list["tensor_xh_l1_" + t] = cce.scope_cbuf
        # optimazition: copy one time
        operation_list["tensor_xh_l1_" + t] = "dma_copy"

        # copy  xh  to L1
        tensor_xh_l0a_tmp = tvm.compute(shape_xh,
                                        lambda *i: tensor_xh_l1_tmp(*i),
                                        name='tensor_xh_l0a_' + t)
        tensor_list["tensor_xh_l0a_" + t] = tensor_xh_l0a_tmp
        scope_list["tensor_xh_l0a_" + t] = cce.scope_ca
        # optimazition: copy one time
        operation_list["tensor_xh_l0a_" + t] = "dma_copy"
        # copy w to L1 buf
        tensor_w_l1_tmp = tvm.compute(shape_w_split,
                                      lambda *i: tensor_w(*_index_w(t, *i)),
                                      name='tensor_w_l1_' + t)
        tensor_list["tensor_w_l1_" + t] = tensor_w_l1_tmp
        scope_list["tensor_w_l1_" + t] = cce.scope_cbuf
        operation_list["tensor_w_l1_" + t] = "dma_copy"

        # copy W from L1 to L0 B
        tensor_w_l0b_tmp = tvm.compute(shape_w_split,
                                       lambda *i: tensor_w_l1_tmp(*i),
                                       name='tensor_w_l0b_' + t)
        tensor_list["tensor_w_l0b_" + t] = tensor_w_l0b_tmp
        scope_list["tensor_w_l0b_" + t] = cce.scope_cb
        operation_list["tensor_w_l0b_" + t] = "dma_copy"

        # copy bias to ubuf ,split the
        tensor_b_ub_tmp = tvm.compute(
            shape_b,
            lambda i0, i1: tensor_b[_index_bias(t) * output_dim + i0, i1],
            name='tensor_b_ub_' + t)
        tensor_list["tensor_b_ub_" + t] = tensor_b_ub_tmp
        scope_list["tensor_b_ub_" + t] = cce.scope_ubuf
        operation_list["tensor_b_ub_" + t] = "dma_copy"

        #
        tensor_b_ub_true_tmp = tensor_b_ub_tmp
        if not is_hisi_es and dtype_b == "float16":
            tensor_b_ub_true_tmp = tvm.compute(
                shape_b,
                lambda *i: topi.cast(tensor_b_ub_tmp(*i), "float32"),
                name="tensor_b_ub_true_" + t)
            tensor_list["tensor_b_ub_true_" + t] = tensor_b_ub_true_tmp
            scope_list["tensor_b_ub_true_" + t] = cce.scope_ubuf
            operation_list["tensor_b_ub_true_" + t] = "vector_conv"

        # broadcast bias from [ouput_dim//16,16] to [output_dim//16,N//16,16,16]
        tensor_b_loc_tmp = tvm.compute(
            shape_h,
            lambda i0, i1, i2, i3: tensor_b_ub_true_tmp[i0, i3],
            name='tensor_b_loc_' + t)
        tensor_list["tensor_b_loc_" + t] = tensor_b_loc_tmp
        scope_list["tensor_b_loc_" + t] = cce.scope_cc
        operation_list["tensor_b_loc_" + t] = "dma_copy"
        # DO MATMUL
        reduce_kb = tvm.reduce_axis((0, input_dim + output_dim),
                                    name='reduce_kb_' + t)
        reduce_kp = tvm.reduce_axis((0, C0), name='reduce_kp_' + t)
        tensor_matmul_l0c_tmp = tvm.compute(
            shape_h,
            lambda nb, mb, mp, np: tvm.sum((tensor_xh_l0a_tmp[
                mb, reduce_kb, mp, reduce_kp] * tensor_w_l0b_tmp[
                    reduce_kb, nb, np, reduce_kp]).astype(matmul_type),
                                           axis=[reduce_kb, reduce_kp]),
            name='tensor_matmul_l0c_' + t,
            attrs={'input_order': 'positive'})
        tensor_list["tensor_matmul_l0c_" + t] = tensor_matmul_l0c_tmp
        scope_list["tensor_matmul_l0c_" + t] = cce.scope_cc
        # Matmul accumulation it + b_it
        tensor_matmul_result_l0c_tmp = tvm.compute(
            shape_h,
            lambda *i: tensor_b_loc_tmp(*i) + tensor_matmul_l0c_tmp(*i),
            name="tensor_matmul_result_l0c_" + t)
        tensor_list["tensor_matmul_result_l0c_" +
                    t] = tensor_matmul_result_l0c_tmp
        scope_list["tensor_matmul_result_l0c_" + t] = cce.scope_cc
        operation_list["tensor_matmul_result_l0c_" + t] = "phony_insn"

        # copy matmul result from l0c to ub
        gate_ub_tmp = tvm.compute(
            shape_h,
            lambda *i: tensor_list["tensor_matmul_result_l0c_" + t](*i),
            name=t + "_ub")
        tensor_list[t + "_ub"] = gate_ub_tmp
        scope_list[t + "_ub"] = cce.scope_ubuf
        operation_list[t + "_ub"] = "dma_copy"