Exemple #1
0
def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False,
                               kernel_name="matmulcube"):
    """CusMatMulCubeFraczRightMul"""
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x1_shape = input_x1.get("shape")
    input_x1_dtype = input_x1.get("dtype").lower()
    input_x2_shape = input_x2.get("shape")
    input_x2_dtype = input_x2.get("dtype").lower()
    input_x3_shape = input_x3.get("shape")
    input_x3_dtype = input_x3.get("dtype").lower()
    output_shape = output_y.get("shape")
    Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"),
                 ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"),
                 ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"),
                 ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"),
                 ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'),
                 ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'),
                 ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'),
                 ((64, 16, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                 ((32, 64, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                 ((32, 16, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 32, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 8, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 4, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'),
                 ((288, 32, 16, 16), 'float16', (288, 288, 16, 16), 'float16', (1,), 'float32'),
                 ((144, 16, 16, 16), 'float16', (144, 144, 16, 16), 'float16', (1,), 'float32'),
                 ((128, 32, 16, 16), 'float16', (128, 128, 16, 16), 'float16', (1,), 'float32'),
                 ((64, 128, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                 ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'),
                 ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'),
                 ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')]
    input_shape = (
        tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype)
    if input_shape not in Supported:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))

    input_x1 = tik_instance.Tensor("float16", input_x1_shape, name="left_matrix", scope=tik.scope_gm)
    input_x2 = tik_instance.Tensor("float16", input_x2_shape, name="right_matrix", scope=tik.scope_gm)
    input_x3 = tik_instance.Tensor("float32", input_x3_shape, name="matrix_max", scope=tik.scope_gm)
    resMatmul = tik_instance.Tensor("float32", output_shape, name="output", scope=tik.scope_gm)
    cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3, resMatmul)
    tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul])
    return tik_instance
def CusMatMulCubeDenseRight(input_x1,
                            input_x2,
                            input_x3,
                            bias=None,
                            output_y={},
                            trans_a=False,
                            trans_b=False,
                            kernel_name="matmulcube"):
    """CusMatMulCubeDenseRight"""
    shape_a_temp = (128, 63, 16, 16)
    shape_b_temp = (128, 128, 16, 16)
    shape_output = output_y.get("shape")
    matrix_max_shape = (1, )
    support_shape = [
        (shape_a_temp, shape_b_temp, matrix_max_shape),
    ]
    shape_a_input = input_x1.get("shape")
    shape_b_input = input_x2.get("shape")
    matrix_max_input = input_x3.get("shape")
    input_shape = (tuple(shape_a_input), tuple(shape_b_input),
                   tuple(matrix_max_input))
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" %
                           str(input_shape))

    if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[
            0] == 128 and shape_b_temp[1] == 128:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
        input_x1 = tik_instance.Tensor("float16",
                                       shape_a_temp,
                                       name="left_matrix",
                                       scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16",
                                       shape_b_temp,
                                       name="right_matrix",
                                       scope=tik.scope_gm)
        input_x3 = tik_instance.Tensor("float32", [
            1,
        ],
                                       name="matrix_max",
                                       scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float32",
                                        shape_output,
                                        name="output",
                                        scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            core_m_idx = block_index // 16
            core_n_idx = block_index % 16
            matrix_max_scalar = tik_instance.Scalar("float32")
            matrix_max_local_UB = tik_instance.Tensor(
                "float32", (8, ),
                scope=tik.scope_ubuf,
                name="matrix_max_local_UB")
            tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0,
                                   0)
            matrix_max_scalar.set_as(matrix_max_local_UB[0])

            resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128, ),
                                                     scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB1 = tik_instance.Tensor(
                "float32", (240 * 128, ),
                scope=tik.scope_ubuf,
                name="resMatmul_local_UB1")

            resMatmul_local_UB_local_L0C = tik_instance.Tensor(
                "float32", (256 * 128, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB_local_L0C")
            resMatmul_local_UB_local_L0C1 = tik_instance.Tensor(
                "float32", (240 * 128, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB_local_L0C1")

            input_1_local_L1_local_L0A = tik_instance.Tensor(
                "float16", (256 * 128, ),
                scope=tik.scope_ca,
                name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_2_local_L11 = tik_instance.Tensor("float16",
                                                    (8 * 128 * 16, ),
                                                    scope=tik.scope_cbuf,
                                                    name="input_2_local_L11")

            input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_1_local_L11 = tik_instance.Tensor("float16",
                                                    (8 * 240 * 16, ),
                                                    scope=tik.scope_cbuf,
                                                    name="input_1_local_L11")

            input_2_local_L1_local_L0B = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B")
            input_2_local_L1_local_L0B1 = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B1")

            with tik_instance.if_scope(core_m_idx == 0):
                with tik_instance.for_range(0, 2) as cc1:
                    tik_instance.data_move(
                        input_2_local_L1,
                        input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0,
                        8, 128, 1920, 0)
                    tik_instance.data_move(
                        input_1_local_L1,
                        input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256,
                        752, 0)
                    with tik_instance.for_range(0, 8) as cc10:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc10 * 2048],
                            input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
                    with tik_instance.for_range(0, 16) as cc101:
                        tik_instance.load2dv1(
                            input_1_local_L1_local_L0A[cc101 * 2048],
                            input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)

                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 256, 128,
                                      128, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           128, 0, 0)
                    tik_instance.vmuls(64, resMatmul_local_UB,
                                       resMatmul_local_UB, matrix_max_scalar,
                                       255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[255 * 64],
                                       resMatmul_local_UB[255 * 64],
                                       matrix_max_scalar, 255, 1, 1, 8, 8)
                    tik_instance.vmuls(64, resMatmul_local_UB[510 * 64],
                                       resMatmul_local_UB[510 * 64],
                                       matrix_max_scalar, 2, 1, 1, 8, 8)

                    tik_instance.data_move(
                        resMatmul[core_n_idx * 129024 + cc1 * 4096],
                        resMatmul_local_UB, 0, 8, 512, 0, 1504)
            with tik_instance.else_scope():
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
                    128, 1920, 0)
                tik_instance.data_move(
                    input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096],
                    0, 8, 256, 752, 0)
                with tik_instance.for_range(0, 8) as cc10:
                    tik_instance.load2dv1(
                        input_2_local_L1_local_L0B[cc10 * 2048],
                        input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc101:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc101 * 2048],
                        input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                  input_1_local_L1_local_L0A,
                                  input_2_local_L1_local_L0B, 256, 128, 128, 0)
                tik_instance.data_move(resMatmul_local_UB,
                                       resMatmul_local_UB_local_L0C, 0, 1, 128,
                                       0, 0)
                tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB,
                                   matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[255 * 64],
                                   resMatmul_local_UB[255 * 64],
                                   matrix_max_scalar, 255, 1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB[510 * 64],
                                   resMatmul_local_UB[510 * 64],
                                   matrix_max_scalar, 2, 1, 1, 8, 8)

                tik_instance.data_move(
                    resMatmul[core_n_idx * 129024 + 2 * 4096],
                    resMatmul_local_UB, 0, 8, 512, 0, 1504)

                tik_instance.data_move(
                    input_2_local_L11,
                    input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8,
                    128, 1920, 0)
                tik_instance.data_move(input_1_local_L11,
                                       input_x1[core_n_idx * 129024 + 12288],
                                       0, 8, 240, 768, 0)

                with tik_instance.for_range(0, 8) as cc102:
                    tik_instance.load2dv1(
                        input_2_local_L1_local_L0B1[cc102 * 2048],
                        input_2_local_L11[cc102 * 256], 0, 8, 8, 0, True)
                with tik_instance.for_range(0, 16) as cc103:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc103 * 2048],
                        input_1_local_L11[cc103 * 256], 0, 8, 15, 0, False)

                tik_instance.mmad(resMatmul_local_UB_local_L0C1,
                                  input_1_local_L1_local_L0A,
                                  input_2_local_L1_local_L0B1, 240, 128, 128,
                                  0)
                tik_instance.data_move(resMatmul_local_UB1,
                                       resMatmul_local_UB_local_L0C1, 0, 1,
                                       120, 0, 0)

                tik_instance.vmuls(64, resMatmul_local_UB1,
                                   resMatmul_local_UB1, matrix_max_scalar, 255,
                                   1, 1, 8, 8)
                tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64],
                                   resMatmul_local_UB1[255 * 64],
                                   matrix_max_scalar, 225, 1, 1, 8, 8)

                tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288],
                                       resMatmul_local_UB1, 0, 8, 480, 0, 1536)

        tik_instance.BuildCCE(kernel_name=kernel_name,
                              inputs=[input_x1, input_x2, input_x3],
                              outputs=[resMatmul])
        return tik_instance
def CusCholeskyTrsm(input_x, output, kernel_name):
    """CusCholeskyTrsm"""
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128
    matrix_dim = input_x_shape[0]
    split_dim = min(matrix_dim, split_dim)
    vector_repeat_times = int(split_dim // 64)
    blocks = int(matrix_dim // split_dim)
    if blocks == 0:
        blocks = 1
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x = tik_instance.Tensor("float32",
                                  input_x_shape,
                                  name="input_x",
                                  scope=tik.scope_gm)
    res = tik_instance.Tensor("float32",
                              output_shape,
                              name="res",
                              scope=tik.scope_gm)
    with tik_instance.for_range(0, blocks, block_num=blocks) as block_index:
        input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim),
                                         name="input_x_ub",
                                         scope=tik.scope_ubuf)
        temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim),
                                      name="temp_ub",
                                      scope=tik.scope_ubuf)
        assist_1_ub = tik_instance.Tensor("float32", (split_dim, ),
                                          name="assist_1_ub",
                                          scope=tik.scope_ubuf)
        assist_2_ub = tik_instance.Tensor("float32", (split_dim, ),
                                          name="assist_2_ub",
                                          scope=tik.scope_ubuf)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.data_move(
                input_x_ub[i, 0], input_x[block_index * split_dim + i,
                                          block_index * split_dim], 0, 1,
                vector_repeat_times * 8, 0, 0)
        scalar1 = tik_instance.Scalar("float32", init_value=-0.5)

        with tik_instance.for_range(0, split_dim) as i:
            scalar2 = tik_instance.Scalar("float32")
            tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0],
                             vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1,
                               vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0],
                              vector_repeat_times, 1, 1, 8, 8)
            scalar2.set_as(assist_1_ub[i])
            tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2,
                               vector_repeat_times, 1, 1, 8, 8)
            with tik_instance.for_range(i + 1, split_dim) as j:
                scalar3 = tik_instance.Scalar("float32")
                scalar3.set_as(input_x_ub[i, j])
                tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0],
                                   scalar3, vector_repeat_times, 1, 1, 8, 8)
            tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0],
                              temp_ub[i + 1, 0],
                              (split_dim - 1 - i) * vector_repeat_times, 1, 1,
                              1, 8, 8, 8)

        zero = tik_instance.Scalar("float32")
        zero.set_as(0.0)
        one = tik_instance.Scalar("float32")
        one.set_as(1.0)
        with tik_instance.for_range(0, split_dim) as i:
            tik_instance.vector_dup(64, temp_ub[i, 0], zero,
                                    vector_repeat_times, 1, 8)
            temp_ub.__setitem__(i * split_dim + i, one)

        chol_diag_element_final = tik_instance.Scalar("float32")
        chol_diag_element_final.set_as(input_x_ub[split_dim * split_dim - 1])
        trsm_diag_element = tik_instance.Scalar("float32")
        trsm_diag_element.set_as(1.0 / chol_diag_element_final)
        temp_ub.__setitem__(split_dim * split_dim - 1, trsm_diag_element)

        with tik_instance.for_range(1, split_dim) as i:
            index = split_dim - i - 1
            tik_instance.vector_dup(64, assist_1_ub, zero, vector_repeat_times,
                                    1, 8)
            with tik_instance.for_range(0, i) as j:
                chol_diag_element_loop = tik_instance.Scalar("float32")
                chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j])
                tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0],
                                   chol_diag_element_loop, vector_repeat_times,
                                   1, 1, 8, 8)
                tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub,
                                  vector_repeat_times, 1, 1, 1, 8, 8, 8)
            temp_scalar = tik_instance.Scalar("float32")
            temp_scalar.set_as(input_x_ub[index, index])
            chol_diag_element = tik_instance.Scalar("float32")
            chol_diag_element.set_as(1.0 / temp_scalar)
            tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index,
                                                             0], assist_1_ub,
                              vector_repeat_times, 1, 1, 1, 8, 8, 8)
            tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0],
                               chol_diag_element, vector_repeat_times, 1, 1, 8,
                               8)

        tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1,
                               8 * vector_repeat_times * split_dim, 0, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_x],
                          outputs=[res])
    return tik_instance
Exemple #4
0
def CusTranspose02314(input_x, output, kernel_name="transpose021354"):
    """CusTranspose02314"""
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    perm = (0, 2, 3, 1, 4)
    input_x_shape = tuple(input_x_shape)
    support_shape = [(32, 128, 7, 7, 16),
                     (32, 32, 7, 7, 16),
                     (32, 32, 14, 14, 16),
                     (32, 64, 14, 14, 16),
                     (32, 16, 14, 14, 16),
                     (32, 16, 28, 28, 16),
                     (32, 32, 28, 28, 16),
                     (32, 8, 28, 28, 16),
                     (32, 8, 56, 56, 16),
                     (32, 16, 56, 56, 16),
                     (32, 4, 56, 56, 16),
                     (32, 4, 112, 112, 16)]
    if input_x_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_x_shape))

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm)
    res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm)

    dtype = "float16"
    if tuple(input_x_shape) == (32, 4, 112, 112, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    zero = tik_instance.Scalar(dtype="float16", init_value=0)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           12096, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 4, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448,
                                           2688, 0)
                    with tik_instance.for_range(0, 448) as cc7:
                        with tik_instance.for_range(0, 4) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16],
                                               input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0)
            with tik_instance.for_range(0, 448) as cc72:
                with tik_instance.for_range(0, 4) as cc82:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16],
                                       input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 16, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 14) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112,
                                           3024, 0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 56, 56, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912,
                                           0)
                    with tik_instance.for_range(0, 224) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 8, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588,
                                           0)
                    with tik_instance.for_range(0, 196) as cc7:
                        with tik_instance.for_range(0, 8) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16],
                                               input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 32, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 7) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx],
                                           0, 32, 56, 728, 0)
                    with tik_instance.for_range(0, 56) as cc7:
                        with tik_instance.for_range(0, 32) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16],
                                               input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)
    elif tuple(input_x_shape) == (32, 16, 28, 28, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 3) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                    input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB",
                                                           scope=tik.scope_ubuf)
                    T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB",
                                                               scope=tik.scope_ubuf)
                    tik_instance.data_move(input_1_local_UB,
                                           input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672,
                                           0)
                    with tik_instance.for_range(0, 112) as cc7:
                        with tik_instance.for_range(0, 16) as cc8:
                            tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                               input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                    tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx],
                                           T_transpose_local_UB, 0, 1, 1792, 0, 0)

            input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf)
            T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2",
                                                        scope=tik.scope_ubuf)
            tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0)
            with tik_instance.for_range(0, 112) as cc7:
                with tik_instance.for_range(0, 16) as cc8:
                    tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16],
                                       input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0)

    elif tuple(input_x_shape) == (32, 16, 14, 14, 16):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            zero = tik_instance.Scalar(dtype="float16", init_value=0)
            with tik_instance.for_range(0, 2, thread_num=2) as db_idx:
                input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf)
                T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB",
                                                           scope=tik.scope_ubuf)
                tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0)
                with tik_instance.for_range(0, 98) as cc7:
                    with tik_instance.for_range(0, 16) as cc8:
                        tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16],
                                           input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0)
                tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0)
    elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 7, thread_num=2) as cc1:
                input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB",
                                                 scope=tik.scope_ubuf)
                transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB",
                                                   scope=tik.scope_ubuf)
                tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0)
                with tik_instance.for_range(0, 7) as cc7:
                    with tik_instance.for_range(0, 128) as cc8:
                        tik_instance.vadds(16, transpose_ub[0, 0, cc7, cc8, 0], input_x_ub[0, cc8, 0, cc7, 0], 0,
                                           1, 1, 1, 0, 0)
                tik_instance.data_move(res[block_idx * 100352 + 14336 * cc1], transpose_ub, 0, 1, 896, 0, 0)

    elif tuple(input_x_shape) == (32, 32, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 7, 7, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 7, 7, 32, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0)
            with tik_instance.for_range(0, 7) as cc1:
                with tik_instance.for_range(0, 7) as cc2:
                    with tik_instance.for_range(0, 32) as cc3:
                        tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0,
                                           1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 25088], transpose_ub, 0, 1, 1568, 0, 0)

    elif tuple(input_x_shape) == (32, 32, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        def _inner_compute(split_index):
            input_x_ub = tik_instance.Tensor(dtype, [1, 32, 2, 14, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 32, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 32, 28, 168, 0)
            with tik_instance.for_range(0, 2) as cc2:
                with tik_instance.for_range(0, 14) as cc3:
                    with tik_instance.for_range(0, 32) as cc4:
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_compute(cc1)
            _inner_compute(6)
    elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16":
        def _inner_compute(split_index, block_idx):
            input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB",
                                             scope=tik.scope_ubuf)
            transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 64, 16], name="transpose_local_UB",
                                               scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 64, 28, 168, 0)
            with tik_instance.for_range(0, 2) as cc2:
                with tik_instance.for_range(0, 14) as cc3:
                    with tik_instance.for_range(0, 64) as cc4:
                        tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0],
                                           0, 1, 1, 1, 0, 0)
            tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0)

        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 6, thread_num=2) as cc1:
                _inner_compute(cc1, block_idx)
            _inner_compute(6, block_idx)

    tik_instance.BuildCCE(kernel_name, inputs=[input_x], outputs=[res])
    return tik_instance
def CusMatMulCubeDenseLeft(input_x1,
                           input_x2,
                           bias=None,
                           output_y={},
                           trans_a=False,
                           trans_b=False,
                           kernel_name="matmulcube"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.

    Parameters:
    shape_a: list or tuple
            Shape of the first tensor a with rank > 1
    shape_b:  list or tuple
            Shape of the second tensor b with the same type with a,
            and shape_a, shape_b must be 2 dims
    src_dtype: str
            The data type of input, support "float32", "float16"
    dst_dtype: str
            The data type of output, support "float32", "float16"
    trans_a: bool
            If True, shape_a == transposed before multiplication
    trans_b: bool
            If True, shape_b == transposed before multiplication
    is_fractal: bool
            If True, the input data format of a and b must be fractal format
    shape_bias: list or tuple
            Shape of bias, only support the input data format with ND

    Returns
    -------
    None
    """
    print("!!!!come into zzt~~~~~~~!!!!")
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
    shape_output = output_y.get("ori_shape")
    print("============")
    print(input_x1.get("format"), input_x2.get("format"))
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)

    src_dtype = input_x1.get("dtype").lower()
    dst_dtype = output_y.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in,
                        block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce,
                        block_in, block_reduce)

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce,
                        block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out,
                        block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2],
                    shape_a_temp[3])
    format_a = "FRACTAL_NZ"
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2],
                    shape_b_temp[3])
    format_b = "FRACTAL_NZ"

    print("=======================================")
    print(shape_a_temp, shape_b_temp)
    print(format_a, format_b)
    print("=======================================")
    tensor_bias = None
    tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a', dtype=src_dtype)
    tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b', dtype=src_dtype)

    if shape_bias:
        tensor_bias = tvm.placeholder(shape_bias,
                                      name='tensor_bias',
                                      dtype=dst_dtype)

    if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[
            0] == 128 and shape_b_temp[1] == 63:
        if util.get_product_version() == util.VERSION_MINI:
            tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
        else:
            tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

        input_x1 = tik_instance.Tensor("float16",
                                       shape_a_temp,
                                       name="left_matrix",
                                       scope=tik.scope_gm)
        input_x2 = tik_instance.Tensor("float16",
                                       shape_b_temp,
                                       name="right_matrix",
                                       scope=tik.scope_gm)
        resMatmul = tik_instance.Tensor("float16",
                                        shape_output,
                                        name="output",
                                        scope=tik.scope_gm)
        with tik_instance.for_range(0, 32, block_num=32) as block_index:
            resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256, ),
                                                     scope=tik.scope_ubuf,
                                                     name="resMatmul_local_UB")
            resMatmul_local_UB_local_L0C = tik_instance.Tensor(
                "float32", (128 * 256, ),
                scope=tik.scope_cc,
                name="resMatmul_local_UB")
            input_1_local_L1_local_L0A = tik_instance.Tensor(
                "float16", (128 * 128, ),
                scope=tik.scope_ca,
                name="input_1_local_L1_local_L0A")
            input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_2_local_L1")
            input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128, ),
                                                   scope=tik.scope_cbuf,
                                                   name="input_1_local_L1")
            input_2_local_L1_local_L0B = tik_instance.Tensor(
                "float16", (128 * 256, ),
                scope=tik.scope_cb,
                name="input_2_local_L1_local_L0B")
            core_m_idx = block_index % 8
            core_n_idx = block_index // 8
            with tik_instance.if_scope(core_m_idx != 7):
                tik_instance.data_move(
                    input_1_local_L1,
                    input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128,
                    55 * 16, 0)
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                    0, 32, 128, 55 * 16, 0)
                with tik_instance.for_range(0, 8) as cc12:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc12 * 2048],
                        input_1_local_L1[cc12 * 256], 0, 8, 8, 0, False)
                with tik_instance.for_range(0, 2) as cc6:
                    with tik_instance.for_range(0, 8) as cc121:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc121 * 4096],
                            input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16,
                            8, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 128, 128,
                                      256, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           128, 0, 0, 1)
                    tik_instance.data_move(
                        resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 +
                                  core_n_idx * 512 * 1008], resMatmul_local_UB,
                        0, 16, 256 // 2, 0, 55 * 16 * 2 // 2)
            with tik_instance.else_scope():
                tik_instance.data_move(
                    input_1_local_L1,
                    input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112,
                    56 * 16, 0)
                tik_instance.data_move(
                    input_2_local_L1,
                    input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008],
                    0, 32, 112, 56 * 16, 0)
                with tik_instance.for_range(0, 7) as cc10:
                    tik_instance.load2dv1(
                        input_1_local_L1_local_L0A[cc10 * 1792],
                        input_1_local_L1[cc10 * 256], 0, 7, 7, 0, False)
                with tik_instance.for_range(0, 2) as cc5:
                    with tik_instance.for_range(0, 7) as cc101:
                        tik_instance.load2dv1(
                            input_2_local_L1_local_L0B[cc101 * 4096],
                            input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16,
                            7, 0, True)
                    tik_instance.mmad(resMatmul_local_UB_local_L0C,
                                      input_1_local_L1_local_L0A,
                                      input_2_local_L1_local_L0B, 112, 112,
                                      256, 0)
                    tik_instance.data_move(resMatmul_local_UB,
                                           resMatmul_local_UB_local_L0C, 0, 1,
                                           112, 0, 0, 1)
                    tik_instance.data_move(
                        resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 +
                                  core_n_idx * 512 * 1008], resMatmul_local_UB,
                        0, 16, 224 // 2, 0, 56 * 16 * 2 // 2)
        tik_instance.BuildCCE(kernel_name=kernel_name,
                              inputs=[input_x1, input_x2],
                              outputs=[resMatmul])
        return tik_instance

    print("come into tbe, shape is error!")
    result = te.lang.cce.matmul(tensor_a,
                                tensor_b,
                                trans_a,
                                trans_b,
                                format_a=format_a,
                                format_b=format_b,
                                dst_dtype=dst_dtype,
                                tensor_bias=tensor_bias)

    with tvm.target.cce():
        schedule = generic.auto_schedule(result)

    tensor_list = [tensor_a, tensor_b, result]
    if shape_bias:
        tensor_list = [tensor_a, tensor_b, tensor_bias, result]

    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(schedule, config)
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"):
    """CusBatchMatMul"""
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
    x1_shape = input_x1.get("shape")
    dtype = input_x1.get("dtype").lower()
    x2_shape = input_x2.get("shape")
    if dtype != input_x2.get("dtype").lower():
        raise RuntimeError("dtype of input_x1 and input_x2 must be same, but got %s vs %s" % (
            dtype, input_x2.get("dtype").lower()))
    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b)
    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
                     ((2, 128, 128), (2, 128, 128), "float32", False, True)]
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" % str(input_shape))

    # if not transpose_a and transpose_b:
    batch, m, k = x1_shape

    input1_shape = _get_flattern_shape(x1_shape)
    input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm)
    input2_shape = _get_flattern_shape(x2_shape)
    input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm)

    output_shape = x1_shape
    res_shape = _get_flattern_shape(output_shape)
    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)

    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 2) as cc0:
                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    input2_index = block_idx * 32768 + cc0 * 16384
                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    _inner_matmul_new(tik_instance, dtype,
                                      input1, input1_index,
                                      input2, input2_index,
                                      res, res_index)
    if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 30, block_num=30) as block_idx:
            with tik_instance.for_range(0, 11) as cc1_db:
                with tik_instance.for_range(0, 2, thread_num=2) as thread_idx:
                    with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)):
                        input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB",
                                                               scope=tik.scope_ubuf)
                        t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB",
                                                             scope=tik.scope_ubuf)
                        tik_instance.data_move(input_1_local_UB, input1[
                            (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1,
                                               16, 0, 0)
                        with tik_instance.for_range(0, 2) as vec_i:
                            tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0,
                                               64, 1, 1, 16, 0)
                        with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2:
                            input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB",
                                                                   scope=tik.scope_ubuf)
                            t_1_local_UB = input_2_local_UB
                            bisec_last_axis_local_UB = input_2_local_UB
                            matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64],
                                                                             name="matmul_hybrid_f_t_local_UB",
                                                                             scope=tik.scope_ubuf)
                            matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64],
                                                                                     name="matmul_hybrid_f_t_local_UB_dst_tmp",
                                                                                     scope=tik.scope_ubuf)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8)
                            tik_instance.data_move(input_2_local_UB,
                                                   input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1,
                                                   1024, 0, 0)
                            tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8)
                            tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1,
                                              16, 16, 16)
                            tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8)
                            with tik_instance.for_range(0, 64) as cc6:
                                tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6],
                                                   bisec_last_axis_local_UB[cc6 * 128],
                                                   1, 1, 1, 8)
                            tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp,
                                              matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8)
                            tik_instance.data_move(
                                res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 +
                                    thread_idx * 128 + thread_idx2 * 64],
                                matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0)

    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                input1_index = block_idx * 16384 + cc0 * 128
                input2_index = block_idx * 16384
                res_index = block_idx * 16384 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 27, block_num=27) as block_idx:
            with tik_instance.for_range(0, 42, thread_num=2) as cc0:
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)
            with tik_instance.if_scope((block_idx % 3) < 2):
                input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                input2_index = (block_idx // 3) * 16384
                res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
                input1_index = block_idx * 128 + cc0 * 64
                input2_index = 0
                res_index = block_idx * 128 + cc0 * 64
                _inner_matmul_new_1_64_32_64(tik_instance, dtype,
                                             input1, input1_index,
                                             input2, input2_index,
                                             res, res_index)

    input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True),
                        ((2, 128, 128), (2, 128, 128), "float32", False, True),
                        ((4, 128, 128), (4, 128, 128), "float32", False, True),
                        ((8, 128, 128), (8, 128, 128), "float32", False, True),
                        ((16, 128, 128), (16, 128, 128), "float32", False, True)
                        ]
    if input_shape in input_shape_list:
        block_num = 32
        input1_unit_size = 128
        input2_unint_size = 128 * 128
        with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx:
            block_process_ele_num = (batch * m * k) // block_num
            loop_time = (batch * m * k) // block_num // input1_unit_size
            thread_num = 2
            with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0:
                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                if batch > 1:
                    input2_index = block_idx // (block_num // batch) * input2_unint_size
                else:
                    input2_index = 0
                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                _inner_matmul_new(tik_instance, dtype,
                                  input1, input1_index,
                                  input2, input2_index,
                                  res, res_index)

    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
    return tik_instance
Exemple #7
0
def CusMatMulCubeFraczLeftCast(input_x1,
                               input_x2,
                               bias=None,
                               output_y={},
                               trans_a=False,
                               trans_b=False,
                               kernel_name="CusMatMulCubeFraczLeftCast"):
    """
    calculating  matrix multiplication with bias, C = A*B + bias, support input
    data with fractal format.

    Parameters:
    shape_a: list or tuple
            Shape of the first tensor a with rank > 1
    shape_b:  list or tuple
            Shape of the second tensor b with the same type with a,
            and shape_a, shape_b must be 2 dims
    src_dtype: str
            The data type of input, support "float32", "float16"
    dst_dtype: str
            The data type of output, support "float32", "float16"
    trans_a: bool
            If True, shape_a == transposed before multiplication
    trans_b: bool
            If True, shape_b == transposed before multiplication
    is_fractal: bool
            If True, the input data format of a and b must be fractal format
    shape_bias: list or tuple
            Shape of bias, only support the input data format with ND

    Returns
    -------
    None
    """
    shape_a = input_x1.get("ori_shape")
    shape_b = input_x2.get("ori_shape")
    print("============")
    print(input_x1.get("format"), input_x2.get("format"))
    print(shape_a, shape_b)
    print("============")
    if input_x2.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_b
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_b = [n, c1 * h * w * c0]
        shape_a = [n, n]

    if input_x1.get("format") == "FRACTAL_Z":
        n, c, h, w = shape_a
        c0 = 16
        c1 = c // c0
        if c1 == 0:
            c1 = 1
        shape_a = [n, c1 * h * w * c0]
        shape_b = [c1 * h * w * c0, c1 * h * w * c0]

    if input_x2.get("format") == "FRACTAL_NZ":
        shape_a = [shape_b[0], shape_b[0]]
        shape_b = shape_b

    if input_x1.get("format") == "FRACTAL_NZ":
        shape_a = shape_a
        shape_b = [shape_a[1], shape_a[1]]

    shape_a = list(shape_a)
    shape_b = list(shape_b)

    shape_a = _get_input_shape(shape_a)
    shape_b = _get_input_shape(shape_b)

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_a)
    util.check_shape_rule(shape_b)
    util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT)

    shape_a = [shape_a[1], shape_a[0]]
    trans_a = bool(1 - trans_a)

    shape_b = [shape_b[1], shape_b[0]]
    trans_b = bool(1 - trans_b)

    shape_bias = ()
    if bias is not None and bool(bias):
        shape_bias = bias.get("shape")
        shape_bias = list(shape_bias)
        shape_bias = _get_bias(shape_bias)

    src_dtype = input_x1.get("dtype").lower()
    _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b)

    m_shape = shape_a[len(shape_a) - 2]
    km_shape = shape_a[len(shape_a) - 1]
    kn_shape = shape_b[len(shape_a) - 2]
    n_shape = shape_b[len(shape_a) - 1]

    if src_dtype == "float16":
        block_reduce = cce.BLOCK_REDUCE

    block_in = cce.BLOCK_IN
    block_out = cce.BLOCK_OUT

    if trans_a and km_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if not trans_a and m_shape == 1:
        block_in = cce.BLOCK_VECTOR

    if trans_b and kn_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if not trans_b and n_shape == 1:
        block_out = cce.BLOCK_VECTOR

    if trans_a:
        shape_a_temp = (m_shape // block_reduce, km_shape // block_in,
                        block_reduce, block_in)
    else:
        shape_a_temp = (m_shape // block_in, km_shape // block_reduce,
                        block_in, block_reduce)

    if trans_b:
        shape_b_temp = (kn_shape // block_out, n_shape // block_reduce,
                        block_reduce, block_out)
    else:
        shape_b_temp = (kn_shape // block_reduce, n_shape // block_out,
                        block_out, block_reduce)
    shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2],
                    shape_a_temp[3])
    shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2],
                    shape_b_temp[3])

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
    input_x1 = tik_instance.Tensor(input_x1.get("dtype"),
                                   shape_a_temp,
                                   name="left_matrix",
                                   scope=tik.scope_gm)
    input_x2 = tik_instance.Tensor(input_x2.get("dtype"),
                                   shape_b_temp,
                                   name="right_matrix",
                                   scope=tik.scope_gm)
    res_matmul = tik_instance.Tensor(output_y.get("dtype"),
                                     output_y.get("shape"),
                                     name="output",
                                     scope=tik.scope_gm)
    DIAG_SIZE = 128
    mo_tile, ko_tile, no_tile, diag_opt = get_cus_tile_info(
        input_x1, input_x2, DIAG_SIZE)
    cus_cube_matmul_cast(tik_instance,
                         input_x1,
                         trans_a,
                         input_x2,
                         trans_b,
                         res_matmul,
                         mo_tile=mo_tile,
                         ko_tile=ko_tile,
                         no_tile=no_tile,
                         diag_opt=diag_opt,
                         diag_size=DIAG_SIZE)
    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_x1, input_x2],
                          outputs=[res_matmul])
    return tik_instance
Exemple #8
0
def fake_quant_with_min_max_vars_gradient_compute(gradients, x, min,
                                                  max, backprops_wrt_x,
                                                  backprop_wrt_min,
                                                  backprop_wrt_max,
                                                  num_bits, narrow_range,
                                                  kernel_name="fake_quant_"
                                                              "with_min_max"
                                                              "_vars_gradient"):
    """
    Compute gradients for a FakeQuantWithMinMaxVars operation.

    Parameters
    ----------
    gradients: tvm.tensor
        input tensor has shape and dtype attributes
    x: tvm.tensor
        input tensor has shape and dtype attributes
    min: tvm.tensor
    max: tvm.tensor
    backprops_wrt_x: tvm.tensor
        output tensor has shape and dtype attributes
    backprop_wrt_min: tvm.tensor
        output tensor has shape and dtype attributes
    backprop_wrt_max: TVM tensor
        output tensor has shape and dtype attributes
    num_bits: int
        the bitwidth of the quantization, between 2 and 16
    narrow_range: bool
        whether to quantize into 2^num_bits - 1 distinct values
    kernel_name: str
        cce kernel name, default value is "fake_quant_with_min_max_vars_gradient"

    Returns
    ------
    res: TVM tensor
        the calculation results
    """
    input_shape = te.lang.cce.util.shape_to_list(x.shape)
    dtype = x.dtype

    min_broadcast = te.lang.cce.broadcast(min, input_shape, dtype)
    max_broadcast = te.lang.cce.broadcast(max, input_shape, dtype)
    nudged_min, nudged_max = _nudged_min_max_compute(min_broadcast,
                                                     max_broadcast, num_bits,
                                                     narrow_range)
    nudged_min_backup = te.lang.cce.vadds(nudged_min, tvm.const(0, D_TYPE))
    nudged_max_backup = te.lang.cce.vadds(nudged_max, tvm.const(0, D_TYPE))

    between_nudged_min_max = _between_nudged_min_max_compute(x, nudged_min,
                                                             nudged_max)
    wrt_input_tensor = te.lang.cce.vmul(between_nudged_min_max, gradients)
    shape_list = []
    for i, _ in enumerate(input_shape):
        shape_list.append(i)

    bool_below_min = _less_compare_float32(x, nudged_min_backup)
    below_min_data = te.lang.cce.vmul(bool_below_min, gradients)

    bool_below_max = _less_compare_float32(nudged_max_backup, x)
    below_max_data = te.lang.cce.vmul(bool_below_max, gradients)

    # process min and max are both zero
    tensor_one = te.lang.cce.broadcast(1, input_shape, dtype)
    bool_both_no_zero = _both_min_max_zero(min, max, input_shape, dtype)

    bool_both_no_zero_reverse = te.lang.cce.vsub(tensor_one, bool_both_no_zero)
    bool_both_no_zero_broad = te.lang.cce.broadcast(bool_both_no_zero,
                                                    input_shape, dtype)
    bool_both_no_zero_reverse = te.lang.cce.broadcast(bool_both_no_zero_reverse,
                                                      input_shape, dtype)

    wrt_input_weight = te.lang.cce.vmul(wrt_input_tensor,
                                        bool_both_no_zero_broad)
    gradients_weight = te.lang.cce.vmul(gradients, bool_both_no_zero_reverse)
    backprops_wrt_x = te.lang.cce.vadd(wrt_input_weight, gradients_weight)

    # cloud version: optimize to eliminating workspace by reducing atomic
    if util.get_product_version() == util.VERSION_CLOUD:
        # insert temp node to make vadd_last node as mid_outputTensor for eliminating workspace
        temp_insert_node_mul = te.lang.cce.vmuls(backprops_wrt_x,
                                                 tvm.const(0, D_TYPE))
        temp_insert_node_add = te.lang.cce.vadd(temp_insert_node_mul,
                                                below_min_data)
        below_min_data_tensor = te.lang.cce.vmul(temp_insert_node_add,
                                                 bool_both_no_zero)
        below_max_data_tensor = te.lang.cce.vmul(below_max_data,
                                                 bool_both_no_zero)
        backprop_wrt_min_max_list = te.lang.cce.tuple_sum(
            [below_min_data_tensor,
             below_max_data_tensor],
            axis=shape_list)
        output_list = [backprops_wrt_x] + list(backprop_wrt_min_max_list)

    else:
        below_min_data_tensor = te.lang.cce.vmul(below_min_data,
                                                 bool_both_no_zero)
        below_max_data_tensor = te.lang.cce.vmul(below_max_data,
                                                 bool_both_no_zero)
        backprop_wrt_min = te.lang.cce.sum(below_min_data_tensor,
                                           axis=shape_list)
        backprop_wrt_max = te.lang.cce.sum(below_max_data_tensor,
                                           axis=shape_list)
        output_list = [backprops_wrt_x, backprop_wrt_min, backprop_wrt_max]

    return output_list
Exemple #9
0
def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"):
    """CusMatrixCombine"""
    input_x_shape = input_x.get("shape")
    output_shape = output.get("shape")
    split_dim = 128

    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))

    input_x = tik_instance.Tensor("float32",
                                  input_x_shape,
                                  name="input_x",
                                  scope=tik.scope_gm)
    res = tik_instance.Tensor("float32",
                              output_shape,
                              name="res",
                              scope=tik.scope_gm)

    blocks = 32
    matrix_dim = input_x_shape[0] * input_x_shape[1]
    if input_x_shape[0] == 1 and input_x_shape[1] == 64:
        tiling_dim = 2
        bs = 1
        with tik_instance.for_range(0, blocks,
                                    block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32",
                                             (tiling_dim, matrix_dim),
                                             name="input_x_ub",
                                             scope=tik.scope_ubuf)
            tik_instance.data_move(input_x_ub,
                                   input_x[0, block_index * tiling_dim,
                                           0], 0, 1, 16, 0, 0)
            tik_instance.data_move(res[block_index * tiling_dim, 0],
                                   input_x_ub, 0, 1, 16, 0, 0)
    else:
        tiling_dim = 4
        bs = input_x_shape[0]
        with tik_instance.for_range(0, blocks,
                                    block_num=blocks) as block_index:
            input_x_ub = tik_instance.Tensor("float32",
                                             (tiling_dim, matrix_dim),
                                             name="input_x_ub",
                                             scope=tik.scope_ubuf)
            zero = tik_instance.Scalar("float32")
            zero.set_as(0.0)
            with tik_instance.for_range(0, bs) as i:
                repeat_real = tiling_dim * matrix_dim // 64
                if repeat_real <= 255:
                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_real,
                                            1, 8)
                else:
                    repeat_1 = 255
                    repeat_2 = repeat_real - 255
                    tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1,
                                            8)
                    tik_instance.vector_dup(64, input_x_ub[255 * 64], zero,
                                            repeat_2, 1, 8)
                with tik_instance.for_range(0, tiling_dim) as j:
                    tik_instance.data_move(
                        input_x_ub[j, split_dim * i],
                        input_x[i, block_index * tiling_dim + j,
                                0], 0, 1, 16, 0, 0)
                tik_instance.data_move(
                    res[i * split_dim + block_index * tiling_dim, 0],
                    input_x_ub, 0, 1, tiling_dim * matrix_dim * 4 // 32, 0, 0)
    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[input_x],
                          outputs=[res])
    return tik_instance
Exemple #10
0
def CusBatchMatMul(input_x1,
                   input_x2,
                   output,
                   transpose_a=False,
                   transpose_b=True,
                   kernel_name="batchmatmul"):
    """CusBatchMatMul"""
    if util.get_product_version() == util.VERSION_MINI:
        tik_instance = tik.Tik(tik.Dprofile("v100", "mini"))
    else:
        tik_instance = tik.Tik(tik.Dprofile("v100", "cloud"))
    x1_shape = input_x1.get("shape")
    dtype = input_x1.get("dtype").lower()
    x2_shape = input_x2.get("shape")
    if dtype != input_x2.get("dtype").lower():
        raise RuntimeError(
            "dtype of input_x1 and input_x2 must be same, but got %s vs %s" %
            (dtype, input_x2.get("dtype").lower()))
    input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a,
                   transpose_b)
    support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True),
                     ((36, 128, 128), (36, 128, 128), "float32", False, True),
                     ((5, 128, 128), (5, 128, 128), "float32", False, True),
                     ((18, 128, 128), (18, 128, 128), "float32", False, True),
                     ((16, 128, 128), (16, 128, 128), "float32", False, True),
                     ((9, 128, 128), (9, 128, 128), "float32", False, True),
                     ((1, 64, 64), (1, 64, 64), "float32", False, True),
                     ((1, 128, 128), (1, 128, 128), "float32", False, True),
                     ((4, 128, 128), (4, 128, 128), "float32", False, True),
                     ((2, 128, 128), (2, 128, 128), "float32", False, True),
                     ((6, 128, 128), (6, 128, 128), "float32", False, True),
                     ((24, 128, 128), (24, 128, 128), "float32", False, True),
                     ((32, 128, 128), (32, 128, 128), 'float32', False, True)]
    if input_shape not in support_shape:
        raise RuntimeError("input_shape %s is not supported" %
                           str(input_shape))

    # if not transpose_a and transpose_b:
    batch, m, k = x1_shape

    input1_shape = _get_flattern_shape(x1_shape)
    input1 = tik_instance.Tensor(dtype,
                                 input1_shape,
                                 name="input1",
                                 scope=tik.scope_gm)
    input2_shape = _get_flattern_shape(x2_shape)
    input2 = tik_instance.Tensor(dtype,
                                 input2_shape,
                                 name="input2",
                                 scope=tik.scope_gm)

    output_shape = x1_shape
    res_shape = _get_flattern_shape(output_shape)
    res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm)

    if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 2) as cc0:
                with tik_instance.for_range(0, 128, thread_num=2) as cc1:
                    input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    input2_index = block_idx * 32768 + cc0 * 16384
                    res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128
                    _inner_matmul_new(tik_instance, dtype, input1,
                                      input1_index, input2, input2_index, res,
                                      res_index)

    process_input_shape_640(input_shape, tik_instance, dtype, input1, input2,
                            res)

    if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True):
        with tik_instance.for_range(0, 18, block_num=18) as block_idx:
            with tik_instance.for_range(0, 128, thread_num=2) as cc0:
                input1_index = block_idx * 16384 + cc0 * 128
                input2_index = block_idx * 16384
                res_index = block_idx * 16384 + cc0 * 128
                _inner_matmul_new(tik_instance, dtype, input1, input1_index,
                                  input2, input2_index, res, res_index)

    process_input_shape_1152(input_shape, tik_instance, dtype, input1, input2,
                             res)

    if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True):
        with tik_instance.for_range(0, 32, block_num=32) as block_idx:
            with tik_instance.for_range(0, 2, thread_num=2) as cc0:
                input1_index = block_idx * 128 + cc0 * 64
                input2_index = 0
                res_index = block_idx * 128 + cc0 * 64
                _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1,
                                             input1_index, input2,
                                             input2_index, res, res_index)

    input_shape_list = [
        ((1, 128, 128), (1, 128, 128), "float32", False, True),
        ((2, 128, 128), (2, 128, 128), "float32", False, True),
        ((4, 128, 128), (4, 128, 128), "float32", False, True),
        ((6, 128, 128), (6, 128, 128), "float32", False, True),
        ((8, 128, 128), (8, 128, 128), "float32", False, True),
        ((16, 128, 128), (16, 128, 128), "float32", False, True),
        ((24, 128, 128), (24, 128, 128), "float32", False, True),
        ((32, 128, 128), (32, 128, 128), 'float32', False, True)
    ]
    if input_shape in input_shape_list:
        block_num = 32
        input1_unit_size = 128
        input2_unint_size = 128 * 128
        with tik_instance.for_range(0, block_num,
                                    block_num=block_num) as block_idx:
            block_process_ele_num = (batch * m * k) // block_num
            loop_time = (batch * m * k) // block_num // input1_unit_size
            thread_num = 2
            with tik_instance.for_range(0, loop_time,
                                        thread_num=thread_num) as cc0:
                input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                if batch > 1:
                    input2_index = block_idx // (block_num //
                                                 batch) * input2_unint_size
                else:
                    input2_index = 0
                res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size
                _inner_matmul_new(tik_instance, dtype, input1, input1_index,
                                  input2, input2_index, res, res_index)

    tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res])
    return tik_instance