Esempio n. 1
0
def tc_gen_models_Kernels(each_configuration, idx_count, opt_print=0):
    #
    #
    #
    #
    if opt_print == 1:
        print ("each_config.representative_problem_size: ", each_configuration.list_representative_problem_size)
        print ("each_config.tile_sizes: ", each_configuration.list_tile_sizes)
    
    #
    opt_full_ext = True
    opt_full_int = True

    for each_idx_tile in each_configuration.list_tile_sizes:
        #
        idx_name = each_idx_tile[0]
        idx_tile = each_idx_tile[1]

        #
        if tc_helper.tc_gen_helper_find_1d(each_configuration.list_TB_K, idx_name) != -1:
            if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0:
                opt_full_int = False               
        else:
            if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0:
                opt_full_ext = False

    #
    if opt_print == 1:
        print (">>> opt_full_int: ", opt_full_int, ", opt_full_ext: ", opt_full_ext)
    #
    each_configuration.kernel_full_ext = opt_full_ext
    each_configuration.kernel_full_int = opt_full_int
def tc_gen_code_Kernel_Load_Inputs_Boundary_External_REG(
        opt_gen_ext, opt_axis, l_tile_sizes, l_mapping_reg):
    #
    print("[Code Generator][Load][Input][Boundary][External][REG]")
    str_boundary_external_reg = ""

    #
    #   opt_axis: 0 (x), 1 (y)
    #
    str_reg_idx = ""
    if opt_axis == 0:
        str_reg_idx = l_mapping_reg[0]
    else:
        str_reg_idx = l_mapping_reg[1]

    #
    #
    #
    if opt_gen_ext == 1:
        str_boundary_external_reg = "rng_" + str_reg_idx
    else:
        str_boundary_external_reg = str(
            tc_helper.tc_gen_helper_find(l_tile_sizes, str_reg_idx))

    #
    return str_boundary_external_reg
Esempio n. 3
0
def tc_gen_models_Computes(each_configuration, idx_count, opt_print=0):
    #
    #
    #
    if opt_print == 1:
        print ("[Model][Computes]")

    size_REG_X = 1
    size_REG_Y = 1
    for each_idx in each_configuration.list_REG_X:
        size_REG_X *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)

    for each_idx in each_configuration.list_REG_Y:
        size_REG_Y *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)

    #
    each_configuration.kernel_arithmetic_intensity = (size_REG_X * size_REG_Y) / (size_REG_X + size_REG_Y)
def tc_gen_code_Kernel_Load_Inputs_Boundary_Exteranl_TB(
        opt_load_ext_int, l_input_tensor, l_mapping_reg, l_internal_idx,
        l_info_matching_indices):
    #
    print("[Code Generator][Load][Input][Boundary][External][TB]")
    print(
        "[Code Generator][Load][Input][Boundary][External][TB] l_input_tensor: ",
        l_input_tensor)
    str_boundary_external_tb = ""

    #
    #   External Indices Mapped on TB
    #
    idx_count = 0
    for each_idx in l_input_tensor:
        if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1:
                #
                #
                #
                if tc_helper.tc_gen_helper_find(l_info_matching_indices,
                                                each_idx) == "0":
                    str_input_specific_idx = "0"
                else:
                    str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find(
                        l_info_matching_indices, each_idx)

                #
                if idx_count == 0:
                    #str_boundary_external_tb = "idx_" + each_idx + " < " + "rng_" + each_idx   # should be fixed.
                    str_boundary_external_tb = str_input_specific_idx + " < " + "rng_" + each_idx  # should be fixed.
                else:
                    #str_boundary_external_tb = str_boundary_external_tb + " && idx_" + each_idx + " < " + "rng_" + each_idx   # should be fixed.
                    str_boundary_external_tb = str_boundary_external_tb + " && " + str_input_specific_idx + " < " + "rng_" + each_idx  # should be fixed.
                #
                idx_count += 1

    #
    return str_boundary_external_tb
Esempio n. 5
0
def tc_gen_code_pre_BasicBlock_Ranges(f, l_t3_idx, l_t3_slices):
    # ranges
    for t3_idx in l_t3_idx:
        val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx)
        f.write("\tfor (int i = 0; i < n_blk_" + t3_idx + "; i++)\n")
        f.write("\t{\n")
        f.write("\t\tblk_" + t3_idx + "_range[i] = " + str(val) + ";\n")
        f.write("\t\tif (rng_boundary_" + t3_idx + " != 0 && i == n_blk_" + t3_idx + " - 1)\n")
        f.write("\t\t{\n")
        f.write("\t\t\tblk_" + t3_idx + "_range[i] = rng_boundary_" + t3_idx + ";\n")
        f.write("\t\t}\n")
        f.write("\t}\n")
    f.write("\n")
Esempio n. 6
0
def tc_gen_code_pre_BasicBlock_Initial(f, l_t3_idx, l_t3_slices, idx_kernel):
    # for t3,
    # the number of blocks per index
    for t3_idx in l_t3_idx:
        val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx)
        f.write("\tint n_blk_" + t3_idx + " = CEIL(SIZE_IDX_" + t3_idx.capitalize() + ", " + str(val) + ");\n")
    f.write("\n")

    # block-range per index
    for t3_idx in l_t3_idx:
        f.write("\tint blk_" + t3_idx + "_range[n_blk_" + t3_idx + "];\n")
    f.write("\n")

    # (just-in-case) # of full-tile per index (for debug)
    for t3_idx in l_t3_idx:
        val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx)
        f.write("\tint n_blk_full_" + t3_idx + " = SIZE_IDX_" + t3_idx.capitalize() + " / " + str(val) + ";\n")
    f.write("\n")

    #
    for t3_idx in l_t3_idx:
        val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx)
        f.write("\tint rng_boundary_" + t3_idx + " = SIZE_IDX_" + t3_idx.capitalize() + " % " + str(val) + ";\n")
    f.write("\n")
Esempio n. 7
0
def tc_gen_code_Kernel_Load_Checking_Boundary(f, l_blk_boundary_rng,
                                              tensor_contraction):
    upper_left = 1
    upper_right = 1
    l_left = list()
    l_right = list()

    print("l_blk_boundary_rng: ", l_blk_boundary_rng)

    #
    for left_idx in tensor_contraction[0][4]:
        if tc_helper.tc_gen_helper_find(l_blk_boundary_rng, left_idx) != -1:
            upper_left = upper_left * tc_helper.tc_gen_helper_find(
                l_blk_boundary_rng, left_idx)
            l_left.append(left_idx)

    #
    for right_idx in tensor_contraction[1][4]:
        if tc_helper.tc_gen_helper_find(l_blk_boundary_rng, right_idx) != -1:
            upper_right = upper_right * tc_helper.tc_gen_helper_find(
                l_blk_boundary_rng, right_idx)
            l_right.append(right_idx)

    return upper_left, upper_right, l_left, l_right
Esempio n. 8
0
def tc_gen_definition_strides_output(f, l_idx_size, l_t3_idx):
    f.write("// t3 for output\n")
    # for t3,
    val_prev = 1
    str_prev = ""
    for t3_idx in l_t3_idx:
        if val_prev == 1:
            tc_gen_code_helper_define(f,
                                      "STR_SD2_T3_" + str(t3_idx.capitalize()),
                                      "1")
        else:
            tc_gen_code_helper_define(
                f, "STR_SD2_T3_" + str(t3_idx.capitalize()),
                "STR_SD2_T3_" + str_prev.capitalize() + " * " + "SIZE_IDX_" +
                str_prev.capitalize())
        str_prev = t3_idx
        val_prev = tc_helper.tc_gen_helper_find(l_idx_size, t3_idx)
    f.write("\n")
Esempio n. 9
0
def tc_gen_Inner_Group(l_outer_groups, tmp_count, tmp_config, opt_print, opt_data_type):
    #
    if opt_print == 1:
        print ("[Code Generator][Inner-Group] Working...")
        print ("====================== Step 2: Creating Inner-Groups =======================")
        print (" Only Support the First Outer-Group")

    #
    #   Permutations ([Assumption] Input: an arbitrary tensor contraction)
    #
    

    #
    #   To Handle 2D Tensors is in "tc_gen_permutations()"*****
    #

    #
    #   Cost-Model ([Verion 1.0] DRAM Data Movement, [Version 2.0] Bank Conflicts ...)
    #
    list_configurations_outer_group = list()
    Configuration.get_configurations(l_outer_groups, list_configurations_outer_group, tmp_count, tmp_config, 0, opt_data_type)

    #
    #
    #
    for each_config_outer_group in list_configurations_outer_group:
        each_config_outer_group.print_configuration(1)
    
    #
    #   (Temporary, Mapping and Tile-Sizes should be Determined by Models in the future.)
    #
    info_each_inner_group = Helper_Inputs.transform_config_innergroup(list_configurations_outer_group[0])

    #
    #   Every Outer-Groups:
    #
    print ("[Code Generator][Inner-Group] # of Outer-Groups: ", len(l_outer_groups))
    for each_outer_group in l_outer_groups:
        #
        print ("[Code Generator][Inner-Group] # of Tensor Contractions (Candidates) within an Outer-Group: ", len(each_outer_group[1]))

        #
        #   Within an Outer-Group, there might be several Inner-Groups.
        #
        l_inner_groups              = list()
        l_each_group_mapping_tb     = list()
        l_each_group_mapping_2D     = list()
        l_each_group_mapping_reg    = list()
        l_t3_slices_size            = list()
        l_t3_interface_info         = list()
        l_t3_temp_inputs            = list()
        l_t3_temp_conditions        = list()

        #
        #   To Create "Interface"
        #
        idx_count           = 0
        str_common_output   = ""
        for each_tc in each_outer_group[1]:
            l_t3_temp_inputs.append([each_tc[4], each_tc[6]])
            l_t3_temp_conditions.append("cond_kernel_" + str(idx_count + 1))
            if idx_count == 0:
                str_common_output = each_tc[0]
            idx_count = idx_count + 1
        #
        #   Information: Split Indices
        #
        l_each_group_split_info = each_outer_group[3]        

        #
        #   l_interface_info: [0] All Index, [1] Output, [2] Inputs, [3] Conditions, [4] Options
        #
        l_t3_interface_info.append([each_outer_group[2], str_common_output, l_t3_temp_inputs, l_t3_temp_conditions, "opt_register_transpose", l_each_group_split_info])

        #
        #   (Temporary)
        #                           [0]          [1]          [2]        [3]            [4]
        #   each_manual_group: Mapping_TB, Mapping_TB_2D, Mapping_Reg, Slices, Split-Info(Repre-size)
        #
        for each_manual_group in info_each_inner_group:
            #
            l_each_group_mapping_tb     = each_manual_group[0]
            l_each_group_mapping_2D     = each_manual_group[1]
            l_each_group_mapping_reg    = each_manual_group[2]
            l_t3_slices_size            = each_manual_group[3]
            l_info_split_ext            = each_manual_group[4]
            l_each_group_mapping_TB_K   = each_manual_group[5]
            l_tensor_contractions       = list()

            #
            print ("[Code Generator][Inner-Groups] Picked Tiles: ", l_t3_slices_size)

            #
            if opt_print == 1:
                print ("Target Outer-Group: ", each_outer_group[0])

            #
            #   Fusion-Constraint #2:
            #
            promissing_left     = 1
            promissing_right    = 1
            all_x_axis          = l_each_group_mapping_2D[0] + [l_each_group_mapping_reg[0]]
            all_y_axis          = l_each_group_mapping_2D[1] + [l_each_group_mapping_reg[1]]

            #
            #   X-Axis (Assumption: (Hypothetically) Left Input) including Output[0]
            #
            for each_idx in all_x_axis:
                promissing_left = promissing_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx)

            #
            #   Y-Axis (Assumption: (Hypothetically) Right Input)
            #
            for each_idx in all_y_axis:
                promissing_right = promissing_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx)

            print ("[Code Generator][Inner-Groups] Supposed Shared Memeory Lenghts: Left >>>", promissing_left, ", Right >>>", promissing_right)

            #
            #   Checking if All Tensor Contractions can be Fused or not.
            #
            l_picked_tc             = list()
            idx_tensor_contraction  = 0

            #   Checking if |info_each_inner_group| > |each_outer_group|,
            checking_used = 1
            if len(each_outer_group[1]) == 0:
                checking_used = -1

            #
            for each_tc in each_outer_group[1]:
                #
                #print (">> each_tc: ", each_tc)
                l_input_left = each_tc[5]
                l_input_right = each_tc[7]

                #
                #   Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs.
                #
                #   LEFT
                idx_check_reg_left  = 0
                size_left           = 1
                for each_left_idx in l_input_left: #each_tc[7]:
                    #
                    #   Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs.
                    #
                    if each_left_idx == l_each_group_mapping_reg[0]:
                        idx_check_reg_left = idx_check_reg_left + 1
                    if each_left_idx == l_each_group_mapping_reg[1]:
                        idx_check_reg_left = idx_check_reg_left + 1

                    #
                    #   Fusion-Constraint #2: The Size of Shared Memeory
                    #
                    #print ("[l] each_outer_group[0]: ", each_outer_group[0])
                    if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_left_idx) != -1:
                        #print (">> ", each_left_idx)
                        size_left = size_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_left_idx)

                #   RIGHT
                idx_check_reg_right = 0
                size_right          = 1
                for each_right_idx in l_input_right: #each_tc[5]:
                    #
                    #   Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs.
                    #
                    if each_right_idx == l_each_group_mapping_reg[0]:
                        idx_check_reg_right = idx_check_reg_right + 1
                    if each_right_idx == l_each_group_mapping_reg[1]:
                        idx_check_reg_right = idx_check_reg_right + 1

                    #
                    #   Fusion-Constraint #2: The Size of Shared Memeory
                    #
                    #print ("[r] each_outer_group[0]: ", each_outer_group[0])
                    if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_right_idx) != -1:
                        #print (">> ", each_right_idx)
                        size_right = size_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_right_idx)

                #
                #   [Should be Fixed] 
                #
                if idx_check_reg_right != 2 and idx_check_reg_left != 2 and promissing_left == size_left and promissing_right == size_right:
                    l_picked_tc.append(idx_tensor_contraction)
                    l_tensor_contractions.append(each_tc)
                else:
                    print ("[DEBUG] promissing_left: ", promissing_left, ", promissing_right: ", promissing_right)
                    print ("[DEBUG] idx_check_reg_right: ", idx_check_reg_right, ", idx_check_reg_left: ", idx_check_reg_left, ", size_left: ", size_left, ", size_right: ", size_right)
                    sys.exit()

                #
                idx_tensor_contraction = idx_tensor_contraction + 1
                #
                #   End of For-Statement
                #
            
            #
            if checking_used == 1:
                #print ("added a tensor contraction to an inner-group")
                #print ("l_info_split_ext: ", l_info_split_ext)
                l_inner_groups.append([l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K])

            #   To-Do: Should be checked in detail
            for each_tc in list(reversed(l_picked_tc)):
                each_outer_group[1].pop(each_tc)
                
        #
        #   For-Outer-Group
        #
        break   # To-Do: (Currently) Only 1 Outer-Group.
    #
    #
    #
    if opt_print == -1:
        print ("============================================================================")
        #
        #
        #
        print ("===================== Step 2: [Output] Inner-Groups ========================")
        print (" Does not Support Register-Transpose")
        print (" These Tensor Contractions will be fused.")
        print (" # of Inner-Groups: ", len(l_inner_groups))

        # [l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K]
        for each_inner_group in l_inner_groups:
            print ("Mapping All: ",     each_inner_group[0])
            print ("Mapping TB : ",     each_inner_group[1])
            print ("Mapping Reg: ",     each_inner_group[2])
            print ("Mapping TB_K: ",    each_inner_group[6])
            print ("Slices : ",         each_inner_group[4])
            print ("Split-Slices : ",   each_inner_group[5])
            print ("# of Tensor Contractions: ", len(each_inner_group[3]))
            for each_tc in each_inner_group[3]:
                print ("Each Tensor Contraction: ", each_tc)

        print ("============================================================================")
    
    #
    #   Return Output
    #
    return l_inner_groups, l_t3_interface_info
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External(
        opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y,
        l_internal_idx, l_input_tensor, l_mapping_tb, l_mapping_reg,
        l_tile_sizes, l_info_matching_indices, opt_matching_index_fully,
        idx_kernel, each_inner_load_inst_x, each_inner_load_inst_y, size_tb_x,
        size_tb_y):
    #
    print("[Code Generator][Load][Input][Addr][Global][External]")
    print(
        "[Code Generator][Load][Input][Addr][Global][External] l_input_tensor: ",
        l_input_tensor)
    print(
        "[Code Generator][Load][Input][Addr][Global][External] l_mapping_tb: ",
        l_mapping_tb)
    print(
        "[Code Generator][Load][Input][Addr][Global][External] l_mapping_reg: ",
        l_mapping_reg)
    str_input_ext_global_addr = ""

    #
    #   |TB_X| -(loads)-> K
    #   |TB_Y| -(loads)-> E ***
    #
    if opt_load_ext_int == -1:
        l_idx_tb = l_mapping_tb[1]
    else:
        l_idx_tb = l_mapping_tb[0]

    #
    #   [Symbols] Block Index,
    #
    rev_l_input_tensor = list(reversed(l_input_tensor))
    idx_count = 0
    idx_base_ext = 0
    str_specific_idx = ""
    for each_idx in rev_l_input_tensor:
        #
        #   Internal Index
        #
        if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) != -1:
            #print ("[int.] ", each_idx, ", idx_count: ", idx_count)
            if idx_count != 0:
                str_input_ext_global_addr = "(" + str_input_ext_global_addr + ") * size_" + each_idx

        #
        #   External Index
        #
        else:
            #
            #   Mapped on REG
            #
            if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) != -1:
                #print (" >>> [ext.][REG] ", each_idx)
                if idx_base_ext == 0:
                    str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel) + "_" + each_idx.capitalize() + " + ll"
                else:
                    str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel
                    ) + "_" + each_idx.capitalize(
                    ) + " + ll + (" + str_input_ext_global_addr + ") * size_" + each_idx
            else:
                #print (" >>> opts: tb_x: ", opt_inner_load_input_tb_x, ", tb_y: " , opt_inner_load_input_tb_y)
                #print (" >>> [ext.][TB] ", each_idx, " >>> ", tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx))
                str_input_specific_idx_multi = ""
                #
                #   |TB_X| -(loads)-> K
                #   |TB_Y| -(loads)-> E ***
                #
                if opt_load_ext_int == -1:
                    if opt_inner_load_input_tb_y == 2:
                        str_input_specific_idx_multi = " + " + str(
                            int(size_tb_y * each_inner_load_inst_y))
                #
                #   |TB_X| -(loads)-> E ***
                #   |TB_Y| -(loads)-> K
                #
                else:
                    if opt_inner_load_input_tb_x == 2:
                        str_input_specific_idx_multi = " + " + str(
                            int(size_tb_x * each_inner_load_inst_x))

                #print (">>> str_input_specific_idx_multi: ", str_input_specific_idx_multi)
                if tc_helper.tc_gen_helper_find(l_info_matching_indices,
                                                each_idx) == "0":
                    str_specific_idx = each_idx
                    str_input_specific_idx = tc_helper.tc_gen_helper_find(
                        l_info_matching_indices, each_idx)
                    str_input_specific_idx_multi = ""
                else:
                    str_specific_idx = each_idx
                    str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find(
                        l_info_matching_indices, each_idx)

                #
                if idx_base_ext == 0:
                    str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel
                    ) + "_" + each_idx.capitalize(
                    ) + " + " + str_input_specific_idx + str_input_specific_idx_multi
                else:
                    str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel
                    ) + "_" + each_idx.capitalize(
                    ) + " + " + str_input_specific_idx + str_input_specific_idx_multi + " + (" + str_input_ext_global_addr + ") * size_" + each_idx
            #
            idx_base_ext += 1
        #
        idx_count += 1
    #
    return str_input_ext_global_addr, str_input_specific_idx + str_input_specific_idx_multi + " < rng_" + str_specific_idx
Esempio n. 11
0
def tc_gen_code_new(tmp_count, str_tmp_count, str_tmp_config, l_inner_groups,
                    l_interface_info, opt_pre_computed, opt_data_type):
    #
    #   FILE: OPEN
    #
    output_name = "temp"
    if str_tmp_config == "-1":
        f = open(output_name + "__" + str_tmp_count + ".cu", "w")
    else:
        f = open(
            output_name + "__" + str_tmp_count + "__" + str_tmp_config + ".cu",
            "w")

    #
    #   Includes and Globel Methods
    #
    tc_code_include.tc_code_include(f)
    #tc_code_etc.tc_gen_global_methods(f, len(l_inner_groups))

    #
    #   should be changed.
    #
    interface_name = "sd_t_d2_fusion"
    kernel_name = "kernel_"

    #
    l_combined_opt_diffs = list()
    l_combined_opt_gen_fulls = list()
    l_combined_opt_gen_internal = list()
    l_combined_t3_slices_size = list()
    l_combined_mappings = list()

    l_combined_t3_d_decl_var = list()
    l_combined_t2_d_decl_var = list()
    l_combined_v2_d_decl_var = list()
    l_combined_t3_parameters = list()
    l_combined_t2_parameters = list()
    l_combined_v2_parameters = list()
    l_combined_device_dynamic = list()
    l_combined_host_dynamic = list()
    l_combined_cuda_malloc = list()

    #
    #   To Support Multiple Inner-Groups
    #
    for each_inner_group in l_inner_groups:
        l_combined_t3_slices_size.append(each_inner_group[8])
        l_combined_mappings.append([each_inner_group[1], each_inner_group[2]])

    #
    #   Inputs: T3-Slices, T3-Mappings, External Index, Internal Index
    #
    tc_code_define.tc_gen_definition_new(f, l_combined_t3_slices_size,
                                         l_combined_mappings,
                                         l_inner_groups[0][4],
                                         l_inner_groups[0][5])

    #
    #   Each Inner-Group Corresponds to A Kernel (There are Three Types per Kernel)
    #       Type #1: External (Full)    & Internal (Full)
    #       Type #2: External (Full)    & Internal (Partial)
    #       Type #3: External (Partial) & Internal (Full)
    #       Type #4: External (Partial) & Internal (Partial)
    #
    l_var_outputs = list()
    l_var_input_internal = list()

    l_combined_register_mappings = list()

    l_combined_var_input_left = list()
    l_combined_var_input_right = list()
    l_combined_var_outputs_helpers = list()
    l_combined_var_thread_block = list()

    l_combined_t3_d_decl_var = list()
    l_combined_t2_d_decl_var = list()
    l_combined_v2_d_decl_var = list()

    l_combined_t3_parameters = list()
    l_combined_t2_parameters = list()
    l_combined_v2_parameters = list()

    l_combined_inputs_int_strides = list()

    l_cuda_malloc = list()
    l_device_dynamic = list()
    l_host_dynamic = list()

    #
    #   To Handle Multiple Tensor Contractions in an Inner-Group
    #
    kernel_number = 1
    for each_inner_group in l_inner_groups:
        #
        '''
        idx_count = 0
        for each_info in each_inner_group:
            print ("> ", idx_count, ": ", each_info)
            idx_count += 1
        '''
        #
        l_var_tensor_block = list()
        l_var_input_left = list()
        l_var_input_right = list()
        l_var_outputs_helpers = list()

        l_t3_d_decl_var = list()
        l_t2_d_decl_var = list()
        l_v2_d_decl_var = list()

        l_t3_parameters = list()
        l_t2_parameters = list()
        l_v2_parameters = list()

        l_input_strides = list()

        #   Inputs:     kernel_number, each_inner_group[6](l_input_tensors), each_inner_group[4](l_extenral_index), each_inner_group[5](l_internal_index)
        #   Outputs:    l_t3_d_decl_var, l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var, l_v2_parameters,
        #               l_cuda_malloc, l_device_dynami,c
        #               l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal
        tc_code_globalvar.tc_gen_variables(
            kernel_number, l_interface_info, each_inner_group[6],
            each_inner_group[4], each_inner_group[5], l_t3_d_decl_var,
            l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var,
            l_v2_parameters, l_input_strides, l_cuda_malloc, l_device_dynamic,
            l_var_tensor_block, l_var_outputs, l_var_outputs_helpers,
            l_var_input_left, l_var_input_right, l_var_input_internal,
            opt_data_type)

        #
        #   Variables are Used in Functions for Pre-Computed Arrays and In-Direction Arrays. (Finally, Kernels)
        #   : We need to differentiate them to be used in these functions.
        #

        #
        #   Tile-Appoach:
        #       Inputs: kernel_number, l_interface_info, each_inner_group[4](l_external_index), each_inner_group[8](l_t3_slices), each_inner_group[5](l_internal_index), each_inner_group[3](l_idx_size)
        #       Output: l_host_dynamic
        #
        if opt_pre_computed != -1:
            tc_pre_BasicBlock.tc_gen_code_pre_TileApproach(
                f, kernel_number, l_interface_info, each_inner_group[4],
                each_inner_group[8], each_inner_group[3], each_inner_group[5],
                l_host_dynamic)

            #
            #   Pre-Compuated Arrays and In-Direct Arrays
            #
            tc_pre_IndirectArray.tc_gen_code_driver_PreComputedArray(
                f, kernel_number, l_interface_info, l_var_outputs_helpers,
                l_var_input_left, l_var_input_right, l_var_tensor_block,
                each_inner_group[6], l_host_dynamic, each_inner_group[4],
                each_inner_group[4], each_inner_group[2], each_inner_group[0])

        #   Related to Kernel(s)
        size_smem_left, size_smem_right, str_left, str_right = tc_interface.tc_interface_SMEM_Size(
            each_inner_group[6], each_inner_group[4], each_inner_group[5],
            each_inner_group[8], each_inner_group[2])

        #
        size_TB_X, size_TB_Y = tc_interface.tc_interface_TB_Size(
            each_inner_group[1][0], each_inner_group[1][1],
            each_inner_group[8])
        size_smem_internal = tc_helper.tc_gen_helper_CheckingIntUnit(
            each_inner_group[4], each_inner_group[8], each_inner_group[5])
        size_REG_X = tc_helper.tc_gen_helper_find(
            each_inner_group[8], each_inner_group[2][0])  # 0 -> REG_X
        size_REG_Y = tc_helper.tc_gen_helper_find(
            each_inner_group[8], each_inner_group[2][1])  # 1 -> REG_Y
        opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI(
            each_inner_group[6], each_inner_group[5])

        #print ("size_smem_internal: ", size_smem_internal)

        #
        #   Constratins:""
        #
        tc_gen.tc_gen_Constraints(f, size_TB_X, size_TB_Y, size_smem_left,
                                  size_smem_right, size_smem_internal)

        #
        opt_shared_padding = 0

        #
        #   Kernels: Different Types (External, Internal)
        #               (1) Full,       Full
        #               (2) Full,       Partial
        #               (3) Partial,    Full
        #               (4) Partial,    Partial
        #

        tc_code_kernel.tc_gen_code_Kernel(
            f, kernel_name + "_1_" + str(kernel_number), l_t3_d_decl_var,
            l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides,
            each_inner_group[7], each_inner_group[1], each_inner_group[2],
            each_inner_group[4], each_inner_group[5], each_inner_group[8],
            size_smem_left, size_smem_right, size_smem_internal, size_REG_Y,
            size_REG_X, size_TB_Y, size_TB_X, str_left, str_right,
            l_blk_boundary_rng, -1, -1, opt_load_t2, opt_load_v2,
            opt_pre_computed, 1, opt_data_type, opt_shared_padding,
            kernel_number)

        tc_code_kernel.tc_gen_code_Kernel(
            f, kernel_name + "_2_" + str(kernel_number), l_t3_d_decl_var,
            l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides,
            each_inner_group[7], each_inner_group[1], each_inner_group[2],
            each_inner_group[4], each_inner_group[5], each_inner_group[8],
            size_smem_left, size_smem_right, size_smem_internal, size_REG_Y,
            size_REG_X, size_TB_Y, size_TB_X, str_left, str_right,
            l_blk_boundary_rng, 1, -1, opt_load_t2, opt_load_v2,
            opt_pre_computed, 1, opt_data_type, opt_shared_padding,
            kernel_number)

        tc_code_kernel.tc_gen_code_Kernel(
            f, kernel_name + "_3_" + str(kernel_number), l_t3_d_decl_var,
            l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides,
            each_inner_group[7], each_inner_group[1], each_inner_group[2],
            each_inner_group[4], each_inner_group[5], each_inner_group[8],
            size_smem_left, size_smem_right, size_smem_internal, size_REG_Y,
            size_REG_X, size_TB_Y, size_TB_X, str_left, str_right,
            l_blk_boundary_rng, -1, 1, opt_load_t2, opt_load_v2,
            opt_pre_computed, 1, opt_data_type, opt_shared_padding,
            kernel_number)

        tc_code_kernel.tc_gen_code_Kernel(
            f, kernel_name + "_4_" + str(kernel_number), l_t3_d_decl_var,
            l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides,
            each_inner_group[7], each_inner_group[1], each_inner_group[2],
            each_inner_group[4], each_inner_group[5], each_inner_group[8],
            size_smem_left, size_smem_right, size_smem_internal, size_REG_Y,
            size_REG_X, size_TB_Y, size_TB_X, str_left, str_right,
            l_blk_boundary_rng, 1, 1, opt_load_t2, opt_load_v2,
            opt_pre_computed, 1, opt_data_type, opt_shared_padding,
            kernel_number)

        #
        #   multiple internal indices.
        #
        if len(each_inner_group[5]) > 1:

            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name + "_1_tex_" + str(kernel_number),
                l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var,
                l_input_strides, each_inner_group[7], each_inner_group[1],
                each_inner_group[2], each_inner_group[4], each_inner_group[5],
                each_inner_group[8], size_smem_left, size_smem_right,
                size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y,
                size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, -1,
                opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type,
                opt_shared_padding, kernel_number)

            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name + "_2_tex_" + str(kernel_number),
                l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var,
                l_input_strides, each_inner_group[7], each_inner_group[1],
                each_inner_group[2], each_inner_group[4], each_inner_group[5],
                each_inner_group[8], size_smem_left, size_smem_right,
                size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y,
                size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, -1,
                opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type,
                opt_shared_padding, kernel_number)

            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name + "_3_tex_" + str(kernel_number),
                l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var,
                l_input_strides, each_inner_group[7], each_inner_group[1],
                each_inner_group[2], each_inner_group[4], each_inner_group[5],
                each_inner_group[8], size_smem_left, size_smem_right,
                size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y,
                size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, 1,
                opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type,
                opt_shared_padding, kernel_number)

            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name + "_4_tex_" + str(kernel_number),
                l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var,
                l_input_strides, each_inner_group[7], each_inner_group[1],
                each_inner_group[2], each_inner_group[4], each_inner_group[5],
                each_inner_group[8], size_smem_left, size_smem_right,
                size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y,
                size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, 1,
                opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type,
                opt_shared_padding, kernel_number)

        #
        #   For the Interface
        #
        l_combined_var_input_left.append(l_var_input_left)
        l_combined_var_input_right.append(l_var_input_right)
        l_combined_var_outputs_helpers.append(l_var_outputs_helpers)
        l_combined_var_thread_block.append(l_var_tensor_block)
        l_combined_t3_d_decl_var.append(l_t3_d_decl_var)
        l_combined_t2_d_decl_var.append(l_t2_d_decl_var)
        l_combined_v2_d_decl_var.append(l_v2_d_decl_var)
        l_combined_t3_parameters.append(l_t3_parameters)
        l_combined_t2_parameters.append(l_t2_parameters)
        l_combined_v2_parameters.append(l_v2_parameters)
        l_combined_register_mappings.append(each_inner_group[2])
        l_combined_inputs_int_strides.append(l_input_strides)

        #
        kernel_number = kernel_number + 1
        #
        #   End of For-Statement: l_innter_groups
        #

    #
    #   Parts of Kernels for Register Transpose
    #   (Currently) It assumes that all inner groups can be grouped.
    #
    if tc_gen.tc_gen_Check_RegisterTranspose(l_inner_groups) == 1:
        print(
            "[Code Generator][Code] Register Transpose [Possible] according to the given mappings"
        )
        f.write("\n")
        f.write(
            "// This part is for Kernels which support Register Transpose\n")

        #
        #   This is also 4-different types.
        #
        tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose(
            f, kernel_name + "_4", l_inner_groups, l_combined_t3_d_decl_var,
            l_combined_t2_d_decl_var, l_combined_v2_d_decl_var,
            l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, 1)
        tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose(
            f, kernel_name + "_3", l_inner_groups, l_combined_t3_d_decl_var,
            l_combined_t2_d_decl_var, l_combined_v2_d_decl_var,
            l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, -1)
        tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose(
            f, kernel_name + "_2", l_inner_groups, l_combined_t3_d_decl_var,
            l_combined_t2_d_decl_var, l_combined_v2_d_decl_var,
            l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, 1)
        tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose(
            f, kernel_name + "_1", l_inner_groups, l_combined_t3_d_decl_var,
            l_combined_t2_d_decl_var, l_combined_v2_d_decl_var,
            l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, -1)
    else:
        print(
            "[Code Generator][Code] Register Transpose [Impossible] according to the given mappings"
        )

    #
    #   Temp...... For |K| > 1,
    #
    l_internal_addrs = list()
    for each_inner_group in l_inner_groups:
        #
        #   Each Inner-Group can have multiple Tensor Contractions which will be Fused.
        #   Among Inner-Groups, there might be possible to group them by using Register-Transposition.
        #
        print("each_inner_group: ", each_inner_group)
        l_tensor_contraction_in_inner_group = each_inner_group[6]

        #   To-Do: Need To Suport Multiple-Tensor Contractions
        for each_tensor_contraction in l_tensor_contraction_in_inner_group:
            info_input_left = each_tensor_contraction[0]
            info_input_right = each_tensor_contraction[1]

            # LEFT
            l_idx_info_left = list()
            for idx_tensor in info_input_left[1]:
                #
                if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5],
                                                   idx_tensor) != -1:
                    l_idx_info_left.append([1, idx_tensor])
                else:
                    l_idx_info_left.append([0, idx_tensor])

            #
            l_rev_idx_info_left = list(reversed(l_idx_info_left))
            str_addr_left = ""
            idx_count = 0
            for each_idx_info in l_rev_idx_info_left:
                #
                if each_idx_info[0] == 1:  # internal index
                    if idx_count == 0:
                        str_addr_left = "idx_" + each_idx_info[1]
                    else:
                        str_addr_left = "idx_" + each_idx_info[
                            1] + " + (" + str_addr_left + ") * size_" + each_idx_info[
                                1]
                    #
                    idx_count = idx_count + 1
                else:  # external index
                    if idx_count != 0:
                        str_addr_left = "(" + str_addr_left + ") * size_" + each_idx_info[
                            1]

            # RIGHT
            str_addr_right = ""
            l_idx_info_right = list()
            for idx_tensor in info_input_right[1]:
                #
                if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5],
                                                   idx_tensor) != -1:
                    l_idx_info_right.append([1, idx_tensor])
                else:
                    l_idx_info_right.append([0, idx_tensor])

            #
            l_rev_idx_info_right = list(reversed(l_idx_info_right))
            str_addr_right = ""
            idx_count = 0
            for each_idx_info in l_rev_idx_info_right:
                #
                if each_idx_info[0] == 1:  # internal index
                    if idx_count == 0:
                        str_addr_right = "idx_" + each_idx_info[1]
                    else:
                        str_addr_right = "idx_" + each_idx_info[
                            1] + " + (" + str_addr_right + ") * size_" + each_idx_info[
                                1]
                    #
                    idx_count = idx_count + 1
                else:  # external index
                    if idx_count != 0:
                        str_addr_right = "(" + str_addr_right + ") * size_" + each_idx_info[
                            1]

            #
            l_internal_addrs.append([str_addr_left, str_addr_right])
        #

    #
    #   Drivers
    #
    tc_interface.tc_gen_code_kernel_caller(
        f,
        interface_name,
        kernel_name,
        l_interface_info,
        l_inner_groups[0][4],
        l_inner_groups[0][5],
        l_inner_groups[0][10],
        l_var_tensor_block,
        l_var_outputs,
        l_var_outputs_helpers,
        l_var_input_left,
        l_var_input_right,
        l_var_input_internal,
        l_combined_var_input_left,
        l_combined_var_input_right,
        l_combined_var_outputs_helpers,
        l_combined_var_thread_block,
        l_combined_register_mappings,
        l_internal_addrs,  ## ADDEDEDEDE
        l_combined_inputs_int_strides,  ##
        l_cuda_malloc,
        l_device_dynamic,
        l_host_dynamic,
        l_t3_parameters,
        l_t2_parameters,
        l_v2_parameters,
        l_combined_t3_parameters,
        l_combined_t2_parameters,
        l_combined_v2_parameters,
        opt_pre_computed,
        opt_data_type)

    #
    '''
    idx_inner_count = 0
    for each_inner in l_inner_groups:
        idx_count = 0
        for each_info in each_inner:
            print ("[", idx_inner_count, "][", idx_count, "] each_info: ", each_info)
            idx_count = idx_count + 1
        idx_inner_count = idx_inner_count + 1
    '''

    #
    l_tile_sizes = l_inner_groups[0][8]
    l_split_representative_problem_size = l_inner_groups[0][9]

    #l_split_representative_problem_size = l_inner_groups[0][9]
    #print ("l_split_representative_problem_size: ", l_split_representative_problem_size)
    #print ("l_tile_sizes: ", l_tile_sizes)

    #
    #   "Interface"
    #
    tc_interface.tc_gen_code_interface(f, interface_name, l_interface_info,
                                       l_tile_sizes,
                                       l_split_representative_problem_size,
                                       opt_data_type)

    #
    #   FILE: CLOSE
    #
    f.close()
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External_Matching_Index(
        opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y,
        l_input_tensor, l_mapping_tb, l_mapping_reg, l_internal_idx,
        l_tile_sizes, num_inner_inst_tb_x, num_inner_inst_tb_y, size_tb_x,
        size_tb_y):
    #
    #   [Option]
    #
    #
    l_info_matching_indices = []
    l_info_pruned_indices = []
    l_info_idx_tb = []
    l_info_pruned_indices_tb = []
    opt_matching_index_fully = -1

    #
    #   |TB_X| -(loads)-> K
    #   |TB_Y| -(loads)-> E ***
    #
    if opt_load_ext_int == -1:
        l_idx_tb = l_mapping_tb[1]
    else:
        l_idx_tb = l_mapping_tb[0]

    #
    #   [Option] Indices mapped on TB except for them having tile-size == 1.
    #
    for each_idx in l_idx_tb:
        if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) != 1:
            l_info_pruned_indices_tb.append([
                each_idx,
                tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)
            ])

    #
    for each_idx in l_idx_tb:
        l_info_idx_tb.append(
            [each_idx,
             tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)])

    #
    print(
        "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] l_info_idx_tb: ",
        l_info_idx_tb)

    #
    #   [Option] Check if indices mapped on TB_X | TB_Y can be used for the external indices mapped on
    #
    for each_idx in l_input_tensor:
        if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1:
                print(
                    "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] ext. mapped on TB: ",
                    each_idx, ", size: ",
                    tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx))
                #
                #   To Prune Indices with Tile-Size = 1.
                #
                if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) == 1:
                    l_info_matching_indices.append([each_idx, "0"])
                else:
                    l_info_pruned_indices.append([
                        each_idx,
                        tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)
                    ])

    #
    #   Target Indices should be handled.
    #
    print("[DEBUG] l_info_pruned_indices: ", l_info_pruned_indices)
    print("[DEBUG] l_info_pruned_indices_tb: ", l_info_pruned_indices_tb)

    #
    #
    #
    print(
        "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices mapped on TB: ",
        len(l_info_pruned_indices_tb))
    print(
        "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices: ",
        len(l_info_pruned_indices))
    if len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1:
        #print ("len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1")
        len_pruned_indices_tb = 1
        len_pruned_indice = 1
        #
        #
        #
        for each_idx_info in l_info_pruned_indices_tb:
            len_pruned_indices_tb *= tc_helper.tc_gen_helper_find(
                l_tile_sizes, each_idx_info[0])

        #
        for each_idx_info in l_info_pruned_indices:
            len_pruned_indice *= tc_helper.tc_gen_helper_find(
                l_tile_sizes, each_idx_info[0])

        #
        l_info_matching_indices.append(
            [l_info_pruned_indices[0][0], l_info_pruned_indices_tb[0][0]])
        opt_matching_index_fully = 1
    else:
        #
        #   Miserable Case:
        #
        print("Miserable Case: Should Handle Manually!")
        len_pruned_indices_tb = 1
        len_pruned_indice = 1
        #
        #
        #
        for each_idx_info in l_info_pruned_indices_tb:
            len_pruned_indices_tb *= tc_helper.tc_gen_helper_find(
                l_tile_sizes, each_idx_info[0])

        #
        for each_idx_info in l_info_pruned_indices:
            len_pruned_indice *= tc_helper.tc_gen_helper_find(
                l_tile_sizes, each_idx_info[0])

        print("len_pruned_indices_tb: ", len_pruned_indices_tb)
        print("len_pruned_indice: ", len_pruned_indice)

    #
    return l_info_matching_indices, opt_matching_index_fully
def tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB,
                                                list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                list_internal_indices,
                                                list_representative_problem_size,
                                                list_TB_K, list_TB_X,
                                                list_REG_X,
                                                list_inherited_Tile_Sizes,
                                                list_CLASS_configuration,
                                                opt_print):
    #
    #
    #
    num_ext_idx = 0
    num_int_idx = 0
    for each_right_idx in list_given_input_tensor_right:
        if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1:
            num_ext_idx += 1
        else:
            num_int_idx += 1
    #
    len_tensor_right = len(list_given_input_tensor_right)
    
    if opt_print == 1:
        print ("========================================== [Enumerations-REG_Y] ===================================================")
        print ("========================================== [Exclusive]          ===================================================")
        print ("Tensor (LEFT): ", list_given_input_tensor_right)
        print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx)
        print ("list_representative_problem_size: ", list_representative_problem_size)
        print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes)
        print ("Given list_REG_X: ", list_REG_X)

    #
    #   For Each Tile-Size for REG_X
    #
    for size_REG_Y in list_sizes_REG:
        if opt_print == 1:
            print ("|REG_Y| = ", size_REG_Y)

        #
        #
        #
        for start_index in range(0, len_tensor_right):
            #   
            REG_Y_Vol               = 1
            REG_Y_Vol_Prev          = 1
            list_REG_Y              = []    # inherited
            duplicated_Tile_Sizes   = copy.deepcopy(list_inherited_Tile_Sizes)    # inherited
            done_mapping_REG_Y      = -1    # not done

            #
            #
            #
            for target_index in range(start_index, len_tensor_right):
                str_start_index = list_given_input_tensor_right[target_index]

                #
                #   #1. Internal Index
                #
                if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1:
                    continue

                #
                #   #2. The FVI in the Output Tensor
                #
                if str_start_index == list_given_output_tensor[0]:
                    continue

                #
                #   |REG_Y'|
                #
                REG_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)

                #
                #   |REG_Y'| >= |REG_Y|
                #
                if REG_Y_Vol >= size_REG_Y:
                    #
                    #   |REG_Y'| > |REG_Y|
                    #
                    if REG_Y_Vol > size_REG_Y:
                        #
                        #   Need to SPlit (REG and BX)
                        #
                        if done_mapping_REG_Y == -1:
                            blocking_tile_size = size_REG_Y / REG_Y_Vol_Prev
                            list_REG_Y.append(str_start_index)
                            duplicated_Tile_Sizes.append([str_start_index, int(blocking_tile_size)])
                            done_mapping_REG_Y = 1
                        else:
                            duplicated_Tile_Sizes.append([str_start_index, 1])  # ?
                    #
                    #   |REG_Y'| = |REG_Y|
                    #
                    else:
                        #
                        #
                        #
                        if done_mapping_REG_Y == -1:
                            list_REG_Y.append(str_start_index)
                            duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)])
                            done_mapping_REG_Y = 1
                        else:
                            duplicated_Tile_Sizes.append([str_start_index, 1])
                    #
                    #
                    #
                    break
                #
                #   |REG_Y'| < |REG_Y|
                #
                else:
                    list_REG_Y.append(str_start_index)
                    duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)])
    
                #
                #
                #
                REG_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)
            #
            #
            #
            if done_mapping_REG_Y == 1:
                tc_gen_perms_exclusive_TB_X(list_sizes_TB,
                                                    list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                    list_internal_indices,
                                                    list_representative_problem_size,
                                                    list_TB_K, list_TB_X,
                                                    list_REG_X, list_REG_Y,
                                                    duplicated_Tile_Sizes,
                                                    list_CLASS_configuration,
                                                    opt_print)
Esempio n. 14
0
def tc_gen_models_TBs(each_configuration, idx_count, opt_print=0):
    #
    #
    #
    if opt_print == 1:
        print ("===[", idx_count, "]====================================== [Model][GMEM Load Inputs] =========================================")
        print (" tile-sizes: ", each_configuration.list_tile_sizes)
        print (" repr-sizes: ", each_configuration.list_representative_problem_size)
        print (" split-info: ", each_configuration.list_splits)

    #
    list_possible_comb_splits = []

    #
    for each_split in each_configuration.list_splits:
        #
        idx_base            = each_split[0]
        idx_first           = each_split[1]
        idx_second          = each_split[2]
        
        idx_base_repre_size = tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_base)
        list_possible_cases = []

        #
        if opt_print == 2:
            print (idx_base, " --> both ", idx_first, " &", idx_second)
        
        #
        for denominiator in range(1, idx_base_repre_size + 1):
            if idx_base_repre_size % denominiator == 0:
                #
                list_possible_cases.append([[idx_base, idx_base_repre_size], [idx_first, int(idx_base_repre_size / denominiator)], [idx_second, denominiator]])
        #
        list_possible_comb_splits.append(list_possible_cases)
    
    #
    if opt_print == 2:
        for each_split in list_possible_comb_splits:
            print ("len(each_split): ", len(each_split))
            for each_comb in each_split:
                print (" >> ", each_comb)
    
    #
    #   External Indices: related to # of TBs, and Full-Tiles for External Indices
    #
    opt_full_ext                                = True
    opt_full_int                                = True
    list_possible_representative_problem_sizes  = []

    #
    #   [Assumption] len(list_possible_comb_splits) == 1 or 2.
    #
    if len(list_possible_comb_splits) == 1:
        for each_comb in list_possible_comb_splits[0]:
            tmp_list    = []
            tmp_num_TBs = 1

            #
            tmp_list.append(each_comb[1])
            tmp_num_TBs *= math.ceil(each_comb[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb[1][0]))

            tmp_list.append(each_comb[2])
            tmp_num_TBs *= math.ceil(each_comb[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb[2][0]))

            #
            for each_ext_idx in each_configuration.list_tensor_C:
                if helper_base.helper_base_find_list_2D(each_comb, each_ext_idx) == -1:
                    tmp_list.append([each_ext_idx, helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)])
                    tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx))
                    
            list_possible_representative_problem_sizes.append([tmp_list, tmp_num_TBs])
        #
        tmp_min_num_TBs = 10000000000000000
        min_idx         = 0
        idx_count       = 0
        for each_comb in list_possible_representative_problem_sizes:
            if tmp_min_num_TBs > each_comb[1]:
                tmp_min_num_TBs = each_comb[1]
                min_idx         = idx_count
            #
            idx_count += 1
        #
        each_configuration.add_split_representative_problem_size(list_possible_representative_problem_sizes[min_idx][0])
        each_configuration.num_TBs = list_possible_representative_problem_sizes[min_idx][1]
        return list_possible_representative_problem_sizes[min_idx]
    #
    elif len(list_possible_comb_splits) == 2:
        for each_comb_out in list_possible_comb_splits[0]:
            for each_comb_in in list_possible_comb_splits[1]:
                tmp_list    = []
                tmp_num_TBs = 1

                #
                tmp_list.append(each_comb_out[1])
                tmp_num_TBs *= math.ceil(each_comb_out[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_out[1][0]))

                tmp_list.append(each_comb_out[2])
                tmp_num_TBs *= math.ceil(each_comb_out[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_out[2][0]))

                #
                tmp_list.append(each_comb_in[1])
                tmp_num_TBs *= math.ceil(each_comb_in[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_in[1][0]))

                tmp_list.append(each_comb_in[2])
                tmp_num_TBs *= math.ceil(each_comb_in[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_in[2][0]))

                #
                for each_ext_idx in each_configuration.list_tensor_C:
                    if helper_base.helper_base_find_list_2D(each_comb_out, each_ext_idx) == -1 and helper_base.helper_base_find_list_2D(each_comb_in, each_ext_idx) == -1:
                        tmp_list.append([each_ext_idx, helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)])
                        tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx))
                        
                #
                list_possible_representative_problem_sizes.append([tmp_list, tmp_num_TBs])
        #
        tmp_min_num_TBs = 1000000000000000000
        min_idx         = 0
        idx_count       = 0
        for each_comb in list_possible_representative_problem_sizes:
            if tmp_min_num_TBs > each_comb[1]:
                tmp_min_num_TBs = each_comb[1]
                min_idx         = idx_count
            #
            idx_count += 1
        #print ("[2] min. comb: ", list_possible_representative_problem_sizes[min_idx])
        each_configuration.add_split_representative_problem_size(list_possible_representative_problem_sizes[min_idx][0])
        each_configuration.num_TBs = list_possible_representative_problem_sizes[min_idx][1]
        return list_possible_representative_problem_sizes[min_idx]
    #
    else:
        list_possible_representative_problem_sizes = each_configuration.list_representative_problem_size

        tmp_num_TBs = 1
        for each_ext_idx in each_configuration.list_tensor_C:
            tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx))
        
        #print ("[3] min. comb: ", tmp_num_TBs)
        each_configuration.add_split_representative_problem_size(each_configuration.list_representative_problem_size)
        each_configuration.num_TBs = tmp_num_TBs
        return [each_configuration.list_representative_problem_size, tmp_num_TBs]
Esempio n. 15
0
def tc_gen_models_GMEM(each_configuration, list_comb, idx_count, opt_print=0):
    #
    #
    #
    if opt_print == 1:
        print ("===[", idx_count, "]====================================== [Model][GMEM Load Inputs] =========================================")
        print (" mappings: TB_X <- ", each_configuration.list_TB_X, ", TB_Y <- ", each_configuration.list_TB_Y)
        print ("         : TB_K <- ", each_configuration.list_TB_K)
        print ("         : REG_X <- ", each_configuration.list_REG_X, ", REG_Y <-", each_configuration.list_REG_Y)
        print (" list_comb: ", list_comb)
        print (" tile-sizes: ", each_configuration.list_tile_sizes)

    #
    numElements_Double  = int(128 / 8)
    #print ("numElements_Double: ", numElements_Double)
    
    #
    #   For Internal Indicies,
    #
    size_TB_K   = 1
    size_N_K    = 1
    for each_int_idx in each_configuration.list_TB_K:
        size_TB_K   *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_int_idx)
        size_N_K    *= tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_int_idx)
        
    #
    #   # of "main" loop (calculated by N_K / T_K)
    #
    steps_main_loops = size_N_K / size_TB_K

    #
    #   Check Types of Input such as [E_K, ...] or [E_A, ...]
    #
    opt_load_A_ext = -1     # -1: FVI = internal
    opt_load_B_ext = -1     #  1: FVI = external
    if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_configuration.list_tensor_A[0]) == -1:
        opt_load_A_ext = 1
    
    if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_configuration.list_tensor_B[0]) == -1:
        opt_load_B_ext = 1

    #
    #   Initial Values
    #
    size_continuous_elements_A = 1
    size_continuous_elements_B = 1
    size_continuous_elements_C = 1

    #
    #   Based on the Representative Problem Size
    #
    size_tiles_A = 1
    size_tiles_B = 1
    size_tiles_C = 1

    #
    if opt_print == 1:
        print ("-1: FVI = internal, 1: FVI = external")
        print ("opt_load_A_ext: ", opt_load_A_ext, ", opt_load_B_ext: ", opt_load_B_ext)

    #
    #   Input: A (Continuous)
    #
    is_continuous = 1
    for each_idx in each_configuration.list_tensor_A:
        #
        #   Starts from External Index
        #
        if opt_load_A_ext == 1:
            #
            #   Internal
            #
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) != -1:
                break
            #
            #   External
            #
            else:
                #
                #   Need to Check if This Index is Continuous Or NOT.
                #
                if is_continuous == 1:
                    size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
                    #
                    #
                    #
                    if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx):
                        is_continuous = -1
                else:
                    break            
        #
        #   Starts from Internal Index
        #
        else:
            #
            #   External
            #
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1:
                break
            #
            #   Internal
            #
            else:
                #
                #
                #
                if is_continuous == 1:
                    size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
                    #
                    #
                    #
                    if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx):
                        is_continuous = -1
                else:
                    break
    #
    #print ("[A] is_continuous: ", is_continuous, ", size_continuous_elements_A: ", size_continuous_elements_A)


    #
    #   Input: A (TB and REG)
    #
    size_A_E_TB     = 1
    size_A_K_TB     = 1
    size_A_E_REG    = 1
    for each_idx in each_configuration.list_tensor_A:
        #   External Index
        if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1:
            #   TB
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1:
                size_A_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
            #   REG
            else:
                size_A_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
        else:
            size_A_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
    #
    if opt_print == 1:
        print ("|SMEM_A| = ", size_A_E_TB * size_A_E_REG * size_A_K_TB, ", (", size_A_E_REG, " * ", size_A_E_TB, " * ", size_A_K_TB, ")")
        print ("|TB_X| = ", each_configuration.size_TB_X, ", |TB_Y| = ", each_configuration.size_TB_Y)
    #
    #   TB_X -> External Index && TB_Y -> Internal Index
    #
    if opt_load_A_ext == 1:
        times_inner_TB_X    = max(size_A_E_TB / each_configuration.size_TB_X, 1.0)
        times_inner_TB_Y    = max(size_A_K_TB / each_configuration.size_TB_Y, 1.0)
    #
    #   TB_X -> Internal Index && TB_Y -> External Index
    #
    else:
        times_inner_TB_X    = max(size_A_K_TB / each_configuration.size_TB_X, 1.0)
        times_inner_TB_Y    = max(size_A_E_TB / each_configuration.size_TB_Y, 1.0)
    #
    if opt_print == 1:
        print ("A: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y)

    #
    #   Based on a row along TB_X, can the number of continuous elements be loaded concurrently?
    #
    if opt_load_A_ext == -1:    # TB_X -> K
        size_TB_X = min(size_A_K_TB, each_configuration.size_TB_X)
        size_TB_Y = min(size_A_E_TB, each_configuration.size_TB_Y)
    else:                       # TB_X -> E
        size_TB_X = min(size_A_E_TB, each_configuration.size_TB_X)
        size_TB_Y = min(size_A_K_TB, each_configuration.size_TB_Y)

    #
    #
    #
    estimated_DRAM_transaction_per_TB_X                     = size_TB_X / min(size_continuous_elements_A, size_TB_X)
    estimated_DRAM_transaction_per_TB                       = estimated_DRAM_transaction_per_TB_X * size_TB_Y
    estimated_DRAM_transaction_per_TB_inner_loops           = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y
    estimated_DRAM_transaction_per_TB_inner_loops_reg       = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG
    estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K   = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops

    if opt_print == 1:
        print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K)
        print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X)
        print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB)
        print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops)
        print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg)
        print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K)

    #
    #   To Calculate The Cost of Loading Input Tensor per a Thread Block
    #
    cost_TB_load_A = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K
    
    #
    #   Input: B
    #
    is_continuous = 1
    for each_idx in each_configuration.list_tensor_B:
        #
        #
        #
        if opt_load_B_ext == 1:
            #
            #   Internal
            #
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) != -1:
                break
            #
            #   External
            #
            else:
                #
                #
                #
                if is_continuous == 1:
                    size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
                    #
                    #
                    #
                    if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx):
                        is_continuous = -1
                else:
                    break
        else:
            #
            #   External
            #
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1:
                break
            #
            #   Internal
            #
            else:
                #
                #
                #
                if is_continuous == 1:
                    size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
                    #
                    #
                    #
                    if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx):
                        is_continuous = -1
                else:
                    break
        #
        #
        #
    #
    if opt_print == 1:
        print ("size_continuous_elements_B: ", size_continuous_elements_B)
    #
    #   Input: B (TB and REG)
    #
    size_B_E_TB     = 1
    size_B_K_TB     = 1
    size_B_E_REG    = 1
    for each_idx in each_configuration.list_tensor_B:
        #
        if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1:
            #   TB
            if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1:
                size_B_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
            #   REG
            else:
                size_B_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
        else:
            size_B_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
    
    #
    #   TB_X -> External Index && TB_Y -> Internal Index
    #
    if opt_load_B_ext == 1:
        times_inner_TB_X    = max(size_B_E_TB / each_configuration.size_TB_X, 1.0)
        times_inner_TB_Y    = max(size_B_K_TB / each_configuration.size_TB_Y, 1.0)
    #
    #   TB_X -> Internal Index && TB_Y -> External Index
    #
    else:
        times_inner_TB_X    = max(size_B_K_TB / each_configuration.size_TB_X, 1.0)
        times_inner_TB_Y    = max(size_B_E_TB / each_configuration.size_TB_Y, 1.0)
    
    #
    if opt_print == 1:
        print ("B: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y)

    #
    #   Based on a row along TB_X, can the number of continuous elements be loaded concurrently?
    #
    if opt_load_A_ext == -1:    # TB_X -> K
        size_TB_X = min(size_B_K_TB, each_configuration.size_TB_X)
        size_TB_Y = min(size_B_E_TB, each_configuration.size_TB_Y)
    else:                       # TB_X -> E
        size_TB_X = min(size_B_E_TB, each_configuration.size_TB_X)
        size_TB_Y = min(size_B_K_TB, each_configuration.size_TB_Y)
   
    #
    #
    #
    estimated_DRAM_transaction_per_TB_X                     = size_TB_X / min(size_continuous_elements_B, size_TB_X)
    estimated_DRAM_transaction_per_TB                       = estimated_DRAM_transaction_per_TB_X * size_TB_Y
    estimated_DRAM_transaction_per_TB_inner_loops           = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y
    #estimated_DRAM_transaction_per_TB_inner_loops_reg       = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG
    estimated_DRAM_transaction_per_TB_inner_loops_reg       = estimated_DRAM_transaction_per_TB_inner_loops * size_B_E_REG
    estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K   = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops

    if opt_print == 1:
        print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K)
        print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X)
        print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB)
        print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops)
        print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg)
        print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K)

    #
    #
    #
    cost_TB_load_B = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K
    
    #
    #   Output: C
    #
    is_continuous = 1
    for each_idx in each_configuration.list_tensor_C:
        #
        #   Index mapped on TB
        #
        if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1:
            if is_continuous == 1:
                size_continuous_elements_C *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)
                if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx):
                    is_continuous = -1
            else:
                break
        #
        #   Index mapped on REG
        #
        else:
            break

    #
    #
    #
    size_continuous_elements_C_based_TB_X = 1
    for idx_count in range(0, len(each_configuration.list_TB_X)):
        if each_configuration.list_TB_X[idx_count] == each_configuration.list_tensor_C[idx_count]:
            size_continuous_elements_C_based_TB_X *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_configuration.list_TB_X[idx_count])
        else:
            break
    
    size_Output_TB = 1
    for each_idx in each_configuration.list_tensor_C:
        size_Output_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx)

    #
    cost_TB_store_C = size_Output_TB / size_continuous_elements_C_based_TB_X

    #
    #   The # of Thread Blocks
    #
    num_TBs = list_comb[1]

    if opt_print == 1:
        print (">>> # of TBs: ", num_TBs)

    #
    each_configuration.cost_load_input      = (cost_TB_load_A + cost_TB_load_B) * num_TBs
    each_configuration.cost_store_output    = cost_TB_store_C * num_TBs

    #each_configuration.cost_load_output     = 0
    each_configuration.cost_load_output     = cost_TB_store_C * num_TBs
    each_configuration.cost_total           = each_configuration.cost_load_input + each_configuration.cost_store_output + each_configuration.cost_load_output
    each_configuration.steps_main_loops     = steps_main_loops

    #
    if opt_print == 1:
        print ("Cost Input (Load): ",   each_configuration.cost_load_input)
        print ("Cost Output (Store): ", each_configuration.cost_store_output)
        print ("Cost Output (Load): ",  each_configuration.cost_load_output)
        print ("Total Cost: ",          each_configuration.cost_total, ", # of steps for main-loop: ", each_configuration.steps_main_loops)

    #
    #
    #
    if opt_print == 1:
        print ("=============================================================================================================")

    #
    return 1
def tc_gen_code_Kernel_Load_Inputs(
        f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7, int_str_t2,
        int_str_v2, l_blk_boundary_rng, tensor_contraction, l_input_strides,
        l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg,
        opt_gen_full, opt_gen_p7, opt_load_t2, opt_load_v2, opt_pre_computed,
        idx_kernel):
    # For Shared Memory,
    #   need to support non-fvi for p7
    #   >>> it affects how to generalize loading inputs.
    #   [To-Do] What is the purpose of this?
    #
    if len(l_blk_boundary_rng) > 0:
        upper_left, upper_right, l_left, l_right = tc_gen_code_Kernel_Load_Checking_Boundary(
            f, l_blk_boundary_rng, tensor_contraction)
    else:
        upper_left = size_tb_x
        upper_right = size_tb_x

    #
    #   # of Internal Indices
    #
    num_internal_indices = len(l_internal_idx)

    #
    f.write("\t\t// Load Input Tensor to Shared Memory: " + str(size_tb_x) +
            ":" +
            str(tc_helper.tc_gen_helper_find(l_t3_slices, l_internal_idx[0])) +
            "\n")
    f.write("\t\t// # of Internal Indices: " + str(num_internal_indices) +
            "\n")

    #
    #   Step 0. Boundaries for LEFT
    #       - size_tb_x & size_tb_y are determined by tiles' size.
    l_idx_x = l_t3_mapping_tb_2D[0]
    l_idx_y = l_t3_mapping_tb_2D[1]
    l_left_indices = tensor_contraction[0][4]
    l_left_target_indices = list()
    l_left_indices_reg = list()
    cond_boundary_left_ext = -1
    cond_boundary_left_int = -1
    str_cond_gen_external = ""
    str_cond_gen_internal = ""
    opt_gen_full_special_case_left = -1
    #
    #   - ThreadIdx.x -> Internal Indices -- |E_K|
    #   - ThreadIdx.y -> External Indices -- |E_LEFT|
    #
    if opt_load_t2 == -1:
        #
        #   OPTION #1. Partial Tiles for External Indices (Assumption: 4D Input Tensors)
        #
        opt_gen_full_special_case = -1
        if opt_gen_full == 1:
            cond_boundary_left_ext = 1
            for each_idx in l_left_indices:
                if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg,
                                                   each_idx) == -1:  #
                    if tc_helper.tc_gen_helper_find_1d(l_internal_idx,
                                                       each_idx) == -1:  #
                        l_left_target_indices.append(each_idx)
                else:
                    l_left_indices_reg.append(each_idx)
            #
            size_len_external_tiles = 1
            for each_target_index in l_left_target_indices:
                size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_target_index)

            #
            if size_tb_y > size_len_external_tiles:
                len_covered_reg = int(size_tb_y / size_len_external_tiles)
                size_reg_tile = tc_helper.tc_gen_helper_find(
                    l_t3_slices, l_left_indices_reg[0])

                if len_covered_reg > 1:
                    opt_gen_full_special_case = 1

            #
            #   Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal)
            #   New-Version
            #
            #
            #   [1] indices along y-axis can be used directly to check boundaries for indices in the left input.
            #
            list_alternative_mapping = list()
            opt_boundary_left_input = -1
            len_l_idx_y = len(l_idx_y)
            for each_idx_left in l_left_target_indices:
                for each_idx_y in l_idx_y:
                    if each_idx_left == each_idx_y:
                        len_l_idx_y = len_l_idx_y - 1

            #
            #   [1] indices along y-axis can be used directly to check boundaries for indices in the left input.
            #
            if len_l_idx_y == 0:
                opt_boundary_left_input = 1
            #
            #       [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input.
            #   or  [3] indices along y-axis cannot bu used to check boundaries for indices in the left input.
            #
            else:
                #
                #   [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input.
                #
                #
                if len(l_idx_y) == len(l_left_target_indices):
                    #
                    #   The Simplest Version (Directly Replacing Indices)
                    #
                    for each_idx_left in l_left_target_indices:
                        for each_idx_y in l_idx_y:
                            if each_idx_left == each_idx_y:
                                list_alternative_mapping.append(
                                    [each_idx_y, each_idx_left])
                                break
                            else:
                                if tc_helper.tc_gen_helper_find(
                                        l_t3_slices, each_idx_left
                                ) == tc_helper.tc_gen_helper_find(
                                        l_t3_slices, each_idx_y):
                                    list_alternative_mapping.append(
                                        [each_idx_y, each_idx_left])
                                    break

                    #
                    if len(list_alternative_mapping) == len(l_idx_y):
                        print("[To-Do] list_alternative_mapping: ",
                              list_alternative_mapping)
                        opt_boundary_left_input = 2
                    else:
                        opt_boundary_left_input = 3

                else:
                    opt_boundary_left_input = 3

                #
                #   [3] indices along y-axis cannot bu used to check boundaries for indices in the left input.
                #

            if opt_boundary_left_input == 1:
                #print (" >>> [1] indices along y-axis can be used directly to check boundaries for indices in the left input.")
                idx_count = 0
                for idx_tb in l_left_target_indices:
                    if idx_count == 0:
                        str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb
                    else:
                        str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb
                    idx_count = idx_count + 1
            elif opt_boundary_left_input == 2:
                #print (" >>> [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input.")
                idx_count = 0
                for idx_mapping in list_alternative_mapping:
                    if idx_count == 0:
                        str_cond_gen_external = "idx_" + idx_mapping[
                            0] + " < rng_" + idx_mapping[1]
                    else:
                        str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_mapping[
                            0] + " < rng_" + idx_mapping[1]
                    idx_count = idx_count + 1

            else:
                print(
                    " >>> [3] indices along y-axis cannot bu used to check boundaries for indices in the left input."
                )
                print(" >>> ERROR!!!! Not Support Yet")

            del list_alternative_mapping

        #
        #   OPTION #2. Partial Tiles for Internal Indices
        #
        if opt_gen_p7 == 1:
            cond_boundary_left_int = 1
            str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str(
                idx_kernel) + " - internal_upperbound"

    #
    #   - ThreadIdx.x -> External Indices -- |E_LEFT|
    #   - ThreadIdx.y -> Internal Indices -- |E_K|
    #
    else:
        #
        #   OPTION #1. Partial Tiles for External Indices
        #
        if opt_gen_full == 1:
            #
            #   This case, TB_Y will load |T_K|
            #   However, when |TB_Y| < |T_K|, we need to load input (|T_K| / |TB_Y|) times.
            #
            cond_boundary_left_ext = 1
            for each_idx in l_left_indices:
                if tc_helper.tc_gen_helper_find_1d(
                        l_t3_mapping_reg,
                        each_idx) == -1:  #   Not mapped on REG
                    if tc_helper.tc_gen_helper_find_1d(
                            l_internal_idx,
                            each_idx) == -1:  #   External Indices
                        l_left_target_indices.append(each_idx)
                else:
                    l_left_indices_reg.append(
                        each_idx)  #   Indices Mapped on REG

            #
            #   To Calculate Length of Indices' Tile-Size mapped on TB
            #
            size_len_external_tiles = 1
            for each_target_index in l_left_target_indices:
                size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_target_index)

            #
            #
            #
            if size_tb_x > size_len_external_tiles:
                opt_gen_full_special_case_left = 1

            #
            #   Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal)
            #   To-Do: Fusion is a little bit complecated.
            #
            #print ("l_left_target_indices: ", l_left_target_indices)
            #print ("l_idx_x: ", l_idx_x)
            #print ("[To-Do] Boundary Case for External Indices ---- Fusion")
            #
            #   For a Tensor Contraction,
            #
            if len(l_left_target_indices) == len(l_idx_x):
                #
                #print ("============================")
                opt_fusion = -1
                for each_target in l_left_target_indices:
                    is_common = -1
                    for each_idx_x in l_idx_x:
                        if each_target == each_idx_x:
                            is_common = 1

                    if is_common == -1:
                        opt_fusion = 1
                        break

                #
                if opt_fusion == 1:
                    #print ("opt_fusion == 1")
                    idx_count = 0
                    for idx_tb in l_idx_x:
                        if idx_count == 0:
                            #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count]
                            str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[
                                idx_count]
                        else:
                            #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count]
                            str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[
                                idx_count]
                        idx_count = idx_count + 1
                else:
                    #print ("opt_fusion != 1")
                    idx_count = 0
                    for idx_tb in l_idx_x:
                        if idx_count == 0:
                            str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb
                        else:
                            str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb
                        idx_count = idx_count + 1

            else:
                idx_count = 0
                for idx_tb in l_idx_x:
                    if idx_count == 0:
                        #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count]
                        str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[
                            idx_count]
                    else:
                        #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count]
                        str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[
                            idx_count]
                    idx_count = idx_count + 1

        #
        #   OPTION #2. Partial Tiles for Internal Indices
        #
        if opt_gen_p7 == 1:
            cond_boundary_left_int = 1
            str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str(
                idx_kernel) + " - internal_upperbound"

    #
    #   To Write Code for Boundary Cases
    #
    if (cond_boundary_left_ext == 1 and opt_gen_full_special_case_left
            == -1) or cond_boundary_left_int == 1:
        #
        #
        #
        tc_code_kernel_helper.code_kernel_load_input_left_boundary_case(
            f, opt_gen_full_special_case_left, cond_boundary_left_ext,
            cond_boundary_left_int, str_cond_gen_external,
            str_cond_gen_internal)

    #
    #
    #
    l_left_indices_target_temp = list()
    l_left_indices_reg_temp = list()
    opt_gen_full_special_case = -1
    len_covered_reg = 1
    #
    #   To Figure out indices mapped on TB (l_left_indices_target_temp) and indices mapped on REG (l_left_indices_reg_temp)
    #
    for each_idx in l_left_indices:
        if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
                l_left_indices_target_temp.append(each_idx)
        else:
            l_left_indices_reg_temp.append(each_idx)

    #   To Calculate Length of Indices' Tile-Size mapped on TB
    len_external_tiles_left = 1
    for each_target_index in l_left_indices_target_temp:
        len_external_tiles_left = len_external_tiles_left * tc_helper.tc_gen_helper_find(
            l_t3_slices, each_target_index)

    #   Assumed only one index is mapped on REG
    size_reg_tile = tc_helper.tc_gen_helper_find(l_t3_slices,
                                                 l_left_indices_reg_temp[0])

    #
    #   TB_X will load |E_LEFT| on TB_X without an index mapped on REG
    #
    if opt_load_t2 == -1:
        if size_tb_y > len_external_tiles_left:
            len_covered_reg = int(size_tb_y / len_external_tiles_left)
            opt_gen_full_special_case = 1
        else:
            len_covered_reg = 1
    #
    #   |TB_X| > |E_LEFT_TB|, Then, |TB_X| can cover |E_LEFT_TB|
    #
    else:
        if size_tb_x > len_external_tiles_left:
            print("[aft]size_tb_x > len_external_tiles_left :: ", size_tb_x,
                  " >? ", len_external_tiles_left)
            # how many steps for register tile can be covered by TB_X
            len_covered_reg = int(size_tb_x / len_external_tiles_left)
            opt_gen_full_special_case = 1
        #
        #   |TB_X| == |E_LEFT_TB|
        #
        elif size_tb_x == len_external_tiles_left:
            len_covered_reg = 1
    #
    #   |TB_X| < |E_LEFT_TB|
    #
    #else:
    #print ("HERE: ", size_tb_x, len_external_tiles_left)
    #len_covered_reg             = 1#size_tb_x / len_external_tiles_left # (To-Do) This case will be dealt in the loop.
    #opt_gen_full_special_case   = 2

    #
    #   Step 1: For-Statement: T2 (LEFT), |E_A'| * |E_A''|, where  A' is a set of indices mapped on Thread Block and
    #                                                             A'' is a set of indices mapped on Register Tile.
    tc_code_kernel_helper.code_kernel_load_input_left_for_statement(
        f, opt_gen_full, tensor_contraction[0][2], opt_gen_full_special_case,
        size_reg_tile, len_covered_reg, l_t3_mapping_reg)

    #
    str_str_t2 = ""
    idx_count = 0
    for each_idx in tensor_contraction[0][4]:
        if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
            if idx_count != 0:
                str_str_t2 = str_str_t2 + " * "
            str_str_t2 = str_str_t2 + "SIZE_SLICE_" + str(
                idx_kernel) + "_" + each_idx.capitalize()
            idx_count = idx_count + 1

    #   To Calculate Length of Indices' Tile-Size mapped on TB
    size_len_external_tiles_left = 1
    size_len_reg_tiles_left = 1
    for each_idx in tensor_contraction[0][4]:
        if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
                size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_idx)
        else:
            size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find(
                l_t3_slices, each_idx)

    #
    #   [Sub-Routine] Load Tensor Inputs to sm_a[][]
    #
    if len(l_input_strides) > 0:
        tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left(
            f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x,
            size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2,
            num_internal_indices, idx_kernel, l_input_strides[0],
            opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg,
            l_t3_slices)
    else:
        tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left(
            f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x,
            size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2,
            num_internal_indices, idx_kernel, l_input_strides,
            opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg,
            l_t3_slices)

    #   To Calculate Length of Indices' Tile-Size mapped on TB
    size_len_external_tiles_right = 1
    size_len_reg_tiles_right = 1
    for each_idx in tensor_contraction[1][4]:
        if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
                size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_idx)
        else:
            size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find(
                l_t3_slices, each_idx)

    #print ("size_len_reg_tiles_left, right:", size_len_reg_tiles_left, ",", size_len_reg_tiles_right)
    #
    #   When we cannot merge Two different loops for Both Inputs.
    #
    #if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or (opt_load_t2 != opt_load_v2) or (size_tb_x > upper_right) or (size_tb_x > upper_left) or opt_gen_full == 1 or (size_len_reg_tiles_left != size_len_reg_tiles_right):
    if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or (
            opt_load_t2 != opt_load_v2) or opt_gen_full == 1 or (
                size_len_reg_tiles_left != size_len_reg_tiles_right):
        f.write("\t\t}\n")
        f.write("\n")

        #
        #
        #
        l_right_indices = tensor_contraction[1][4]
        l_right_indices_target = list()
        l_right_indices_reg = list()
        cond_boundary_right_ext = -1
        cond_boundary_right_int = -1
        cond_boundary_right_tbx = -1
        cond_boundary_right_tby = -1
        str_cond_gen_external = ""
        str_cond_gen_internal = ""
        str_cond_gen_tb_x = ""
        str_cond_gen_tb_y = ""

        f.write("\t\t// Load Input Tensor to Shared Memory\n")
        #
        #   [Load][Right] TB_X -> |K| && TB_Y -> |E_RIGHT|
        #
        if opt_load_v2 == -1:
            #
            #   OPTION #1.   External Index
            #
            if opt_gen_full == 1:
                cond_boundary_right_ext = 1
                for each_idx in l_right_indices:
                    if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg,
                                                       each_idx) == -1:
                        if tc_helper.tc_gen_helper_find_1d(
                                l_internal_idx, each_idx) == -1:
                            l_right_indices_target.append(each_idx)
                    else:
                        l_right_indices_reg.append(each_idx)

                #print ("len(l_right_indices_target):", len(l_right_indices_target))
                #print ("len(l_idx_x);", len(l_idx_x))

                #
                #   To-Do
                #
                #   4D Input Tensor
                if len(l_right_indices_target) == len(l_idx_y):
                    idx_count = 0
                    for idx_tb in l_idx_y:
                        if idx_count == 0:
                            str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[
                                idx_count]
                        else:
                            str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[
                                idx_count]
                        idx_count = idx_count + 1
                else:
                    print("ERROR: (-1) Input Tensor Should be 4D...")

            #
            #   OPTION #2.  Internal Index
            #
            if opt_gen_p7 == 1:
                cond_boundary_right_int = 1
                str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str(
                    idx_kernel) + " - internal_upperbound"
            #
            #   OPTION #3.
            #
            if size_tb_x > size_sm_p7:
                cond_boundary_right_tbx = 1
                str_cond_gen_tb_x = "threadIdx.x < " + str(size_sm_p7)
        else:
            #
            #   OPTION #1.  External Index
            #
            #print ("[CODE][LOAD-INPUT][RIGHT] TB_X -> |E_RIGHT| && TB_Y -> |K|")
            opt_gen_full_special_case = -1
            if opt_gen_full == 1:
                cond_boundary_right_ext = 1
                for each_idx in l_right_indices:
                    if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg,
                                                       each_idx) == -1:
                        if tc_helper.tc_gen_helper_find_1d(
                                l_internal_idx, each_idx) == -1:
                            l_right_indices_target.append(each_idx)
                    else:
                        l_right_indices_reg.append(each_idx)

                #   To Calculate Length of Indices' Tile-Size mapped on TB
                size_len_external_tiles = 1
                for each_target_index in l_right_indices_target:
                    size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find(
                        l_t3_slices, each_target_index)

                #   TB_X will load |E_LEFT| on TB_X without the index mapped on REG
                if size_tb_x > size_len_external_tiles:
                    # how many steps for register tile can be covered by TB_X
                    len_covered_reg = int(size_tb_x / size_len_external_tiles)
                    # Assumed only one index is mapped on REG
                    size_reg_tile = tc_helper.tc_gen_helper_find(
                        l_t3_slices, l_right_indices_reg[0])

                    if len_covered_reg > 1:
                        opt_gen_full_special_case = 1
                '''
                if len(l_idx_y) > len(l_left_target_indices):
                idx_count = 0
                for idx_tb in l_left_target_indices:
                    if idx_count == 0:
                        str_cond_gen_external = "idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb
                    else:
                        str_cond_gen_external = str_cond_gen_external + " && idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb#l_left_target_indices[idx_count]
                    idx_count = idx_count + 1
                '''

                #

                #print ("len(l_right_indices_target):", len(l_right_indices_target))
                #print ("len(l_idx_x);", len(l_idx_x))

                if len(l_idx_x) > len(l_right_indices_target):
                    idx_count = 0
                    for idx_tb in l_right_indices_target:
                        if idx_count == 0:
                            str_cond_gen_external = "idx_" + l_idx_x[
                                idx_count] + " < rng_" + idx_tb
                        else:
                            temp = str_cond_gen_external  # bug??
                            str_cond_gen_external = temp + " && idx_" + l_idx_x[
                                idx_count] + " < rng_" + idx_tb
                        idx_count = idx_count + 1
                else:
                    idx_count = 0
                    for idx_tb in l_idx_x:
                        if idx_count == 0:
                            str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[
                                idx_count]
                        else:
                            temp = str_cond_gen_external  # bug??
                            str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[
                                idx_count]
                        idx_count = idx_count + 1
                '''
                if len(l_right_indices_target) == len(l_idx_x):
                    idx_count = 0
                    for idx_tb in l_idx_x:
                        if idx_count == 0:
                            str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count]
                        else:   
                            temp = str_cond_gen_external     # bug??
                            str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count]
                        idx_count = idx_count + 1
                else:
                    print ("ERROR: (!-1) Input Tensor Should be 4D...")
                '''

            #
            #   OPTION #2.
            #
            if opt_gen_p7 == 1:
                cond_boundary_right_int = 1
                str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str(
                    idx_kernel) + " - internal_upperbound"

        #
        #   Boundary Cases (External, Internal and Thread Block)
        #   To Write Code for Boundary Cases
        #
        if cond_boundary_right_ext == 1 or cond_boundary_right_int == 1 or cond_boundary_right_tbx == 1:
            #
            #
            #
            tc_code_kernel_helper.code_kernel_load_input_right_boundary_case(
                f, cond_boundary_right_ext, cond_boundary_right_tbx,
                cond_boundary_right_int, str_cond_gen_external,
                str_cond_gen_tb_x, str_cond_gen_internal)

        #
        #
        l_right_indices_target_temp = list()
        l_right_indices_reg_temp = list()
        #
        for each_idx in l_right_indices:
            if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg,
                                               each_idx) == -1:
                if tc_helper.tc_gen_helper_find_1d(l_internal_idx,
                                                   each_idx) == -1:
                    l_right_indices_target_temp.append(each_idx)
            else:
                l_right_indices_reg_temp.append(each_idx)

        #   To Calculate Length of Indices' Tile-Size mapped on TB
        size_len_external_tiles = 1
        for each_target_index in l_right_indices_target_temp:
            size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find(
                l_t3_slices, each_target_index)

        # Assumed only one index is mapped on REG
        size_reg_tile = tc_helper.tc_gen_helper_find(
            l_t3_slices, l_right_indices_reg_temp[0])

        #print ("size_tb_x: ", size_tb_x)
        #print ("size_tb_y: ", size_tb_y)
        #print ("size_len_external_tiles: ", size_len_external_tiles)
        opt_gen_full_special_case = -1
        #
        #   TB_X will load |E_K|.
        #
        if opt_load_v2 == -1:
            #
            #
            if size_tb_y > size_len_external_tiles:
                # how many steps for register tile can be covered by TB_X
                len_covered_reg = int(size_tb_y / size_len_external_tiles)

                if len_covered_reg > 1:
                    opt_gen_full_special_case = 1
            else:
                len_covered_reg = 1
        #
        #   TB_X will load |E_RIGHT|.
        #
        else:
            #
            #
            if size_tb_x > size_len_external_tiles:
                # how many steps for register tile can be covered by TB_X
                len_covered_reg = int(size_tb_x / size_len_external_tiles)

                if len_covered_reg > 1:
                    opt_gen_full_special_case = 1
            else:
                len_covered_reg = 1

        #
        #   Step 2: This "For-Statement" is related to "Regiter-Tile."
        #           The size of Thread Block depends on Indices' Tile-Size mapped on Thread Block.
        #           However, when the lengh of a dimension along thread block which load inputs can cover some "Register-Tile,"
        #           then, we need to change the ranges.
        #
        tc_code_kernel_helper.code_kernel_load_input_right_for_statement(
            f, opt_gen_full, tensor_contraction[1][2],
            opt_gen_full_special_case, size_len_reg_tiles_right, size_reg_tile,
            len_covered_reg, l_t3_mapping_reg)

        #
        #   free
        #
        del l_right_indices_target
        del l_right_indices_reg
        del l_right_indices_target_temp
        del l_right_indices_reg_temp
    #
    #
    #
    str_str_v2 = ""
    idx_count = 0
    for each_idx in tensor_contraction[1][4]:
        if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
            if idx_count != 0:
                str_str_v2 = str_str_v2 + " * "
            str_str_v2 = str_str_v2 + "SIZE_SLICE_" + str(
                idx_kernel) + "_" + each_idx.capitalize()
            idx_count = idx_count + 1

    #
    #   [Sub-Routine] Load Tensor Inputs to sm_b[][]
    #
    if len(l_input_strides) > 0:
        tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right(
            f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x,
            size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2,
            num_internal_indices, idx_kernel, l_input_strides[2],
            opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg,
            opt_gen_full, opt_gen_p7, l_t3_slices)
    else:
        tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right(
            f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x,
            size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2,
            num_internal_indices, idx_kernel, l_input_strides,
            opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg,
            opt_gen_full, opt_gen_p7, l_t3_slices)

    #
    #   END: To Load Inputs
    #
    f.write("\t\t}\n")
    f.write("\t\t__syncthreads();\n")
    f.write("\n")
def tc_gen_code_kernel_load_inputs_base(
        f, opt_gen_ext, opt_gen_int, opt_load_left, opt_load_right,
        opt_internal, tensor_contraction, l_t3_slices, l_internal_idx,
        l_t3_mapping_tb_2D, l_t3_mapping_reg, size_smem_k, size_tb_x,
        size_tb_y, idx_kernel):
    #
    #
    #
    num_code_tabs = 2
    #
    #
    #
    tc_helper.tc_gen_helper_code_a_line(
        f, num_code_tabs,
        "//---------------------------------------------------------------------------------------------------",
        1)
    tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs,
                                        "// This is for the new version", 1)
    #
    #   >>> Base Form <<<
    #   if (idx_a < rng_c2 && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound)  // boundary: external-smem, internal
    #   if (threadIdx.x < 8)                                                        // boundary: |TB| and |K|
    #   {
    #       for (int ll = 0; ll < rng_c1; ll++)                                     // related to |REG| && boundary: external0-reg
    #       {
    #           sm_b[][] = dev_v2[ext_addr + int_addr];
    #       }
    #   }
    #   __synchthread();
    #
    #print ("tensor_contraction: ", tensor_contraction)

    #
    #   Which Axis is mapped on REG for Input-Left and Input-Right
    #
    opt_axis_reg_left = 0
    opt_axis_reg_right = 0
    if tensor_contraction[0][2] != "x":
        opt_axis_reg_left = 1

    if tensor_contraction[1][2] != "x":
        opt_axis_reg_right = 1

    #
    #   To Calculate Length of Indices' Tile-Size mapped on TB
    #
    size_len_external_tiles_left = 1
    size_len_reg_tiles_left = 1
    for each_idx in tensor_contraction[0][4]:
        if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
                size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_idx)
        else:
            size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find(
                l_t3_slices, each_idx)

    #
    #   To Calculate Length of Indices' Tile-Size mapped on TB
    #
    size_len_external_tiles_right = 1
    size_len_reg_tiles_right = 1
    for each_idx in tensor_contraction[1][4]:
        if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1:
            if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1:
                size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find(
                    l_t3_slices, each_idx)
        else:
            size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find(
                l_t3_slices, each_idx)

    print("[Code Generator][New][Load][Input] size_len_ext_left: ",
          size_len_external_tiles_left)
    print("[Code Generator][New][Load][Input] size_len_ext_right: ",
          size_len_external_tiles_right)
    print("[Code Generator][New][Load][Input] size_len_reg_left: ",
          size_len_reg_tiles_left)
    print("[Code Generator][New][Load][Input] size_len_reg_right: ",
          size_len_reg_tiles_right)

    #
    #   [Load-Input][Left]
    #
    tc_helper.tc_gen_helper_code_a_line(
        f, num_code_tabs, "// This Part is for Loading Input-Left", 1)
    #
    tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts(
        f,
        num_code_tabs,
        #
        tensor_contraction[0],
        #   options
        opt_load_left,
        1,
        1,
        opt_gen_ext,
        opt_gen_int,
        opt_axis_reg_left,  # need to make automatically
        opt_internal,
        #   lists
        l_t3_slices,
        l_internal_idx,
        l_t3_mapping_tb_2D,
        l_t3_mapping_reg,
        #   sizes
        size_len_external_tiles_left,
        size_len_reg_tiles_left,
        size_smem_k,
        size_tb_x,
        size_tb_y,
        idx_kernel)

    #
    tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "", 1)

    #
    #   [Load-Input][Right]
    #
    tc_helper.tc_gen_helper_code_a_line(
        f, num_code_tabs, "// This Part is for Loading Input-Right", 1)
    #
    tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts(
        f,
        num_code_tabs,
        #
        tensor_contraction[1],
        #   options
        opt_load_right,
        2,
        2,
        opt_gen_ext,
        opt_gen_int,
        opt_axis_reg_right,
        opt_internal,
        #   lists
        l_t3_slices,
        l_internal_idx,
        l_t3_mapping_tb_2D,
        l_t3_mapping_reg,
        #   sizes
        size_len_external_tiles_right,
        size_len_reg_tiles_right,
        size_smem_k,
        size_tb_x,
        size_tb_y,
        idx_kernel)

    #
    #   END: After Loading Both Inputs
    #
    tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "__syncthreads();",
                                        1)

    #
    tc_helper.tc_gen_helper_code_a_line(
        f, num_code_tabs,
        "//---------------------------------------------------------------------------------------------------",
        1)
    tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "\n", 1)
def tc_gen_perms_exclusive_TB_Y(list_sizes_TB,
                                                list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                list_internal_indices,
                                                list_representative_problem_size,
                                                list_TB_K, list_TB_X,
                                                list_REG_X, list_REG_Y,
                                                list_inherited_Tile_Sizes,
                                                list_CLASS_configuration, opt_print):
    #
    #
    #
    num_ext_idx = 0
    num_int_idx = 0
    for each_right_idx in list_given_input_tensor_right:
        if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1:
            num_ext_idx += 1
        else:
            num_int_idx += 1
    #
    len_tensor_right = len(list_given_input_tensor_right)

    if opt_print == 1:
        print ("========================================== [Enumerations-TB_Y]  ===================================================")        
        print ("========================================== [Exclusive] [START]  ===================================================")
        print ("Tensor (LEFT): ", list_given_input_tensor_right)
        print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx)
        print ("list_representative_problem_size: ", list_representative_problem_size)
        print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes)
        print ("Given REG_X: ", list_REG_X)
        print ("Given REG_Y: ", list_REG_Y)
        print ("Given TB_X:  ", list_TB_X)
        print ("========================================== [Exclusive]   [END]  ===================================================")

    #
    #
    #
    for size_TB_Y in list_sizes_TB:
        if opt_print == 1:
            print ("|TB_Y| = ", size_TB_Y)

        #
        #   Assumption: This Input Tensor does not have the FVI in the Output Tensor.
        #
        TB_Y_Vol                =  1
        TB_Y_Vol_Prev           =  1
        done_mapping_TB_Y       = -1
        list_TB_Y               = []
        duplicated_Tile_Sizes   = copy.deepcopy(list_inherited_Tile_Sizes)

        #
        #
        #
        for each_right_idx in list_given_input_tensor_right:
            #
            #   #1. Internal Index
            #
            if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) != -1:
                continue
            
            #
            #   #2. (Just In Case) Indices Mapped on REG_X
            #
            if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_right_idx) != -1:
                continue

            #
            #   #3. Indices Mapped on REG_Y
            #
            if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_right_idx) != -1:
                continue
            
            #
            #   |TB_Y'|
            #
            TB_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)

            #
            #   |TB_Y'| >= |TB_XY
            #
            if TB_Y_Vol >= size_TB_Y:
                #
                #   |TB_Y'| > |TB_Y|
                #
                if TB_Y_Vol > size_TB_Y:
                    #
                    #
                    #
                    if done_mapping_TB_Y == -1:
                        blocking_tile_size = size_TB_Y / TB_Y_Vol_Prev
                        list_TB_Y.append(each_right_idx)
                        duplicated_Tile_Sizes.append([each_right_idx, int(blocking_tile_size)])
                        done_mapping_TB_Y = 1
                    else:
                        list_TB_Y.append(each_right_idx)
                        duplicated_Tile_Sizes.append([each_right_idx, 1])
                        
                #
                #   |TB_Y'| = |TB_Y|
                #
                else:
                    #
                    #
                    #
                    if done_mapping_TB_Y == -1:
                        list_TB_Y.append(each_right_idx)
                        duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)])
                    else:
                        list_TB_Y.append(each_right_idx)
                        duplicated_Tile_Sizes.append([each_right_idx, 1])
            #
            #   |TB_Y'| < |TB_Y|
            #
            else:
                list_TB_Y.append(each_right_idx)
                duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)])

            #
            #
            #
            TB_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)
        #
        #
        #
        #print ("list_TB_X: ", list_TB_X)
        #print ("list_TB_Y: ", list_TB_Y)
        #print ("Tile-Sizes: ", duplicated_Tile_Sizes)
        #
        #   Configuration
        #
        if done_mapping_TB_Y == 1:
            #
            #   #1. Shared-Memory |SMEM_L| = |SMEM_R|
            #
            size_SMEM_Left      = 1
            size_SMEM_Right     = 1

            #
            #
            #
            for each_idx in list_given_input_tensor_left:
                if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1:
                    size_SMEM_Left *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)
            
            for each_idx in list_given_input_tensor_right:
                if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1:
                    size_SMEM_Right *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)

            #
            #   #1. H/W Constraint---- Shared Memory
            #
            if (size_SMEM_Left * 16) + (size_SMEM_Right * 16) > 4096:
                continue

            #
            #
            #
            if size_SMEM_Left == size_SMEM_Right:
                tmp_config = tc_gen_permutations.Configuration()
                tmp_config.add_tensor_C(list_given_output_tensor)
                tmp_config.add_tensor_A(list_given_input_tensor_left)
                tmp_config.add_tensor_B(list_given_input_tensor_right)
                tmp_config.add_REG_X(list_REG_X)
                tmp_config.add_REG_Y(list_REG_Y)
                tmp_config.add_TB_X(list_TB_X)
                tmp_config.add_TB_Y(list_TB_Y)
                tmp_config.add_TB_K(list_TB_K)

                #
                #   [To-Do] Need to Make it Automatically
                #
                #duplicated_Tile_Sizes.append(["e", 16])
                #duplicated_Tile_Sizes.append(["f", 1])
                #duplicated_Tile_Sizes.append(["g", 16])

                #duplicated_Tile_Sizes.append(["d", 16]) # for 15
                duplicated_Tile_Sizes.append(["f", 16]) #
                #duplicated_Tile_Sizes.append(["e", 16]) # 3

                tmp_config.add_tile_size(duplicated_Tile_Sizes)
                tmp_config.add_representative_problem_size(list_representative_problem_size)

                #
                tmp_config.size_REG_X   = 1
                tmp_config.size_REG_Y   = 1
                tmp_config.size_TB_X    = 1
                tmp_config.size_TB_Y    = 1
                tmp_config.size_TB_K    = 1
                for each_idx in list_REG_X:
                    tmp_config.size_REG_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)

                for each_idx in list_REG_Y:
                    tmp_config.size_REG_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)

                for each_idx in list_TB_X:
                    tmp_config.size_TB_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)

                for each_idx in list_TB_Y:
                    tmp_config.size_TB_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)

                for each_idx in list_TB_K:
                    tmp_config.size_TB_K *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx)
                '''
                print (">>>> list_TB_X: ", tmp_config.list_TB_X)
                print (">>>> list_TB_Y: ", tmp_config.list_TB_Y)
                print (">>>> list_TB_K: ", tmp_config.list_TB_K)
                print (">>>> list_REG_X: ", tmp_config.list_REG_X)
                print (">>>> list_REG_Y: ", tmp_config.list_REG_Y)
                print (">>>> list_Tile_Sizes: ", tmp_config.list_tile_sizes)
                print (">>>> # of Elements in SMEM_L: ", size_SMEM_Left * 16)
                print (">>>> # of Elements in SMEM_R: ", size_SMEM_Right * 16)
                '''
                #
                list_CLASS_configuration.append(tmp_config)
Esempio n. 19
0
def tc_gen_code_Kernel_Initial(
        f,
        size_sm_p7,
        size_sm_a,
        size_sm_b,  # For Shared memory
        l_t3_mapping_tb_2D,
        l_t3_idx,
        l_t3_slices,  # For T3 (Output)
        size_reg_x,
        size_reg_y,  # For Register-Tiling
        opt_gen_p7,
        opt_gen_full,
        opt_pre_computed,  # Options
        opt_shared_padding,
        opt_data_type,  # Options
        idx_kernel):  # For Options for Generalizing
    #   Shared Memory
    #   Dependson SIZE_P7_UNIT, SIZE_REG_T, SIZE_SLICE (or SIZE_TB)
    f.write("\t// For Shared Memory,\n")
    if opt_shared_padding == 1:
        #
        if opt_data_type == "DOUBLE":
            f.write("\t__shared__ double sm_a[" + str(size_sm_p7) + "][" +
                    str(size_sm_a) + " + 1];\n")
            f.write("\t__shared__ double sm_b[" + str(size_sm_p7) + "][" +
                    str(size_sm_b) + " + 1];\n")
        else:
            f.write("\t__shared__ float sm_a[" + str(size_sm_p7) + "][" +
                    str(size_sm_a) + " + 1];\n")
            f.write("\t__shared__ float sm_b[" + str(size_sm_p7) + "][" +
                    str(size_sm_b) + " + 1];\n")
    else:
        #
        if opt_data_type == "DOUBLE":
            f.write("\t__shared__ double sm_a[" + str(size_sm_p7) + "][" +
                    str(size_sm_a) + "];\n")
            f.write("\t__shared__ double sm_b[" + str(size_sm_p7) + "][" +
                    str(size_sm_b) + "];\n")
        else:
            f.write("\t__shared__ float sm_a[" + str(size_sm_p7) + "][" +
                    str(size_sm_a) + "];\n")
            f.write("\t__shared__ float sm_b[" + str(size_sm_p7) + "][" +
                    str(size_sm_b) + "];\n")
    f.write("\n")

    #
    #   basic variables (used for pre-computed arrays)
    #
    if opt_pre_computed != -1:
        f.write(
            "\tint l_idx_t3         = threadIdx.x + threadIdx.y * SIZE_TB_" +
            str(idx_kernel) + "_X;\n")  # (Required)
        f.write(
            "\tint t3_base_thread   = dev_t3_output_base_" + str(idx_kernel) +
            "[blockIdx.x] + dev_t3_output_offset_" + str(idx_kernel) +
            "[l_idx_t3];\n"
        )  # Based on inputs (t3_output_base, t3_output_offset) (Required)
    f.write("\n")

    #   Generalized for "Internal Indice"
    if opt_gen_p7 == 1:
        f.write("\tint internal_upperbound   = 0;\n"
                )  # Based on p7b's size (Generalized)
        f.write(
            "\tint internal_offset;\n")  # Based on p7b's size (Generalized)
        f.write("\n")

    #
    #   "-1": pre_computed is off
    #   " 1": pre_computed is on
    #
    if opt_pre_computed == -1:
        #
        numIdx_TB_X = len(l_t3_mapping_tb_2D[0])
        numIdx_TB_Y = len(l_t3_mapping_tb_2D[1])

        #
        f.write(
            "\t// when opt_pre_computed == -1, all indices will be calculated manually\n"
        )
        f.write("\t// # of indices mapped on TB_X: " + str(numIdx_TB_X) + "\n")
        f.write("\t// # of indices mapped on TB_Y: " + str(numIdx_TB_Y) + "\n")

        #
        #   TB_X
        #
        if numIdx_TB_X == 1:
            f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] +
                    " = threadIdx.x;\n")
        elif numIdx_TB_X == 2:
            f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] +
                    " = threadIdx.x % SIZE_SLICE_" + str(idx_kernel) + "_" +
                    l_t3_mapping_tb_2D[0][0].capitalize() + ";\n")
            f.write("\tint idx_" + l_t3_mapping_tb_2D[0][1] +
                    " = threadIdx.x / SIZE_SLICE_" + str(idx_kernel) + "_" +
                    l_t3_mapping_tb_2D[0][0].capitalize() + ";\n")
        else:
            #
            #   [To-Do]
            #
            list_strides = list()
            idx_count = 0
            prev_stride = ""
            for each_idx in l_t3_mapping_tb_2D[0]:
                if idx_count != len(l_t3_mapping_tb_2D[0]) - 1:
                    if prev_stride == "":
                        list_strides.append("SIZE_SLICE_" + str(idx_kernel) +
                                            "_" + each_idx.capitalize())
                        prev_stride = "SIZE_SLICE_" + str(
                            idx_kernel) + "_" + each_idx.capitalize()
                    else:
                        list_strides.append("SIZE_SLICE_" + str(idx_kernel) +
                                            "_" + each_idx.capitalize() +
                                            " * " + prev_stride)
                #
                idx_count += 1

            #
            #
            #
            rev_l_idx_tb_x = list(reversed(l_t3_mapping_tb_2D[0]))
            idx_count = 0
            rev_idx_count = len(rev_l_idx_tb_x) - 1
            for each_rev_idx in rev_l_idx_tb_x:
                #
                rev_idx_count -= 1

                #
                if idx_count == 0:
                    if idx_count == len(rev_l_idx_tb_x) - 1:
                        f.write("\tint idx_" + each_rev_idx +
                                " = threadIdx.x;\n")
                    else:
                        f.write("\tint idx_" + each_rev_idx +
                                " = threadIdx.x / (" +
                                list_strides[rev_idx_count] + ");\n")
                        f.write("\tint remaining_idx = threadIdx.x % (" +
                                list_strides[rev_idx_count] + ");\n")
                else:
                    if idx_count == len(rev_l_idx_tb_x) - 1:
                        f.write("\tint idx_" + each_rev_idx +
                                " = remaining_idx;\n")
                    else:
                        f.write("\tint idx_" + each_rev_idx +
                                " = remaining_idx / (" +
                                list_strides[rev_idx_count] + ");\n")
                        f.write("\tremaining_idx = remaining_idx % (" +
                                list_strides[rev_idx_count] + ");\n")

                #
                idx_count += 1

        #
        #   TB_Y
        #
        if numIdx_TB_Y == 1:
            f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] +
                    " = threadIdx.y;\n")
        elif numIdx_TB_Y == 2:
            f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] +
                    " = threadIdx.y % SIZE_SLICE_" + str(idx_kernel) + "_" +
                    l_t3_mapping_tb_2D[1][0].capitalize() + ";\n")
            f.write("\tint idx_" + l_t3_mapping_tb_2D[1][1] +
                    " = threadIdx.y / SIZE_SLICE_" + str(idx_kernel) + "_" +
                    l_t3_mapping_tb_2D[1][0].capitalize() + ";\n")
        else:
            #
            #   [To-Do]
            #
            f.write("\t// not-yet: |TB_Y| > 2, " + str(numIdx_TB_Y) + "\n")
            for each_idx in l_t3_mapping_tb_2D[1]:
                f.write("\tidx_" + each_idx + "\n")

        #
        #   Block Numbers
        #
        f.write("\n")
        f.write("\tint tmp_blkIdx;\n")
        rev_l_t3_idx = reversed(l_t3_idx)
        len_l_t3_idx = len(l_t3_idx)

        #
        idx_count = len_l_t3_idx
        for each_idx in rev_l_t3_idx:
            #
            str_prod_strides = ""
            for each_num_idx in range(0, idx_count - 1):
                if each_num_idx == 0:
                    str_prod_strides = "numBlk_" + l_t3_idx[each_num_idx]
                else:
                    str_prod_strides = "numBlk_" + l_t3_idx[
                        each_num_idx] + " * " + str_prod_strides

            #
            if idx_count == len_l_t3_idx:
                f.write("\tint blk_idx_" + each_idx + " = blockIdx.x / (" +
                        str_prod_strides + ");\n")
                f.write("\ttmp_blkIdx = blockIdx.x % (" + str_prod_strides +
                        ");\n")
            else:
                if idx_count == 1:
                    #
                    #
                    #
                    f.write("\tint  blk_idx_" + each_idx + " = tmp_blkIdx;\n")
                elif idx_count == 2:
                    f.write("\tint blk_idx_" + each_idx + " = tmp_blkIdx / " +
                            str_prod_strides + ";\n")
                    f.write("\ttmp_blkIdx = tmp_blkIdx % (" +
                            str_prod_strides + ");\n")
                else:
                    f.write("\tint blk_idx_" + each_idx + " = tmp_blkIdx / (" +
                            str_prod_strides + ");\n")
                    f.write("\ttmp_blkIdx = tmp_blkIdx % (" +
                            str_prod_strides + ");\n")

            #
            f.write("\n")
            idx_count = idx_count - 1

        #
        #   the output's base address for a thread block
        #
        str_t3_base_addr = ""
        rev_l_t3_idx = reversed(l_t3_idx)
        l_tb_idx = list()

        for each_axis in l_t3_mapping_tb_2D:
            for each_idx in each_axis:
                l_tb_idx.append(each_idx)

        idx_count = 0
        existing_idx = 0
        for each_idx in rev_l_t3_idx:
            #
            idx_t3_count = 0
            for each_tb_idx in l_tb_idx:
                if each_idx == each_tb_idx:
                    existing_idx = 1
                    l_tb_idx.pop(idx_t3_count)
                    break
                idx_t3_count = idx_t3_count + 1

            #
            if existing_idx == 1:
                if idx_count == 0:
                    str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel) + "_" + each_idx.capitalize(
                        ) + " + idx_" + each_idx
                else:
                    str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel
                    ) + "_" + each_idx.capitalize(
                    ) + " + idx_" + each_idx + " + (" + str_t3_base_addr + ") * size_" + each_idx
            else:
                if idx_count == 0:
                    str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel) + "_" + each_idx.capitalize()
                else:
                    str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str(
                        idx_kernel) + "_" + each_idx.capitalize(
                        ) + " + (" + str_t3_base_addr + ") * size_" + each_idx
            #
            existing_idx = 0
            idx_count = idx_count + 1

        f.write("\tint t3_base_thread = " + str_t3_base_addr + ";\n")
        f.write("\n")

        #
        #   Ranges
        #
        if opt_gen_full == 1:
            #
            f.write("\t// need to support partial tiles\n")
            idx_count = 0
            for each_idx in l_t3_idx:
                if idx_count == 0:
                    f.write("\tint rng_" + each_idx)
                else:
                    f.write(", rng_" + each_idx)
                #
                idx_count = idx_count + 1
            f.write(";\n")

            #
            #
            #
            for each_idx in l_t3_idx:
                #
                f.write("\tif ((size_" + each_idx + " - (blk_idx_" + each_idx +
                        " * SIZE_SLICE_1_" + each_idx.capitalize() +
                        ")) >= SIZE_SLICE_1_" + each_idx.capitalize() + ")\n")
                f.write("\t{\n")
                #
                #   IF
                #
                f.write("\t\trng_" + each_idx + " = SIZE_SLICE_1_" +
                        each_idx.capitalize() + ";\n")
                #
                f.write("\t}\n")
                f.write("\telse\n")
                f.write("\t{\n")
                #
                #   ELSE
                #
                f.write("\t\trng_" + each_idx + " = size_" + each_idx +
                        " % SIZE_SLICE_1_" + each_idx.capitalize() + ";\n")
                #
                f.write("\t}\n")

        #
        #
        #
        del l_tb_idx

    else:
        #
        #   Generalized for non-full tile
        #   not yet generalized....
        if opt_gen_full == 1:
            f.write("\t// should support for non-full tiles\n")

            #
            #   To-Do: It does not support multi-dimensional arrays fully.
            #
            # "x"-axis
            if len(l_t3_mapping_tb_2D) != 2:
                print(
                    "ERROR: This part does not support well when len(l_t3_mapping_tb_2D) != 2!"
                )

            #
            numIdxTB_X = len(l_t3_mapping_tb_2D[0])
            numIdxTB_Y = len(l_t3_mapping_tb_2D[1])

            #
            if numIdxTB_X == 2:
                f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] +
                        " = threadIdx.x % SIZE_SLICE_" + str(idx_kernel) +
                        "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n")
                f.write("\tint idx_" + l_t3_mapping_tb_2D[0][1] +
                        " = threadIdx.x / SIZE_SLICE_" + str(idx_kernel) +
                        "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n")
            elif numIdxTB_X == 1:
                f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] +
                        " = threadIdx.x;\n")
            else:
                #print ("[ERROR]!!! The number of indices mapped on TB_X: ", numIdxTB_X, " (Not Supported Yet)")
                #print ("[ERROR]!!! TB_X: ", l_t3_mapping_tb_2D[0])
                tc_code_etc.tc_gen_code_write_line(
                    f, 1,
                    "// The # of External Indices mapped on TB_X is equal to or greater than 3"
                )

                #
                l_stride_TB_X = list()
                tmp_str_stride = ""
                idx_count = 0
                for each_idx in l_t3_mapping_tb_2D[0]:
                    if idx_count == 0:
                        tmp_str_stride = "SIZE_SLICE_" + str(
                            idx_kernel) + "_" + each_idx.capitalize()
                    else:
                        tmp_str_stride = tmp_str_stride + " * SIZE_SLICE_" + str(
                            idx_kernel) + "_" + each_idx.capitalize()
                    #
                    l_stride_TB_X.append(tmp_str_stride)
                    idx_count = idx_count + 1

                #
                idx_first = 0
                idx_second = 0
                idx_count = 0
                str_remainning = "threadIdx.x"
                l_rev_l_t3_mapping_TB_X = list(reversed(l_t3_mapping_tb_2D[0]))
                l_rev_l_stride_TB_X = list(reversed(l_stride_TB_X))
                for each_idx in l_rev_l_t3_mapping_TB_X:
                    #
                    #   |T_i| == 1, no need to calculate index.
                    #
                    if tc_helper.tc_gen_helper_find(l_t3_slices,
                                                    each_idx) == 1:
                        tc_code_etc.tc_gen_code_write_line(
                            f, 1, "int idx_" + each_idx + " \t= 0;")
                    #
                    #   |T_i| != 1
                    #
                    else:
                        #
                        #
                        #
                        if idx_first == 0:
                            #
                            #   THE FVI && The FIRST
                            #
                            if idx_count == len(l_rev_l_t3_mapping_TB_X) - 1:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "int idx_" + each_idx + " \t= " +
                                    str_remainning + ";")
                            #
                            #   NOT THE FVI && The FIRST
                            #
                            else:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "int idx_" + each_idx + " \t= " +
                                    str_remainning + " / " +
                                    l_rev_l_stride_TB_X[idx_count - 1] + ";")
                                str_remainning = str_remainning + " % " + l_rev_l_stride_TB_X[
                                    idx_count - 1]
                            idx_first = 1
                        else:
                            if idx_second == 0:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "int tmp_remainning \t= " +
                                    str_remainning + ";")
                                idx_second = 1
                            else:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "tmp_remainning \t= " +
                                    str_remainning + ";")

                            #
                            if idx_count == len(l_rev_l_t3_mapping_TB_X) - 1:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "int idx_" + each_idx +
                                    " = tmp_remainning ;")
                            else:
                                tc_code_etc.tc_gen_code_write_line(
                                    f, 1, "int idx_" + each_idx +
                                    " = tmp_remainning / " +
                                    l_rev_l_stride_TB_X[idx_count - 1] + ";")

                    #
                    idx_count = idx_count + 1

            #
            if numIdxTB_Y == 2:
                f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] +
                        " \t= threadIdx.y % SIZE_SLICE_" + str(idx_kernel) +
                        "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n")
                f.write("\tint idx_" + l_t3_mapping_tb_2D[1][1] +
                        " \t= threadIdx.y / SIZE_SLICE_" + str(idx_kernel) +
                        "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n")
            elif numIdxTB_Y == 1:
                f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] +
                        " \t= threadIdx.y;\n")
            else:
                print("[ERROR]!!! The number of indices mapped on TB_Y: ",
                      numIdxTB_Y, " (Not Supported Yet)")
                print("[ERROR]!!! TB_Y: ", l_t3_mapping_tb_2D[1])
            f.write("\n")

            # block ranges for t3 (in order)
            idx_count = 0
            for t3_idx in l_t3_idx:
                f.write("\tint rng_" + t3_idx + " \t= dev_t3_block_range_" +
                        str(idx_kernel) + "[blockIdx.x * NUM_INDEX + " +
                        str(idx_count) + "];\n")
                idx_count = idx_count + 1

    f.write("\n")

    #
    #   Register-Tile
    #   : Depends on SIZE_REG_T
    #
    if opt_data_type == "DOUBLE":
        f.write("\tdouble temp_av;\n")
        if size_reg_y >= size_reg_x:
            f.write("\tdouble temp_bv[" + str(size_reg_y) +
                    "];\n")  # min(size_reg_y, size_reg_x), basically
            f.write("\tdouble reg_tile[" + str(size_reg_y) + "][" +
                    str(size_reg_x) + "];\n")
        else:
            f.write("\tdouble temp_bv[" + str(size_reg_x) +
                    "];\n")  # min(size_reg_y, size_reg_x), basically
            f.write("\tdouble reg_tile[" + str(size_reg_y) + "][" +
                    str(size_reg_x) + "];\n")
    else:
        f.write("\tfloat temp_av;\n")
        if size_reg_y >= size_reg_x:
            f.write("\tfloat temp_bv[" + str(size_reg_y) +
                    "];\n")  # min(size_reg_y, size_reg_x), basically
            f.write("\tfloat reg_tile[" + str(size_reg_y) + "][" +
                    str(size_reg_x) + "];\n")
        else:
            f.write("\tfloat temp_bv[" + str(size_reg_x) +
                    "];\n")  # min(size_reg_y, size_reg_x), basically
            f.write("\tfloat reg_tile[" + str(size_reg_y) + "][" +
                    str(size_reg_x) + "];\n")
    f.write("\n")

    #   Initializing reg_tile[][]
    f.write("\tfor (int i = 0; i < " + str(size_reg_y) + "; i++)\n")
    f.write("\tfor (int j = 0; j < " + str(size_reg_x) + "; j++)\n")
    f.write("\treg_tile[i][j] = 0.0;\n")
    f.write("\n")
Esempio n. 20
0
def tc_gen_code(tmp_count, inner_groups):
    #
    #   This is for calculating cost function based on a given input.
    #
    output_name = "temp"  # Depends on # of groups.

    #
    #   FILE OPEN: A LIST of Inner-Groups is for A SINGLE CUDA File.
    #            : Each Inner-Group has multiple Tensor Contractions which will be fused.
    #            : Thus, each Inner-Group is for A SINGLE KERNEL.
    #
    f = open(output_name + "_" + str(tmp_count) + ".cu", "w")

    #
    #   Per Inner-Groups,
    #
    tc_code_include.tc_code_include(f)
    #tc_code_etc.tc_gen_global_methods(f, len(inner_groups))

    #
    l_combined_opt_diffs = list()
    l_combined_opt_gen_fulls = list()
    l_combined_opt_gen_internal = list()
    l_combined_input_tensors = list()
    l_combined_t3_slices_size = list()
    l_combined_mappings = list()

    l_combined_t3_d_decl_var = list()
    l_combined_t2_d_decl_var = list()
    l_combined_v2_d_decl_var = list()
    l_combined_t3_parameters = list()
    l_combined_t3_parameters_f = list()
    l_combined_t3_parameters_nf = list()
    l_combined_t2_parameters = list()
    l_combined_t2_parameters_f = list()
    l_combined_t2_parameters_nf = list()
    l_combined_v2_parameters = list()
    l_combined_v2_parameters_f = list()
    l_combined_v2_parameters_nf = list()
    l_combined_device_dynamic = list()
    l_combined_host_dynamic = list()
    l_combined_cuda_malloc = list()

    #
    #   To Support Multiple Inner-Groups
    #
    for each_inner_group in inner_groups:
        l_combined_input_tensors.append(each_inner_group[6])
        l_combined_t3_slices_size.append(each_inner_group[8])
        l_combined_mappings.append([each_inner_group[1], each_inner_group[2]])

    #
    tc_code_define.tc_gen_definition(f, l_combined_t3_slices_size,
                                     inner_groups[0][3], inner_groups[0][4],
                                     l_combined_mappings, inner_groups[0][4],
                                     inner_groups[0][5],
                                     l_combined_input_tensors)
    #
    #   Check Types
    #
    idx_kernel = 1
    for each_inner_group in inner_groups:
        opt_gen_full, opt_gen_p7, possible_diff = tc_helper.tc_gen_helper_CheckingTypes(
            each_inner_group[3], each_inner_group[8], each_inner_group[4])

        # possible_diff, opt_gen_full, and opt_gen_p7 are created at here.
        #l_combined_opt_diffs.append(possible_diff)
        l_combined_opt_diffs.append(-1)
        l_combined_opt_gen_fulls.append(opt_gen_full)
        l_combined_opt_gen_internal.append(opt_gen_p7)

        print("[Code Generator][tc_gen_code] Kernel #", idx_kernel,
              ">>> opt_diff:", possible_diff, "(but -1), opt_gen_full:",
              opt_gen_full, ", opt_gen_p7:", opt_gen_p7)
        idx_kernel = idx_kernel + 1

    #
    #   Code: Global Variables (Common)
    #
    tc_code_globalvar.tc_gen_global_variables_common(f)

    #
    idx_kernel = 0
    for each_inner_group in inner_groups:
        #
        l_t3_d_decl_var = list()
        l_t2_d_decl_var = list()
        l_v2_d_decl_var = list()

        l_t3_parameters = list()
        l_t2_parameters = list()
        l_v2_parameters = list()

        l_t3_parameters_nf = list()
        l_t2_parameters_nf = list()
        l_v2_parameters_nf = list()

        l_t3_parameters_f = list()
        l_t2_parameters_f = list()
        l_v2_parameters_f = list()

        #
        #   Code: Global Variables (Tensor Contractions)
        #
        tc_code_globalvar.tc_gen_global_variables(
            f, each_inner_group[6], each_inner_group[4], each_inner_group[5],
            l_t3_d_decl_var, l_t3_parameters, l_t3_parameters_nf,
            l_t2_parameters_nf, l_v2_parameters_nf, l_t3_parameters_f,
            l_t2_parameters_f, l_v2_parameters_f, l_device_dynamic,
            l_t2_d_decl_var, l_v2_d_decl_var, l_t2_parameters, l_v2_parameters,
            l_cuda_malloc, l_combined_opt_diffs[idx_kernel], idx_kernel + 1)
        #
        l_combined_t3_d_decl_var.append(l_t3_d_decl_var)
        l_combined_t2_d_decl_var.append(l_t2_d_decl_var)
        l_combined_v2_d_decl_var.append(l_v2_d_decl_var)
        l_combined_t3_parameters.append(l_t3_parameters)
        l_combined_t2_parameters.append(l_t2_parameters)
        l_combined_v2_parameters.append(l_v2_parameters)
        l_combined_t3_parameters_f.append(l_t3_parameters_f)
        l_combined_t2_parameters_f.append(l_t2_parameters_f)
        l_combined_v2_parameters_f.append(l_v2_parameters_f)
        l_combined_t3_parameters_nf.append(l_t3_parameters_nf)
        l_combined_t2_parameters_nf.append(l_t2_parameters_nf)
        l_combined_v2_parameters_nf.append(l_v2_parameters_nf)

        #
        idx_kernel = idx_kernel + 1

    #
    #   Code: SD2 Functions
    #
    idx_kernel = 1
    for each_inner_group in inner_groups:
        tc_pre_SD2_Functions.tc_gen_code_pre_SD2_Functions(
            f, each_inner_group[4], each_inner_group[5], each_inner_group[6],
            l_host_dynamic, idx_kernel)
        idx_kernel = idx_kernel + 1

    #
    #   Code: CUDA Malloc | Memcpy
    #
    tc_pre_CUDA_Malloc.tc_gen_code_pre_CUDA_Malloc(
        f, l_cuda_malloc, l_t3_parameters, l_t2_parameters, l_v2_parameters,
        l_device_dynamic, inner_groups[0][5])

    #
    #   Code: Pre-Computed Arrays
    #
    idx_kernel = 1
    for each_inner_group in inner_groups:
        tc_pre_IndirectArray.tc_gen_code_pre_IndirectArray(
            f, each_inner_group[4], each_inner_group[0], each_inner_group[6],
            each_inner_group[2], each_inner_group[4], each_inner_group[5],
            l_host_dynamic, l_combined_opt_diffs[idx_kernel - 1], idx_kernel)
        idx_kernel = idx_kernel + 1

    #
    #   Code: BasicBlock
    #
    idx_kernel = 1
    for each_inner_group in inner_groups:
        tc_pre_BasicBlock.tc_gen_code_pre_BasicBlock(
            f, each_inner_group[4], each_inner_group[3], each_inner_group[8],
            l_host_dynamic, each_inner_group[4], each_inner_group[5],
            l_combined_opt_diffs[idx_kernel - 1], idx_kernel)
        idx_kernel = idx_kernel + 1

    #
    print("[Code Generator][tc_gen_code] # of Inner-Groups: ",
          len(inner_groups))

    #
    #   Each Inner-Group Corresponds to A Kernel.
    #
    idx_kernel = 1
    for an_inner_group in inner_groups:
        print("[Code Generator][tc_gen_code] Creating Kernel --- #",
              idx_kernel)
        #
        kernel_name = "kernel_ccsdT_" + str(
            idx_kernel)  # Depends on # of groups.

        #   Options for Each Kernel
        possible_diff = l_combined_opt_diffs[idx_kernel - 1]
        opt_gen_full = l_combined_opt_gen_fulls[idx_kernel - 1]
        opt_gen_p7 = l_combined_opt_gen_internal[idx_kernel - 1]

        #
        #   To-Do: check if boundaries are needed by all tensor contractions, or not.
        #
        if opt_gen_full != -1:
            #   Inputs:     l_blk_boundary_rng, l_idx_size, l_t3_slices, l_t3_mapping_reg, l_t3_mapping_tb_2D, info_left_index, info_right_index
            #   Outputs:    l_blk_boundary_rng
            tc_helper.tc_gen_helper_CheckingBoundary(
                l_blk_boundary_rng, an_inner_group[3], an_inner_group[8],
                an_inner_group[2], an_inner_group[1],
                an_inner_group[6][0][0][1], an_inner_group[6][0][1][1])
            print(">>> Boundaries for External Indices: ", l_blk_boundary_rng)

        # (....)
        #possible_diff = -1  # (To-Do: )

        #
        #   Inputs: l_input_tensors, l_internal_idx, l_external_idx, l_t3_slices, l_t3_mapping_reg,
        #   Outputs: int_size_sm_a, int_size_sm_b, int_str_t2, int_str_v2
        #
        for each_tc in an_inner_group[6]:
            #
            int_size_sm_a = 1
            int_size_sm_b = 1
            int_str_t2 = 1
            int_str_v2 = 1

            #
            bool_found = 1
            for each_idx in each_tc[0][1]:
                # for size_sm_a
                if tc_helper.tc_gen_helper_find_1d(an_inner_group[5],
                                                   each_idx) == -1:
                    int_size_sm_a = int_size_sm_a * tc_helper.tc_gen_helper_find(
                        an_inner_group[8], each_idx)

                # for str_str_t2
                if tc_helper.tc_gen_helper_find_1d(
                        an_inner_group[4],
                        each_idx) != -1:  # external indices (all)
                    if tc_helper.tc_gen_helper_find_1d(
                            an_inner_group[2], each_idx
                    ) == -1:  # external indices mapped on ! regiter tiling
                        int_str_t2 = int_str_t2 * tc_helper.tc_gen_helper_find(
                            an_inner_group[8], each_idx)

            bool_found = 1
            for each_idx in each_tc[1][1]:
                # for size_sm_b
                if tc_helper.tc_gen_helper_find_1d(an_inner_group[5],
                                                   each_idx) == -1:
                    int_size_sm_b = int_size_sm_b * tc_helper.tc_gen_helper_find(
                        an_inner_group[8], each_idx)

                # for str_str_v2
                if tc_helper.tc_gen_helper_find_1d(
                        an_inner_group[4],
                        each_idx) != -1:  # external indices (all)
                    if tc_helper.tc_gen_helper_find_1d(
                            an_inner_group[2], each_idx
                    ) == -1:  # external indices mapped on ! register tiling
                        int_str_v2 = int_str_v2 * tc_helper.tc_gen_helper_find(
                            an_inner_group[8], each_idx)

        #
        #   Inputs:     l_t3_mapping_tb_2D, l_t3_slices
        #   Outputs:    int_size_tb_x, int_size_tb_y
        #
        int_size_tb_x = 1
        int_size_tb_y = 1
        for each_idx in an_inner_group[1][0]:  # "x"-axis
            int_size_tb_x = int_size_tb_x * tc_helper.tc_gen_helper_find(
                an_inner_group[8], each_idx)

        for each_idx in an_inner_group[1][1]:  # "y"-axis_idx
            int_size_tb_y = int_size_tb_y * tc_helper.tc_gen_helper_find(
                an_inner_group[8], each_idx)

        # the below information should be produced by "tc_gen_input()"
        size_sm_a = int_size_sm_a  #+ 1 (padding)
        size_sm_b = int_size_sm_b  #+ 1
        size_tb_x = int_size_tb_x
        size_tb_y = int_size_tb_y
        size_sm_p7 = tc_helper.tc_gen_helper_CheckingIntUnit(
            an_inner_group[4], an_inner_group[8], an_inner_group[5])
        size_reg_y = tc_helper.tc_gen_helper_find(an_inner_group[8],
                                                  an_inner_group[2][1])
        size_reg_x = tc_helper.tc_gen_helper_find(an_inner_group[8],
                                                  an_inner_group[2][0])

        #
        #   Options for two inputs
        #   Inputs: l_input_tensors, l_internal_idx
        #
        opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI(
            an_inner_group[6], an_inner_group[5])

        #
        #   Constraints
        #   Inputs: f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7
        #
        tc_gen_Constraints(f, size_tb_x, size_tb_y, size_sm_a, size_sm_b,
                           size_sm_p7)

        #
        l_t3_d_decl_var = l_combined_t3_d_decl_var[idx_kernel - 1]
        l_t2_d_decl_var = l_combined_t2_d_decl_var[idx_kernel - 1]
        l_v2_d_decl_var = l_combined_v2_d_decl_var[idx_kernel - 1]
        l_t3_parameters = l_combined_t3_parameters[idx_kernel - 1]
        l_t2_parameters = l_combined_t2_parameters[idx_kernel - 1]
        l_v2_parameters = l_combined_v2_parameters[idx_kernel - 1]
        l_t3_parameters_f = l_combined_t3_parameters_f[idx_kernel - 1]
        l_t2_parameters_f = l_combined_t2_parameters_f[idx_kernel - 1]
        l_v2_parameters_f = l_combined_v2_parameters_f[idx_kernel - 1]
        l_t3_parameters_nf = l_combined_t3_parameters_nf[idx_kernel - 1]
        l_t2_parameters_nf = l_combined_t2_parameters_nf[idx_kernel - 1]

        #
        #   >>> Create Kernels <<<
        #
        if possible_diff == -1:
            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var,
                l_v2_d_decl_var, an_inner_group[7], an_inner_group[1],
                an_inner_group[2], an_inner_group[4], an_inner_group[5],
                an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7,
                size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2,
                int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full,
                opt_load_t2, opt_load_v2, idx_kernel)
        else:
            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var,
                l_v2_d_decl_var, an_inner_group[7], an_inner_group[1],
                an_inner_group[2], an_inner_group[4], an_inner_group[5],
                an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7,
                size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2,
                int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full,
                opt_load_t2, opt_load_v2, idx_kernel)
            opt_gen_full = -1  # 1 or -1
            tc_code_kernel.tc_gen_code_Kernel(
                f, kernel_name + "_full", l_t3_d_decl_var, l_t2_d_decl_var,
                l_v2_d_decl_var, an_inner_group[7], an_inner_group[1],
                an_inner_group[2], an_inner_group[4], an_inner_group[5],
                an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7,
                size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2,
                int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full,
                opt_load_t2, opt_load_v2, idx_kernel)

        #
        idx_kernel = idx_kernel + 1

    #
    #   Code: Function to Call Kernels.
    #
    kernel_name = "kernel_ccsdT"
    tc_code_etc.tc_gen_code_fusedKernels(
        f, kernel_name, l_combined_t3_parameters, l_combined_t2_parameters,
        l_combined_v2_parameters, l_combined_t3_parameters_nf,
        l_combined_t2_parameters_nf, l_combined_v2_parameters_nf,
        l_combined_t3_parameters_f, l_combined_t2_parameters_f,
        l_combined_v2_parameters_f, l_combined_opt_diffs, len(inner_groups))

    #
    #   Code: Function for Correctness Check.
    #
    tc_post_Correctness.tc_gen_code_Post_Correctness(f, an_inner_group[4],
                                                     an_inner_group[5],
                                                     l_combined_input_tensors)

    #
    #   Code: Delete Device Memory Allocated Dynamically.
    #
    tc_post_HostDevice_Free.tc_gen_code_post_CUDA_Free(f, l_cuda_malloc)

    #
    #   Code: Delete Host Memory Allocated Dynamically.
    #
    tc_post_HostDevice_Free.tc_gen_code_post_HostFree(f, l_host_dynamic)

    #
    #   Code: Main Function
    #
    f.write("// # of Inner-Groups: " + str(len(inner_groups)) + "\n")
    tc_code_etc.tc_gen_code_main(f, len(inner_groups))

    # FILE CLOSE
    f.close()
def tc_gen_perms_exclusive_TB_X(list_sizes_TB,
                                                list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                list_internal_indices,
                                                list_representative_problem_size,
                                                list_TB_K, list_TB_X,
                                                list_REG_X, list_REG_Y,
                                                list_inherited_Tile_Sizes,
                                                list_CLASS_configuration,
                                                opt_print):
    #
    #
    #
    num_ext_idx = 0
    num_int_idx = 0
    for each_right_idx in list_given_input_tensor_left:
        if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1:
            num_ext_idx += 1
        else:
            num_int_idx += 1
    #
    len_tensor_left = len(list_given_input_tensor_left)

    if opt_print == 1:
        print ("========================================== [Enumerations-TB_X]  ===================================================")
        print ("========================================== [Exclusive] [START]  ===================================================")
        print ("Tensor (LEFT): ", list_given_input_tensor_left)
        print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx)
        print ("list_representative_problem_size: ", list_representative_problem_size)
        print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes)
        print ("Given REG_X: ", list_REG_X)
        print ("Given REG_Y: ", list_REG_Y)
        print ("Given TB_X:  ", list_TB_X)
        print ("========================================== [Exclusive]   [END]  ===================================================")

    #
    #
    #
    for size_TB_X in list_sizes_TB:
        if opt_print == 1:
            print ("|TB_X| = ", size_TB_X)

        #
        #   Assumption: Input Tensor whose index is the FVI in the Output will be related to REG_X and TB_X.
        #
        TB_X_Vol                = -1
        TB_X_Vol_Prev           = -1
        done_mapping_TB_X       = -1
        duplicated_TB_X         = copy.deepcopy(list_TB_X)
        duplicated_Tile_Sizes   = copy.deepcopy(list_inherited_Tile_Sizes)

        #
        #   Handling the FVI (Default)
        #
        for each_left_idx in list_given_input_tensor_left:
            if each_left_idx == list_TB_X[0]:
                size_FVI = tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)
                #
                #   Need to Split
                #
                if size_FVI > size_TB_X:
                    duplicated_Tile_Sizes.append([each_left_idx, size_TB_X])
                    TB_X_Vol            = size_TB_X
                    TB_X_Vol_Prev       = size_TB_X
                    done_mapping_TB_X   = 1
                #
                #   No Need to Split (Fitted)
                #
                elif size_FVI == size_TB_X:
                    duplicated_Tile_Sizes.append([each_left_idx, size_TB_X])
                    TB_X_Vol            = size_TB_X
                    TB_X_Vol_Prev       = size_TB_X
                    done_mapping_TB_X   = 1
                #
                #   No Need to Split
                #
                else:
                    duplicated_Tile_Sizes.append([each_left_idx, size_FVI])
                    TB_X_Vol        = size_FVI
                    TB_X_Vol_Prev   = size_FVI

        #
        #
        #
        for each_left_idx in list_given_input_tensor_left:
            #
            #   #1. Internal Index
            #
            if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) != -1:
                continue
            
            #
            #   #2. Indices Mapped on REG_X
            #
            if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_left_idx) != -1:
                continue

            #
            #   #3. (Just In Case) Indices Mapped on REG_Y
            #
            if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_left_idx) != -1:
                continue

            #
            #   #4. Indiced Mapped on TB_X (Should Use list_TB_X (passed by))
            #
            if tc_helper.tc_gen_helper_find_1d(list_TB_X, each_left_idx) != -1:
                continue

            #
            #   |TB_X'|
            #
            TB_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)

            #
            #   |TB_X'| >= |TB_X|
            #
            if TB_X_Vol >= size_TB_X:
                #
                #   |TB_X'| > |TB_X|
                #
                if TB_X_Vol > size_TB_X:
                    #
                    #
                    #
                    if done_mapping_TB_X == -1:
                        blocking_tile_size = size_TB_X / TB_X_Vol_Prev
                        duplicated_TB_X.append(each_left_idx)
                        duplicated_Tile_Sizes.append([each_left_idx, int(blocking_tile_size)])
                        done_mapping_TB_X = 1
                    else:
                        duplicated_TB_X.append(each_left_idx)
                        duplicated_Tile_Sizes.append([each_left_idx, 1])
                #
                #   |TB_X'| = |TB_X|
                #
                else:
                    #
                    #
                    #
                    if done_mapping_TB_X == -1:
                        duplicated_TB_X.append(each_left_idx)
                        duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)])
                    else:
                        duplicated_TB_X.append(each_left_idx)
                        duplicated_Tile_Sizes.append([each_left_idx, 1])
            #
            #   |TB_X'| < |TB_X|
            #
            else:
                duplicated_TB_X.append(each_left_idx)
                duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)])
            
            #
            #
            #
            TB_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)

        #
        #
        #
        if done_mapping_TB_X == 1:
            tc_gen_perms_exclusive_TB_Y(list_sizes_TB,
                                                list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                list_internal_indices,
                                                list_representative_problem_size,
                                                list_TB_K, duplicated_TB_X,
                                                list_REG_X, list_REG_Y,
                                                duplicated_Tile_Sizes,
                                                list_CLASS_configuration,
                                                opt_print)
Esempio n. 22
0
def model_predictive_modeling(list_configurations):
    #
    #   architectures: p100 (sm_60), v100 (sm_70)
    #
    cuda_arch = "sm_70"

    #
    #   0. initialize
    #       0.1. init. indices
    #       0.2. init. reg. tiles
    #
    #   ----- main-loop -----
    #   1. load inputs
    #       1.1. load A
    #       1.2. load B
    #
    #   2. compute
    #       2.1.
    #   ---------------------
    #
    #   3. store output
    #
    print(
        "=========[model_predictive_modeling]======================================================================="
    )
    for each_config in list_configurations:
        #
        #   (1) m, n, k from a given representative problem size (This is based on the given equation not a configuration)
        #
        tmp_m = 1
        for each_idx in each_config.list_tensor_A:
            if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K,
                                               each_idx) == -1:
                tmp_m *= tc_helper.tc_gen_helper_find(
                    each_config.list_representative_problem_size, each_idx)
        #
        each_config.m = tmp_m

        #
        tmp_n = 1
        for each_idx in each_config.list_tensor_B:
            if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K,
                                               each_idx) == -1:
                tmp_n *= tc_helper.tc_gen_helper_find(
                    each_config.list_representative_problem_size, each_idx)
        #
        each_config.n = tmp_n

        #
        tmp_k = 1
        for each_idx in each_config.list_TB_K:
            tmp_k *= tc_helper.tc_gen_helper_find(
                each_config.list_representative_problem_size, each_idx)
        #
        each_config.k = tmp_k

        #
        #   (2) # of Thread Blocks (calculated in cost-model)
        #

        #
        #   (3) Estimated Number of Registers in a Thread Block
        #
        num_base = 20
        #
        #                       71, 70, 74, 71
        #   example: tccg 48th, 4x4: 16         > 32
        #                       4x1 + 1x1 = 5   > 10    : 42 ~ 29, 28, 32, 29
        #
        #                       128, 121, 126, 120
        #                       6x6: 36         > 72
        #                       6x1 + 1x1 = 7   > 14    : 86 ~ 42, 35, 40, 34
        #
        #                       120, 122, 112, 113
        #                       4x8: 32         > 64
        #                       8x1 + 1x1 = 9   > 18    : 82 ~ 38, 40, 30, 31
        #
        #
        size_register_x = 1
        size_register_y = 1

        for each_idx in each_config.list_REG_X:
            size_register_x *= tc_helper.tc_gen_helper_find(
                each_config.list_tile_sizes, each_idx)

        for each_idx in each_config.list_REG_Y:
            size_register_y *= tc_helper.tc_gen_helper_find(
                each_config.list_tile_sizes, each_idx)

        each_config.num_Estimated_Registers = size_register_x * size_register_y * 2

        #
        #   (4) Kernel Efficiencies
        #

        #break
    print(
        "=========[model_predictive_modeling]======================================================================="
    )

    return 1000
def tc_gen_perms_exclusive_REG_X(list_sizes_REG, list_sizes_TB,
                                                list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                                list_internal_indices,
                                                list_representative_problem_size,
                                                list_TB_K, list_TB_X,
                                                list_CLASS_configuration,
                                                opt_print):
    #
    #
    #
    num_ext_idx = 0
    num_int_idx = 0
    for each_left_idx in list_given_input_tensor_left:
        if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) == -1:
            num_ext_idx += 1
        else:
            num_int_idx += 1
    #
    len_tensor_left = len(list_given_input_tensor_left)
    
    if opt_print == 1:
        print ("========================================== [Enumerations-REG_X] ===================================================")
        print ("========================================== [Exclusive]          ===================================================")
        print ("Tensor (LEFT): ", list_given_input_tensor_left)
        print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx)
        print ("list_representative_problem_size: ", list_representative_problem_size)

    #
    #   For Each Tile-Size for REG_X
    #
    for size_REG_X in list_sizes_REG:
        if opt_print == 1:
            print ("|REG_X| = ", size_REG_X)

        #
        #
        #
        for start_index in range(0, len_tensor_left):
            #   
            REG_X_Vol           = 1
            REG_X_Vol_Prev      = 1
            list_REG_X          = []    # will be inherited
            list_Tile_Sizes     = []    # will be inherited
            done_mapping_REG_X  = -1    # not done

            #
            #
            #
            for target_index in range(start_index, len_tensor_left):
                str_start_index = list_given_input_tensor_left[target_index]
                if opt_print == 1:
                    print ("idx: ", str_start_index)

                #
                #   #1. Internal Index
                #
                if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1:
                    continue

                #
                #   #2. The FVI in the Output Tensor
                #
                if str_start_index == list_given_output_tensor[0]:
                    continue

                if opt_print == 1:
                    print (">> idx: ", str_start_index)
                #
                #   |REG_X'|
                #
                REG_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)

                if opt_print == 1:
                    print (">> idx: ", str_start_index, ", REG_X_Vol: ", REG_X_Vol)

                #
                #
                #
                if REG_X_Vol >= size_REG_X:
                    #
                    #   |REG_X'| > |REG_X|
                    #
                    if REG_X_Vol > size_REG_X:
                        #
                        #   Need to Split (REG and BX)
                        #
                        if done_mapping_REG_X == -1:
                            blocking_tile_size = size_REG_X / REG_X_Vol_Prev
                            list_REG_X.append(str_start_index)
                            list_Tile_Sizes.append([str_start_index, int(blocking_tile_size)])
                            done_mapping_REG_X = 1
                        else:
                            list_Tile_Sizes.append([str_start_index, 1])
                    #
                    #   |REG_X'| = |REG_X|
                    #
                    else: 
                        #
                        #
                        #
                        if done_mapping_REG_X == -1:
                            list_REG_X.append(str_start_index)
                            list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)])
                            done_mapping_REG_X = 1
                        else:
                            list_Tile_Sizes.append([str_start_index, 1])
                    #
                    #
                    #
                    break
                else:
                    list_REG_X.append(str_start_index)
                    list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)])
                
                #
                #
                #
                REG_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)
            #
            #
            #
            if done_mapping_REG_X == 1:
                if opt_print == 1:
                    print ("list_REG_X: ", list_REG_X)
                    print ("list_Tile_sizes: ", list_Tile_Sizes)
                tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB,
                                        list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right,
                                        list_internal_indices, list_representative_problem_size,
                                        list_TB_K, list_TB_X,
                                        list_REG_X,
                                        list_Tile_Sizes,
                                        list_CLASS_configuration,
                                        opt_print)
Esempio n. 24
0
def tc_gen_code_Kernel_Register_Transpose(
        f, kernel_name, l_inner_groups, l_combined_t3_d_decl_var,
        l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var,
        l_t2_d_decl_var, l_v2_d_decl_var, opt_gen_p7, opt_gen_full):
    #
    #   [1] Header (To-Do: Strides for Non-FVI of Internal Index)
    #
    tc_gen_code_Kernel_Head_RT(f, kernel_name + "_rt",
                               l_combined_t3_d_decl_var,
                               l_combined_t2_d_decl_var,
                               l_combined_v2_d_decl_var)

    #   Open
    f.write("{\n")

    f.write("\t// Kernel for Register Transpose\n")
    f.write("\t// " + str(len(l_inner_groups)) +
            " of Inner Groups will be merged by using Register Transpose\n")
    f.write("\n")

    #
    #   Each Inner-Group
    #
    inner_count = 1
    temp_tb_2D = list()
    for each_inner_group in l_inner_groups:
        #
        #   For Each Inner-Group,
        #
        size_smem_left, size_smem_right, str_left, str_right = tc_interface.tc_interface_SMEM_Size(
            each_inner_group[6], each_inner_group[4], each_inner_group[5],
            each_inner_group[8], each_inner_group[2])
        size_TB_X, size_TB_Y = tc_interface.tc_interface_TB_Size(
            each_inner_group[1][0], each_inner_group[1][1],
            each_inner_group[8])
        size_smem_internal = tc_helper.tc_gen_helper_CheckingIntUnit(
            each_inner_group[4], each_inner_group[8], each_inner_group[5])
        size_REG_X = tc_helper.tc_gen_helper_find(each_inner_group[8],
                                                  each_inner_group[2][1])
        size_REG_Y = tc_helper.tc_gen_helper_find(each_inner_group[8],
                                                  each_inner_group[2][0])
        opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI(
            each_inner_group[6], each_inner_group[5])
        l_blk_boundary_rng = list()

        #print (inner_count, ">>> TB:", size_TB_X, size_TB_Y, ", REG: ", size_REG_X, size_REG_Y)
        #
        #
        #
        if opt_gen_full != -1:
            tc_helper.tc_gen_helper_CheckingBoundary(
                l_blk_boundary_rng, each_inner_group[3], each_inner_group[8],
                each_inner_group[2], each_inner_group[1],
                each_inner_group[6][0][0][1], each_inner_group[6][0][1][1])

        if inner_count == 1:
            #
            #   [2] Initialization
            #       Q) This first information can be used for the other inner-groups?
            #
            f.write("\t// Initialization\n")
            tc_gen_code_Kernel_Initial(f, size_smem_internal, size_smem_left,
                                       size_smem_right, each_inner_group[1],
                                       each_inner_group[4], size_REG_X,
                                       size_REG_Y, opt_gen_p7, opt_gen_full,
                                       inner_count)  # "1" kernel number...

            temp_tb_2D = each_inner_group[1]

        #
        #   [3] An Inner-Group
        #
        f.write("\t// Within Inner-Group\n")
        #
        #   For Each Tensor Contraction,
        #
        idx_countractions = 1
        for tensor_contraction in each_inner_group[7]:
            f.write("\t// Tensor Contraction\n")

            #
            if opt_gen_p7 == 1 and (idx_countractions > 1 or inner_count > 1):
                f.write("\tinternal_upperbound = 0;\n")

            #   [START] Tensor Contraction
            f.write("\t#pragma unroll 1\n")
            f.write(
                "\tfor (int l = 0; l < size_internal; l += SIZE_INT_UNIT_" +
                str(inner_count) + ")\n")
            f.write("\t{\n")

            #
            #   For Generalizing Internal Index,
            #
            if opt_gen_p7 == 1:
                f.write("\t\t// For Generalizing Contraction Index\n")
                f.write("\t\tinternal_offset = (l + SIZE_INT_UNIT_" +
                        str(inner_count) + ") - size_internal;\n")
                f.write(
                    "\t\tif (internal_offset > 0) internal_upperbound = internal_offset;\n"
                )
                f.write("\n")

            #
            #   [Main] Loads Inputs >> To-Do: Need to Double-Check!
            #
            f.write("\t\t// Load Inputs\n")
            tc_gen_code_Kernel_Load_Inputs(
                f,
                size_TB_X,
                size_TB_Y,
                size_smem_left,
                size_smem_right,
                size_smem_internal,  #
                l_blk_boundary_rng,
                tensor_contraction,  #
                each_inner_group[8],
                each_inner_group[5],
                temp_tb_2D,
                each_inner_group[2],  # temp_tb_2D
                #each_inner_group[1], each_inner_group[2],
                opt_gen_full,
                opt_gen_p7,
                opt_load_t2,
                opt_load_v2,
                inner_count)

            #
            #   [Start] Computes
            #
            f.write("\t\t// Computes: Cross-Product\n")
            f.write("\t\tfor (int ll = 0; ll < SIZE_INT_UNIT_" +
                    str(inner_count))

            #   For "Internal Index",
            if opt_gen_p7 == 1:
                f.write(" - internal_upperbound")

            f.write("; ll++)\n")
            f.write("\t\t{\n")

            #
            #   [Main] Computes
            #
            f.write("\t\t\t// Computes\n")
            tc_gen_code_Kernel_Compute(f, size_REG_X, size_REG_Y, str_left,
                                       str_right, tensor_contraction)

            #   [End] Computes
            f.write("\t\t}\n")
            f.write("\t\t__syncthreads();\n")

            #   [END] Tensor Contraction
            f.write("\t}\n")
            f.write("\n")

            #
            idx_countractions = idx_countractions + 1

        #
        #   Part: Register Transpose
        #
        if inner_count < len(l_inner_groups):
            #
            f.write("\t// Register-Transpose: " + str(inner_count - 1) +
                    " with " + str(inner_count) + "\n")

            #
            #
            #
            tc_gen_code_Kernel_Process_Register_Transpose(
                f, l_inner_groups[inner_count - 1],
                l_inner_groups[inner_count], size_smem_internal,
                size_smem_left, size_smem_right,
                size_TB_X * size_TB_Y * size_REG_X * size_REG_Y)

            f.write("\n")

        #
        #   [5] Register Tiles -> Global Memory
        #
        if inner_count == len(l_inner_groups):
            f.write("\t// Store the Results to Global Memory\n")
            f.write("\t// This should be based on the last inner-group\n")
            tc_gen_code_Kernel_Store_Results(
                f,
                opt_gen_full,
                #each_inner_group[1], each_inner_group[2],
                temp_tb_2D,
                each_inner_group[2],
                size_REG_X,
                size_REG_Y,
                1,
                1)
            #size_REG_X, size_REG_Y, inner_count)

        inner_count = inner_count + 1
        #
        #   END OF FOR: INNER-GROUP
        #

    #   Close
    f.write("}\n")