def tc_gen_models_Kernels(each_configuration, idx_count, opt_print=0): # # # # if opt_print == 1: print ("each_config.representative_problem_size: ", each_configuration.list_representative_problem_size) print ("each_config.tile_sizes: ", each_configuration.list_tile_sizes) # opt_full_ext = True opt_full_int = True for each_idx_tile in each_configuration.list_tile_sizes: # idx_name = each_idx_tile[0] idx_tile = each_idx_tile[1] # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_TB_K, idx_name) != -1: if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0: opt_full_int = False else: if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0: opt_full_ext = False # if opt_print == 1: print (">>> opt_full_int: ", opt_full_int, ", opt_full_ext: ", opt_full_ext) # each_configuration.kernel_full_ext = opt_full_ext each_configuration.kernel_full_int = opt_full_int
def tc_gen_code_Kernel_Load_Inputs_Boundary_External_REG( opt_gen_ext, opt_axis, l_tile_sizes, l_mapping_reg): # print("[Code Generator][Load][Input][Boundary][External][REG]") str_boundary_external_reg = "" # # opt_axis: 0 (x), 1 (y) # str_reg_idx = "" if opt_axis == 0: str_reg_idx = l_mapping_reg[0] else: str_reg_idx = l_mapping_reg[1] # # # if opt_gen_ext == 1: str_boundary_external_reg = "rng_" + str_reg_idx else: str_boundary_external_reg = str( tc_helper.tc_gen_helper_find(l_tile_sizes, str_reg_idx)) # return str_boundary_external_reg
def tc_gen_models_Computes(each_configuration, idx_count, opt_print=0): # # # if opt_print == 1: print ("[Model][Computes]") size_REG_X = 1 size_REG_Y = 1 for each_idx in each_configuration.list_REG_X: size_REG_X *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) for each_idx in each_configuration.list_REG_Y: size_REG_Y *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # each_configuration.kernel_arithmetic_intensity = (size_REG_X * size_REG_Y) / (size_REG_X + size_REG_Y)
def tc_gen_code_Kernel_Load_Inputs_Boundary_Exteranl_TB( opt_load_ext_int, l_input_tensor, l_mapping_reg, l_internal_idx, l_info_matching_indices): # print("[Code Generator][Load][Input][Boundary][External][TB]") print( "[Code Generator][Load][Input][Boundary][External][TB] l_input_tensor: ", l_input_tensor) str_boundary_external_tb = "" # # External Indices Mapped on TB # idx_count = 0 for each_idx in l_input_tensor: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1: # # # if tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx) == "0": str_input_specific_idx = "0" else: str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) # if idx_count == 0: #str_boundary_external_tb = "idx_" + each_idx + " < " + "rng_" + each_idx # should be fixed. str_boundary_external_tb = str_input_specific_idx + " < " + "rng_" + each_idx # should be fixed. else: #str_boundary_external_tb = str_boundary_external_tb + " && idx_" + each_idx + " < " + "rng_" + each_idx # should be fixed. str_boundary_external_tb = str_boundary_external_tb + " && " + str_input_specific_idx + " < " + "rng_" + each_idx # should be fixed. # idx_count += 1 # return str_boundary_external_tb
def tc_gen_code_pre_BasicBlock_Ranges(f, l_t3_idx, l_t3_slices): # ranges for t3_idx in l_t3_idx: val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx) f.write("\tfor (int i = 0; i < n_blk_" + t3_idx + "; i++)\n") f.write("\t{\n") f.write("\t\tblk_" + t3_idx + "_range[i] = " + str(val) + ";\n") f.write("\t\tif (rng_boundary_" + t3_idx + " != 0 && i == n_blk_" + t3_idx + " - 1)\n") f.write("\t\t{\n") f.write("\t\t\tblk_" + t3_idx + "_range[i] = rng_boundary_" + t3_idx + ";\n") f.write("\t\t}\n") f.write("\t}\n") f.write("\n")
def tc_gen_code_pre_BasicBlock_Initial(f, l_t3_idx, l_t3_slices, idx_kernel): # for t3, # the number of blocks per index for t3_idx in l_t3_idx: val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx) f.write("\tint n_blk_" + t3_idx + " = CEIL(SIZE_IDX_" + t3_idx.capitalize() + ", " + str(val) + ");\n") f.write("\n") # block-range per index for t3_idx in l_t3_idx: f.write("\tint blk_" + t3_idx + "_range[n_blk_" + t3_idx + "];\n") f.write("\n") # (just-in-case) # of full-tile per index (for debug) for t3_idx in l_t3_idx: val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx) f.write("\tint n_blk_full_" + t3_idx + " = SIZE_IDX_" + t3_idx.capitalize() + " / " + str(val) + ";\n") f.write("\n") # for t3_idx in l_t3_idx: val = tc_helper.tc_gen_helper_find(l_t3_slices, t3_idx) f.write("\tint rng_boundary_" + t3_idx + " = SIZE_IDX_" + t3_idx.capitalize() + " % " + str(val) + ";\n") f.write("\n")
def tc_gen_code_Kernel_Load_Checking_Boundary(f, l_blk_boundary_rng, tensor_contraction): upper_left = 1 upper_right = 1 l_left = list() l_right = list() print("l_blk_boundary_rng: ", l_blk_boundary_rng) # for left_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find(l_blk_boundary_rng, left_idx) != -1: upper_left = upper_left * tc_helper.tc_gen_helper_find( l_blk_boundary_rng, left_idx) l_left.append(left_idx) # for right_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find(l_blk_boundary_rng, right_idx) != -1: upper_right = upper_right * tc_helper.tc_gen_helper_find( l_blk_boundary_rng, right_idx) l_right.append(right_idx) return upper_left, upper_right, l_left, l_right
def tc_gen_definition_strides_output(f, l_idx_size, l_t3_idx): f.write("// t3 for output\n") # for t3, val_prev = 1 str_prev = "" for t3_idx in l_t3_idx: if val_prev == 1: tc_gen_code_helper_define(f, "STR_SD2_T3_" + str(t3_idx.capitalize()), "1") else: tc_gen_code_helper_define( f, "STR_SD2_T3_" + str(t3_idx.capitalize()), "STR_SD2_T3_" + str_prev.capitalize() + " * " + "SIZE_IDX_" + str_prev.capitalize()) str_prev = t3_idx val_prev = tc_helper.tc_gen_helper_find(l_idx_size, t3_idx) f.write("\n")
def tc_gen_Inner_Group(l_outer_groups, tmp_count, tmp_config, opt_print, opt_data_type): # if opt_print == 1: print ("[Code Generator][Inner-Group] Working...") print ("====================== Step 2: Creating Inner-Groups =======================") print (" Only Support the First Outer-Group") # # Permutations ([Assumption] Input: an arbitrary tensor contraction) # # # To Handle 2D Tensors is in "tc_gen_permutations()"***** # # # Cost-Model ([Verion 1.0] DRAM Data Movement, [Version 2.0] Bank Conflicts ...) # list_configurations_outer_group = list() Configuration.get_configurations(l_outer_groups, list_configurations_outer_group, tmp_count, tmp_config, 0, opt_data_type) # # # for each_config_outer_group in list_configurations_outer_group: each_config_outer_group.print_configuration(1) # # (Temporary, Mapping and Tile-Sizes should be Determined by Models in the future.) # info_each_inner_group = Helper_Inputs.transform_config_innergroup(list_configurations_outer_group[0]) # # Every Outer-Groups: # print ("[Code Generator][Inner-Group] # of Outer-Groups: ", len(l_outer_groups)) for each_outer_group in l_outer_groups: # print ("[Code Generator][Inner-Group] # of Tensor Contractions (Candidates) within an Outer-Group: ", len(each_outer_group[1])) # # Within an Outer-Group, there might be several Inner-Groups. # l_inner_groups = list() l_each_group_mapping_tb = list() l_each_group_mapping_2D = list() l_each_group_mapping_reg = list() l_t3_slices_size = list() l_t3_interface_info = list() l_t3_temp_inputs = list() l_t3_temp_conditions = list() # # To Create "Interface" # idx_count = 0 str_common_output = "" for each_tc in each_outer_group[1]: l_t3_temp_inputs.append([each_tc[4], each_tc[6]]) l_t3_temp_conditions.append("cond_kernel_" + str(idx_count + 1)) if idx_count == 0: str_common_output = each_tc[0] idx_count = idx_count + 1 # # Information: Split Indices # l_each_group_split_info = each_outer_group[3] # # l_interface_info: [0] All Index, [1] Output, [2] Inputs, [3] Conditions, [4] Options # l_t3_interface_info.append([each_outer_group[2], str_common_output, l_t3_temp_inputs, l_t3_temp_conditions, "opt_register_transpose", l_each_group_split_info]) # # (Temporary) # [0] [1] [2] [3] [4] # each_manual_group: Mapping_TB, Mapping_TB_2D, Mapping_Reg, Slices, Split-Info(Repre-size) # for each_manual_group in info_each_inner_group: # l_each_group_mapping_tb = each_manual_group[0] l_each_group_mapping_2D = each_manual_group[1] l_each_group_mapping_reg = each_manual_group[2] l_t3_slices_size = each_manual_group[3] l_info_split_ext = each_manual_group[4] l_each_group_mapping_TB_K = each_manual_group[5] l_tensor_contractions = list() # print ("[Code Generator][Inner-Groups] Picked Tiles: ", l_t3_slices_size) # if opt_print == 1: print ("Target Outer-Group: ", each_outer_group[0]) # # Fusion-Constraint #2: # promissing_left = 1 promissing_right = 1 all_x_axis = l_each_group_mapping_2D[0] + [l_each_group_mapping_reg[0]] all_y_axis = l_each_group_mapping_2D[1] + [l_each_group_mapping_reg[1]] # # X-Axis (Assumption: (Hypothetically) Left Input) including Output[0] # for each_idx in all_x_axis: promissing_left = promissing_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx) # # Y-Axis (Assumption: (Hypothetically) Right Input) # for each_idx in all_y_axis: promissing_right = promissing_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx) print ("[Code Generator][Inner-Groups] Supposed Shared Memeory Lenghts: Left >>>", promissing_left, ", Right >>>", promissing_right) # # Checking if All Tensor Contractions can be Fused or not. # l_picked_tc = list() idx_tensor_contraction = 0 # Checking if |info_each_inner_group| > |each_outer_group|, checking_used = 1 if len(each_outer_group[1]) == 0: checking_used = -1 # for each_tc in each_outer_group[1]: # #print (">> each_tc: ", each_tc) l_input_left = each_tc[5] l_input_right = each_tc[7] # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # # LEFT idx_check_reg_left = 0 size_left = 1 for each_left_idx in l_input_left: #each_tc[7]: # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # if each_left_idx == l_each_group_mapping_reg[0]: idx_check_reg_left = idx_check_reg_left + 1 if each_left_idx == l_each_group_mapping_reg[1]: idx_check_reg_left = idx_check_reg_left + 1 # # Fusion-Constraint #2: The Size of Shared Memeory # #print ("[l] each_outer_group[0]: ", each_outer_group[0]) if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_left_idx) != -1: #print (">> ", each_left_idx) size_left = size_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_left_idx) # RIGHT idx_check_reg_right = 0 size_right = 1 for each_right_idx in l_input_right: #each_tc[5]: # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # if each_right_idx == l_each_group_mapping_reg[0]: idx_check_reg_right = idx_check_reg_right + 1 if each_right_idx == l_each_group_mapping_reg[1]: idx_check_reg_right = idx_check_reg_right + 1 # # Fusion-Constraint #2: The Size of Shared Memeory # #print ("[r] each_outer_group[0]: ", each_outer_group[0]) if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_right_idx) != -1: #print (">> ", each_right_idx) size_right = size_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_right_idx) # # [Should be Fixed] # if idx_check_reg_right != 2 and idx_check_reg_left != 2 and promissing_left == size_left and promissing_right == size_right: l_picked_tc.append(idx_tensor_contraction) l_tensor_contractions.append(each_tc) else: print ("[DEBUG] promissing_left: ", promissing_left, ", promissing_right: ", promissing_right) print ("[DEBUG] idx_check_reg_right: ", idx_check_reg_right, ", idx_check_reg_left: ", idx_check_reg_left, ", size_left: ", size_left, ", size_right: ", size_right) sys.exit() # idx_tensor_contraction = idx_tensor_contraction + 1 # # End of For-Statement # # if checking_used == 1: #print ("added a tensor contraction to an inner-group") #print ("l_info_split_ext: ", l_info_split_ext) l_inner_groups.append([l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K]) # To-Do: Should be checked in detail for each_tc in list(reversed(l_picked_tc)): each_outer_group[1].pop(each_tc) # # For-Outer-Group # break # To-Do: (Currently) Only 1 Outer-Group. # # # if opt_print == -1: print ("============================================================================") # # # print ("===================== Step 2: [Output] Inner-Groups ========================") print (" Does not Support Register-Transpose") print (" These Tensor Contractions will be fused.") print (" # of Inner-Groups: ", len(l_inner_groups)) # [l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K] for each_inner_group in l_inner_groups: print ("Mapping All: ", each_inner_group[0]) print ("Mapping TB : ", each_inner_group[1]) print ("Mapping Reg: ", each_inner_group[2]) print ("Mapping TB_K: ", each_inner_group[6]) print ("Slices : ", each_inner_group[4]) print ("Split-Slices : ", each_inner_group[5]) print ("# of Tensor Contractions: ", len(each_inner_group[3])) for each_tc in each_inner_group[3]: print ("Each Tensor Contraction: ", each_tc) print ("============================================================================") # # Return Output # return l_inner_groups, l_t3_interface_info
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External( opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y, l_internal_idx, l_input_tensor, l_mapping_tb, l_mapping_reg, l_tile_sizes, l_info_matching_indices, opt_matching_index_fully, idx_kernel, each_inner_load_inst_x, each_inner_load_inst_y, size_tb_x, size_tb_y): # print("[Code Generator][Load][Input][Addr][Global][External]") print( "[Code Generator][Load][Input][Addr][Global][External] l_input_tensor: ", l_input_tensor) print( "[Code Generator][Load][Input][Addr][Global][External] l_mapping_tb: ", l_mapping_tb) print( "[Code Generator][Load][Input][Addr][Global][External] l_mapping_reg: ", l_mapping_reg) str_input_ext_global_addr = "" # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: l_idx_tb = l_mapping_tb[1] else: l_idx_tb = l_mapping_tb[0] # # [Symbols] Block Index, # rev_l_input_tensor = list(reversed(l_input_tensor)) idx_count = 0 idx_base_ext = 0 str_specific_idx = "" for each_idx in rev_l_input_tensor: # # Internal Index # if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) != -1: #print ("[int.] ", each_idx, ", idx_count: ", idx_count) if idx_count != 0: str_input_ext_global_addr = "(" + str_input_ext_global_addr + ") * size_" + each_idx # # External Index # else: # # Mapped on REG # if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) != -1: #print (" >>> [ext.][REG] ", each_idx) if idx_base_ext == 0: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() + " + ll" else: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + ll + (" + str_input_ext_global_addr + ") * size_" + each_idx else: #print (" >>> opts: tb_x: ", opt_inner_load_input_tb_x, ", tb_y: " , opt_inner_load_input_tb_y) #print (" >>> [ext.][TB] ", each_idx, " >>> ", tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx)) str_input_specific_idx_multi = "" # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: if opt_inner_load_input_tb_y == 2: str_input_specific_idx_multi = " + " + str( int(size_tb_y * each_inner_load_inst_y)) # # |TB_X| -(loads)-> E *** # |TB_Y| -(loads)-> K # else: if opt_inner_load_input_tb_x == 2: str_input_specific_idx_multi = " + " + str( int(size_tb_x * each_inner_load_inst_x)) #print (">>> str_input_specific_idx_multi: ", str_input_specific_idx_multi) if tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx) == "0": str_specific_idx = each_idx str_input_specific_idx = tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) str_input_specific_idx_multi = "" else: str_specific_idx = each_idx str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) # if idx_base_ext == 0: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + " + str_input_specific_idx + str_input_specific_idx_multi else: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + " + str_input_specific_idx + str_input_specific_idx_multi + " + (" + str_input_ext_global_addr + ") * size_" + each_idx # idx_base_ext += 1 # idx_count += 1 # return str_input_ext_global_addr, str_input_specific_idx + str_input_specific_idx_multi + " < rng_" + str_specific_idx
def tc_gen_code_new(tmp_count, str_tmp_count, str_tmp_config, l_inner_groups, l_interface_info, opt_pre_computed, opt_data_type): # # FILE: OPEN # output_name = "temp" if str_tmp_config == "-1": f = open(output_name + "__" + str_tmp_count + ".cu", "w") else: f = open( output_name + "__" + str_tmp_count + "__" + str_tmp_config + ".cu", "w") # # Includes and Globel Methods # tc_code_include.tc_code_include(f) #tc_code_etc.tc_gen_global_methods(f, len(l_inner_groups)) # # should be changed. # interface_name = "sd_t_d2_fusion" kernel_name = "kernel_" # l_combined_opt_diffs = list() l_combined_opt_gen_fulls = list() l_combined_opt_gen_internal = list() l_combined_t3_slices_size = list() l_combined_mappings = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t2_parameters = list() l_combined_v2_parameters = list() l_combined_device_dynamic = list() l_combined_host_dynamic = list() l_combined_cuda_malloc = list() # # To Support Multiple Inner-Groups # for each_inner_group in l_inner_groups: l_combined_t3_slices_size.append(each_inner_group[8]) l_combined_mappings.append([each_inner_group[1], each_inner_group[2]]) # # Inputs: T3-Slices, T3-Mappings, External Index, Internal Index # tc_code_define.tc_gen_definition_new(f, l_combined_t3_slices_size, l_combined_mappings, l_inner_groups[0][4], l_inner_groups[0][5]) # # Each Inner-Group Corresponds to A Kernel (There are Three Types per Kernel) # Type #1: External (Full) & Internal (Full) # Type #2: External (Full) & Internal (Partial) # Type #3: External (Partial) & Internal (Full) # Type #4: External (Partial) & Internal (Partial) # l_var_outputs = list() l_var_input_internal = list() l_combined_register_mappings = list() l_combined_var_input_left = list() l_combined_var_input_right = list() l_combined_var_outputs_helpers = list() l_combined_var_thread_block = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t2_parameters = list() l_combined_v2_parameters = list() l_combined_inputs_int_strides = list() l_cuda_malloc = list() l_device_dynamic = list() l_host_dynamic = list() # # To Handle Multiple Tensor Contractions in an Inner-Group # kernel_number = 1 for each_inner_group in l_inner_groups: # ''' idx_count = 0 for each_info in each_inner_group: print ("> ", idx_count, ": ", each_info) idx_count += 1 ''' # l_var_tensor_block = list() l_var_input_left = list() l_var_input_right = list() l_var_outputs_helpers = list() l_t3_d_decl_var = list() l_t2_d_decl_var = list() l_v2_d_decl_var = list() l_t3_parameters = list() l_t2_parameters = list() l_v2_parameters = list() l_input_strides = list() # Inputs: kernel_number, each_inner_group[6](l_input_tensors), each_inner_group[4](l_extenral_index), each_inner_group[5](l_internal_index) # Outputs: l_t3_d_decl_var, l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var, l_v2_parameters, # l_cuda_malloc, l_device_dynami,c # l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal tc_code_globalvar.tc_gen_variables( kernel_number, l_interface_info, each_inner_group[6], each_inner_group[4], each_inner_group[5], l_t3_d_decl_var, l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var, l_v2_parameters, l_input_strides, l_cuda_malloc, l_device_dynamic, l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal, opt_data_type) # # Variables are Used in Functions for Pre-Computed Arrays and In-Direction Arrays. (Finally, Kernels) # : We need to differentiate them to be used in these functions. # # # Tile-Appoach: # Inputs: kernel_number, l_interface_info, each_inner_group[4](l_external_index), each_inner_group[8](l_t3_slices), each_inner_group[5](l_internal_index), each_inner_group[3](l_idx_size) # Output: l_host_dynamic # if opt_pre_computed != -1: tc_pre_BasicBlock.tc_gen_code_pre_TileApproach( f, kernel_number, l_interface_info, each_inner_group[4], each_inner_group[8], each_inner_group[3], each_inner_group[5], l_host_dynamic) # # Pre-Compuated Arrays and In-Direct Arrays # tc_pre_IndirectArray.tc_gen_code_driver_PreComputedArray( f, kernel_number, l_interface_info, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_tensor_block, each_inner_group[6], l_host_dynamic, each_inner_group[4], each_inner_group[4], each_inner_group[2], each_inner_group[0]) # Related to Kernel(s) size_smem_left, size_smem_right, str_left, str_right = tc_interface.tc_interface_SMEM_Size( each_inner_group[6], each_inner_group[4], each_inner_group[5], each_inner_group[8], each_inner_group[2]) # size_TB_X, size_TB_Y = tc_interface.tc_interface_TB_Size( each_inner_group[1][0], each_inner_group[1][1], each_inner_group[8]) size_smem_internal = tc_helper.tc_gen_helper_CheckingIntUnit( each_inner_group[4], each_inner_group[8], each_inner_group[5]) size_REG_X = tc_helper.tc_gen_helper_find( each_inner_group[8], each_inner_group[2][0]) # 0 -> REG_X size_REG_Y = tc_helper.tc_gen_helper_find( each_inner_group[8], each_inner_group[2][1]) # 1 -> REG_Y opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI( each_inner_group[6], each_inner_group[5]) #print ("size_smem_internal: ", size_smem_internal) # # Constratins:"" # tc_gen.tc_gen_Constraints(f, size_TB_X, size_TB_Y, size_smem_left, size_smem_right, size_smem_internal) # opt_shared_padding = 0 # # Kernels: Different Types (External, Internal) # (1) Full, Full # (2) Full, Partial # (3) Partial, Full # (4) Partial, Partial # tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_1_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_2_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_3_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_4_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) # # multiple internal indices. # if len(each_inner_group[5]) > 1: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_1_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_2_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_3_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_4_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) # # For the Interface # l_combined_var_input_left.append(l_var_input_left) l_combined_var_input_right.append(l_var_input_right) l_combined_var_outputs_helpers.append(l_var_outputs_helpers) l_combined_var_thread_block.append(l_var_tensor_block) l_combined_t3_d_decl_var.append(l_t3_d_decl_var) l_combined_t2_d_decl_var.append(l_t2_d_decl_var) l_combined_v2_d_decl_var.append(l_v2_d_decl_var) l_combined_t3_parameters.append(l_t3_parameters) l_combined_t2_parameters.append(l_t2_parameters) l_combined_v2_parameters.append(l_v2_parameters) l_combined_register_mappings.append(each_inner_group[2]) l_combined_inputs_int_strides.append(l_input_strides) # kernel_number = kernel_number + 1 # # End of For-Statement: l_innter_groups # # # Parts of Kernels for Register Transpose # (Currently) It assumes that all inner groups can be grouped. # if tc_gen.tc_gen_Check_RegisterTranspose(l_inner_groups) == 1: print( "[Code Generator][Code] Register Transpose [Possible] according to the given mappings" ) f.write("\n") f.write( "// This part is for Kernels which support Register Transpose\n") # # This is also 4-different types. # tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_4", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, 1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_3", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, -1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_2", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, 1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_1", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, -1) else: print( "[Code Generator][Code] Register Transpose [Impossible] according to the given mappings" ) # # Temp...... For |K| > 1, # l_internal_addrs = list() for each_inner_group in l_inner_groups: # # Each Inner-Group can have multiple Tensor Contractions which will be Fused. # Among Inner-Groups, there might be possible to group them by using Register-Transposition. # print("each_inner_group: ", each_inner_group) l_tensor_contraction_in_inner_group = each_inner_group[6] # To-Do: Need To Suport Multiple-Tensor Contractions for each_tensor_contraction in l_tensor_contraction_in_inner_group: info_input_left = each_tensor_contraction[0] info_input_right = each_tensor_contraction[1] # LEFT l_idx_info_left = list() for idx_tensor in info_input_left[1]: # if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5], idx_tensor) != -1: l_idx_info_left.append([1, idx_tensor]) else: l_idx_info_left.append([0, idx_tensor]) # l_rev_idx_info_left = list(reversed(l_idx_info_left)) str_addr_left = "" idx_count = 0 for each_idx_info in l_rev_idx_info_left: # if each_idx_info[0] == 1: # internal index if idx_count == 0: str_addr_left = "idx_" + each_idx_info[1] else: str_addr_left = "idx_" + each_idx_info[ 1] + " + (" + str_addr_left + ") * size_" + each_idx_info[ 1] # idx_count = idx_count + 1 else: # external index if idx_count != 0: str_addr_left = "(" + str_addr_left + ") * size_" + each_idx_info[ 1] # RIGHT str_addr_right = "" l_idx_info_right = list() for idx_tensor in info_input_right[1]: # if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5], idx_tensor) != -1: l_idx_info_right.append([1, idx_tensor]) else: l_idx_info_right.append([0, idx_tensor]) # l_rev_idx_info_right = list(reversed(l_idx_info_right)) str_addr_right = "" idx_count = 0 for each_idx_info in l_rev_idx_info_right: # if each_idx_info[0] == 1: # internal index if idx_count == 0: str_addr_right = "idx_" + each_idx_info[1] else: str_addr_right = "idx_" + each_idx_info[ 1] + " + (" + str_addr_right + ") * size_" + each_idx_info[ 1] # idx_count = idx_count + 1 else: # external index if idx_count != 0: str_addr_right = "(" + str_addr_right + ") * size_" + each_idx_info[ 1] # l_internal_addrs.append([str_addr_left, str_addr_right]) # # # Drivers # tc_interface.tc_gen_code_kernel_caller( f, interface_name, kernel_name, l_interface_info, l_inner_groups[0][4], l_inner_groups[0][5], l_inner_groups[0][10], l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal, l_combined_var_input_left, l_combined_var_input_right, l_combined_var_outputs_helpers, l_combined_var_thread_block, l_combined_register_mappings, l_internal_addrs, ## ADDEDEDEDE l_combined_inputs_int_strides, ## l_cuda_malloc, l_device_dynamic, l_host_dynamic, l_t3_parameters, l_t2_parameters, l_v2_parameters, l_combined_t3_parameters, l_combined_t2_parameters, l_combined_v2_parameters, opt_pre_computed, opt_data_type) # ''' idx_inner_count = 0 for each_inner in l_inner_groups: idx_count = 0 for each_info in each_inner: print ("[", idx_inner_count, "][", idx_count, "] each_info: ", each_info) idx_count = idx_count + 1 idx_inner_count = idx_inner_count + 1 ''' # l_tile_sizes = l_inner_groups[0][8] l_split_representative_problem_size = l_inner_groups[0][9] #l_split_representative_problem_size = l_inner_groups[0][9] #print ("l_split_representative_problem_size: ", l_split_representative_problem_size) #print ("l_tile_sizes: ", l_tile_sizes) # # "Interface" # tc_interface.tc_gen_code_interface(f, interface_name, l_interface_info, l_tile_sizes, l_split_representative_problem_size, opt_data_type) # # FILE: CLOSE # f.close()
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External_Matching_Index( opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y, l_input_tensor, l_mapping_tb, l_mapping_reg, l_internal_idx, l_tile_sizes, num_inner_inst_tb_x, num_inner_inst_tb_y, size_tb_x, size_tb_y): # # [Option] # # l_info_matching_indices = [] l_info_pruned_indices = [] l_info_idx_tb = [] l_info_pruned_indices_tb = [] opt_matching_index_fully = -1 # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: l_idx_tb = l_mapping_tb[1] else: l_idx_tb = l_mapping_tb[0] # # [Option] Indices mapped on TB except for them having tile-size == 1. # for each_idx in l_idx_tb: if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) != 1: l_info_pruned_indices_tb.append([ each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) ]) # for each_idx in l_idx_tb: l_info_idx_tb.append( [each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)]) # print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] l_info_idx_tb: ", l_info_idx_tb) # # [Option] Check if indices mapped on TB_X | TB_Y can be used for the external indices mapped on # for each_idx in l_input_tensor: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1: print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] ext. mapped on TB: ", each_idx, ", size: ", tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)) # # To Prune Indices with Tile-Size = 1. # if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) == 1: l_info_matching_indices.append([each_idx, "0"]) else: l_info_pruned_indices.append([ each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) ]) # # Target Indices should be handled. # print("[DEBUG] l_info_pruned_indices: ", l_info_pruned_indices) print("[DEBUG] l_info_pruned_indices_tb: ", l_info_pruned_indices_tb) # # # print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices mapped on TB: ", len(l_info_pruned_indices_tb)) print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices: ", len(l_info_pruned_indices)) if len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1: #print ("len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1") len_pruned_indices_tb = 1 len_pruned_indice = 1 # # # for each_idx_info in l_info_pruned_indices_tb: len_pruned_indices_tb *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # for each_idx_info in l_info_pruned_indices: len_pruned_indice *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # l_info_matching_indices.append( [l_info_pruned_indices[0][0], l_info_pruned_indices_tb[0][0]]) opt_matching_index_fully = 1 else: # # Miserable Case: # print("Miserable Case: Should Handle Manually!") len_pruned_indices_tb = 1 len_pruned_indice = 1 # # # for each_idx_info in l_info_pruned_indices_tb: len_pruned_indices_tb *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # for each_idx_info in l_info_pruned_indices: len_pruned_indice *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) print("len_pruned_indices_tb: ", len_pruned_indices_tb) print("len_pruned_indice: ", len_pruned_indice) # return l_info_matching_indices, opt_matching_index_fully
def tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_right = len(list_given_input_tensor_right) if opt_print == 1: print ("========================================== [Enumerations-REG_Y] ===================================================") print ("========================================== [Exclusive] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_right) print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given list_REG_X: ", list_REG_X) # # For Each Tile-Size for REG_X # for size_REG_Y in list_sizes_REG: if opt_print == 1: print ("|REG_Y| = ", size_REG_Y) # # # for start_index in range(0, len_tensor_right): # REG_Y_Vol = 1 REG_Y_Vol_Prev = 1 list_REG_Y = [] # inherited duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # inherited done_mapping_REG_Y = -1 # not done # # # for target_index in range(start_index, len_tensor_right): str_start_index = list_given_input_tensor_right[target_index] # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1: continue # # #2. The FVI in the Output Tensor # if str_start_index == list_given_output_tensor[0]: continue # # |REG_Y'| # REG_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # |REG_Y'| >= |REG_Y| # if REG_Y_Vol >= size_REG_Y: # # |REG_Y'| > |REG_Y| # if REG_Y_Vol > size_REG_Y: # # Need to SPlit (REG and BX) # if done_mapping_REG_Y == -1: blocking_tile_size = size_REG_Y / REG_Y_Vol_Prev list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, int(blocking_tile_size)]) done_mapping_REG_Y = 1 else: duplicated_Tile_Sizes.append([str_start_index, 1]) # ? # # |REG_Y'| = |REG_Y| # else: # # # if done_mapping_REG_Y == -1: list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) done_mapping_REG_Y = 1 else: duplicated_Tile_Sizes.append([str_start_index, 1]) # # # break # # |REG_Y'| < |REG_Y| # else: list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) # # # REG_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # # if done_mapping_REG_Y == 1: tc_gen_perms_exclusive_TB_X(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, duplicated_Tile_Sizes, list_CLASS_configuration, opt_print)
def tc_gen_models_TBs(each_configuration, idx_count, opt_print=0): # # # if opt_print == 1: print ("===[", idx_count, "]====================================== [Model][GMEM Load Inputs] =========================================") print (" tile-sizes: ", each_configuration.list_tile_sizes) print (" repr-sizes: ", each_configuration.list_representative_problem_size) print (" split-info: ", each_configuration.list_splits) # list_possible_comb_splits = [] # for each_split in each_configuration.list_splits: # idx_base = each_split[0] idx_first = each_split[1] idx_second = each_split[2] idx_base_repre_size = tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_base) list_possible_cases = [] # if opt_print == 2: print (idx_base, " --> both ", idx_first, " &", idx_second) # for denominiator in range(1, idx_base_repre_size + 1): if idx_base_repre_size % denominiator == 0: # list_possible_cases.append([[idx_base, idx_base_repre_size], [idx_first, int(idx_base_repre_size / denominiator)], [idx_second, denominiator]]) # list_possible_comb_splits.append(list_possible_cases) # if opt_print == 2: for each_split in list_possible_comb_splits: print ("len(each_split): ", len(each_split)) for each_comb in each_split: print (" >> ", each_comb) # # External Indices: related to # of TBs, and Full-Tiles for External Indices # opt_full_ext = True opt_full_int = True list_possible_representative_problem_sizes = [] # # [Assumption] len(list_possible_comb_splits) == 1 or 2. # if len(list_possible_comb_splits) == 1: for each_comb in list_possible_comb_splits[0]: tmp_list = [] tmp_num_TBs = 1 # tmp_list.append(each_comb[1]) tmp_num_TBs *= math.ceil(each_comb[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb[1][0])) tmp_list.append(each_comb[2]) tmp_num_TBs *= math.ceil(each_comb[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb[2][0])) # for each_ext_idx in each_configuration.list_tensor_C: if helper_base.helper_base_find_list_2D(each_comb, each_ext_idx) == -1: tmp_list.append([each_ext_idx, helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)]) tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)) list_possible_representative_problem_sizes.append([tmp_list, tmp_num_TBs]) # tmp_min_num_TBs = 10000000000000000 min_idx = 0 idx_count = 0 for each_comb in list_possible_representative_problem_sizes: if tmp_min_num_TBs > each_comb[1]: tmp_min_num_TBs = each_comb[1] min_idx = idx_count # idx_count += 1 # each_configuration.add_split_representative_problem_size(list_possible_representative_problem_sizes[min_idx][0]) each_configuration.num_TBs = list_possible_representative_problem_sizes[min_idx][1] return list_possible_representative_problem_sizes[min_idx] # elif len(list_possible_comb_splits) == 2: for each_comb_out in list_possible_comb_splits[0]: for each_comb_in in list_possible_comb_splits[1]: tmp_list = [] tmp_num_TBs = 1 # tmp_list.append(each_comb_out[1]) tmp_num_TBs *= math.ceil(each_comb_out[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_out[1][0])) tmp_list.append(each_comb_out[2]) tmp_num_TBs *= math.ceil(each_comb_out[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_out[2][0])) # tmp_list.append(each_comb_in[1]) tmp_num_TBs *= math.ceil(each_comb_in[1][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_in[1][0])) tmp_list.append(each_comb_in[2]) tmp_num_TBs *= math.ceil(each_comb_in[2][1] / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_comb_in[2][0])) # for each_ext_idx in each_configuration.list_tensor_C: if helper_base.helper_base_find_list_2D(each_comb_out, each_ext_idx) == -1 and helper_base.helper_base_find_list_2D(each_comb_in, each_ext_idx) == -1: tmp_list.append([each_ext_idx, helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)]) tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)) # list_possible_representative_problem_sizes.append([tmp_list, tmp_num_TBs]) # tmp_min_num_TBs = 1000000000000000000 min_idx = 0 idx_count = 0 for each_comb in list_possible_representative_problem_sizes: if tmp_min_num_TBs > each_comb[1]: tmp_min_num_TBs = each_comb[1] min_idx = idx_count # idx_count += 1 #print ("[2] min. comb: ", list_possible_representative_problem_sizes[min_idx]) each_configuration.add_split_representative_problem_size(list_possible_representative_problem_sizes[min_idx][0]) each_configuration.num_TBs = list_possible_representative_problem_sizes[min_idx][1] return list_possible_representative_problem_sizes[min_idx] # else: list_possible_representative_problem_sizes = each_configuration.list_representative_problem_size tmp_num_TBs = 1 for each_ext_idx in each_configuration.list_tensor_C: tmp_num_TBs *= math.ceil(helper_base.helper_base_find_list_2D(each_configuration.list_representative_problem_size, each_ext_idx) / helper_base.helper_base_find_list_2D(each_configuration.list_tile_sizes, each_ext_idx)) #print ("[3] min. comb: ", tmp_num_TBs) each_configuration.add_split_representative_problem_size(each_configuration.list_representative_problem_size) each_configuration.num_TBs = tmp_num_TBs return [each_configuration.list_representative_problem_size, tmp_num_TBs]
def tc_gen_models_GMEM(each_configuration, list_comb, idx_count, opt_print=0): # # # if opt_print == 1: print ("===[", idx_count, "]====================================== [Model][GMEM Load Inputs] =========================================") print (" mappings: TB_X <- ", each_configuration.list_TB_X, ", TB_Y <- ", each_configuration.list_TB_Y) print (" : TB_K <- ", each_configuration.list_TB_K) print (" : REG_X <- ", each_configuration.list_REG_X, ", REG_Y <-", each_configuration.list_REG_Y) print (" list_comb: ", list_comb) print (" tile-sizes: ", each_configuration.list_tile_sizes) # numElements_Double = int(128 / 8) #print ("numElements_Double: ", numElements_Double) # # For Internal Indicies, # size_TB_K = 1 size_N_K = 1 for each_int_idx in each_configuration.list_TB_K: size_TB_K *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_int_idx) size_N_K *= tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_int_idx) # # # of "main" loop (calculated by N_K / T_K) # steps_main_loops = size_N_K / size_TB_K # # Check Types of Input such as [E_K, ...] or [E_A, ...] # opt_load_A_ext = -1 # -1: FVI = internal opt_load_B_ext = -1 # 1: FVI = external if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_configuration.list_tensor_A[0]) == -1: opt_load_A_ext = 1 if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_configuration.list_tensor_B[0]) == -1: opt_load_B_ext = 1 # # Initial Values # size_continuous_elements_A = 1 size_continuous_elements_B = 1 size_continuous_elements_C = 1 # # Based on the Representative Problem Size # size_tiles_A = 1 size_tiles_B = 1 size_tiles_C = 1 # if opt_print == 1: print ("-1: FVI = internal, 1: FVI = external") print ("opt_load_A_ext: ", opt_load_A_ext, ", opt_load_B_ext: ", opt_load_B_ext) # # Input: A (Continuous) # is_continuous = 1 for each_idx in each_configuration.list_tensor_A: # # Starts from External Index # if opt_load_A_ext == 1: # # Internal # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) != -1: break # # External # else: # # Need to Check if This Index is Continuous Or NOT. # if is_continuous == 1: size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # Starts from Internal Index # else: # # External # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1: break # # Internal # else: # # # if is_continuous == 1: size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # #print ("[A] is_continuous: ", is_continuous, ", size_continuous_elements_A: ", size_continuous_elements_A) # # Input: A (TB and REG) # size_A_E_TB = 1 size_A_K_TB = 1 size_A_E_REG = 1 for each_idx in each_configuration.list_tensor_A: # External Index if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1: # TB if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: size_A_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # REG else: size_A_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) else: size_A_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # if opt_print == 1: print ("|SMEM_A| = ", size_A_E_TB * size_A_E_REG * size_A_K_TB, ", (", size_A_E_REG, " * ", size_A_E_TB, " * ", size_A_K_TB, ")") print ("|TB_X| = ", each_configuration.size_TB_X, ", |TB_Y| = ", each_configuration.size_TB_Y) # # TB_X -> External Index && TB_Y -> Internal Index # if opt_load_A_ext == 1: times_inner_TB_X = max(size_A_E_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_A_K_TB / each_configuration.size_TB_Y, 1.0) # # TB_X -> Internal Index && TB_Y -> External Index # else: times_inner_TB_X = max(size_A_K_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_A_E_TB / each_configuration.size_TB_Y, 1.0) # if opt_print == 1: print ("A: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y) # # Based on a row along TB_X, can the number of continuous elements be loaded concurrently? # if opt_load_A_ext == -1: # TB_X -> K size_TB_X = min(size_A_K_TB, each_configuration.size_TB_X) size_TB_Y = min(size_A_E_TB, each_configuration.size_TB_Y) else: # TB_X -> E size_TB_X = min(size_A_E_TB, each_configuration.size_TB_X) size_TB_Y = min(size_A_K_TB, each_configuration.size_TB_Y) # # # estimated_DRAM_transaction_per_TB_X = size_TB_X / min(size_continuous_elements_A, size_TB_X) estimated_DRAM_transaction_per_TB = estimated_DRAM_transaction_per_TB_X * size_TB_Y estimated_DRAM_transaction_per_TB_inner_loops = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops if opt_print == 1: print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K) print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X) print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB) print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K) # # To Calculate The Cost of Loading Input Tensor per a Thread Block # cost_TB_load_A = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K # # Input: B # is_continuous = 1 for each_idx in each_configuration.list_tensor_B: # # # if opt_load_B_ext == 1: # # Internal # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) != -1: break # # External # else: # # # if is_continuous == 1: size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break else: # # External # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1: break # # Internal # else: # # # if is_continuous == 1: size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # # # if opt_print == 1: print ("size_continuous_elements_B: ", size_continuous_elements_B) # # Input: B (TB and REG) # size_B_E_TB = 1 size_B_K_TB = 1 size_B_E_REG = 1 for each_idx in each_configuration.list_tensor_B: # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1: # TB if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: size_B_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # REG else: size_B_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) else: size_B_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # TB_X -> External Index && TB_Y -> Internal Index # if opt_load_B_ext == 1: times_inner_TB_X = max(size_B_E_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_B_K_TB / each_configuration.size_TB_Y, 1.0) # # TB_X -> Internal Index && TB_Y -> External Index # else: times_inner_TB_X = max(size_B_K_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_B_E_TB / each_configuration.size_TB_Y, 1.0) # if opt_print == 1: print ("B: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y) # # Based on a row along TB_X, can the number of continuous elements be loaded concurrently? # if opt_load_A_ext == -1: # TB_X -> K size_TB_X = min(size_B_K_TB, each_configuration.size_TB_X) size_TB_Y = min(size_B_E_TB, each_configuration.size_TB_Y) else: # TB_X -> E size_TB_X = min(size_B_E_TB, each_configuration.size_TB_X) size_TB_Y = min(size_B_K_TB, each_configuration.size_TB_Y) # # # estimated_DRAM_transaction_per_TB_X = size_TB_X / min(size_continuous_elements_B, size_TB_X) estimated_DRAM_transaction_per_TB = estimated_DRAM_transaction_per_TB_X * size_TB_Y estimated_DRAM_transaction_per_TB_inner_loops = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y #estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_B_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops if opt_print == 1: print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K) print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X) print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB) print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K) # # # cost_TB_load_B = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K # # Output: C # is_continuous = 1 for each_idx in each_configuration.list_tensor_C: # # Index mapped on TB # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: if is_continuous == 1: size_continuous_elements_C *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # Index mapped on REG # else: break # # # size_continuous_elements_C_based_TB_X = 1 for idx_count in range(0, len(each_configuration.list_TB_X)): if each_configuration.list_TB_X[idx_count] == each_configuration.list_tensor_C[idx_count]: size_continuous_elements_C_based_TB_X *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_configuration.list_TB_X[idx_count]) else: break size_Output_TB = 1 for each_idx in each_configuration.list_tensor_C: size_Output_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # cost_TB_store_C = size_Output_TB / size_continuous_elements_C_based_TB_X # # The # of Thread Blocks # num_TBs = list_comb[1] if opt_print == 1: print (">>> # of TBs: ", num_TBs) # each_configuration.cost_load_input = (cost_TB_load_A + cost_TB_load_B) * num_TBs each_configuration.cost_store_output = cost_TB_store_C * num_TBs #each_configuration.cost_load_output = 0 each_configuration.cost_load_output = cost_TB_store_C * num_TBs each_configuration.cost_total = each_configuration.cost_load_input + each_configuration.cost_store_output + each_configuration.cost_load_output each_configuration.steps_main_loops = steps_main_loops # if opt_print == 1: print ("Cost Input (Load): ", each_configuration.cost_load_input) print ("Cost Output (Store): ", each_configuration.cost_store_output) print ("Cost Output (Load): ", each_configuration.cost_load_output) print ("Total Cost: ", each_configuration.cost_total, ", # of steps for main-loop: ", each_configuration.steps_main_loops) # # # if opt_print == 1: print ("=============================================================================================================") # return 1
def tc_gen_code_Kernel_Load_Inputs( f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7, int_str_t2, int_str_v2, l_blk_boundary_rng, tensor_contraction, l_input_strides, l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, opt_load_t2, opt_load_v2, opt_pre_computed, idx_kernel): # For Shared Memory, # need to support non-fvi for p7 # >>> it affects how to generalize loading inputs. # [To-Do] What is the purpose of this? # if len(l_blk_boundary_rng) > 0: upper_left, upper_right, l_left, l_right = tc_gen_code_Kernel_Load_Checking_Boundary( f, l_blk_boundary_rng, tensor_contraction) else: upper_left = size_tb_x upper_right = size_tb_x # # # of Internal Indices # num_internal_indices = len(l_internal_idx) # f.write("\t\t// Load Input Tensor to Shared Memory: " + str(size_tb_x) + ":" + str(tc_helper.tc_gen_helper_find(l_t3_slices, l_internal_idx[0])) + "\n") f.write("\t\t// # of Internal Indices: " + str(num_internal_indices) + "\n") # # Step 0. Boundaries for LEFT # - size_tb_x & size_tb_y are determined by tiles' size. l_idx_x = l_t3_mapping_tb_2D[0] l_idx_y = l_t3_mapping_tb_2D[1] l_left_indices = tensor_contraction[0][4] l_left_target_indices = list() l_left_indices_reg = list() cond_boundary_left_ext = -1 cond_boundary_left_int = -1 str_cond_gen_external = "" str_cond_gen_internal = "" opt_gen_full_special_case_left = -1 # # - ThreadIdx.x -> Internal Indices -- |E_K| # - ThreadIdx.y -> External Indices -- |E_LEFT| # if opt_load_t2 == -1: # # OPTION #1. Partial Tiles for External Indices (Assumption: 4D Input Tensors) # opt_gen_full_special_case = -1 if opt_gen_full == 1: cond_boundary_left_ext = 1 for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: # if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: # l_left_target_indices.append(each_idx) else: l_left_indices_reg.append(each_idx) # size_len_external_tiles = 1 for each_target_index in l_left_target_indices: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # if size_tb_y > size_len_external_tiles: len_covered_reg = int(size_tb_y / size_len_external_tiles) size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_left_indices_reg[0]) if len_covered_reg > 1: opt_gen_full_special_case = 1 # # Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal) # New-Version # # # [1] indices along y-axis can be used directly to check boundaries for indices in the left input. # list_alternative_mapping = list() opt_boundary_left_input = -1 len_l_idx_y = len(l_idx_y) for each_idx_left in l_left_target_indices: for each_idx_y in l_idx_y: if each_idx_left == each_idx_y: len_l_idx_y = len_l_idx_y - 1 # # [1] indices along y-axis can be used directly to check boundaries for indices in the left input. # if len_l_idx_y == 0: opt_boundary_left_input = 1 # # [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input. # or [3] indices along y-axis cannot bu used to check boundaries for indices in the left input. # else: # # [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input. # # if len(l_idx_y) == len(l_left_target_indices): # # The Simplest Version (Directly Replacing Indices) # for each_idx_left in l_left_target_indices: for each_idx_y in l_idx_y: if each_idx_left == each_idx_y: list_alternative_mapping.append( [each_idx_y, each_idx_left]) break else: if tc_helper.tc_gen_helper_find( l_t3_slices, each_idx_left ) == tc_helper.tc_gen_helper_find( l_t3_slices, each_idx_y): list_alternative_mapping.append( [each_idx_y, each_idx_left]) break # if len(list_alternative_mapping) == len(l_idx_y): print("[To-Do] list_alternative_mapping: ", list_alternative_mapping) opt_boundary_left_input = 2 else: opt_boundary_left_input = 3 else: opt_boundary_left_input = 3 # # [3] indices along y-axis cannot bu used to check boundaries for indices in the left input. # if opt_boundary_left_input == 1: #print (" >>> [1] indices along y-axis can be used directly to check boundaries for indices in the left input.") idx_count = 0 for idx_tb in l_left_target_indices: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb idx_count = idx_count + 1 elif opt_boundary_left_input == 2: #print (" >>> [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input.") idx_count = 0 for idx_mapping in list_alternative_mapping: if idx_count == 0: str_cond_gen_external = "idx_" + idx_mapping[ 0] + " < rng_" + idx_mapping[1] else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_mapping[ 0] + " < rng_" + idx_mapping[1] idx_count = idx_count + 1 else: print( " >>> [3] indices along y-axis cannot bu used to check boundaries for indices in the left input." ) print(" >>> ERROR!!!! Not Support Yet") del list_alternative_mapping # # OPTION #2. Partial Tiles for Internal Indices # if opt_gen_p7 == 1: cond_boundary_left_int = 1 str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # - ThreadIdx.x -> External Indices -- |E_LEFT| # - ThreadIdx.y -> Internal Indices -- |E_K| # else: # # OPTION #1. Partial Tiles for External Indices # if opt_gen_full == 1: # # This case, TB_Y will load |T_K| # However, when |TB_Y| < |T_K|, we need to load input (|T_K| / |TB_Y|) times. # cond_boundary_left_ext = 1 for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d( l_t3_mapping_reg, each_idx) == -1: # Not mapped on REG if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: # External Indices l_left_target_indices.append(each_idx) else: l_left_indices_reg.append( each_idx) # Indices Mapped on REG # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles = 1 for each_target_index in l_left_target_indices: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # # # if size_tb_x > size_len_external_tiles: opt_gen_full_special_case_left = 1 # # Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal) # To-Do: Fusion is a little bit complecated. # #print ("l_left_target_indices: ", l_left_target_indices) #print ("l_idx_x: ", l_idx_x) #print ("[To-Do] Boundary Case for External Indices ---- Fusion") # # For a Tensor Contraction, # if len(l_left_target_indices) == len(l_idx_x): # #print ("============================") opt_fusion = -1 for each_target in l_left_target_indices: is_common = -1 for each_idx_x in l_idx_x: if each_target == each_idx_x: is_common = 1 if is_common == -1: opt_fusion = 1 break # if opt_fusion == 1: #print ("opt_fusion == 1") idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] else: #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] idx_count = idx_count + 1 else: #print ("opt_fusion != 1") idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb idx_count = idx_count + 1 else: idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] else: #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] idx_count = idx_count + 1 # # OPTION #2. Partial Tiles for Internal Indices # if opt_gen_p7 == 1: cond_boundary_left_int = 1 str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # To Write Code for Boundary Cases # if (cond_boundary_left_ext == 1 and opt_gen_full_special_case_left == -1) or cond_boundary_left_int == 1: # # # tc_code_kernel_helper.code_kernel_load_input_left_boundary_case( f, opt_gen_full_special_case_left, cond_boundary_left_ext, cond_boundary_left_int, str_cond_gen_external, str_cond_gen_internal) # # # l_left_indices_target_temp = list() l_left_indices_reg_temp = list() opt_gen_full_special_case = -1 len_covered_reg = 1 # # To Figure out indices mapped on TB (l_left_indices_target_temp) and indices mapped on REG (l_left_indices_reg_temp) # for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: l_left_indices_target_temp.append(each_idx) else: l_left_indices_reg_temp.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB len_external_tiles_left = 1 for each_target_index in l_left_indices_target_temp: len_external_tiles_left = len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find(l_t3_slices, l_left_indices_reg_temp[0]) # # TB_X will load |E_LEFT| on TB_X without an index mapped on REG # if opt_load_t2 == -1: if size_tb_y > len_external_tiles_left: len_covered_reg = int(size_tb_y / len_external_tiles_left) opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # |TB_X| > |E_LEFT_TB|, Then, |TB_X| can cover |E_LEFT_TB| # else: if size_tb_x > len_external_tiles_left: print("[aft]size_tb_x > len_external_tiles_left :: ", size_tb_x, " >? ", len_external_tiles_left) # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / len_external_tiles_left) opt_gen_full_special_case = 1 # # |TB_X| == |E_LEFT_TB| # elif size_tb_x == len_external_tiles_left: len_covered_reg = 1 # # |TB_X| < |E_LEFT_TB| # #else: #print ("HERE: ", size_tb_x, len_external_tiles_left) #len_covered_reg = 1#size_tb_x / len_external_tiles_left # (To-Do) This case will be dealt in the loop. #opt_gen_full_special_case = 2 # # Step 1: For-Statement: T2 (LEFT), |E_A'| * |E_A''|, where A' is a set of indices mapped on Thread Block and # A'' is a set of indices mapped on Register Tile. tc_code_kernel_helper.code_kernel_load_input_left_for_statement( f, opt_gen_full, tensor_contraction[0][2], opt_gen_full_special_case, size_reg_tile, len_covered_reg, l_t3_mapping_reg) # str_str_t2 = "" idx_count = 0 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if idx_count != 0: str_str_t2 = str_str_t2 + " * " str_str_t2 = str_str_t2 + "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() idx_count = idx_count + 1 # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles_left = 1 size_len_reg_tiles_left = 1 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) # # [Sub-Routine] Load Tensor Inputs to sm_a[][] # if len(l_input_strides) > 0: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left( f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2, num_internal_indices, idx_kernel, l_input_strides[0], opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, l_t3_slices) else: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left( f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2, num_internal_indices, idx_kernel, l_input_strides, opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, l_t3_slices) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles_right = 1 size_len_reg_tiles_right = 1 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) #print ("size_len_reg_tiles_left, right:", size_len_reg_tiles_left, ",", size_len_reg_tiles_right) # # When we cannot merge Two different loops for Both Inputs. # #if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or (opt_load_t2 != opt_load_v2) or (size_tb_x > upper_right) or (size_tb_x > upper_left) or opt_gen_full == 1 or (size_len_reg_tiles_left != size_len_reg_tiles_right): if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or ( opt_load_t2 != opt_load_v2) or opt_gen_full == 1 or ( size_len_reg_tiles_left != size_len_reg_tiles_right): f.write("\t\t}\n") f.write("\n") # # # l_right_indices = tensor_contraction[1][4] l_right_indices_target = list() l_right_indices_reg = list() cond_boundary_right_ext = -1 cond_boundary_right_int = -1 cond_boundary_right_tbx = -1 cond_boundary_right_tby = -1 str_cond_gen_external = "" str_cond_gen_internal = "" str_cond_gen_tb_x = "" str_cond_gen_tb_y = "" f.write("\t\t// Load Input Tensor to Shared Memory\n") # # [Load][Right] TB_X -> |K| && TB_Y -> |E_RIGHT| # if opt_load_v2 == -1: # # OPTION #1. External Index # if opt_gen_full == 1: cond_boundary_right_ext = 1 for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: l_right_indices_target.append(each_idx) else: l_right_indices_reg.append(each_idx) #print ("len(l_right_indices_target):", len(l_right_indices_target)) #print ("len(l_idx_x);", len(l_idx_x)) # # To-Do # # 4D Input Tensor if len(l_right_indices_target) == len(l_idx_y): idx_count = 0 for idx_tb in l_idx_y: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] idx_count = idx_count + 1 else: print("ERROR: (-1) Input Tensor Should be 4D...") # # OPTION #2. Internal Index # if opt_gen_p7 == 1: cond_boundary_right_int = 1 str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # OPTION #3. # if size_tb_x > size_sm_p7: cond_boundary_right_tbx = 1 str_cond_gen_tb_x = "threadIdx.x < " + str(size_sm_p7) else: # # OPTION #1. External Index # #print ("[CODE][LOAD-INPUT][RIGHT] TB_X -> |E_RIGHT| && TB_Y -> |K|") opt_gen_full_special_case = -1 if opt_gen_full == 1: cond_boundary_right_ext = 1 for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: l_right_indices_target.append(each_idx) else: l_right_indices_reg.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles = 1 for each_target_index in l_right_indices_target: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # TB_X will load |E_LEFT| on TB_X without the index mapped on REG if size_tb_x > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / size_len_external_tiles) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_right_indices_reg[0]) if len_covered_reg > 1: opt_gen_full_special_case = 1 ''' if len(l_idx_y) > len(l_left_target_indices): idx_count = 0 for idx_tb in l_left_target_indices: if idx_count == 0: str_cond_gen_external = "idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb#l_left_target_indices[idx_count] idx_count = idx_count + 1 ''' # #print ("len(l_right_indices_target):", len(l_right_indices_target)) #print ("len(l_idx_x);", len(l_idx_x)) if len(l_idx_x) > len(l_right_indices_target): idx_count = 0 for idx_tb in l_right_indices_target: if idx_count == 0: str_cond_gen_external = "idx_" + l_idx_x[ idx_count] + " < rng_" + idx_tb else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + l_idx_x[ idx_count] + " < rng_" + idx_tb idx_count = idx_count + 1 else: idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] idx_count = idx_count + 1 ''' if len(l_right_indices_target) == len(l_idx_x): idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count] else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count] idx_count = idx_count + 1 else: print ("ERROR: (!-1) Input Tensor Should be 4D...") ''' # # OPTION #2. # if opt_gen_p7 == 1: cond_boundary_right_int = 1 str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # Boundary Cases (External, Internal and Thread Block) # To Write Code for Boundary Cases # if cond_boundary_right_ext == 1 or cond_boundary_right_int == 1 or cond_boundary_right_tbx == 1: # # # tc_code_kernel_helper.code_kernel_load_input_right_boundary_case( f, cond_boundary_right_ext, cond_boundary_right_tbx, cond_boundary_right_int, str_cond_gen_external, str_cond_gen_tb_x, str_cond_gen_internal) # # l_right_indices_target_temp = list() l_right_indices_reg_temp = list() # for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: l_right_indices_target_temp.append(each_idx) else: l_right_indices_reg_temp.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles = 1 for each_target_index in l_right_indices_target_temp: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_right_indices_reg_temp[0]) #print ("size_tb_x: ", size_tb_x) #print ("size_tb_y: ", size_tb_y) #print ("size_len_external_tiles: ", size_len_external_tiles) opt_gen_full_special_case = -1 # # TB_X will load |E_K|. # if opt_load_v2 == -1: # # if size_tb_y > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_y / size_len_external_tiles) if len_covered_reg > 1: opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # TB_X will load |E_RIGHT|. # else: # # if size_tb_x > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / size_len_external_tiles) if len_covered_reg > 1: opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # Step 2: This "For-Statement" is related to "Regiter-Tile." # The size of Thread Block depends on Indices' Tile-Size mapped on Thread Block. # However, when the lengh of a dimension along thread block which load inputs can cover some "Register-Tile," # then, we need to change the ranges. # tc_code_kernel_helper.code_kernel_load_input_right_for_statement( f, opt_gen_full, tensor_contraction[1][2], opt_gen_full_special_case, size_len_reg_tiles_right, size_reg_tile, len_covered_reg, l_t3_mapping_reg) # # free # del l_right_indices_target del l_right_indices_reg del l_right_indices_target_temp del l_right_indices_reg_temp # # # str_str_v2 = "" idx_count = 0 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if idx_count != 0: str_str_v2 = str_str_v2 + " * " str_str_v2 = str_str_v2 + "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() idx_count = idx_count + 1 # # [Sub-Routine] Load Tensor Inputs to sm_b[][] # if len(l_input_strides) > 0: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right( f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2, num_internal_indices, idx_kernel, l_input_strides[2], opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, l_t3_slices) else: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right( f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2, num_internal_indices, idx_kernel, l_input_strides, opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, l_t3_slices) # # END: To Load Inputs # f.write("\t\t}\n") f.write("\t\t__syncthreads();\n") f.write("\n")
def tc_gen_code_kernel_load_inputs_base( f, opt_gen_ext, opt_gen_int, opt_load_left, opt_load_right, opt_internal, tensor_contraction, l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, size_smem_k, size_tb_x, size_tb_y, idx_kernel): # # # num_code_tabs = 2 # # # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "//---------------------------------------------------------------------------------------------------", 1) tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "// This is for the new version", 1) # # >>> Base Form <<< # if (idx_a < rng_c2 && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound) // boundary: external-smem, internal # if (threadIdx.x < 8) // boundary: |TB| and |K| # { # for (int ll = 0; ll < rng_c1; ll++) // related to |REG| && boundary: external0-reg # { # sm_b[][] = dev_v2[ext_addr + int_addr]; # } # } # __synchthread(); # #print ("tensor_contraction: ", tensor_contraction) # # Which Axis is mapped on REG for Input-Left and Input-Right # opt_axis_reg_left = 0 opt_axis_reg_right = 0 if tensor_contraction[0][2] != "x": opt_axis_reg_left = 1 if tensor_contraction[1][2] != "x": opt_axis_reg_right = 1 # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles_left = 1 size_len_reg_tiles_left = 1 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles_right = 1 size_len_reg_tiles_right = 1 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) print("[Code Generator][New][Load][Input] size_len_ext_left: ", size_len_external_tiles_left) print("[Code Generator][New][Load][Input] size_len_ext_right: ", size_len_external_tiles_right) print("[Code Generator][New][Load][Input] size_len_reg_left: ", size_len_reg_tiles_left) print("[Code Generator][New][Load][Input] size_len_reg_right: ", size_len_reg_tiles_right) # # [Load-Input][Left] # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "// This Part is for Loading Input-Left", 1) # tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts( f, num_code_tabs, # tensor_contraction[0], # options opt_load_left, 1, 1, opt_gen_ext, opt_gen_int, opt_axis_reg_left, # need to make automatically opt_internal, # lists l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, # sizes size_len_external_tiles_left, size_len_reg_tiles_left, size_smem_k, size_tb_x, size_tb_y, idx_kernel) # tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "", 1) # # [Load-Input][Right] # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "// This Part is for Loading Input-Right", 1) # tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts( f, num_code_tabs, # tensor_contraction[1], # options opt_load_right, 2, 2, opt_gen_ext, opt_gen_int, opt_axis_reg_right, opt_internal, # lists l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, # sizes size_len_external_tiles_right, size_len_reg_tiles_right, size_smem_k, size_tb_x, size_tb_y, idx_kernel) # # END: After Loading Both Inputs # tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "__syncthreads();", 1) # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "//---------------------------------------------------------------------------------------------------", 1) tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "\n", 1)
def tc_gen_perms_exclusive_TB_Y(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_right = len(list_given_input_tensor_right) if opt_print == 1: print ("========================================== [Enumerations-TB_Y] ===================================================") print ("========================================== [Exclusive] [START] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_right) print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given REG_X: ", list_REG_X) print ("Given REG_Y: ", list_REG_Y) print ("Given TB_X: ", list_TB_X) print ("========================================== [Exclusive] [END] ===================================================") # # # for size_TB_Y in list_sizes_TB: if opt_print == 1: print ("|TB_Y| = ", size_TB_Y) # # Assumption: This Input Tensor does not have the FVI in the Output Tensor. # TB_Y_Vol = 1 TB_Y_Vol_Prev = 1 done_mapping_TB_Y = -1 list_TB_Y = [] duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # # # for each_right_idx in list_given_input_tensor_right: # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) != -1: continue # # #2. (Just In Case) Indices Mapped on REG_X # if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_right_idx) != -1: continue # # #3. Indices Mapped on REG_Y # if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_right_idx) != -1: continue # # |TB_Y'| # TB_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx) # # |TB_Y'| >= |TB_XY # if TB_Y_Vol >= size_TB_Y: # # |TB_Y'| > |TB_Y| # if TB_Y_Vol > size_TB_Y: # # # if done_mapping_TB_Y == -1: blocking_tile_size = size_TB_Y / TB_Y_Vol_Prev list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, int(blocking_tile_size)]) done_mapping_TB_Y = 1 else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, 1]) # # |TB_Y'| = |TB_Y| # else: # # # if done_mapping_TB_Y == -1: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)]) else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, 1]) # # |TB_Y'| < |TB_Y| # else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)]) # # # TB_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx) # # # #print ("list_TB_X: ", list_TB_X) #print ("list_TB_Y: ", list_TB_Y) #print ("Tile-Sizes: ", duplicated_Tile_Sizes) # # Configuration # if done_mapping_TB_Y == 1: # # #1. Shared-Memory |SMEM_L| = |SMEM_R| # size_SMEM_Left = 1 size_SMEM_Right = 1 # # # for each_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1: size_SMEM_Left *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1: size_SMEM_Right *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) # # #1. H/W Constraint---- Shared Memory # if (size_SMEM_Left * 16) + (size_SMEM_Right * 16) > 4096: continue # # # if size_SMEM_Left == size_SMEM_Right: tmp_config = tc_gen_permutations.Configuration() tmp_config.add_tensor_C(list_given_output_tensor) tmp_config.add_tensor_A(list_given_input_tensor_left) tmp_config.add_tensor_B(list_given_input_tensor_right) tmp_config.add_REG_X(list_REG_X) tmp_config.add_REG_Y(list_REG_Y) tmp_config.add_TB_X(list_TB_X) tmp_config.add_TB_Y(list_TB_Y) tmp_config.add_TB_K(list_TB_K) # # [To-Do] Need to Make it Automatically # #duplicated_Tile_Sizes.append(["e", 16]) #duplicated_Tile_Sizes.append(["f", 1]) #duplicated_Tile_Sizes.append(["g", 16]) #duplicated_Tile_Sizes.append(["d", 16]) # for 15 duplicated_Tile_Sizes.append(["f", 16]) # #duplicated_Tile_Sizes.append(["e", 16]) # 3 tmp_config.add_tile_size(duplicated_Tile_Sizes) tmp_config.add_representative_problem_size(list_representative_problem_size) # tmp_config.size_REG_X = 1 tmp_config.size_REG_Y = 1 tmp_config.size_TB_X = 1 tmp_config.size_TB_Y = 1 tmp_config.size_TB_K = 1 for each_idx in list_REG_X: tmp_config.size_REG_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_REG_Y: tmp_config.size_REG_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_X: tmp_config.size_TB_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_Y: tmp_config.size_TB_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_K: tmp_config.size_TB_K *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) ''' print (">>>> list_TB_X: ", tmp_config.list_TB_X) print (">>>> list_TB_Y: ", tmp_config.list_TB_Y) print (">>>> list_TB_K: ", tmp_config.list_TB_K) print (">>>> list_REG_X: ", tmp_config.list_REG_X) print (">>>> list_REG_Y: ", tmp_config.list_REG_Y) print (">>>> list_Tile_Sizes: ", tmp_config.list_tile_sizes) print (">>>> # of Elements in SMEM_L: ", size_SMEM_Left * 16) print (">>>> # of Elements in SMEM_R: ", size_SMEM_Right * 16) ''' # list_CLASS_configuration.append(tmp_config)
def tc_gen_code_Kernel_Initial( f, size_sm_p7, size_sm_a, size_sm_b, # For Shared memory l_t3_mapping_tb_2D, l_t3_idx, l_t3_slices, # For T3 (Output) size_reg_x, size_reg_y, # For Register-Tiling opt_gen_p7, opt_gen_full, opt_pre_computed, # Options opt_shared_padding, opt_data_type, # Options idx_kernel): # For Options for Generalizing # Shared Memory # Dependson SIZE_P7_UNIT, SIZE_REG_T, SIZE_SLICE (or SIZE_TB) f.write("\t// For Shared Memory,\n") if opt_shared_padding == 1: # if opt_data_type == "DOUBLE": f.write("\t__shared__ double sm_a[" + str(size_sm_p7) + "][" + str(size_sm_a) + " + 1];\n") f.write("\t__shared__ double sm_b[" + str(size_sm_p7) + "][" + str(size_sm_b) + " + 1];\n") else: f.write("\t__shared__ float sm_a[" + str(size_sm_p7) + "][" + str(size_sm_a) + " + 1];\n") f.write("\t__shared__ float sm_b[" + str(size_sm_p7) + "][" + str(size_sm_b) + " + 1];\n") else: # if opt_data_type == "DOUBLE": f.write("\t__shared__ double sm_a[" + str(size_sm_p7) + "][" + str(size_sm_a) + "];\n") f.write("\t__shared__ double sm_b[" + str(size_sm_p7) + "][" + str(size_sm_b) + "];\n") else: f.write("\t__shared__ float sm_a[" + str(size_sm_p7) + "][" + str(size_sm_a) + "];\n") f.write("\t__shared__ float sm_b[" + str(size_sm_p7) + "][" + str(size_sm_b) + "];\n") f.write("\n") # # basic variables (used for pre-computed arrays) # if opt_pre_computed != -1: f.write( "\tint l_idx_t3 = threadIdx.x + threadIdx.y * SIZE_TB_" + str(idx_kernel) + "_X;\n") # (Required) f.write( "\tint t3_base_thread = dev_t3_output_base_" + str(idx_kernel) + "[blockIdx.x] + dev_t3_output_offset_" + str(idx_kernel) + "[l_idx_t3];\n" ) # Based on inputs (t3_output_base, t3_output_offset) (Required) f.write("\n") # Generalized for "Internal Indice" if opt_gen_p7 == 1: f.write("\tint internal_upperbound = 0;\n" ) # Based on p7b's size (Generalized) f.write( "\tint internal_offset;\n") # Based on p7b's size (Generalized) f.write("\n") # # "-1": pre_computed is off # " 1": pre_computed is on # if opt_pre_computed == -1: # numIdx_TB_X = len(l_t3_mapping_tb_2D[0]) numIdx_TB_Y = len(l_t3_mapping_tb_2D[1]) # f.write( "\t// when opt_pre_computed == -1, all indices will be calculated manually\n" ) f.write("\t// # of indices mapped on TB_X: " + str(numIdx_TB_X) + "\n") f.write("\t// # of indices mapped on TB_Y: " + str(numIdx_TB_Y) + "\n") # # TB_X # if numIdx_TB_X == 1: f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] + " = threadIdx.x;\n") elif numIdx_TB_X == 2: f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] + " = threadIdx.x % SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n") f.write("\tint idx_" + l_t3_mapping_tb_2D[0][1] + " = threadIdx.x / SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n") else: # # [To-Do] # list_strides = list() idx_count = 0 prev_stride = "" for each_idx in l_t3_mapping_tb_2D[0]: if idx_count != len(l_t3_mapping_tb_2D[0]) - 1: if prev_stride == "": list_strides.append("SIZE_SLICE_" + str(idx_kernel) + "_" + each_idx.capitalize()) prev_stride = "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() else: list_strides.append("SIZE_SLICE_" + str(idx_kernel) + "_" + each_idx.capitalize() + " * " + prev_stride) # idx_count += 1 # # # rev_l_idx_tb_x = list(reversed(l_t3_mapping_tb_2D[0])) idx_count = 0 rev_idx_count = len(rev_l_idx_tb_x) - 1 for each_rev_idx in rev_l_idx_tb_x: # rev_idx_count -= 1 # if idx_count == 0: if idx_count == len(rev_l_idx_tb_x) - 1: f.write("\tint idx_" + each_rev_idx + " = threadIdx.x;\n") else: f.write("\tint idx_" + each_rev_idx + " = threadIdx.x / (" + list_strides[rev_idx_count] + ");\n") f.write("\tint remaining_idx = threadIdx.x % (" + list_strides[rev_idx_count] + ");\n") else: if idx_count == len(rev_l_idx_tb_x) - 1: f.write("\tint idx_" + each_rev_idx + " = remaining_idx;\n") else: f.write("\tint idx_" + each_rev_idx + " = remaining_idx / (" + list_strides[rev_idx_count] + ");\n") f.write("\tremaining_idx = remaining_idx % (" + list_strides[rev_idx_count] + ");\n") # idx_count += 1 # # TB_Y # if numIdx_TB_Y == 1: f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] + " = threadIdx.y;\n") elif numIdx_TB_Y == 2: f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] + " = threadIdx.y % SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n") f.write("\tint idx_" + l_t3_mapping_tb_2D[1][1] + " = threadIdx.y / SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n") else: # # [To-Do] # f.write("\t// not-yet: |TB_Y| > 2, " + str(numIdx_TB_Y) + "\n") for each_idx in l_t3_mapping_tb_2D[1]: f.write("\tidx_" + each_idx + "\n") # # Block Numbers # f.write("\n") f.write("\tint tmp_blkIdx;\n") rev_l_t3_idx = reversed(l_t3_idx) len_l_t3_idx = len(l_t3_idx) # idx_count = len_l_t3_idx for each_idx in rev_l_t3_idx: # str_prod_strides = "" for each_num_idx in range(0, idx_count - 1): if each_num_idx == 0: str_prod_strides = "numBlk_" + l_t3_idx[each_num_idx] else: str_prod_strides = "numBlk_" + l_t3_idx[ each_num_idx] + " * " + str_prod_strides # if idx_count == len_l_t3_idx: f.write("\tint blk_idx_" + each_idx + " = blockIdx.x / (" + str_prod_strides + ");\n") f.write("\ttmp_blkIdx = blockIdx.x % (" + str_prod_strides + ");\n") else: if idx_count == 1: # # # f.write("\tint blk_idx_" + each_idx + " = tmp_blkIdx;\n") elif idx_count == 2: f.write("\tint blk_idx_" + each_idx + " = tmp_blkIdx / " + str_prod_strides + ";\n") f.write("\ttmp_blkIdx = tmp_blkIdx % (" + str_prod_strides + ");\n") else: f.write("\tint blk_idx_" + each_idx + " = tmp_blkIdx / (" + str_prod_strides + ");\n") f.write("\ttmp_blkIdx = tmp_blkIdx % (" + str_prod_strides + ");\n") # f.write("\n") idx_count = idx_count - 1 # # the output's base address for a thread block # str_t3_base_addr = "" rev_l_t3_idx = reversed(l_t3_idx) l_tb_idx = list() for each_axis in l_t3_mapping_tb_2D: for each_idx in each_axis: l_tb_idx.append(each_idx) idx_count = 0 existing_idx = 0 for each_idx in rev_l_t3_idx: # idx_t3_count = 0 for each_tb_idx in l_tb_idx: if each_idx == each_tb_idx: existing_idx = 1 l_tb_idx.pop(idx_t3_count) break idx_t3_count = idx_t3_count + 1 # if existing_idx == 1: if idx_count == 0: str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize( ) + " + idx_" + each_idx else: str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + idx_" + each_idx + " + (" + str_t3_base_addr + ") * size_" + each_idx else: if idx_count == 0: str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() else: str_t3_base_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize( ) + " + (" + str_t3_base_addr + ") * size_" + each_idx # existing_idx = 0 idx_count = idx_count + 1 f.write("\tint t3_base_thread = " + str_t3_base_addr + ";\n") f.write("\n") # # Ranges # if opt_gen_full == 1: # f.write("\t// need to support partial tiles\n") idx_count = 0 for each_idx in l_t3_idx: if idx_count == 0: f.write("\tint rng_" + each_idx) else: f.write(", rng_" + each_idx) # idx_count = idx_count + 1 f.write(";\n") # # # for each_idx in l_t3_idx: # f.write("\tif ((size_" + each_idx + " - (blk_idx_" + each_idx + " * SIZE_SLICE_1_" + each_idx.capitalize() + ")) >= SIZE_SLICE_1_" + each_idx.capitalize() + ")\n") f.write("\t{\n") # # IF # f.write("\t\trng_" + each_idx + " = SIZE_SLICE_1_" + each_idx.capitalize() + ";\n") # f.write("\t}\n") f.write("\telse\n") f.write("\t{\n") # # ELSE # f.write("\t\trng_" + each_idx + " = size_" + each_idx + " % SIZE_SLICE_1_" + each_idx.capitalize() + ";\n") # f.write("\t}\n") # # # del l_tb_idx else: # # Generalized for non-full tile # not yet generalized.... if opt_gen_full == 1: f.write("\t// should support for non-full tiles\n") # # To-Do: It does not support multi-dimensional arrays fully. # # "x"-axis if len(l_t3_mapping_tb_2D) != 2: print( "ERROR: This part does not support well when len(l_t3_mapping_tb_2D) != 2!" ) # numIdxTB_X = len(l_t3_mapping_tb_2D[0]) numIdxTB_Y = len(l_t3_mapping_tb_2D[1]) # if numIdxTB_X == 2: f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] + " = threadIdx.x % SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n") f.write("\tint idx_" + l_t3_mapping_tb_2D[0][1] + " = threadIdx.x / SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[0][0].capitalize() + ";\n") elif numIdxTB_X == 1: f.write("\tint idx_" + l_t3_mapping_tb_2D[0][0] + " = threadIdx.x;\n") else: #print ("[ERROR]!!! The number of indices mapped on TB_X: ", numIdxTB_X, " (Not Supported Yet)") #print ("[ERROR]!!! TB_X: ", l_t3_mapping_tb_2D[0]) tc_code_etc.tc_gen_code_write_line( f, 1, "// The # of External Indices mapped on TB_X is equal to or greater than 3" ) # l_stride_TB_X = list() tmp_str_stride = "" idx_count = 0 for each_idx in l_t3_mapping_tb_2D[0]: if idx_count == 0: tmp_str_stride = "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() else: tmp_str_stride = tmp_str_stride + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() # l_stride_TB_X.append(tmp_str_stride) idx_count = idx_count + 1 # idx_first = 0 idx_second = 0 idx_count = 0 str_remainning = "threadIdx.x" l_rev_l_t3_mapping_TB_X = list(reversed(l_t3_mapping_tb_2D[0])) l_rev_l_stride_TB_X = list(reversed(l_stride_TB_X)) for each_idx in l_rev_l_t3_mapping_TB_X: # # |T_i| == 1, no need to calculate index. # if tc_helper.tc_gen_helper_find(l_t3_slices, each_idx) == 1: tc_code_etc.tc_gen_code_write_line( f, 1, "int idx_" + each_idx + " \t= 0;") # # |T_i| != 1 # else: # # # if idx_first == 0: # # THE FVI && The FIRST # if idx_count == len(l_rev_l_t3_mapping_TB_X) - 1: tc_code_etc.tc_gen_code_write_line( f, 1, "int idx_" + each_idx + " \t= " + str_remainning + ";") # # NOT THE FVI && The FIRST # else: tc_code_etc.tc_gen_code_write_line( f, 1, "int idx_" + each_idx + " \t= " + str_remainning + " / " + l_rev_l_stride_TB_X[idx_count - 1] + ";") str_remainning = str_remainning + " % " + l_rev_l_stride_TB_X[ idx_count - 1] idx_first = 1 else: if idx_second == 0: tc_code_etc.tc_gen_code_write_line( f, 1, "int tmp_remainning \t= " + str_remainning + ";") idx_second = 1 else: tc_code_etc.tc_gen_code_write_line( f, 1, "tmp_remainning \t= " + str_remainning + ";") # if idx_count == len(l_rev_l_t3_mapping_TB_X) - 1: tc_code_etc.tc_gen_code_write_line( f, 1, "int idx_" + each_idx + " = tmp_remainning ;") else: tc_code_etc.tc_gen_code_write_line( f, 1, "int idx_" + each_idx + " = tmp_remainning / " + l_rev_l_stride_TB_X[idx_count - 1] + ";") # idx_count = idx_count + 1 # if numIdxTB_Y == 2: f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] + " \t= threadIdx.y % SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n") f.write("\tint idx_" + l_t3_mapping_tb_2D[1][1] + " \t= threadIdx.y / SIZE_SLICE_" + str(idx_kernel) + "_" + l_t3_mapping_tb_2D[1][0].capitalize() + ";\n") elif numIdxTB_Y == 1: f.write("\tint idx_" + l_t3_mapping_tb_2D[1][0] + " \t= threadIdx.y;\n") else: print("[ERROR]!!! The number of indices mapped on TB_Y: ", numIdxTB_Y, " (Not Supported Yet)") print("[ERROR]!!! TB_Y: ", l_t3_mapping_tb_2D[1]) f.write("\n") # block ranges for t3 (in order) idx_count = 0 for t3_idx in l_t3_idx: f.write("\tint rng_" + t3_idx + " \t= dev_t3_block_range_" + str(idx_kernel) + "[blockIdx.x * NUM_INDEX + " + str(idx_count) + "];\n") idx_count = idx_count + 1 f.write("\n") # # Register-Tile # : Depends on SIZE_REG_T # if opt_data_type == "DOUBLE": f.write("\tdouble temp_av;\n") if size_reg_y >= size_reg_x: f.write("\tdouble temp_bv[" + str(size_reg_y) + "];\n") # min(size_reg_y, size_reg_x), basically f.write("\tdouble reg_tile[" + str(size_reg_y) + "][" + str(size_reg_x) + "];\n") else: f.write("\tdouble temp_bv[" + str(size_reg_x) + "];\n") # min(size_reg_y, size_reg_x), basically f.write("\tdouble reg_tile[" + str(size_reg_y) + "][" + str(size_reg_x) + "];\n") else: f.write("\tfloat temp_av;\n") if size_reg_y >= size_reg_x: f.write("\tfloat temp_bv[" + str(size_reg_y) + "];\n") # min(size_reg_y, size_reg_x), basically f.write("\tfloat reg_tile[" + str(size_reg_y) + "][" + str(size_reg_x) + "];\n") else: f.write("\tfloat temp_bv[" + str(size_reg_x) + "];\n") # min(size_reg_y, size_reg_x), basically f.write("\tfloat reg_tile[" + str(size_reg_y) + "][" + str(size_reg_x) + "];\n") f.write("\n") # Initializing reg_tile[][] f.write("\tfor (int i = 0; i < " + str(size_reg_y) + "; i++)\n") f.write("\tfor (int j = 0; j < " + str(size_reg_x) + "; j++)\n") f.write("\treg_tile[i][j] = 0.0;\n") f.write("\n")
def tc_gen_code(tmp_count, inner_groups): # # This is for calculating cost function based on a given input. # output_name = "temp" # Depends on # of groups. # # FILE OPEN: A LIST of Inner-Groups is for A SINGLE CUDA File. # : Each Inner-Group has multiple Tensor Contractions which will be fused. # : Thus, each Inner-Group is for A SINGLE KERNEL. # f = open(output_name + "_" + str(tmp_count) + ".cu", "w") # # Per Inner-Groups, # tc_code_include.tc_code_include(f) #tc_code_etc.tc_gen_global_methods(f, len(inner_groups)) # l_combined_opt_diffs = list() l_combined_opt_gen_fulls = list() l_combined_opt_gen_internal = list() l_combined_input_tensors = list() l_combined_t3_slices_size = list() l_combined_mappings = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t3_parameters_f = list() l_combined_t3_parameters_nf = list() l_combined_t2_parameters = list() l_combined_t2_parameters_f = list() l_combined_t2_parameters_nf = list() l_combined_v2_parameters = list() l_combined_v2_parameters_f = list() l_combined_v2_parameters_nf = list() l_combined_device_dynamic = list() l_combined_host_dynamic = list() l_combined_cuda_malloc = list() # # To Support Multiple Inner-Groups # for each_inner_group in inner_groups: l_combined_input_tensors.append(each_inner_group[6]) l_combined_t3_slices_size.append(each_inner_group[8]) l_combined_mappings.append([each_inner_group[1], each_inner_group[2]]) # tc_code_define.tc_gen_definition(f, l_combined_t3_slices_size, inner_groups[0][3], inner_groups[0][4], l_combined_mappings, inner_groups[0][4], inner_groups[0][5], l_combined_input_tensors) # # Check Types # idx_kernel = 1 for each_inner_group in inner_groups: opt_gen_full, opt_gen_p7, possible_diff = tc_helper.tc_gen_helper_CheckingTypes( each_inner_group[3], each_inner_group[8], each_inner_group[4]) # possible_diff, opt_gen_full, and opt_gen_p7 are created at here. #l_combined_opt_diffs.append(possible_diff) l_combined_opt_diffs.append(-1) l_combined_opt_gen_fulls.append(opt_gen_full) l_combined_opt_gen_internal.append(opt_gen_p7) print("[Code Generator][tc_gen_code] Kernel #", idx_kernel, ">>> opt_diff:", possible_diff, "(but -1), opt_gen_full:", opt_gen_full, ", opt_gen_p7:", opt_gen_p7) idx_kernel = idx_kernel + 1 # # Code: Global Variables (Common) # tc_code_globalvar.tc_gen_global_variables_common(f) # idx_kernel = 0 for each_inner_group in inner_groups: # l_t3_d_decl_var = list() l_t2_d_decl_var = list() l_v2_d_decl_var = list() l_t3_parameters = list() l_t2_parameters = list() l_v2_parameters = list() l_t3_parameters_nf = list() l_t2_parameters_nf = list() l_v2_parameters_nf = list() l_t3_parameters_f = list() l_t2_parameters_f = list() l_v2_parameters_f = list() # # Code: Global Variables (Tensor Contractions) # tc_code_globalvar.tc_gen_global_variables( f, each_inner_group[6], each_inner_group[4], each_inner_group[5], l_t3_d_decl_var, l_t3_parameters, l_t3_parameters_nf, l_t2_parameters_nf, l_v2_parameters_nf, l_t3_parameters_f, l_t2_parameters_f, l_v2_parameters_f, l_device_dynamic, l_t2_d_decl_var, l_v2_d_decl_var, l_t2_parameters, l_v2_parameters, l_cuda_malloc, l_combined_opt_diffs[idx_kernel], idx_kernel + 1) # l_combined_t3_d_decl_var.append(l_t3_d_decl_var) l_combined_t2_d_decl_var.append(l_t2_d_decl_var) l_combined_v2_d_decl_var.append(l_v2_d_decl_var) l_combined_t3_parameters.append(l_t3_parameters) l_combined_t2_parameters.append(l_t2_parameters) l_combined_v2_parameters.append(l_v2_parameters) l_combined_t3_parameters_f.append(l_t3_parameters_f) l_combined_t2_parameters_f.append(l_t2_parameters_f) l_combined_v2_parameters_f.append(l_v2_parameters_f) l_combined_t3_parameters_nf.append(l_t3_parameters_nf) l_combined_t2_parameters_nf.append(l_t2_parameters_nf) l_combined_v2_parameters_nf.append(l_v2_parameters_nf) # idx_kernel = idx_kernel + 1 # # Code: SD2 Functions # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_SD2_Functions.tc_gen_code_pre_SD2_Functions( f, each_inner_group[4], each_inner_group[5], each_inner_group[6], l_host_dynamic, idx_kernel) idx_kernel = idx_kernel + 1 # # Code: CUDA Malloc | Memcpy # tc_pre_CUDA_Malloc.tc_gen_code_pre_CUDA_Malloc( f, l_cuda_malloc, l_t3_parameters, l_t2_parameters, l_v2_parameters, l_device_dynamic, inner_groups[0][5]) # # Code: Pre-Computed Arrays # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_IndirectArray.tc_gen_code_pre_IndirectArray( f, each_inner_group[4], each_inner_group[0], each_inner_group[6], each_inner_group[2], each_inner_group[4], each_inner_group[5], l_host_dynamic, l_combined_opt_diffs[idx_kernel - 1], idx_kernel) idx_kernel = idx_kernel + 1 # # Code: BasicBlock # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_BasicBlock.tc_gen_code_pre_BasicBlock( f, each_inner_group[4], each_inner_group[3], each_inner_group[8], l_host_dynamic, each_inner_group[4], each_inner_group[5], l_combined_opt_diffs[idx_kernel - 1], idx_kernel) idx_kernel = idx_kernel + 1 # print("[Code Generator][tc_gen_code] # of Inner-Groups: ", len(inner_groups)) # # Each Inner-Group Corresponds to A Kernel. # idx_kernel = 1 for an_inner_group in inner_groups: print("[Code Generator][tc_gen_code] Creating Kernel --- #", idx_kernel) # kernel_name = "kernel_ccsdT_" + str( idx_kernel) # Depends on # of groups. # Options for Each Kernel possible_diff = l_combined_opt_diffs[idx_kernel - 1] opt_gen_full = l_combined_opt_gen_fulls[idx_kernel - 1] opt_gen_p7 = l_combined_opt_gen_internal[idx_kernel - 1] # # To-Do: check if boundaries are needed by all tensor contractions, or not. # if opt_gen_full != -1: # Inputs: l_blk_boundary_rng, l_idx_size, l_t3_slices, l_t3_mapping_reg, l_t3_mapping_tb_2D, info_left_index, info_right_index # Outputs: l_blk_boundary_rng tc_helper.tc_gen_helper_CheckingBoundary( l_blk_boundary_rng, an_inner_group[3], an_inner_group[8], an_inner_group[2], an_inner_group[1], an_inner_group[6][0][0][1], an_inner_group[6][0][1][1]) print(">>> Boundaries for External Indices: ", l_blk_boundary_rng) # (....) #possible_diff = -1 # (To-Do: ) # # Inputs: l_input_tensors, l_internal_idx, l_external_idx, l_t3_slices, l_t3_mapping_reg, # Outputs: int_size_sm_a, int_size_sm_b, int_str_t2, int_str_v2 # for each_tc in an_inner_group[6]: # int_size_sm_a = 1 int_size_sm_b = 1 int_str_t2 = 1 int_str_v2 = 1 # bool_found = 1 for each_idx in each_tc[0][1]: # for size_sm_a if tc_helper.tc_gen_helper_find_1d(an_inner_group[5], each_idx) == -1: int_size_sm_a = int_size_sm_a * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # for str_str_t2 if tc_helper.tc_gen_helper_find_1d( an_inner_group[4], each_idx) != -1: # external indices (all) if tc_helper.tc_gen_helper_find_1d( an_inner_group[2], each_idx ) == -1: # external indices mapped on ! regiter tiling int_str_t2 = int_str_t2 * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) bool_found = 1 for each_idx in each_tc[1][1]: # for size_sm_b if tc_helper.tc_gen_helper_find_1d(an_inner_group[5], each_idx) == -1: int_size_sm_b = int_size_sm_b * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # for str_str_v2 if tc_helper.tc_gen_helper_find_1d( an_inner_group[4], each_idx) != -1: # external indices (all) if tc_helper.tc_gen_helper_find_1d( an_inner_group[2], each_idx ) == -1: # external indices mapped on ! register tiling int_str_v2 = int_str_v2 * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # # Inputs: l_t3_mapping_tb_2D, l_t3_slices # Outputs: int_size_tb_x, int_size_tb_y # int_size_tb_x = 1 int_size_tb_y = 1 for each_idx in an_inner_group[1][0]: # "x"-axis int_size_tb_x = int_size_tb_x * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) for each_idx in an_inner_group[1][1]: # "y"-axis_idx int_size_tb_y = int_size_tb_y * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # the below information should be produced by "tc_gen_input()" size_sm_a = int_size_sm_a #+ 1 (padding) size_sm_b = int_size_sm_b #+ 1 size_tb_x = int_size_tb_x size_tb_y = int_size_tb_y size_sm_p7 = tc_helper.tc_gen_helper_CheckingIntUnit( an_inner_group[4], an_inner_group[8], an_inner_group[5]) size_reg_y = tc_helper.tc_gen_helper_find(an_inner_group[8], an_inner_group[2][1]) size_reg_x = tc_helper.tc_gen_helper_find(an_inner_group[8], an_inner_group[2][0]) # # Options for two inputs # Inputs: l_input_tensors, l_internal_idx # opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI( an_inner_group[6], an_inner_group[5]) # # Constraints # Inputs: f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7 # tc_gen_Constraints(f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7) # l_t3_d_decl_var = l_combined_t3_d_decl_var[idx_kernel - 1] l_t2_d_decl_var = l_combined_t2_d_decl_var[idx_kernel - 1] l_v2_d_decl_var = l_combined_v2_d_decl_var[idx_kernel - 1] l_t3_parameters = l_combined_t3_parameters[idx_kernel - 1] l_t2_parameters = l_combined_t2_parameters[idx_kernel - 1] l_v2_parameters = l_combined_v2_parameters[idx_kernel - 1] l_t3_parameters_f = l_combined_t3_parameters_f[idx_kernel - 1] l_t2_parameters_f = l_combined_t2_parameters_f[idx_kernel - 1] l_v2_parameters_f = l_combined_v2_parameters_f[idx_kernel - 1] l_t3_parameters_nf = l_combined_t3_parameters_nf[idx_kernel - 1] l_t2_parameters_nf = l_combined_t2_parameters_nf[idx_kernel - 1] # # >>> Create Kernels <<< # if possible_diff == -1: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) else: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) opt_gen_full = -1 # 1 or -1 tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_full", l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) # idx_kernel = idx_kernel + 1 # # Code: Function to Call Kernels. # kernel_name = "kernel_ccsdT" tc_code_etc.tc_gen_code_fusedKernels( f, kernel_name, l_combined_t3_parameters, l_combined_t2_parameters, l_combined_v2_parameters, l_combined_t3_parameters_nf, l_combined_t2_parameters_nf, l_combined_v2_parameters_nf, l_combined_t3_parameters_f, l_combined_t2_parameters_f, l_combined_v2_parameters_f, l_combined_opt_diffs, len(inner_groups)) # # Code: Function for Correctness Check. # tc_post_Correctness.tc_gen_code_Post_Correctness(f, an_inner_group[4], an_inner_group[5], l_combined_input_tensors) # # Code: Delete Device Memory Allocated Dynamically. # tc_post_HostDevice_Free.tc_gen_code_post_CUDA_Free(f, l_cuda_malloc) # # Code: Delete Host Memory Allocated Dynamically. # tc_post_HostDevice_Free.tc_gen_code_post_HostFree(f, l_host_dynamic) # # Code: Main Function # f.write("// # of Inner-Groups: " + str(len(inner_groups)) + "\n") tc_code_etc.tc_gen_code_main(f, len(inner_groups)) # FILE CLOSE f.close()
def tc_gen_perms_exclusive_TB_X(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_left = len(list_given_input_tensor_left) if opt_print == 1: print ("========================================== [Enumerations-TB_X] ===================================================") print ("========================================== [Exclusive] [START] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_left) print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given REG_X: ", list_REG_X) print ("Given REG_Y: ", list_REG_Y) print ("Given TB_X: ", list_TB_X) print ("========================================== [Exclusive] [END] ===================================================") # # # for size_TB_X in list_sizes_TB: if opt_print == 1: print ("|TB_X| = ", size_TB_X) # # Assumption: Input Tensor whose index is the FVI in the Output will be related to REG_X and TB_X. # TB_X_Vol = -1 TB_X_Vol_Prev = -1 done_mapping_TB_X = -1 duplicated_TB_X = copy.deepcopy(list_TB_X) duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # # Handling the FVI (Default) # for each_left_idx in list_given_input_tensor_left: if each_left_idx == list_TB_X[0]: size_FVI = tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # Need to Split # if size_FVI > size_TB_X: duplicated_Tile_Sizes.append([each_left_idx, size_TB_X]) TB_X_Vol = size_TB_X TB_X_Vol_Prev = size_TB_X done_mapping_TB_X = 1 # # No Need to Split (Fitted) # elif size_FVI == size_TB_X: duplicated_Tile_Sizes.append([each_left_idx, size_TB_X]) TB_X_Vol = size_TB_X TB_X_Vol_Prev = size_TB_X done_mapping_TB_X = 1 # # No Need to Split # else: duplicated_Tile_Sizes.append([each_left_idx, size_FVI]) TB_X_Vol = size_FVI TB_X_Vol_Prev = size_FVI # # # for each_left_idx in list_given_input_tensor_left: # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) != -1: continue # # #2. Indices Mapped on REG_X # if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_left_idx) != -1: continue # # #3. (Just In Case) Indices Mapped on REG_Y # if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_left_idx) != -1: continue # # #4. Indiced Mapped on TB_X (Should Use list_TB_X (passed by)) # if tc_helper.tc_gen_helper_find_1d(list_TB_X, each_left_idx) != -1: continue # # |TB_X'| # TB_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # |TB_X'| >= |TB_X| # if TB_X_Vol >= size_TB_X: # # |TB_X'| > |TB_X| # if TB_X_Vol > size_TB_X: # # # if done_mapping_TB_X == -1: blocking_tile_size = size_TB_X / TB_X_Vol_Prev duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, int(blocking_tile_size)]) done_mapping_TB_X = 1 else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, 1]) # # |TB_X'| = |TB_X| # else: # # # if done_mapping_TB_X == -1: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)]) else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, 1]) # # |TB_X'| < |TB_X| # else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)]) # # # TB_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # # if done_mapping_TB_X == 1: tc_gen_perms_exclusive_TB_Y(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, duplicated_TB_X, list_REG_X, list_REG_Y, duplicated_Tile_Sizes, list_CLASS_configuration, opt_print)
def model_predictive_modeling(list_configurations): # # architectures: p100 (sm_60), v100 (sm_70) # cuda_arch = "sm_70" # # 0. initialize # 0.1. init. indices # 0.2. init. reg. tiles # # ----- main-loop ----- # 1. load inputs # 1.1. load A # 1.2. load B # # 2. compute # 2.1. # --------------------- # # 3. store output # print( "=========[model_predictive_modeling]=======================================================================" ) for each_config in list_configurations: # # (1) m, n, k from a given representative problem size (This is based on the given equation not a configuration) # tmp_m = 1 for each_idx in each_config.list_tensor_A: if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K, each_idx) == -1: tmp_m *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.m = tmp_m # tmp_n = 1 for each_idx in each_config.list_tensor_B: if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K, each_idx) == -1: tmp_n *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.n = tmp_n # tmp_k = 1 for each_idx in each_config.list_TB_K: tmp_k *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.k = tmp_k # # (2) # of Thread Blocks (calculated in cost-model) # # # (3) Estimated Number of Registers in a Thread Block # num_base = 20 # # 71, 70, 74, 71 # example: tccg 48th, 4x4: 16 > 32 # 4x1 + 1x1 = 5 > 10 : 42 ~ 29, 28, 32, 29 # # 128, 121, 126, 120 # 6x6: 36 > 72 # 6x1 + 1x1 = 7 > 14 : 86 ~ 42, 35, 40, 34 # # 120, 122, 112, 113 # 4x8: 32 > 64 # 8x1 + 1x1 = 9 > 18 : 82 ~ 38, 40, 30, 31 # # size_register_x = 1 size_register_y = 1 for each_idx in each_config.list_REG_X: size_register_x *= tc_helper.tc_gen_helper_find( each_config.list_tile_sizes, each_idx) for each_idx in each_config.list_REG_Y: size_register_y *= tc_helper.tc_gen_helper_find( each_config.list_tile_sizes, each_idx) each_config.num_Estimated_Registers = size_register_x * size_register_y * 2 # # (4) Kernel Efficiencies # #break print( "=========[model_predictive_modeling]=======================================================================" ) return 1000
def tc_gen_perms_exclusive_REG_X(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_left_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_left = len(list_given_input_tensor_left) if opt_print == 1: print ("========================================== [Enumerations-REG_X] ===================================================") print ("========================================== [Exclusive] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_left) print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) # # For Each Tile-Size for REG_X # for size_REG_X in list_sizes_REG: if opt_print == 1: print ("|REG_X| = ", size_REG_X) # # # for start_index in range(0, len_tensor_left): # REG_X_Vol = 1 REG_X_Vol_Prev = 1 list_REG_X = [] # will be inherited list_Tile_Sizes = [] # will be inherited done_mapping_REG_X = -1 # not done # # # for target_index in range(start_index, len_tensor_left): str_start_index = list_given_input_tensor_left[target_index] if opt_print == 1: print ("idx: ", str_start_index) # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1: continue # # #2. The FVI in the Output Tensor # if str_start_index == list_given_output_tensor[0]: continue if opt_print == 1: print (">> idx: ", str_start_index) # # |REG_X'| # REG_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) if opt_print == 1: print (">> idx: ", str_start_index, ", REG_X_Vol: ", REG_X_Vol) # # # if REG_X_Vol >= size_REG_X: # # |REG_X'| > |REG_X| # if REG_X_Vol > size_REG_X: # # Need to Split (REG and BX) # if done_mapping_REG_X == -1: blocking_tile_size = size_REG_X / REG_X_Vol_Prev list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, int(blocking_tile_size)]) done_mapping_REG_X = 1 else: list_Tile_Sizes.append([str_start_index, 1]) # # |REG_X'| = |REG_X| # else: # # # if done_mapping_REG_X == -1: list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) done_mapping_REG_X = 1 else: list_Tile_Sizes.append([str_start_index, 1]) # # # break else: list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) # # # REG_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # # if done_mapping_REG_X == 1: if opt_print == 1: print ("list_REG_X: ", list_REG_X) print ("list_Tile_sizes: ", list_Tile_Sizes) tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_Tile_Sizes, list_CLASS_configuration, opt_print)
def tc_gen_code_Kernel_Register_Transpose( f, kernel_name, l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, opt_gen_p7, opt_gen_full): # # [1] Header (To-Do: Strides for Non-FVI of Internal Index) # tc_gen_code_Kernel_Head_RT(f, kernel_name + "_rt", l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var) # Open f.write("{\n") f.write("\t// Kernel for Register Transpose\n") f.write("\t// " + str(len(l_inner_groups)) + " of Inner Groups will be merged by using Register Transpose\n") f.write("\n") # # Each Inner-Group # inner_count = 1 temp_tb_2D = list() for each_inner_group in l_inner_groups: # # For Each Inner-Group, # size_smem_left, size_smem_right, str_left, str_right = tc_interface.tc_interface_SMEM_Size( each_inner_group[6], each_inner_group[4], each_inner_group[5], each_inner_group[8], each_inner_group[2]) size_TB_X, size_TB_Y = tc_interface.tc_interface_TB_Size( each_inner_group[1][0], each_inner_group[1][1], each_inner_group[8]) size_smem_internal = tc_helper.tc_gen_helper_CheckingIntUnit( each_inner_group[4], each_inner_group[8], each_inner_group[5]) size_REG_X = tc_helper.tc_gen_helper_find(each_inner_group[8], each_inner_group[2][1]) size_REG_Y = tc_helper.tc_gen_helper_find(each_inner_group[8], each_inner_group[2][0]) opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI( each_inner_group[6], each_inner_group[5]) l_blk_boundary_rng = list() #print (inner_count, ">>> TB:", size_TB_X, size_TB_Y, ", REG: ", size_REG_X, size_REG_Y) # # # if opt_gen_full != -1: tc_helper.tc_gen_helper_CheckingBoundary( l_blk_boundary_rng, each_inner_group[3], each_inner_group[8], each_inner_group[2], each_inner_group[1], each_inner_group[6][0][0][1], each_inner_group[6][0][1][1]) if inner_count == 1: # # [2] Initialization # Q) This first information can be used for the other inner-groups? # f.write("\t// Initialization\n") tc_gen_code_Kernel_Initial(f, size_smem_internal, size_smem_left, size_smem_right, each_inner_group[1], each_inner_group[4], size_REG_X, size_REG_Y, opt_gen_p7, opt_gen_full, inner_count) # "1" kernel number... temp_tb_2D = each_inner_group[1] # # [3] An Inner-Group # f.write("\t// Within Inner-Group\n") # # For Each Tensor Contraction, # idx_countractions = 1 for tensor_contraction in each_inner_group[7]: f.write("\t// Tensor Contraction\n") # if opt_gen_p7 == 1 and (idx_countractions > 1 or inner_count > 1): f.write("\tinternal_upperbound = 0;\n") # [START] Tensor Contraction f.write("\t#pragma unroll 1\n") f.write( "\tfor (int l = 0; l < size_internal; l += SIZE_INT_UNIT_" + str(inner_count) + ")\n") f.write("\t{\n") # # For Generalizing Internal Index, # if opt_gen_p7 == 1: f.write("\t\t// For Generalizing Contraction Index\n") f.write("\t\tinternal_offset = (l + SIZE_INT_UNIT_" + str(inner_count) + ") - size_internal;\n") f.write( "\t\tif (internal_offset > 0) internal_upperbound = internal_offset;\n" ) f.write("\n") # # [Main] Loads Inputs >> To-Do: Need to Double-Check! # f.write("\t\t// Load Inputs\n") tc_gen_code_Kernel_Load_Inputs( f, size_TB_X, size_TB_Y, size_smem_left, size_smem_right, size_smem_internal, # l_blk_boundary_rng, tensor_contraction, # each_inner_group[8], each_inner_group[5], temp_tb_2D, each_inner_group[2], # temp_tb_2D #each_inner_group[1], each_inner_group[2], opt_gen_full, opt_gen_p7, opt_load_t2, opt_load_v2, inner_count) # # [Start] Computes # f.write("\t\t// Computes: Cross-Product\n") f.write("\t\tfor (int ll = 0; ll < SIZE_INT_UNIT_" + str(inner_count)) # For "Internal Index", if opt_gen_p7 == 1: f.write(" - internal_upperbound") f.write("; ll++)\n") f.write("\t\t{\n") # # [Main] Computes # f.write("\t\t\t// Computes\n") tc_gen_code_Kernel_Compute(f, size_REG_X, size_REG_Y, str_left, str_right, tensor_contraction) # [End] Computes f.write("\t\t}\n") f.write("\t\t__syncthreads();\n") # [END] Tensor Contraction f.write("\t}\n") f.write("\n") # idx_countractions = idx_countractions + 1 # # Part: Register Transpose # if inner_count < len(l_inner_groups): # f.write("\t// Register-Transpose: " + str(inner_count - 1) + " with " + str(inner_count) + "\n") # # # tc_gen_code_Kernel_Process_Register_Transpose( f, l_inner_groups[inner_count - 1], l_inner_groups[inner_count], size_smem_internal, size_smem_left, size_smem_right, size_TB_X * size_TB_Y * size_REG_X * size_REG_Y) f.write("\n") # # [5] Register Tiles -> Global Memory # if inner_count == len(l_inner_groups): f.write("\t// Store the Results to Global Memory\n") f.write("\t// This should be based on the last inner-group\n") tc_gen_code_Kernel_Store_Results( f, opt_gen_full, #each_inner_group[1], each_inner_group[2], temp_tb_2D, each_inner_group[2], size_REG_X, size_REG_Y, 1, 1) #size_REG_X, size_REG_Y, inner_count) inner_count = inner_count + 1 # # END OF FOR: INNER-GROUP # # Close f.write("}\n")