def tc_gen_models_Kernels(each_configuration, idx_count, opt_print=0): # # # # if opt_print == 1: print ("each_config.representative_problem_size: ", each_configuration.list_representative_problem_size) print ("each_config.tile_sizes: ", each_configuration.list_tile_sizes) # opt_full_ext = True opt_full_int = True for each_idx_tile in each_configuration.list_tile_sizes: # idx_name = each_idx_tile[0] idx_tile = each_idx_tile[1] # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_TB_K, idx_name) != -1: if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0: opt_full_int = False else: if tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, idx_name) % idx_tile != 0: opt_full_ext = False # if opt_print == 1: print (">>> opt_full_int: ", opt_full_int, ", opt_full_ext: ", opt_full_ext) # each_configuration.kernel_full_ext = opt_full_ext each_configuration.kernel_full_int = opt_full_int
def tc_gen_code_Kernel_Load_Inputs_Boundary_Exteranl_TB( opt_load_ext_int, l_input_tensor, l_mapping_reg, l_internal_idx, l_info_matching_indices): # print("[Code Generator][Load][Input][Boundary][External][TB]") print( "[Code Generator][Load][Input][Boundary][External][TB] l_input_tensor: ", l_input_tensor) str_boundary_external_tb = "" # # External Indices Mapped on TB # idx_count = 0 for each_idx in l_input_tensor: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1: # # # if tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx) == "0": str_input_specific_idx = "0" else: str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) # if idx_count == 0: #str_boundary_external_tb = "idx_" + each_idx + " < " + "rng_" + each_idx # should be fixed. str_boundary_external_tb = str_input_specific_idx + " < " + "rng_" + each_idx # should be fixed. else: #str_boundary_external_tb = str_boundary_external_tb + " && idx_" + each_idx + " < " + "rng_" + each_idx # should be fixed. str_boundary_external_tb = str_boundary_external_tb + " && " + str_input_specific_idx + " < " + "rng_" + each_idx # should be fixed. # idx_count += 1 # return str_boundary_external_tb
def tc_gen_perms_exclusive_TB_X(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_left = len(list_given_input_tensor_left) if opt_print == 1: print ("========================================== [Enumerations-TB_X] ===================================================") print ("========================================== [Exclusive] [START] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_left) print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given REG_X: ", list_REG_X) print ("Given REG_Y: ", list_REG_Y) print ("Given TB_X: ", list_TB_X) print ("========================================== [Exclusive] [END] ===================================================") # # # for size_TB_X in list_sizes_TB: if opt_print == 1: print ("|TB_X| = ", size_TB_X) # # Assumption: Input Tensor whose index is the FVI in the Output will be related to REG_X and TB_X. # TB_X_Vol = -1 TB_X_Vol_Prev = -1 done_mapping_TB_X = -1 duplicated_TB_X = copy.deepcopy(list_TB_X) duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # # Handling the FVI (Default) # for each_left_idx in list_given_input_tensor_left: if each_left_idx == list_TB_X[0]: size_FVI = tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # Need to Split # if size_FVI > size_TB_X: duplicated_Tile_Sizes.append([each_left_idx, size_TB_X]) TB_X_Vol = size_TB_X TB_X_Vol_Prev = size_TB_X done_mapping_TB_X = 1 # # No Need to Split (Fitted) # elif size_FVI == size_TB_X: duplicated_Tile_Sizes.append([each_left_idx, size_TB_X]) TB_X_Vol = size_TB_X TB_X_Vol_Prev = size_TB_X done_mapping_TB_X = 1 # # No Need to Split # else: duplicated_Tile_Sizes.append([each_left_idx, size_FVI]) TB_X_Vol = size_FVI TB_X_Vol_Prev = size_FVI # # # for each_left_idx in list_given_input_tensor_left: # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) != -1: continue # # #2. Indices Mapped on REG_X # if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_left_idx) != -1: continue # # #3. (Just In Case) Indices Mapped on REG_Y # if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_left_idx) != -1: continue # # #4. Indiced Mapped on TB_X (Should Use list_TB_X (passed by)) # if tc_helper.tc_gen_helper_find_1d(list_TB_X, each_left_idx) != -1: continue # # |TB_X'| # TB_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # |TB_X'| >= |TB_X| # if TB_X_Vol >= size_TB_X: # # |TB_X'| > |TB_X| # if TB_X_Vol > size_TB_X: # # # if done_mapping_TB_X == -1: blocking_tile_size = size_TB_X / TB_X_Vol_Prev duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, int(blocking_tile_size)]) done_mapping_TB_X = 1 else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, 1]) # # |TB_X'| = |TB_X| # else: # # # if done_mapping_TB_X == -1: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)]) else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, 1]) # # |TB_X'| < |TB_X| # else: duplicated_TB_X.append(each_left_idx) duplicated_Tile_Sizes.append([each_left_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx)]) # # # TB_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_left_idx) # # # if done_mapping_TB_X == 1: tc_gen_perms_exclusive_TB_Y(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, duplicated_TB_X, list_REG_X, list_REG_Y, duplicated_Tile_Sizes, list_CLASS_configuration, opt_print)
def tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_right = len(list_given_input_tensor_right) if opt_print == 1: print ("========================================== [Enumerations-REG_Y] ===================================================") print ("========================================== [Exclusive] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_right) print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given list_REG_X: ", list_REG_X) # # For Each Tile-Size for REG_X # for size_REG_Y in list_sizes_REG: if opt_print == 1: print ("|REG_Y| = ", size_REG_Y) # # # for start_index in range(0, len_tensor_right): # REG_Y_Vol = 1 REG_Y_Vol_Prev = 1 list_REG_Y = [] # inherited duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # inherited done_mapping_REG_Y = -1 # not done # # # for target_index in range(start_index, len_tensor_right): str_start_index = list_given_input_tensor_right[target_index] # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1: continue # # #2. The FVI in the Output Tensor # if str_start_index == list_given_output_tensor[0]: continue # # |REG_Y'| # REG_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # |REG_Y'| >= |REG_Y| # if REG_Y_Vol >= size_REG_Y: # # |REG_Y'| > |REG_Y| # if REG_Y_Vol > size_REG_Y: # # Need to SPlit (REG and BX) # if done_mapping_REG_Y == -1: blocking_tile_size = size_REG_Y / REG_Y_Vol_Prev list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, int(blocking_tile_size)]) done_mapping_REG_Y = 1 else: duplicated_Tile_Sizes.append([str_start_index, 1]) # ? # # |REG_Y'| = |REG_Y| # else: # # # if done_mapping_REG_Y == -1: list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) done_mapping_REG_Y = 1 else: duplicated_Tile_Sizes.append([str_start_index, 1]) # # # break # # |REG_Y'| < |REG_Y| # else: list_REG_Y.append(str_start_index) duplicated_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) # # # REG_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # # if done_mapping_REG_Y == 1: tc_gen_perms_exclusive_TB_X(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, duplicated_Tile_Sizes, list_CLASS_configuration, opt_print)
def tc_gen_Inner_Group(l_outer_groups, tmp_count, tmp_config, opt_print, opt_data_type): # if opt_print == 1: print ("[Code Generator][Inner-Group] Working...") print ("====================== Step 2: Creating Inner-Groups =======================") print (" Only Support the First Outer-Group") # # Permutations ([Assumption] Input: an arbitrary tensor contraction) # # # To Handle 2D Tensors is in "tc_gen_permutations()"***** # # # Cost-Model ([Verion 1.0] DRAM Data Movement, [Version 2.0] Bank Conflicts ...) # list_configurations_outer_group = list() Configuration.get_configurations(l_outer_groups, list_configurations_outer_group, tmp_count, tmp_config, 0, opt_data_type) # # # for each_config_outer_group in list_configurations_outer_group: each_config_outer_group.print_configuration(1) # # (Temporary, Mapping and Tile-Sizes should be Determined by Models in the future.) # info_each_inner_group = Helper_Inputs.transform_config_innergroup(list_configurations_outer_group[0]) # # Every Outer-Groups: # print ("[Code Generator][Inner-Group] # of Outer-Groups: ", len(l_outer_groups)) for each_outer_group in l_outer_groups: # print ("[Code Generator][Inner-Group] # of Tensor Contractions (Candidates) within an Outer-Group: ", len(each_outer_group[1])) # # Within an Outer-Group, there might be several Inner-Groups. # l_inner_groups = list() l_each_group_mapping_tb = list() l_each_group_mapping_2D = list() l_each_group_mapping_reg = list() l_t3_slices_size = list() l_t3_interface_info = list() l_t3_temp_inputs = list() l_t3_temp_conditions = list() # # To Create "Interface" # idx_count = 0 str_common_output = "" for each_tc in each_outer_group[1]: l_t3_temp_inputs.append([each_tc[4], each_tc[6]]) l_t3_temp_conditions.append("cond_kernel_" + str(idx_count + 1)) if idx_count == 0: str_common_output = each_tc[0] idx_count = idx_count + 1 # # Information: Split Indices # l_each_group_split_info = each_outer_group[3] # # l_interface_info: [0] All Index, [1] Output, [2] Inputs, [3] Conditions, [4] Options # l_t3_interface_info.append([each_outer_group[2], str_common_output, l_t3_temp_inputs, l_t3_temp_conditions, "opt_register_transpose", l_each_group_split_info]) # # (Temporary) # [0] [1] [2] [3] [4] # each_manual_group: Mapping_TB, Mapping_TB_2D, Mapping_Reg, Slices, Split-Info(Repre-size) # for each_manual_group in info_each_inner_group: # l_each_group_mapping_tb = each_manual_group[0] l_each_group_mapping_2D = each_manual_group[1] l_each_group_mapping_reg = each_manual_group[2] l_t3_slices_size = each_manual_group[3] l_info_split_ext = each_manual_group[4] l_each_group_mapping_TB_K = each_manual_group[5] l_tensor_contractions = list() # print ("[Code Generator][Inner-Groups] Picked Tiles: ", l_t3_slices_size) # if opt_print == 1: print ("Target Outer-Group: ", each_outer_group[0]) # # Fusion-Constraint #2: # promissing_left = 1 promissing_right = 1 all_x_axis = l_each_group_mapping_2D[0] + [l_each_group_mapping_reg[0]] all_y_axis = l_each_group_mapping_2D[1] + [l_each_group_mapping_reg[1]] # # X-Axis (Assumption: (Hypothetically) Left Input) including Output[0] # for each_idx in all_x_axis: promissing_left = promissing_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx) # # Y-Axis (Assumption: (Hypothetically) Right Input) # for each_idx in all_y_axis: promissing_right = promissing_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_idx) print ("[Code Generator][Inner-Groups] Supposed Shared Memeory Lenghts: Left >>>", promissing_left, ", Right >>>", promissing_right) # # Checking if All Tensor Contractions can be Fused or not. # l_picked_tc = list() idx_tensor_contraction = 0 # Checking if |info_each_inner_group| > |each_outer_group|, checking_used = 1 if len(each_outer_group[1]) == 0: checking_used = -1 # for each_tc in each_outer_group[1]: # #print (">> each_tc: ", each_tc) l_input_left = each_tc[5] l_input_right = each_tc[7] # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # # LEFT idx_check_reg_left = 0 size_left = 1 for each_left_idx in l_input_left: #each_tc[7]: # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # if each_left_idx == l_each_group_mapping_reg[0]: idx_check_reg_left = idx_check_reg_left + 1 if each_left_idx == l_each_group_mapping_reg[1]: idx_check_reg_left = idx_check_reg_left + 1 # # Fusion-Constraint #2: The Size of Shared Memeory # #print ("[l] each_outer_group[0]: ", each_outer_group[0]) if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_left_idx) != -1: #print (">> ", each_left_idx) size_left = size_left * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_left_idx) # RIGHT idx_check_reg_right = 0 size_right = 1 for each_right_idx in l_input_right: #each_tc[5]: # # Fusion-Constraint #1: Two indices for Register Tile should be on two different inputs. # if each_right_idx == l_each_group_mapping_reg[0]: idx_check_reg_right = idx_check_reg_right + 1 if each_right_idx == l_each_group_mapping_reg[1]: idx_check_reg_right = idx_check_reg_right + 1 # # Fusion-Constraint #2: The Size of Shared Memeory # #print ("[r] each_outer_group[0]: ", each_outer_group[0]) if tc_helper.tc_gen_helper_find_1d(each_outer_group[0], each_right_idx) != -1: #print (">> ", each_right_idx) size_right = size_right * tc_helper.tc_gen_helper_find(l_t3_slices_size, each_right_idx) # # [Should be Fixed] # if idx_check_reg_right != 2 and idx_check_reg_left != 2 and promissing_left == size_left and promissing_right == size_right: l_picked_tc.append(idx_tensor_contraction) l_tensor_contractions.append(each_tc) else: print ("[DEBUG] promissing_left: ", promissing_left, ", promissing_right: ", promissing_right) print ("[DEBUG] idx_check_reg_right: ", idx_check_reg_right, ", idx_check_reg_left: ", idx_check_reg_left, ", size_left: ", size_left, ", size_right: ", size_right) sys.exit() # idx_tensor_contraction = idx_tensor_contraction + 1 # # End of For-Statement # # if checking_used == 1: #print ("added a tensor contraction to an inner-group") #print ("l_info_split_ext: ", l_info_split_ext) l_inner_groups.append([l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K]) # To-Do: Should be checked in detail for each_tc in list(reversed(l_picked_tc)): each_outer_group[1].pop(each_tc) # # For-Outer-Group # break # To-Do: (Currently) Only 1 Outer-Group. # # # if opt_print == -1: print ("============================================================================") # # # print ("===================== Step 2: [Output] Inner-Groups ========================") print (" Does not Support Register-Transpose") print (" These Tensor Contractions will be fused.") print (" # of Inner-Groups: ", len(l_inner_groups)) # [l_each_group_mapping_tb, l_each_group_mapping_2D, l_each_group_mapping_reg, l_tensor_contractions, l_t3_slices_size, l_info_split_ext, l_each_group_mapping_TB_K] for each_inner_group in l_inner_groups: print ("Mapping All: ", each_inner_group[0]) print ("Mapping TB : ", each_inner_group[1]) print ("Mapping Reg: ", each_inner_group[2]) print ("Mapping TB_K: ", each_inner_group[6]) print ("Slices : ", each_inner_group[4]) print ("Split-Slices : ", each_inner_group[5]) print ("# of Tensor Contractions: ", len(each_inner_group[3])) for each_tc in each_inner_group[3]: print ("Each Tensor Contraction: ", each_tc) print ("============================================================================") # # Return Output # return l_inner_groups, l_t3_interface_info
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External( opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y, l_internal_idx, l_input_tensor, l_mapping_tb, l_mapping_reg, l_tile_sizes, l_info_matching_indices, opt_matching_index_fully, idx_kernel, each_inner_load_inst_x, each_inner_load_inst_y, size_tb_x, size_tb_y): # print("[Code Generator][Load][Input][Addr][Global][External]") print( "[Code Generator][Load][Input][Addr][Global][External] l_input_tensor: ", l_input_tensor) print( "[Code Generator][Load][Input][Addr][Global][External] l_mapping_tb: ", l_mapping_tb) print( "[Code Generator][Load][Input][Addr][Global][External] l_mapping_reg: ", l_mapping_reg) str_input_ext_global_addr = "" # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: l_idx_tb = l_mapping_tb[1] else: l_idx_tb = l_mapping_tb[0] # # [Symbols] Block Index, # rev_l_input_tensor = list(reversed(l_input_tensor)) idx_count = 0 idx_base_ext = 0 str_specific_idx = "" for each_idx in rev_l_input_tensor: # # Internal Index # if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) != -1: #print ("[int.] ", each_idx, ", idx_count: ", idx_count) if idx_count != 0: str_input_ext_global_addr = "(" + str_input_ext_global_addr + ") * size_" + each_idx # # External Index # else: # # Mapped on REG # if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) != -1: #print (" >>> [ext.][REG] ", each_idx) if idx_base_ext == 0: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() + " + ll" else: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + ll + (" + str_input_ext_global_addr + ") * size_" + each_idx else: #print (" >>> opts: tb_x: ", opt_inner_load_input_tb_x, ", tb_y: " , opt_inner_load_input_tb_y) #print (" >>> [ext.][TB] ", each_idx, " >>> ", tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx)) str_input_specific_idx_multi = "" # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: if opt_inner_load_input_tb_y == 2: str_input_specific_idx_multi = " + " + str( int(size_tb_y * each_inner_load_inst_y)) # # |TB_X| -(loads)-> E *** # |TB_Y| -(loads)-> K # else: if opt_inner_load_input_tb_x == 2: str_input_specific_idx_multi = " + " + str( int(size_tb_x * each_inner_load_inst_x)) #print (">>> str_input_specific_idx_multi: ", str_input_specific_idx_multi) if tc_helper.tc_gen_helper_find(l_info_matching_indices, each_idx) == "0": str_specific_idx = each_idx str_input_specific_idx = tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) str_input_specific_idx_multi = "" else: str_specific_idx = each_idx str_input_specific_idx = "idx_" + tc_helper.tc_gen_helper_find( l_info_matching_indices, each_idx) # if idx_base_ext == 0: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + " + str_input_specific_idx + str_input_specific_idx_multi else: str_input_ext_global_addr = "blk_idx_" + each_idx + " * SIZE_SLICE_" + str( idx_kernel ) + "_" + each_idx.capitalize( ) + " + " + str_input_specific_idx + str_input_specific_idx_multi + " + (" + str_input_ext_global_addr + ") * size_" + each_idx # idx_base_ext += 1 # idx_count += 1 # return str_input_ext_global_addr, str_input_specific_idx + str_input_specific_idx_multi + " < rng_" + str_specific_idx
def tc_gen_code_Kernel_Load_Inputs_Addr_Global_External_Matching_Index( opt_load_ext_int, opt_inner_load_input_tb_x, opt_inner_load_input_tb_y, l_input_tensor, l_mapping_tb, l_mapping_reg, l_internal_idx, l_tile_sizes, num_inner_inst_tb_x, num_inner_inst_tb_y, size_tb_x, size_tb_y): # # [Option] # # l_info_matching_indices = [] l_info_pruned_indices = [] l_info_idx_tb = [] l_info_pruned_indices_tb = [] opt_matching_index_fully = -1 # # |TB_X| -(loads)-> K # |TB_Y| -(loads)-> E *** # if opt_load_ext_int == -1: l_idx_tb = l_mapping_tb[1] else: l_idx_tb = l_mapping_tb[0] # # [Option] Indices mapped on TB except for them having tile-size == 1. # for each_idx in l_idx_tb: if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) != 1: l_info_pruned_indices_tb.append([ each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) ]) # for each_idx in l_idx_tb: l_info_idx_tb.append( [each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)]) # print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] l_info_idx_tb: ", l_info_idx_tb) # # [Option] Check if indices mapped on TB_X | TB_Y can be used for the external indices mapped on # for each_idx in l_input_tensor: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_mapping_reg, each_idx) == -1: print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] ext. mapped on TB: ", each_idx, ", size: ", tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx)) # # To Prune Indices with Tile-Size = 1. # if tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) == 1: l_info_matching_indices.append([each_idx, "0"]) else: l_info_pruned_indices.append([ each_idx, tc_helper.tc_gen_helper_find(l_tile_sizes, each_idx) ]) # # Target Indices should be handled. # print("[DEBUG] l_info_pruned_indices: ", l_info_pruned_indices) print("[DEBUG] l_info_pruned_indices_tb: ", l_info_pruned_indices_tb) # # # print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices mapped on TB: ", len(l_info_pruned_indices_tb)) print( "[Code Generator][Load][Input][Addr][Global][External][Matching-Index] # of Pruned Indices: ", len(l_info_pruned_indices)) if len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1: #print ("len(l_info_pruned_indices_tb) == len(l_info_pruned_indices) == 1") len_pruned_indices_tb = 1 len_pruned_indice = 1 # # # for each_idx_info in l_info_pruned_indices_tb: len_pruned_indices_tb *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # for each_idx_info in l_info_pruned_indices: len_pruned_indice *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # l_info_matching_indices.append( [l_info_pruned_indices[0][0], l_info_pruned_indices_tb[0][0]]) opt_matching_index_fully = 1 else: # # Miserable Case: # print("Miserable Case: Should Handle Manually!") len_pruned_indices_tb = 1 len_pruned_indice = 1 # # # for each_idx_info in l_info_pruned_indices_tb: len_pruned_indices_tb *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) # for each_idx_info in l_info_pruned_indices: len_pruned_indice *= tc_helper.tc_gen_helper_find( l_tile_sizes, each_idx_info[0]) print("len_pruned_indices_tb: ", len_pruned_indices_tb) print("len_pruned_indice: ", len_pruned_indice) # return l_info_matching_indices, opt_matching_index_fully
def tc_gen_global_variables_outputs_input_right( f, each_input, l_external_idx, possible_diff, l_v2_d_decl_var, l_v2_parameters, l_v2_parameters_nf, l_v2_parameters_f, l_cuda_malloc, l_device_dynamic, kernel_number, opt_data_type): f.write("// Global Variables for Right Input\n") # Right Input d_input_name = "d_" + each_input[1][0] h_input_name = "h_" + each_input[1][0] input_f_size = "" input_s_size = "" # idx_f_count = 0 idx_s_count = 0 for each_index in each_input[1][1]: if tc_helper.tc_gen_helper_find_1d(l_external_idx, each_index) != -1: if idx_f_count == 0: input_f_size = "SIZE_IDX_" + each_index.capitalize() else: input_f_size = "SIZE_IDX_" + each_index.capitalize( ) + " * " + input_f_size if idx_s_count == 0: input_s_size = "SIZE_SLICE_" + str( kernel_number) + "_" + each_index.capitalize() else: input_s_size = "SIZE_SLICE_" + str( kernel_number) + "_" + each_index.capitalize( ) + " * " + input_s_size idx_f_count = idx_f_count + 1 idx_s_count = idx_s_count + 1 else: if idx_f_count == 0: input_f_size = "SIZE_IDX_" + each_index.capitalize() else: input_f_size = "SIZE_IDX_" + each_index.capitalize( ) + " * " + input_f_size idx_f_count = idx_f_count + 1 # if opt_data_type == "DOUBLE": tc_gen_code_helper_varible(f, "double*", d_input_name) tc_gen_code_helper_varible(f, "double*", h_input_name) else: tc_gen_code_helper_varible(f, "float*", d_input_name) tc_gen_code_helper_varible(f, "float*", h_input_name) tc_gen_code_helper_varible(f, "int*", d_input_name + "_addr") tc_gen_code_helper_varible(f, "int*", h_input_name + "_addr") tc_gen_code_helper_varible(f, "int*", d_input_name + "_offset") tc_gen_code_helper_varible(f, "int*", h_input_name + "_offset") # if opt_data_type == "DOUBLE": l_v2_d_decl_var.append("double* " + d_input_name) else: l_v2_d_decl_var.append("float* " + d_input_name) l_v2_d_decl_var.append("const int* __restrict__ " + d_input_name + "_addr") l_v2_d_decl_var.append("const int* __restrict__ " + d_input_name + "_offset") # if opt_data_type == "DOUBLE": l_v2_parameters.append((d_input_name, "double", input_f_size)) else: l_v2_parameters.append((d_input_name, "float", input_f_size)) l_v2_parameters.append((d_input_name + "_addr", "int", input_s_size + " * n_blks_" + str(kernel_number))) l_v2_parameters.append( (d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y")) # if opt_data_type == "DOUBLE": l_cuda_malloc.append((d_input_name, "double", input_f_size)) else: l_cuda_malloc.append((d_input_name, "float", input_f_size)) l_cuda_malloc.append((d_input_name + "_addr", "int", input_s_size + " * n_blks_" + str(kernel_number))) l_cuda_malloc.append( (d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y")) # if opt_data_type == "DOUBLE": l_device_dynamic.append( ("double", d_input_name, h_input_name, input_f_size)) else: l_device_dynamic.append( ("float", d_input_name, h_input_name, input_f_size)) l_device_dynamic.append( ("int", d_input_name + "_addr", h_input_name + "_addr", input_s_size + " * n_blks_" + str(kernel_number))) l_device_dynamic.append( ("int", d_input_name + "_offset", h_input_name + "_offset", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y")) # # # if possible_diff == 1: tc_gen_code_helper_varible(f, "int*", d_input_name + "_addr_full") tc_gen_code_helper_varible(f, "int*", h_input_name + "_addr_full") tc_gen_code_helper_varible(f, "int*", d_input_name + "_addr_non_full") tc_gen_code_helper_varible(f, "int*", h_input_name + "_addr_non_full") # if opt_data_type == "DOUBLE": l_v2_parameters_nf.append((d_input_name, "double", input_f_size)) else: l_v2_parameters_nf.append((d_input_name, "float", input_f_size)) l_v2_parameters_nf.append( (d_input_name + "_addr_non_full", "int", input_s_size + " * num_blk_non_full_" + str(kernel_number))) l_v2_parameters_nf.append( (d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y")) # if opt_data_type == "DOUBLE": l_v2_parameters_f.append((d_input_name, "double", input_f_size)) else: l_v2_parameters_f.append((d_input_name, "float", input_f_size)) l_v2_parameters_f.append((d_input_name + "_addr_full", "int", input_s_size + " * num_blk_full")) l_v2_parameters_f.append( (d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y")) l_device_dynamic.append( ("int", d_input_name + "_addr_full", h_input_name + "_addr_full", input_s_size + " * num_blk_full_" + str(kernel_number))) l_device_dynamic.append( ("int", d_input_name + "_addr_non_full", h_input_name + "_addr_non_full", input_s_size + " * num_blk_non_full_" + str(kernel_number))) l_cuda_malloc.append( (d_input_name + "_addr_full", "int", input_s_size + " * num_blk_full_" + str(kernel_number))) l_cuda_malloc.append( (d_input_name + "_addr_non_full", "int", input_s_size + " * num_blk_non_full_" + str(kernel_number))) f.write("\n")
def tc_gen_models_GMEM(each_configuration, list_comb, idx_count, opt_print=0): # # # if opt_print == 1: print ("===[", idx_count, "]====================================== [Model][GMEM Load Inputs] =========================================") print (" mappings: TB_X <- ", each_configuration.list_TB_X, ", TB_Y <- ", each_configuration.list_TB_Y) print (" : TB_K <- ", each_configuration.list_TB_K) print (" : REG_X <- ", each_configuration.list_REG_X, ", REG_Y <-", each_configuration.list_REG_Y) print (" list_comb: ", list_comb) print (" tile-sizes: ", each_configuration.list_tile_sizes) # numElements_Double = int(128 / 8) #print ("numElements_Double: ", numElements_Double) # # For Internal Indicies, # size_TB_K = 1 size_N_K = 1 for each_int_idx in each_configuration.list_TB_K: size_TB_K *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_int_idx) size_N_K *= tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_int_idx) # # # of "main" loop (calculated by N_K / T_K) # steps_main_loops = size_N_K / size_TB_K # # Check Types of Input such as [E_K, ...] or [E_A, ...] # opt_load_A_ext = -1 # -1: FVI = internal opt_load_B_ext = -1 # 1: FVI = external if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_configuration.list_tensor_A[0]) == -1: opt_load_A_ext = 1 if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_configuration.list_tensor_B[0]) == -1: opt_load_B_ext = 1 # # Initial Values # size_continuous_elements_A = 1 size_continuous_elements_B = 1 size_continuous_elements_C = 1 # # Based on the Representative Problem Size # size_tiles_A = 1 size_tiles_B = 1 size_tiles_C = 1 # if opt_print == 1: print ("-1: FVI = internal, 1: FVI = external") print ("opt_load_A_ext: ", opt_load_A_ext, ", opt_load_B_ext: ", opt_load_B_ext) # # Input: A (Continuous) # is_continuous = 1 for each_idx in each_configuration.list_tensor_A: # # Starts from External Index # if opt_load_A_ext == 1: # # Internal # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) != -1: break # # External # else: # # Need to Check if This Index is Continuous Or NOT. # if is_continuous == 1: size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # Starts from Internal Index # else: # # External # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1: break # # Internal # else: # # # if is_continuous == 1: size_continuous_elements_A *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # #print ("[A] is_continuous: ", is_continuous, ", size_continuous_elements_A: ", size_continuous_elements_A) # # Input: A (TB and REG) # size_A_E_TB = 1 size_A_K_TB = 1 size_A_E_REG = 1 for each_idx in each_configuration.list_tensor_A: # External Index if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_B, each_idx) == -1: # TB if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: size_A_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # REG else: size_A_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) else: size_A_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # if opt_print == 1: print ("|SMEM_A| = ", size_A_E_TB * size_A_E_REG * size_A_K_TB, ", (", size_A_E_REG, " * ", size_A_E_TB, " * ", size_A_K_TB, ")") print ("|TB_X| = ", each_configuration.size_TB_X, ", |TB_Y| = ", each_configuration.size_TB_Y) # # TB_X -> External Index && TB_Y -> Internal Index # if opt_load_A_ext == 1: times_inner_TB_X = max(size_A_E_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_A_K_TB / each_configuration.size_TB_Y, 1.0) # # TB_X -> Internal Index && TB_Y -> External Index # else: times_inner_TB_X = max(size_A_K_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_A_E_TB / each_configuration.size_TB_Y, 1.0) # if opt_print == 1: print ("A: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y) # # Based on a row along TB_X, can the number of continuous elements be loaded concurrently? # if opt_load_A_ext == -1: # TB_X -> K size_TB_X = min(size_A_K_TB, each_configuration.size_TB_X) size_TB_Y = min(size_A_E_TB, each_configuration.size_TB_Y) else: # TB_X -> E size_TB_X = min(size_A_E_TB, each_configuration.size_TB_X) size_TB_Y = min(size_A_K_TB, each_configuration.size_TB_Y) # # # estimated_DRAM_transaction_per_TB_X = size_TB_X / min(size_continuous_elements_A, size_TB_X) estimated_DRAM_transaction_per_TB = estimated_DRAM_transaction_per_TB_X * size_TB_Y estimated_DRAM_transaction_per_TB_inner_loops = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops if opt_print == 1: print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K) print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X) print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB) print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K) # # To Calculate The Cost of Loading Input Tensor per a Thread Block # cost_TB_load_A = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K # # Input: B # is_continuous = 1 for each_idx in each_configuration.list_tensor_B: # # # if opt_load_B_ext == 1: # # Internal # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) != -1: break # # External # else: # # # if is_continuous == 1: size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break else: # # External # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1: break # # Internal # else: # # # if is_continuous == 1: size_continuous_elements_B *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # # if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # # # if opt_print == 1: print ("size_continuous_elements_B: ", size_continuous_elements_B) # # Input: B (TB and REG) # size_B_E_TB = 1 size_B_K_TB = 1 size_B_E_REG = 1 for each_idx in each_configuration.list_tensor_B: # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_tensor_A, each_idx) == -1: # TB if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: size_B_E_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # REG else: size_B_E_REG *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) else: size_B_K_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # # TB_X -> External Index && TB_Y -> Internal Index # if opt_load_B_ext == 1: times_inner_TB_X = max(size_B_E_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_B_K_TB / each_configuration.size_TB_Y, 1.0) # # TB_X -> Internal Index && TB_Y -> External Index # else: times_inner_TB_X = max(size_B_K_TB / each_configuration.size_TB_X, 1.0) times_inner_TB_Y = max(size_B_E_TB / each_configuration.size_TB_Y, 1.0) # if opt_print == 1: print ("B: :times_inner_TB_X: ", times_inner_TB_X, ", times_inner_TB_Y: ", times_inner_TB_Y) # # Based on a row along TB_X, can the number of continuous elements be loaded concurrently? # if opt_load_A_ext == -1: # TB_X -> K size_TB_X = min(size_B_K_TB, each_configuration.size_TB_X) size_TB_Y = min(size_B_E_TB, each_configuration.size_TB_Y) else: # TB_X -> E size_TB_X = min(size_B_E_TB, each_configuration.size_TB_X) size_TB_Y = min(size_B_K_TB, each_configuration.size_TB_Y) # # # estimated_DRAM_transaction_per_TB_X = size_TB_X / min(size_continuous_elements_B, size_TB_X) estimated_DRAM_transaction_per_TB = estimated_DRAM_transaction_per_TB_X * size_TB_Y estimated_DRAM_transaction_per_TB_inner_loops = estimated_DRAM_transaction_per_TB * times_inner_TB_X * times_inner_TB_Y #estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_A_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg = estimated_DRAM_transaction_per_TB_inner_loops * size_B_E_REG estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K = estimated_DRAM_transaction_per_TB_inner_loops_reg * steps_main_loops if opt_print == 1: print ("size_TB_X: ", size_TB_X, ", size_TB_Y: ", size_TB_Y, ", size_TB_K: ", size_TB_K) print ("estimated_DRAM_transactions_per_TB_X (should be fixed): ", estimated_DRAM_transaction_per_TB_X) print ("estimated_DRAM_transactions_per_TB: ", estimated_DRAM_transaction_per_TB) print ("estimated_DRAM_transaction_per_TB_inner_loops: ", estimated_DRAM_transaction_per_TB_inner_loops) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg: ", estimated_DRAM_transaction_per_TB_inner_loops_reg) print ("estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K: ", estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K) # # # cost_TB_load_B = estimated_DRAM_transaction_per_TB_inner_loops_reg_N_K # # Output: C # is_continuous = 1 for each_idx in each_configuration.list_tensor_C: # # Index mapped on TB # if tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_X, each_idx) == -1 and tc_helper.tc_gen_helper_find_1d(each_configuration.list_REG_Y, each_idx) == -1: if is_continuous == 1: size_continuous_elements_C *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) if tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) != tc_helper.tc_gen_helper_find(each_configuration.list_representative_problem_size, each_idx): is_continuous = -1 else: break # # Index mapped on REG # else: break # # # size_continuous_elements_C_based_TB_X = 1 for idx_count in range(0, len(each_configuration.list_TB_X)): if each_configuration.list_TB_X[idx_count] == each_configuration.list_tensor_C[idx_count]: size_continuous_elements_C_based_TB_X *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_configuration.list_TB_X[idx_count]) else: break size_Output_TB = 1 for each_idx in each_configuration.list_tensor_C: size_Output_TB *= tc_helper.tc_gen_helper_find(each_configuration.list_tile_sizes, each_idx) # cost_TB_store_C = size_Output_TB / size_continuous_elements_C_based_TB_X # # The # of Thread Blocks # num_TBs = list_comb[1] if opt_print == 1: print (">>> # of TBs: ", num_TBs) # each_configuration.cost_load_input = (cost_TB_load_A + cost_TB_load_B) * num_TBs each_configuration.cost_store_output = cost_TB_store_C * num_TBs #each_configuration.cost_load_output = 0 each_configuration.cost_load_output = cost_TB_store_C * num_TBs each_configuration.cost_total = each_configuration.cost_load_input + each_configuration.cost_store_output + each_configuration.cost_load_output each_configuration.steps_main_loops = steps_main_loops # if opt_print == 1: print ("Cost Input (Load): ", each_configuration.cost_load_input) print ("Cost Output (Store): ", each_configuration.cost_store_output) print ("Cost Output (Load): ", each_configuration.cost_load_output) print ("Total Cost: ", each_configuration.cost_total, ", # of steps for main-loop: ", each_configuration.steps_main_loops) # # # if opt_print == 1: print ("=============================================================================================================") # return 1
def tc_gen_code_Kernel_Load_Inputs( f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7, int_str_t2, int_str_v2, l_blk_boundary_rng, tensor_contraction, l_input_strides, l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, opt_load_t2, opt_load_v2, opt_pre_computed, idx_kernel): # For Shared Memory, # need to support non-fvi for p7 # >>> it affects how to generalize loading inputs. # [To-Do] What is the purpose of this? # if len(l_blk_boundary_rng) > 0: upper_left, upper_right, l_left, l_right = tc_gen_code_Kernel_Load_Checking_Boundary( f, l_blk_boundary_rng, tensor_contraction) else: upper_left = size_tb_x upper_right = size_tb_x # # # of Internal Indices # num_internal_indices = len(l_internal_idx) # f.write("\t\t// Load Input Tensor to Shared Memory: " + str(size_tb_x) + ":" + str(tc_helper.tc_gen_helper_find(l_t3_slices, l_internal_idx[0])) + "\n") f.write("\t\t// # of Internal Indices: " + str(num_internal_indices) + "\n") # # Step 0. Boundaries for LEFT # - size_tb_x & size_tb_y are determined by tiles' size. l_idx_x = l_t3_mapping_tb_2D[0] l_idx_y = l_t3_mapping_tb_2D[1] l_left_indices = tensor_contraction[0][4] l_left_target_indices = list() l_left_indices_reg = list() cond_boundary_left_ext = -1 cond_boundary_left_int = -1 str_cond_gen_external = "" str_cond_gen_internal = "" opt_gen_full_special_case_left = -1 # # - ThreadIdx.x -> Internal Indices -- |E_K| # - ThreadIdx.y -> External Indices -- |E_LEFT| # if opt_load_t2 == -1: # # OPTION #1. Partial Tiles for External Indices (Assumption: 4D Input Tensors) # opt_gen_full_special_case = -1 if opt_gen_full == 1: cond_boundary_left_ext = 1 for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: # if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: # l_left_target_indices.append(each_idx) else: l_left_indices_reg.append(each_idx) # size_len_external_tiles = 1 for each_target_index in l_left_target_indices: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # if size_tb_y > size_len_external_tiles: len_covered_reg = int(size_tb_y / size_len_external_tiles) size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_left_indices_reg[0]) if len_covered_reg > 1: opt_gen_full_special_case = 1 # # Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal) # New-Version # # # [1] indices along y-axis can be used directly to check boundaries for indices in the left input. # list_alternative_mapping = list() opt_boundary_left_input = -1 len_l_idx_y = len(l_idx_y) for each_idx_left in l_left_target_indices: for each_idx_y in l_idx_y: if each_idx_left == each_idx_y: len_l_idx_y = len_l_idx_y - 1 # # [1] indices along y-axis can be used directly to check boundaries for indices in the left input. # if len_l_idx_y == 0: opt_boundary_left_input = 1 # # [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input. # or [3] indices along y-axis cannot bu used to check boundaries for indices in the left input. # else: # # [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input. # # if len(l_idx_y) == len(l_left_target_indices): # # The Simplest Version (Directly Replacing Indices) # for each_idx_left in l_left_target_indices: for each_idx_y in l_idx_y: if each_idx_left == each_idx_y: list_alternative_mapping.append( [each_idx_y, each_idx_left]) break else: if tc_helper.tc_gen_helper_find( l_t3_slices, each_idx_left ) == tc_helper.tc_gen_helper_find( l_t3_slices, each_idx_y): list_alternative_mapping.append( [each_idx_y, each_idx_left]) break # if len(list_alternative_mapping) == len(l_idx_y): print("[To-Do] list_alternative_mapping: ", list_alternative_mapping) opt_boundary_left_input = 2 else: opt_boundary_left_input = 3 else: opt_boundary_left_input = 3 # # [3] indices along y-axis cannot bu used to check boundaries for indices in the left input. # if opt_boundary_left_input == 1: #print (" >>> [1] indices along y-axis can be used directly to check boundaries for indices in the left input.") idx_count = 0 for idx_tb in l_left_target_indices: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb idx_count = idx_count + 1 elif opt_boundary_left_input == 2: #print (" >>> [2] indices along y-axis can be used in-directly to check boundaries for indices in the left input.") idx_count = 0 for idx_mapping in list_alternative_mapping: if idx_count == 0: str_cond_gen_external = "idx_" + idx_mapping[ 0] + " < rng_" + idx_mapping[1] else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_mapping[ 0] + " < rng_" + idx_mapping[1] idx_count = idx_count + 1 else: print( " >>> [3] indices along y-axis cannot bu used to check boundaries for indices in the left input." ) print(" >>> ERROR!!!! Not Support Yet") del list_alternative_mapping # # OPTION #2. Partial Tiles for Internal Indices # if opt_gen_p7 == 1: cond_boundary_left_int = 1 str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # - ThreadIdx.x -> External Indices -- |E_LEFT| # - ThreadIdx.y -> Internal Indices -- |E_K| # else: # # OPTION #1. Partial Tiles for External Indices # if opt_gen_full == 1: # # This case, TB_Y will load |T_K| # However, when |TB_Y| < |T_K|, we need to load input (|T_K| / |TB_Y|) times. # cond_boundary_left_ext = 1 for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d( l_t3_mapping_reg, each_idx) == -1: # Not mapped on REG if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: # External Indices l_left_target_indices.append(each_idx) else: l_left_indices_reg.append( each_idx) # Indices Mapped on REG # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles = 1 for each_target_index in l_left_target_indices: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # # # if size_tb_x > size_len_external_tiles: opt_gen_full_special_case_left = 1 # # Assumption 4D Input Tensor (3 External (2 -> TB, 1 -> REG), 1 Internal) # To-Do: Fusion is a little bit complecated. # #print ("l_left_target_indices: ", l_left_target_indices) #print ("l_idx_x: ", l_idx_x) #print ("[To-Do] Boundary Case for External Indices ---- Fusion") # # For a Tensor Contraction, # if len(l_left_target_indices) == len(l_idx_x): # #print ("============================") opt_fusion = -1 for each_target in l_left_target_indices: is_common = -1 for each_idx_x in l_idx_x: if each_target == each_idx_x: is_common = 1 if is_common == -1: opt_fusion = 1 break # if opt_fusion == 1: #print ("opt_fusion == 1") idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] else: #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] idx_count = idx_count + 1 else: #print ("opt_fusion != 1") idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb idx_count = idx_count + 1 else: idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: #str_cond_gen_external = "idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] else: #str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + idx_tb#l_left_target_indices[idx_count] str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_left_target_indices[ idx_count] idx_count = idx_count + 1 # # OPTION #2. Partial Tiles for Internal Indices # if opt_gen_p7 == 1: cond_boundary_left_int = 1 str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # To Write Code for Boundary Cases # if (cond_boundary_left_ext == 1 and opt_gen_full_special_case_left == -1) or cond_boundary_left_int == 1: # # # tc_code_kernel_helper.code_kernel_load_input_left_boundary_case( f, opt_gen_full_special_case_left, cond_boundary_left_ext, cond_boundary_left_int, str_cond_gen_external, str_cond_gen_internal) # # # l_left_indices_target_temp = list() l_left_indices_reg_temp = list() opt_gen_full_special_case = -1 len_covered_reg = 1 # # To Figure out indices mapped on TB (l_left_indices_target_temp) and indices mapped on REG (l_left_indices_reg_temp) # for each_idx in l_left_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: l_left_indices_target_temp.append(each_idx) else: l_left_indices_reg_temp.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB len_external_tiles_left = 1 for each_target_index in l_left_indices_target_temp: len_external_tiles_left = len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find(l_t3_slices, l_left_indices_reg_temp[0]) # # TB_X will load |E_LEFT| on TB_X without an index mapped on REG # if opt_load_t2 == -1: if size_tb_y > len_external_tiles_left: len_covered_reg = int(size_tb_y / len_external_tiles_left) opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # |TB_X| > |E_LEFT_TB|, Then, |TB_X| can cover |E_LEFT_TB| # else: if size_tb_x > len_external_tiles_left: print("[aft]size_tb_x > len_external_tiles_left :: ", size_tb_x, " >? ", len_external_tiles_left) # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / len_external_tiles_left) opt_gen_full_special_case = 1 # # |TB_X| == |E_LEFT_TB| # elif size_tb_x == len_external_tiles_left: len_covered_reg = 1 # # |TB_X| < |E_LEFT_TB| # #else: #print ("HERE: ", size_tb_x, len_external_tiles_left) #len_covered_reg = 1#size_tb_x / len_external_tiles_left # (To-Do) This case will be dealt in the loop. #opt_gen_full_special_case = 2 # # Step 1: For-Statement: T2 (LEFT), |E_A'| * |E_A''|, where A' is a set of indices mapped on Thread Block and # A'' is a set of indices mapped on Register Tile. tc_code_kernel_helper.code_kernel_load_input_left_for_statement( f, opt_gen_full, tensor_contraction[0][2], opt_gen_full_special_case, size_reg_tile, len_covered_reg, l_t3_mapping_reg) # str_str_t2 = "" idx_count = 0 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if idx_count != 0: str_str_t2 = str_str_t2 + " * " str_str_t2 = str_str_t2 + "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() idx_count = idx_count + 1 # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles_left = 1 size_len_reg_tiles_left = 1 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) # # [Sub-Routine] Load Tensor Inputs to sm_a[][] # if len(l_input_strides) > 0: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left( f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2, num_internal_indices, idx_kernel, l_input_strides[0], opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, l_t3_slices) else: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Left( f, tensor_contraction, l_internal_idx, opt_load_t2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_left, str_str_t2, num_internal_indices, idx_kernel, l_input_strides, opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, l_t3_slices) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles_right = 1 size_len_reg_tiles_right = 1 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) #print ("size_len_reg_tiles_left, right:", size_len_reg_tiles_left, ",", size_len_reg_tiles_right) # # When we cannot merge Two different loops for Both Inputs. # #if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or (opt_load_t2 != opt_load_v2) or (size_tb_x > upper_right) or (size_tb_x > upper_left) or opt_gen_full == 1 or (size_len_reg_tiles_left != size_len_reg_tiles_right): if ((size_sm_a / size_tb_y) != (size_sm_b / size_tb_y)) or ( opt_load_t2 != opt_load_v2) or opt_gen_full == 1 or ( size_len_reg_tiles_left != size_len_reg_tiles_right): f.write("\t\t}\n") f.write("\n") # # # l_right_indices = tensor_contraction[1][4] l_right_indices_target = list() l_right_indices_reg = list() cond_boundary_right_ext = -1 cond_boundary_right_int = -1 cond_boundary_right_tbx = -1 cond_boundary_right_tby = -1 str_cond_gen_external = "" str_cond_gen_internal = "" str_cond_gen_tb_x = "" str_cond_gen_tb_y = "" f.write("\t\t// Load Input Tensor to Shared Memory\n") # # [Load][Right] TB_X -> |K| && TB_Y -> |E_RIGHT| # if opt_load_v2 == -1: # # OPTION #1. External Index # if opt_gen_full == 1: cond_boundary_right_ext = 1 for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: l_right_indices_target.append(each_idx) else: l_right_indices_reg.append(each_idx) #print ("len(l_right_indices_target):", len(l_right_indices_target)) #print ("len(l_idx_x);", len(l_idx_x)) # # To-Do # # 4D Input Tensor if len(l_right_indices_target) == len(l_idx_y): idx_count = 0 for idx_tb in l_idx_y: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] else: str_cond_gen_external = str_cond_gen_external + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] idx_count = idx_count + 1 else: print("ERROR: (-1) Input Tensor Should be 4D...") # # OPTION #2. Internal Index # if opt_gen_p7 == 1: cond_boundary_right_int = 1 str_cond_gen_internal = "threadIdx.x < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # OPTION #3. # if size_tb_x > size_sm_p7: cond_boundary_right_tbx = 1 str_cond_gen_tb_x = "threadIdx.x < " + str(size_sm_p7) else: # # OPTION #1. External Index # #print ("[CODE][LOAD-INPUT][RIGHT] TB_X -> |E_RIGHT| && TB_Y -> |K|") opt_gen_full_special_case = -1 if opt_gen_full == 1: cond_boundary_right_ext = 1 for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d( l_internal_idx, each_idx) == -1: l_right_indices_target.append(each_idx) else: l_right_indices_reg.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles = 1 for each_target_index in l_right_indices_target: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # TB_X will load |E_LEFT| on TB_X without the index mapped on REG if size_tb_x > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / size_len_external_tiles) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_right_indices_reg[0]) if len_covered_reg > 1: opt_gen_full_special_case = 1 ''' if len(l_idx_y) > len(l_left_target_indices): idx_count = 0 for idx_tb in l_left_target_indices: if idx_count == 0: str_cond_gen_external = "idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb else: str_cond_gen_external = str_cond_gen_external + " && idx_" + l_idx_y[idx_count] + " < rng_" + idx_tb#l_left_target_indices[idx_count] idx_count = idx_count + 1 ''' # #print ("len(l_right_indices_target):", len(l_right_indices_target)) #print ("len(l_idx_x);", len(l_idx_x)) if len(l_idx_x) > len(l_right_indices_target): idx_count = 0 for idx_tb in l_right_indices_target: if idx_count == 0: str_cond_gen_external = "idx_" + l_idx_x[ idx_count] + " < rng_" + idx_tb else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + l_idx_x[ idx_count] + " < rng_" + idx_tb idx_count = idx_count + 1 else: idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[ idx_count] idx_count = idx_count + 1 ''' if len(l_right_indices_target) == len(l_idx_x): idx_count = 0 for idx_tb in l_idx_x: if idx_count == 0: str_cond_gen_external = "idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count] else: temp = str_cond_gen_external # bug?? str_cond_gen_external = temp + " && idx_" + idx_tb + " < rng_" + l_right_indices_target[idx_count] idx_count = idx_count + 1 else: print ("ERROR: (!-1) Input Tensor Should be 4D...") ''' # # OPTION #2. # if opt_gen_p7 == 1: cond_boundary_right_int = 1 str_cond_gen_internal = "threadIdx.y < SIZE_INT_UNIT_" + str( idx_kernel) + " - internal_upperbound" # # Boundary Cases (External, Internal and Thread Block) # To Write Code for Boundary Cases # if cond_boundary_right_ext == 1 or cond_boundary_right_int == 1 or cond_boundary_right_tbx == 1: # # # tc_code_kernel_helper.code_kernel_load_input_right_boundary_case( f, cond_boundary_right_ext, cond_boundary_right_tbx, cond_boundary_right_int, str_cond_gen_external, str_cond_gen_tb_x, str_cond_gen_internal) # # l_right_indices_target_temp = list() l_right_indices_reg_temp = list() # for each_idx in l_right_indices: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: l_right_indices_target_temp.append(each_idx) else: l_right_indices_reg_temp.append(each_idx) # To Calculate Length of Indices' Tile-Size mapped on TB size_len_external_tiles = 1 for each_target_index in l_right_indices_target_temp: size_len_external_tiles = size_len_external_tiles * tc_helper.tc_gen_helper_find( l_t3_slices, each_target_index) # Assumed only one index is mapped on REG size_reg_tile = tc_helper.tc_gen_helper_find( l_t3_slices, l_right_indices_reg_temp[0]) #print ("size_tb_x: ", size_tb_x) #print ("size_tb_y: ", size_tb_y) #print ("size_len_external_tiles: ", size_len_external_tiles) opt_gen_full_special_case = -1 # # TB_X will load |E_K|. # if opt_load_v2 == -1: # # if size_tb_y > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_y / size_len_external_tiles) if len_covered_reg > 1: opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # TB_X will load |E_RIGHT|. # else: # # if size_tb_x > size_len_external_tiles: # how many steps for register tile can be covered by TB_X len_covered_reg = int(size_tb_x / size_len_external_tiles) if len_covered_reg > 1: opt_gen_full_special_case = 1 else: len_covered_reg = 1 # # Step 2: This "For-Statement" is related to "Regiter-Tile." # The size of Thread Block depends on Indices' Tile-Size mapped on Thread Block. # However, when the lengh of a dimension along thread block which load inputs can cover some "Register-Tile," # then, we need to change the ranges. # tc_code_kernel_helper.code_kernel_load_input_right_for_statement( f, opt_gen_full, tensor_contraction[1][2], opt_gen_full_special_case, size_len_reg_tiles_right, size_reg_tile, len_covered_reg, l_t3_mapping_reg) # # free # del l_right_indices_target del l_right_indices_reg del l_right_indices_target_temp del l_right_indices_reg_temp # # # str_str_v2 = "" idx_count = 0 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: if idx_count != 0: str_str_v2 = str_str_v2 + " * " str_str_v2 = str_str_v2 + "SIZE_SLICE_" + str( idx_kernel) + "_" + each_idx.capitalize() idx_count = idx_count + 1 # # [Sub-Routine] Load Tensor Inputs to sm_b[][] # if len(l_input_strides) > 0: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right( f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2, num_internal_indices, idx_kernel, l_input_strides[2], opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, l_t3_slices) else: tc_code_kernel_load_inputs_details.tc_gen_code_Kernel_Load_Inputs_Right( f, tensor_contraction, l_internal_idx, opt_load_v2, size_tb_x, size_tb_y, size_sm_p7, size_len_external_tiles_right, str_str_v2, num_internal_indices, idx_kernel, l_input_strides, opt_pre_computed, l_t3_mapping_tb_2D, l_t3_mapping_reg, opt_gen_full, opt_gen_p7, l_t3_slices) # # END: To Load Inputs # f.write("\t\t}\n") f.write("\t\t__syncthreads();\n") f.write("\n")
def get_configurations(l_outer_groups, list_configurations_outer_group, tmp_count, tmp_config, opt_print, opt_data_type): print( "=========================== [Configurations] ===============================" ) #print (" # of Outer-Groups: ", len(l_outer_groups)) # Assumped that there is only a tensor contraction (PACT 2018) # 1 Outer-Group has 1 Tensor Contraction which will be grouped # by 1 Inner-Group # # Initial Value (Default) # list_representative_problem_size = list() # # Assumption: There is only one Outer-Group and there is only one Tensor Contraction in the Outer-Group. # list_indices = l_outer_groups[0][2] # # For TTCG Benchmark: representative problem sizes: all-16 # #list_tccg_representative_problem_size = tccg_problem_size.get_tccg_representative_problem_sizes(tmp_count) list_tccg_representative_problem_size = l_outer_groups[0][1][0][9] # # # if len(list_indices) != len(list_tccg_representative_problem_size): print("list_representative_problem_size from TCCG Benchmark: ", list_tccg_representative_problem_size) print("len(list_indices): ", len(list_indices), " vs len(list_tccg_representative_problem_size): ", len(list_tccg_representative_problem_size)) for idx_count in range(0, len(list_indices)): list_representative_problem_size.append( [list_indices[idx_count], 16]) #print ("[ERROR] src.generators.configurations.get_configurations()") #sys.exit() else: for idx_count in range(0, len(list_indices)): list_representative_problem_size.append([ list_indices[idx_count], list_tccg_representative_problem_size[idx_count] ]) print(" (TCCG) Representative Problem Size--- ", tmp_count) print(" : ", list_representative_problem_size) # # [Result] List of Configurations # list_configurations_temp = list() # # Per Each-Outer-Group # idx_outer_count = 1 for each_outer_group in l_outer_groups: print(" > Outer-Group #. ", idx_outer_count) # base_outer_group = each_outer_group[0] list_tc = each_outer_group[1] all_indices = each_outer_group[2] list_info_split = list() # # For Each Tensor Contraction # idx_tc_count = 1 for each_tc in list_tc: print(" >> Tensor-Contraction [", idx_tc_count, "] ") print(" :", each_tc) # # Default: Input(Left) ---> TB_X and REG_X # But, if Input(Right) has the FVI in Output, Input(Right) ---> TB_X and REG_X # list_output_tensor = each_tc[1] list_internal_indices = each_tc[3] list_input_tensor_left = each_tc[5] list_input_tensor_right = each_tc[7] list_info_idx_split = [] # # # num_ext_left = len(list_input_tensor_left) - len( list_internal_indices) num_ext_right = len(list_input_tensor_right) - len( list_internal_indices) # # opt_limited_split (0: free, 1: limited) # opt_limited_split = 1 if num_ext_left == 1 or num_ext_right == 1: print( "[Code Generator][Configurations] One of Input Tensors has only one external index, resulting in splitting freely." ) # # Tensor (Left) # if num_ext_left == 1: print("(L) To Split First: ", list_input_tensor_left) # # To Find a Target Index in the Tensor # idx_count = 0 prev_idx = "" for each_idx in list_input_tensor_left: # # Extenel Indices (|External Indices| == 1) # if tc_helper.tc_gen_helper_find_1d( list_internal_indices, each_idx) == -1: prev_idx = each_idx each_tc[5].insert(idx_count, each_idx + "1") each_tc[5].insert(idx_count + 1, each_idx + "2") each_tc[5].pop(idx_count + 2) break # idx_count += 1 # # To Modify the Output Tensor # idx_count = 0 for each_idx in list_output_tensor: if each_idx == prev_idx: each_tc[1].insert(idx_count, each_idx + "1") each_tc[1].insert(idx_count + 1, each_idx + "2") each_tc[1].pop(idx_count + 2) list_info_split.append( [each_idx, each_idx + "1", each_idx + "2"]) list_info_idx_split.append( [each_idx, each_idx + "1", each_idx + "2"]) break # idx_count += 1 # # To Modify the Representative Problem Size # idx_count = 0 for each_element in list_representative_problem_size: # if each_element[0] == prev_idx: list_representative_problem_size.insert( idx_count, [each_element[0] + "1", each_element[1]]) list_representative_problem_size.insert( idx_count + 1, [each_element[0] + "2", each_element[1]]) break # idx_count += 1 # # [Outer-Group] Assumption: Only One Tensor Contraction # idx_count = 0 for each_idx in all_indices: # if each_idx == prev_idx: all_indices.insert(idx_count, each_idx + "1") all_indices.insert(idx_count + 1, each_idx + "2") all_indices.pop(idx_count + 2) break # idx_count += 1 # # Tensor (Right) # if num_ext_right == 1: print("(R) To Split First: ", list_input_tensor_right) # # To Find a Target Index in the Tensor # idx_count = 0 prev_idx = "" for each_idx in list_input_tensor_right: # # External Indices (|External Indices| == 1) # if tc_helper.tc_gen_helper_find_1d( list_internal_indices, each_idx) == -1: prev_idx = each_idx each_tc[7].insert(idx_count, each_idx + "1") each_tc[7].insert(idx_count + 1, each_idx + "2") each_tc[7].pop(idx_count + 2) break # idx_count += 1 # # To Modify the Output Tensor # idx_count = 0 for each_idx in list_output_tensor: if each_idx == prev_idx: each_tc[1].insert(idx_count, each_idx + "1") each_tc[1].insert(idx_count + 1, each_idx + "2") each_tc[1].pop(idx_count + 2) list_info_split.append( [each_idx, each_idx + "1", each_idx + "2"]) list_info_idx_split.append( [each_idx, each_idx + "1", each_idx + "2"]) break # idx_count += 1 # # To Modify the Representative Problem Size # idx_count = 0 for each_element in list_representative_problem_size: # if each_element[0] == prev_idx: list_representative_problem_size.insert( idx_count, [each_element[0] + "1", each_element[1]]) list_representative_problem_size.insert( idx_count + 1, [each_element[0] + "2", each_element[1]]) break # idx_count += 1 # # [Outer-Group] Assumption: Only One Tensor Contraction # idx_count = 0 for each_idx in all_indices: # if each_idx == prev_idx: all_indices.insert(idx_count, each_idx + "1") all_indices.insert(idx_count + 1, each_idx + "2") all_indices.pop(idx_count + 2) break # idx_count += 1 # # This Option (opt_limited_split) is related to "Interface" # opt_limited_split = 0 else: print( "[Code Generator][Configurations] Both Input Tensors have at lease two external indices, resulting in splitting exclusively." ) # # Input: an Equation for a Tensor Contraction # a Representative Problem Size # Output: a List of Configurations # # # New Mapping Algorithms # list_temp = alg_configurations.alg_enumeration_pruning( each_tc, list_info_idx_split, list_representative_problem_size, opt_limited_split, 0, opt_data_type) print( "[Code Generator][Configurations] configurations: # of Configurations--- Total: ", len(list_temp)) # if len(list_temp) < 1: print( "[Code Generator][Configurations] ERROR: Problem(s) in Enumerating Configurations" ) sys.exit() # # Models: each configuration has its own cost. # cost_model.cost_model_total(list_temp, 0) prediction_model.model_predictive_modeling(list_temp) # #list_temp[0].print_configuration(0) # list_temp.sort(key=lambda x: x.cost_total) # # All Configurationss # idx_configuration = 0 min_cost = 1000000000000 min_steps = 1000000000000 idx_count = 0 for each_config in list_temp: if min_cost > each_config.cost_total: min_cost = each_config.cost_total min_steps = each_config.steps_main_loops idx_configuration = idx_count if min_cost == each_config.cost_total: if min_steps > each_config.steps_main_loops: min_steps = each_config.steps_main_loops idx_configuration = idx_count # idx_count += 1 # print("[Code Generator][Configurations] # ", idx_configuration, " in ", len(list_temp)) # if tmp_config < len(list_temp) and tmp_config != -1: print("[Code Generator][Configurations] manually picked # ", tmp_config) list_configurations_outer_group.append(list_temp[tmp_config]) else: list_configurations_outer_group.append( list_temp[idx_configuration]) # # # #for rank in range(0, len(list_temp)): # list_temp[rank].print_configuration(0,str(rank)) # each_outer_group.append(list_info_split) idx_outer_count = idx_outer_count + 1 # # # print( "============================================================================" )
def tc_gen_code_kernel_load_inputs_base( f, opt_gen_ext, opt_gen_int, opt_load_left, opt_load_right, opt_internal, tensor_contraction, l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, size_smem_k, size_tb_x, size_tb_y, idx_kernel): # # # num_code_tabs = 2 # # # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "//---------------------------------------------------------------------------------------------------", 1) tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "// This is for the new version", 1) # # >>> Base Form <<< # if (idx_a < rng_c2 && threadIdx.y < SIZE_INT_UNIT_1 - internal_upperbound) // boundary: external-smem, internal # if (threadIdx.x < 8) // boundary: |TB| and |K| # { # for (int ll = 0; ll < rng_c1; ll++) // related to |REG| && boundary: external0-reg # { # sm_b[][] = dev_v2[ext_addr + int_addr]; # } # } # __synchthread(); # #print ("tensor_contraction: ", tensor_contraction) # # Which Axis is mapped on REG for Input-Left and Input-Right # opt_axis_reg_left = 0 opt_axis_reg_right = 0 if tensor_contraction[0][2] != "x": opt_axis_reg_left = 1 if tensor_contraction[1][2] != "x": opt_axis_reg_right = 1 # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles_left = 1 size_len_reg_tiles_left = 1 for each_idx in tensor_contraction[0][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_left = size_len_external_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_left = size_len_reg_tiles_left * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) # # To Calculate Length of Indices' Tile-Size mapped on TB # size_len_external_tiles_right = 1 size_len_reg_tiles_right = 1 for each_idx in tensor_contraction[1][4]: if tc_helper.tc_gen_helper_find_1d(l_t3_mapping_reg, each_idx) == -1: if tc_helper.tc_gen_helper_find_1d(l_internal_idx, each_idx) == -1: size_len_external_tiles_right = size_len_external_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) else: size_len_reg_tiles_right = size_len_reg_tiles_right * tc_helper.tc_gen_helper_find( l_t3_slices, each_idx) print("[Code Generator][New][Load][Input] size_len_ext_left: ", size_len_external_tiles_left) print("[Code Generator][New][Load][Input] size_len_ext_right: ", size_len_external_tiles_right) print("[Code Generator][New][Load][Input] size_len_reg_left: ", size_len_reg_tiles_left) print("[Code Generator][New][Load][Input] size_len_reg_right: ", size_len_reg_tiles_right) # # [Load-Input][Left] # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "// This Part is for Loading Input-Left", 1) # tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts( f, num_code_tabs, # tensor_contraction[0], # options opt_load_left, 1, 1, opt_gen_ext, opt_gen_int, opt_axis_reg_left, # need to make automatically opt_internal, # lists l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, # sizes size_len_external_tiles_left, size_len_reg_tiles_left, size_smem_k, size_tb_x, size_tb_y, idx_kernel) # tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "", 1) # # [Load-Input][Right] # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "// This Part is for Loading Input-Right", 1) # tc_code_kernel_load_inputs_abstract.tc_gen_code_Kernel_Load_Inputs_Abstracts( f, num_code_tabs, # tensor_contraction[1], # options opt_load_right, 2, 2, opt_gen_ext, opt_gen_int, opt_axis_reg_right, opt_internal, # lists l_t3_slices, l_internal_idx, l_t3_mapping_tb_2D, l_t3_mapping_reg, # sizes size_len_external_tiles_right, size_len_reg_tiles_right, size_smem_k, size_tb_x, size_tb_y, idx_kernel) # # END: After Loading Both Inputs # tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "__syncthreads();", 1) # tc_helper.tc_gen_helper_code_a_line( f, num_code_tabs, "//---------------------------------------------------------------------------------------------------", 1) tc_helper.tc_gen_helper_code_a_line(f, num_code_tabs, "\n", 1)
def tc_gen_code(tmp_count, inner_groups): # # This is for calculating cost function based on a given input. # output_name = "temp" # Depends on # of groups. # # FILE OPEN: A LIST of Inner-Groups is for A SINGLE CUDA File. # : Each Inner-Group has multiple Tensor Contractions which will be fused. # : Thus, each Inner-Group is for A SINGLE KERNEL. # f = open(output_name + "_" + str(tmp_count) + ".cu", "w") # # Per Inner-Groups, # tc_code_include.tc_code_include(f) #tc_code_etc.tc_gen_global_methods(f, len(inner_groups)) # l_combined_opt_diffs = list() l_combined_opt_gen_fulls = list() l_combined_opt_gen_internal = list() l_combined_input_tensors = list() l_combined_t3_slices_size = list() l_combined_mappings = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t3_parameters_f = list() l_combined_t3_parameters_nf = list() l_combined_t2_parameters = list() l_combined_t2_parameters_f = list() l_combined_t2_parameters_nf = list() l_combined_v2_parameters = list() l_combined_v2_parameters_f = list() l_combined_v2_parameters_nf = list() l_combined_device_dynamic = list() l_combined_host_dynamic = list() l_combined_cuda_malloc = list() # # To Support Multiple Inner-Groups # for each_inner_group in inner_groups: l_combined_input_tensors.append(each_inner_group[6]) l_combined_t3_slices_size.append(each_inner_group[8]) l_combined_mappings.append([each_inner_group[1], each_inner_group[2]]) # tc_code_define.tc_gen_definition(f, l_combined_t3_slices_size, inner_groups[0][3], inner_groups[0][4], l_combined_mappings, inner_groups[0][4], inner_groups[0][5], l_combined_input_tensors) # # Check Types # idx_kernel = 1 for each_inner_group in inner_groups: opt_gen_full, opt_gen_p7, possible_diff = tc_helper.tc_gen_helper_CheckingTypes( each_inner_group[3], each_inner_group[8], each_inner_group[4]) # possible_diff, opt_gen_full, and opt_gen_p7 are created at here. #l_combined_opt_diffs.append(possible_diff) l_combined_opt_diffs.append(-1) l_combined_opt_gen_fulls.append(opt_gen_full) l_combined_opt_gen_internal.append(opt_gen_p7) print("[Code Generator][tc_gen_code] Kernel #", idx_kernel, ">>> opt_diff:", possible_diff, "(but -1), opt_gen_full:", opt_gen_full, ", opt_gen_p7:", opt_gen_p7) idx_kernel = idx_kernel + 1 # # Code: Global Variables (Common) # tc_code_globalvar.tc_gen_global_variables_common(f) # idx_kernel = 0 for each_inner_group in inner_groups: # l_t3_d_decl_var = list() l_t2_d_decl_var = list() l_v2_d_decl_var = list() l_t3_parameters = list() l_t2_parameters = list() l_v2_parameters = list() l_t3_parameters_nf = list() l_t2_parameters_nf = list() l_v2_parameters_nf = list() l_t3_parameters_f = list() l_t2_parameters_f = list() l_v2_parameters_f = list() # # Code: Global Variables (Tensor Contractions) # tc_code_globalvar.tc_gen_global_variables( f, each_inner_group[6], each_inner_group[4], each_inner_group[5], l_t3_d_decl_var, l_t3_parameters, l_t3_parameters_nf, l_t2_parameters_nf, l_v2_parameters_nf, l_t3_parameters_f, l_t2_parameters_f, l_v2_parameters_f, l_device_dynamic, l_t2_d_decl_var, l_v2_d_decl_var, l_t2_parameters, l_v2_parameters, l_cuda_malloc, l_combined_opt_diffs[idx_kernel], idx_kernel + 1) # l_combined_t3_d_decl_var.append(l_t3_d_decl_var) l_combined_t2_d_decl_var.append(l_t2_d_decl_var) l_combined_v2_d_decl_var.append(l_v2_d_decl_var) l_combined_t3_parameters.append(l_t3_parameters) l_combined_t2_parameters.append(l_t2_parameters) l_combined_v2_parameters.append(l_v2_parameters) l_combined_t3_parameters_f.append(l_t3_parameters_f) l_combined_t2_parameters_f.append(l_t2_parameters_f) l_combined_v2_parameters_f.append(l_v2_parameters_f) l_combined_t3_parameters_nf.append(l_t3_parameters_nf) l_combined_t2_parameters_nf.append(l_t2_parameters_nf) l_combined_v2_parameters_nf.append(l_v2_parameters_nf) # idx_kernel = idx_kernel + 1 # # Code: SD2 Functions # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_SD2_Functions.tc_gen_code_pre_SD2_Functions( f, each_inner_group[4], each_inner_group[5], each_inner_group[6], l_host_dynamic, idx_kernel) idx_kernel = idx_kernel + 1 # # Code: CUDA Malloc | Memcpy # tc_pre_CUDA_Malloc.tc_gen_code_pre_CUDA_Malloc( f, l_cuda_malloc, l_t3_parameters, l_t2_parameters, l_v2_parameters, l_device_dynamic, inner_groups[0][5]) # # Code: Pre-Computed Arrays # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_IndirectArray.tc_gen_code_pre_IndirectArray( f, each_inner_group[4], each_inner_group[0], each_inner_group[6], each_inner_group[2], each_inner_group[4], each_inner_group[5], l_host_dynamic, l_combined_opt_diffs[idx_kernel - 1], idx_kernel) idx_kernel = idx_kernel + 1 # # Code: BasicBlock # idx_kernel = 1 for each_inner_group in inner_groups: tc_pre_BasicBlock.tc_gen_code_pre_BasicBlock( f, each_inner_group[4], each_inner_group[3], each_inner_group[8], l_host_dynamic, each_inner_group[4], each_inner_group[5], l_combined_opt_diffs[idx_kernel - 1], idx_kernel) idx_kernel = idx_kernel + 1 # print("[Code Generator][tc_gen_code] # of Inner-Groups: ", len(inner_groups)) # # Each Inner-Group Corresponds to A Kernel. # idx_kernel = 1 for an_inner_group in inner_groups: print("[Code Generator][tc_gen_code] Creating Kernel --- #", idx_kernel) # kernel_name = "kernel_ccsdT_" + str( idx_kernel) # Depends on # of groups. # Options for Each Kernel possible_diff = l_combined_opt_diffs[idx_kernel - 1] opt_gen_full = l_combined_opt_gen_fulls[idx_kernel - 1] opt_gen_p7 = l_combined_opt_gen_internal[idx_kernel - 1] # # To-Do: check if boundaries are needed by all tensor contractions, or not. # if opt_gen_full != -1: # Inputs: l_blk_boundary_rng, l_idx_size, l_t3_slices, l_t3_mapping_reg, l_t3_mapping_tb_2D, info_left_index, info_right_index # Outputs: l_blk_boundary_rng tc_helper.tc_gen_helper_CheckingBoundary( l_blk_boundary_rng, an_inner_group[3], an_inner_group[8], an_inner_group[2], an_inner_group[1], an_inner_group[6][0][0][1], an_inner_group[6][0][1][1]) print(">>> Boundaries for External Indices: ", l_blk_boundary_rng) # (....) #possible_diff = -1 # (To-Do: ) # # Inputs: l_input_tensors, l_internal_idx, l_external_idx, l_t3_slices, l_t3_mapping_reg, # Outputs: int_size_sm_a, int_size_sm_b, int_str_t2, int_str_v2 # for each_tc in an_inner_group[6]: # int_size_sm_a = 1 int_size_sm_b = 1 int_str_t2 = 1 int_str_v2 = 1 # bool_found = 1 for each_idx in each_tc[0][1]: # for size_sm_a if tc_helper.tc_gen_helper_find_1d(an_inner_group[5], each_idx) == -1: int_size_sm_a = int_size_sm_a * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # for str_str_t2 if tc_helper.tc_gen_helper_find_1d( an_inner_group[4], each_idx) != -1: # external indices (all) if tc_helper.tc_gen_helper_find_1d( an_inner_group[2], each_idx ) == -1: # external indices mapped on ! regiter tiling int_str_t2 = int_str_t2 * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) bool_found = 1 for each_idx in each_tc[1][1]: # for size_sm_b if tc_helper.tc_gen_helper_find_1d(an_inner_group[5], each_idx) == -1: int_size_sm_b = int_size_sm_b * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # for str_str_v2 if tc_helper.tc_gen_helper_find_1d( an_inner_group[4], each_idx) != -1: # external indices (all) if tc_helper.tc_gen_helper_find_1d( an_inner_group[2], each_idx ) == -1: # external indices mapped on ! register tiling int_str_v2 = int_str_v2 * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # # Inputs: l_t3_mapping_tb_2D, l_t3_slices # Outputs: int_size_tb_x, int_size_tb_y # int_size_tb_x = 1 int_size_tb_y = 1 for each_idx in an_inner_group[1][0]: # "x"-axis int_size_tb_x = int_size_tb_x * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) for each_idx in an_inner_group[1][1]: # "y"-axis_idx int_size_tb_y = int_size_tb_y * tc_helper.tc_gen_helper_find( an_inner_group[8], each_idx) # the below information should be produced by "tc_gen_input()" size_sm_a = int_size_sm_a #+ 1 (padding) size_sm_b = int_size_sm_b #+ 1 size_tb_x = int_size_tb_x size_tb_y = int_size_tb_y size_sm_p7 = tc_helper.tc_gen_helper_CheckingIntUnit( an_inner_group[4], an_inner_group[8], an_inner_group[5]) size_reg_y = tc_helper.tc_gen_helper_find(an_inner_group[8], an_inner_group[2][1]) size_reg_x = tc_helper.tc_gen_helper_find(an_inner_group[8], an_inner_group[2][0]) # # Options for two inputs # Inputs: l_input_tensors, l_internal_idx # opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI( an_inner_group[6], an_inner_group[5]) # # Constraints # Inputs: f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7 # tc_gen_Constraints(f, size_tb_x, size_tb_y, size_sm_a, size_sm_b, size_sm_p7) # l_t3_d_decl_var = l_combined_t3_d_decl_var[idx_kernel - 1] l_t2_d_decl_var = l_combined_t2_d_decl_var[idx_kernel - 1] l_v2_d_decl_var = l_combined_v2_d_decl_var[idx_kernel - 1] l_t3_parameters = l_combined_t3_parameters[idx_kernel - 1] l_t2_parameters = l_combined_t2_parameters[idx_kernel - 1] l_v2_parameters = l_combined_v2_parameters[idx_kernel - 1] l_t3_parameters_f = l_combined_t3_parameters_f[idx_kernel - 1] l_t2_parameters_f = l_combined_t2_parameters_f[idx_kernel - 1] l_v2_parameters_f = l_combined_v2_parameters_f[idx_kernel - 1] l_t3_parameters_nf = l_combined_t3_parameters_nf[idx_kernel - 1] l_t2_parameters_nf = l_combined_t2_parameters_nf[idx_kernel - 1] # # >>> Create Kernels <<< # if possible_diff == -1: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) else: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) opt_gen_full = -1 # 1 or -1 tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_full", l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, an_inner_group[7], an_inner_group[1], an_inner_group[2], an_inner_group[4], an_inner_group[5], an_inner_group[8], size_sm_a, size_sm_b, size_sm_p7, size_reg_y, size_reg_x, size_tb_y, size_tb_x, int_str_t2, int_str_v2, l_blk_boundary_rng, opt_gen_p7, opt_gen_full, opt_load_t2, opt_load_v2, idx_kernel) # idx_kernel = idx_kernel + 1 # # Code: Function to Call Kernels. # kernel_name = "kernel_ccsdT" tc_code_etc.tc_gen_code_fusedKernels( f, kernel_name, l_combined_t3_parameters, l_combined_t2_parameters, l_combined_v2_parameters, l_combined_t3_parameters_nf, l_combined_t2_parameters_nf, l_combined_v2_parameters_nf, l_combined_t3_parameters_f, l_combined_t2_parameters_f, l_combined_v2_parameters_f, l_combined_opt_diffs, len(inner_groups)) # # Code: Function for Correctness Check. # tc_post_Correctness.tc_gen_code_Post_Correctness(f, an_inner_group[4], an_inner_group[5], l_combined_input_tensors) # # Code: Delete Device Memory Allocated Dynamically. # tc_post_HostDevice_Free.tc_gen_code_post_CUDA_Free(f, l_cuda_malloc) # # Code: Delete Host Memory Allocated Dynamically. # tc_post_HostDevice_Free.tc_gen_code_post_HostFree(f, l_host_dynamic) # # Code: Main Function # f.write("// # of Inner-Groups: " + str(len(inner_groups)) + "\n") tc_code_etc.tc_gen_code_main(f, len(inner_groups)) # FILE CLOSE f.close()
def tc_gen_code_new(tmp_count, str_tmp_count, str_tmp_config, l_inner_groups, l_interface_info, opt_pre_computed, opt_data_type): # # FILE: OPEN # output_name = "temp" if str_tmp_config == "-1": f = open(output_name + "__" + str_tmp_count + ".cu", "w") else: f = open( output_name + "__" + str_tmp_count + "__" + str_tmp_config + ".cu", "w") # # Includes and Globel Methods # tc_code_include.tc_code_include(f) #tc_code_etc.tc_gen_global_methods(f, len(l_inner_groups)) # # should be changed. # interface_name = "sd_t_d2_fusion" kernel_name = "kernel_" # l_combined_opt_diffs = list() l_combined_opt_gen_fulls = list() l_combined_opt_gen_internal = list() l_combined_t3_slices_size = list() l_combined_mappings = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t2_parameters = list() l_combined_v2_parameters = list() l_combined_device_dynamic = list() l_combined_host_dynamic = list() l_combined_cuda_malloc = list() # # To Support Multiple Inner-Groups # for each_inner_group in l_inner_groups: l_combined_t3_slices_size.append(each_inner_group[8]) l_combined_mappings.append([each_inner_group[1], each_inner_group[2]]) # # Inputs: T3-Slices, T3-Mappings, External Index, Internal Index # tc_code_define.tc_gen_definition_new(f, l_combined_t3_slices_size, l_combined_mappings, l_inner_groups[0][4], l_inner_groups[0][5]) # # Each Inner-Group Corresponds to A Kernel (There are Three Types per Kernel) # Type #1: External (Full) & Internal (Full) # Type #2: External (Full) & Internal (Partial) # Type #3: External (Partial) & Internal (Full) # Type #4: External (Partial) & Internal (Partial) # l_var_outputs = list() l_var_input_internal = list() l_combined_register_mappings = list() l_combined_var_input_left = list() l_combined_var_input_right = list() l_combined_var_outputs_helpers = list() l_combined_var_thread_block = list() l_combined_t3_d_decl_var = list() l_combined_t2_d_decl_var = list() l_combined_v2_d_decl_var = list() l_combined_t3_parameters = list() l_combined_t2_parameters = list() l_combined_v2_parameters = list() l_combined_inputs_int_strides = list() l_cuda_malloc = list() l_device_dynamic = list() l_host_dynamic = list() # # To Handle Multiple Tensor Contractions in an Inner-Group # kernel_number = 1 for each_inner_group in l_inner_groups: # ''' idx_count = 0 for each_info in each_inner_group: print ("> ", idx_count, ": ", each_info) idx_count += 1 ''' # l_var_tensor_block = list() l_var_input_left = list() l_var_input_right = list() l_var_outputs_helpers = list() l_t3_d_decl_var = list() l_t2_d_decl_var = list() l_v2_d_decl_var = list() l_t3_parameters = list() l_t2_parameters = list() l_v2_parameters = list() l_input_strides = list() # Inputs: kernel_number, each_inner_group[6](l_input_tensors), each_inner_group[4](l_extenral_index), each_inner_group[5](l_internal_index) # Outputs: l_t3_d_decl_var, l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var, l_v2_parameters, # l_cuda_malloc, l_device_dynami,c # l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal tc_code_globalvar.tc_gen_variables( kernel_number, l_interface_info, each_inner_group[6], each_inner_group[4], each_inner_group[5], l_t3_d_decl_var, l_t3_parameters, l_t2_d_decl_var, l_t2_parameters, l_v2_d_decl_var, l_v2_parameters, l_input_strides, l_cuda_malloc, l_device_dynamic, l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal, opt_data_type) # # Variables are Used in Functions for Pre-Computed Arrays and In-Direction Arrays. (Finally, Kernels) # : We need to differentiate them to be used in these functions. # # # Tile-Appoach: # Inputs: kernel_number, l_interface_info, each_inner_group[4](l_external_index), each_inner_group[8](l_t3_slices), each_inner_group[5](l_internal_index), each_inner_group[3](l_idx_size) # Output: l_host_dynamic # if opt_pre_computed != -1: tc_pre_BasicBlock.tc_gen_code_pre_TileApproach( f, kernel_number, l_interface_info, each_inner_group[4], each_inner_group[8], each_inner_group[3], each_inner_group[5], l_host_dynamic) # # Pre-Compuated Arrays and In-Direct Arrays # tc_pre_IndirectArray.tc_gen_code_driver_PreComputedArray( f, kernel_number, l_interface_info, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_tensor_block, each_inner_group[6], l_host_dynamic, each_inner_group[4], each_inner_group[4], each_inner_group[2], each_inner_group[0]) # Related to Kernel(s) size_smem_left, size_smem_right, str_left, str_right = tc_interface.tc_interface_SMEM_Size( each_inner_group[6], each_inner_group[4], each_inner_group[5], each_inner_group[8], each_inner_group[2]) # size_TB_X, size_TB_Y = tc_interface.tc_interface_TB_Size( each_inner_group[1][0], each_inner_group[1][1], each_inner_group[8]) size_smem_internal = tc_helper.tc_gen_helper_CheckingIntUnit( each_inner_group[4], each_inner_group[8], each_inner_group[5]) size_REG_X = tc_helper.tc_gen_helper_find( each_inner_group[8], each_inner_group[2][0]) # 0 -> REG_X size_REG_Y = tc_helper.tc_gen_helper_find( each_inner_group[8], each_inner_group[2][1]) # 1 -> REG_Y opt_load_t2, opt_load_v2 = tc_helper.tc_gen_helper_CheckingInternalFVI( each_inner_group[6], each_inner_group[5]) #print ("size_smem_internal: ", size_smem_internal) # # Constratins:"" # tc_gen.tc_gen_Constraints(f, size_TB_X, size_TB_Y, size_smem_left, size_smem_right, size_smem_internal) # opt_shared_padding = 0 # # Kernels: Different Types (External, Internal) # (1) Full, Full # (2) Full, Partial # (3) Partial, Full # (4) Partial, Partial # tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_1_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_2_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_3_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_4_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 1, opt_data_type, opt_shared_padding, kernel_number) # # multiple internal indices. # if len(each_inner_group[5]) > 1: tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_1_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_2_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, -1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_3_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, -1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) tc_code_kernel.tc_gen_code_Kernel( f, kernel_name + "_4_tex_" + str(kernel_number), l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, l_input_strides, each_inner_group[7], each_inner_group[1], each_inner_group[2], each_inner_group[4], each_inner_group[5], each_inner_group[8], size_smem_left, size_smem_right, size_smem_internal, size_REG_Y, size_REG_X, size_TB_Y, size_TB_X, str_left, str_right, l_blk_boundary_rng, 1, 1, opt_load_t2, opt_load_v2, opt_pre_computed, 2, opt_data_type, opt_shared_padding, kernel_number) # # For the Interface # l_combined_var_input_left.append(l_var_input_left) l_combined_var_input_right.append(l_var_input_right) l_combined_var_outputs_helpers.append(l_var_outputs_helpers) l_combined_var_thread_block.append(l_var_tensor_block) l_combined_t3_d_decl_var.append(l_t3_d_decl_var) l_combined_t2_d_decl_var.append(l_t2_d_decl_var) l_combined_v2_d_decl_var.append(l_v2_d_decl_var) l_combined_t3_parameters.append(l_t3_parameters) l_combined_t2_parameters.append(l_t2_parameters) l_combined_v2_parameters.append(l_v2_parameters) l_combined_register_mappings.append(each_inner_group[2]) l_combined_inputs_int_strides.append(l_input_strides) # kernel_number = kernel_number + 1 # # End of For-Statement: l_innter_groups # # # Parts of Kernels for Register Transpose # (Currently) It assumes that all inner groups can be grouped. # if tc_gen.tc_gen_Check_RegisterTranspose(l_inner_groups) == 1: print( "[Code Generator][Code] Register Transpose [Possible] according to the given mappings" ) f.write("\n") f.write( "// This part is for Kernels which support Register Transpose\n") # # This is also 4-different types. # tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_4", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, 1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_3", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, 1, -1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_2", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, 1) tc_code_kernel_fusion.tc_gen_code_Kernel_Register_Transpose( f, kernel_name + "_1", l_inner_groups, l_combined_t3_d_decl_var, l_combined_t2_d_decl_var, l_combined_v2_d_decl_var, l_t3_d_decl_var, l_t2_d_decl_var, l_v2_d_decl_var, -1, -1) else: print( "[Code Generator][Code] Register Transpose [Impossible] according to the given mappings" ) # # Temp...... For |K| > 1, # l_internal_addrs = list() for each_inner_group in l_inner_groups: # # Each Inner-Group can have multiple Tensor Contractions which will be Fused. # Among Inner-Groups, there might be possible to group them by using Register-Transposition. # print("each_inner_group: ", each_inner_group) l_tensor_contraction_in_inner_group = each_inner_group[6] # To-Do: Need To Suport Multiple-Tensor Contractions for each_tensor_contraction in l_tensor_contraction_in_inner_group: info_input_left = each_tensor_contraction[0] info_input_right = each_tensor_contraction[1] # LEFT l_idx_info_left = list() for idx_tensor in info_input_left[1]: # if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5], idx_tensor) != -1: l_idx_info_left.append([1, idx_tensor]) else: l_idx_info_left.append([0, idx_tensor]) # l_rev_idx_info_left = list(reversed(l_idx_info_left)) str_addr_left = "" idx_count = 0 for each_idx_info in l_rev_idx_info_left: # if each_idx_info[0] == 1: # internal index if idx_count == 0: str_addr_left = "idx_" + each_idx_info[1] else: str_addr_left = "idx_" + each_idx_info[ 1] + " + (" + str_addr_left + ") * size_" + each_idx_info[ 1] # idx_count = idx_count + 1 else: # external index if idx_count != 0: str_addr_left = "(" + str_addr_left + ") * size_" + each_idx_info[ 1] # RIGHT str_addr_right = "" l_idx_info_right = list() for idx_tensor in info_input_right[1]: # if tc_helper.tc_gen_helper_find_1d(l_inner_groups[0][5], idx_tensor) != -1: l_idx_info_right.append([1, idx_tensor]) else: l_idx_info_right.append([0, idx_tensor]) # l_rev_idx_info_right = list(reversed(l_idx_info_right)) str_addr_right = "" idx_count = 0 for each_idx_info in l_rev_idx_info_right: # if each_idx_info[0] == 1: # internal index if idx_count == 0: str_addr_right = "idx_" + each_idx_info[1] else: str_addr_right = "idx_" + each_idx_info[ 1] + " + (" + str_addr_right + ") * size_" + each_idx_info[ 1] # idx_count = idx_count + 1 else: # external index if idx_count != 0: str_addr_right = "(" + str_addr_right + ") * size_" + each_idx_info[ 1] # l_internal_addrs.append([str_addr_left, str_addr_right]) # # # Drivers # tc_interface.tc_gen_code_kernel_caller( f, interface_name, kernel_name, l_interface_info, l_inner_groups[0][4], l_inner_groups[0][5], l_inner_groups[0][10], l_var_tensor_block, l_var_outputs, l_var_outputs_helpers, l_var_input_left, l_var_input_right, l_var_input_internal, l_combined_var_input_left, l_combined_var_input_right, l_combined_var_outputs_helpers, l_combined_var_thread_block, l_combined_register_mappings, l_internal_addrs, ## ADDEDEDEDE l_combined_inputs_int_strides, ## l_cuda_malloc, l_device_dynamic, l_host_dynamic, l_t3_parameters, l_t2_parameters, l_v2_parameters, l_combined_t3_parameters, l_combined_t2_parameters, l_combined_v2_parameters, opt_pre_computed, opt_data_type) # ''' idx_inner_count = 0 for each_inner in l_inner_groups: idx_count = 0 for each_info in each_inner: print ("[", idx_inner_count, "][", idx_count, "] each_info: ", each_info) idx_count = idx_count + 1 idx_inner_count = idx_inner_count + 1 ''' # l_tile_sizes = l_inner_groups[0][8] l_split_representative_problem_size = l_inner_groups[0][9] #l_split_representative_problem_size = l_inner_groups[0][9] #print ("l_split_representative_problem_size: ", l_split_representative_problem_size) #print ("l_tile_sizes: ", l_tile_sizes) # # "Interface" # tc_interface.tc_gen_code_interface(f, interface_name, l_interface_info, l_tile_sizes, l_split_representative_problem_size, opt_data_type) # # FILE: CLOSE # f.close()
def tc_gen_perms_exclusive_TB_Y(list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_REG_Y, list_inherited_Tile_Sizes, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_right_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_right = len(list_given_input_tensor_right) if opt_print == 1: print ("========================================== [Enumerations-TB_Y] ===================================================") print ("========================================== [Exclusive] [START] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_right) print ("len(LEFT): ", len_tensor_right, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) print ("Given Tile-Sizes: ", list_inherited_Tile_Sizes) print ("Given REG_X: ", list_REG_X) print ("Given REG_Y: ", list_REG_Y) print ("Given TB_X: ", list_TB_X) print ("========================================== [Exclusive] [END] ===================================================") # # # for size_TB_Y in list_sizes_TB: if opt_print == 1: print ("|TB_Y| = ", size_TB_Y) # # Assumption: This Input Tensor does not have the FVI in the Output Tensor. # TB_Y_Vol = 1 TB_Y_Vol_Prev = 1 done_mapping_TB_Y = -1 list_TB_Y = [] duplicated_Tile_Sizes = copy.deepcopy(list_inherited_Tile_Sizes) # # # for each_right_idx in list_given_input_tensor_right: # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_right_idx) != -1: continue # # #2. (Just In Case) Indices Mapped on REG_X # if tc_helper.tc_gen_helper_find_1d(list_REG_X, each_right_idx) != -1: continue # # #3. Indices Mapped on REG_Y # if tc_helper.tc_gen_helper_find_1d(list_REG_Y, each_right_idx) != -1: continue # # |TB_Y'| # TB_Y_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx) # # |TB_Y'| >= |TB_XY # if TB_Y_Vol >= size_TB_Y: # # |TB_Y'| > |TB_Y| # if TB_Y_Vol > size_TB_Y: # # # if done_mapping_TB_Y == -1: blocking_tile_size = size_TB_Y / TB_Y_Vol_Prev list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, int(blocking_tile_size)]) done_mapping_TB_Y = 1 else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, 1]) # # |TB_Y'| = |TB_Y| # else: # # # if done_mapping_TB_Y == -1: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)]) else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, 1]) # # |TB_Y'| < |TB_Y| # else: list_TB_Y.append(each_right_idx) duplicated_Tile_Sizes.append([each_right_idx, tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx)]) # # # TB_Y_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, each_right_idx) # # # #print ("list_TB_X: ", list_TB_X) #print ("list_TB_Y: ", list_TB_Y) #print ("Tile-Sizes: ", duplicated_Tile_Sizes) # # Configuration # if done_mapping_TB_Y == 1: # # #1. Shared-Memory |SMEM_L| = |SMEM_R| # size_SMEM_Left = 1 size_SMEM_Right = 1 # # # for each_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1: size_SMEM_Left *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_given_input_tensor_right: if tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) != -1: size_SMEM_Right *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) # # #1. H/W Constraint---- Shared Memory # if (size_SMEM_Left * 16) + (size_SMEM_Right * 16) > 4096: continue # # # if size_SMEM_Left == size_SMEM_Right: tmp_config = tc_gen_permutations.Configuration() tmp_config.add_tensor_C(list_given_output_tensor) tmp_config.add_tensor_A(list_given_input_tensor_left) tmp_config.add_tensor_B(list_given_input_tensor_right) tmp_config.add_REG_X(list_REG_X) tmp_config.add_REG_Y(list_REG_Y) tmp_config.add_TB_X(list_TB_X) tmp_config.add_TB_Y(list_TB_Y) tmp_config.add_TB_K(list_TB_K) # # [To-Do] Need to Make it Automatically # #duplicated_Tile_Sizes.append(["e", 16]) #duplicated_Tile_Sizes.append(["f", 1]) #duplicated_Tile_Sizes.append(["g", 16]) #duplicated_Tile_Sizes.append(["d", 16]) # for 15 duplicated_Tile_Sizes.append(["f", 16]) # #duplicated_Tile_Sizes.append(["e", 16]) # 3 tmp_config.add_tile_size(duplicated_Tile_Sizes) tmp_config.add_representative_problem_size(list_representative_problem_size) # tmp_config.size_REG_X = 1 tmp_config.size_REG_Y = 1 tmp_config.size_TB_X = 1 tmp_config.size_TB_Y = 1 tmp_config.size_TB_K = 1 for each_idx in list_REG_X: tmp_config.size_REG_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_REG_Y: tmp_config.size_REG_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_X: tmp_config.size_TB_X *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_Y: tmp_config.size_TB_Y *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) for each_idx in list_TB_K: tmp_config.size_TB_K *= tc_helper.tc_gen_helper_find(duplicated_Tile_Sizes, each_idx) ''' print (">>>> list_TB_X: ", tmp_config.list_TB_X) print (">>>> list_TB_Y: ", tmp_config.list_TB_Y) print (">>>> list_TB_K: ", tmp_config.list_TB_K) print (">>>> list_REG_X: ", tmp_config.list_REG_X) print (">>>> list_REG_Y: ", tmp_config.list_REG_Y) print (">>>> list_Tile_Sizes: ", tmp_config.list_tile_sizes) print (">>>> # of Elements in SMEM_L: ", size_SMEM_Left * 16) print (">>>> # of Elements in SMEM_R: ", size_SMEM_Right * 16) ''' # list_CLASS_configuration.append(tmp_config)
def model_predictive_modeling(list_configurations): # # architectures: p100 (sm_60), v100 (sm_70) # cuda_arch = "sm_70" # # 0. initialize # 0.1. init. indices # 0.2. init. reg. tiles # # ----- main-loop ----- # 1. load inputs # 1.1. load A # 1.2. load B # # 2. compute # 2.1. # --------------------- # # 3. store output # print( "=========[model_predictive_modeling]=======================================================================" ) for each_config in list_configurations: # # (1) m, n, k from a given representative problem size (This is based on the given equation not a configuration) # tmp_m = 1 for each_idx in each_config.list_tensor_A: if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K, each_idx) == -1: tmp_m *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.m = tmp_m # tmp_n = 1 for each_idx in each_config.list_tensor_B: if tc_helper.tc_gen_helper_find_1d(each_config.list_TB_K, each_idx) == -1: tmp_n *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.n = tmp_n # tmp_k = 1 for each_idx in each_config.list_TB_K: tmp_k *= tc_helper.tc_gen_helper_find( each_config.list_representative_problem_size, each_idx) # each_config.k = tmp_k # # (2) # of Thread Blocks (calculated in cost-model) # # # (3) Estimated Number of Registers in a Thread Block # num_base = 20 # # 71, 70, 74, 71 # example: tccg 48th, 4x4: 16 > 32 # 4x1 + 1x1 = 5 > 10 : 42 ~ 29, 28, 32, 29 # # 128, 121, 126, 120 # 6x6: 36 > 72 # 6x1 + 1x1 = 7 > 14 : 86 ~ 42, 35, 40, 34 # # 120, 122, 112, 113 # 4x8: 32 > 64 # 8x1 + 1x1 = 9 > 18 : 82 ~ 38, 40, 30, 31 # # size_register_x = 1 size_register_y = 1 for each_idx in each_config.list_REG_X: size_register_x *= tc_helper.tc_gen_helper_find( each_config.list_tile_sizes, each_idx) for each_idx in each_config.list_REG_Y: size_register_y *= tc_helper.tc_gen_helper_find( each_config.list_tile_sizes, each_idx) each_config.num_Estimated_Registers = size_register_x * size_register_y * 2 # # (4) Kernel Efficiencies # #break print( "=========[model_predictive_modeling]=======================================================================" ) return 1000
def tc_gen_perms_exclusive_REG_X(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_CLASS_configuration, opt_print): # # # num_ext_idx = 0 num_int_idx = 0 for each_left_idx in list_given_input_tensor_left: if tc_helper.tc_gen_helper_find_1d(list_internal_indices, each_left_idx) == -1: num_ext_idx += 1 else: num_int_idx += 1 # len_tensor_left = len(list_given_input_tensor_left) if opt_print == 1: print ("========================================== [Enumerations-REG_X] ===================================================") print ("========================================== [Exclusive] ===================================================") print ("Tensor (LEFT): ", list_given_input_tensor_left) print ("len(LEFT): ", len_tensor_left, ", # of External Indices: ", num_ext_idx, ", # of Internal Indices: ", num_int_idx) print ("list_representative_problem_size: ", list_representative_problem_size) # # For Each Tile-Size for REG_X # for size_REG_X in list_sizes_REG: if opt_print == 1: print ("|REG_X| = ", size_REG_X) # # # for start_index in range(0, len_tensor_left): # REG_X_Vol = 1 REG_X_Vol_Prev = 1 list_REG_X = [] # will be inherited list_Tile_Sizes = [] # will be inherited done_mapping_REG_X = -1 # not done # # # for target_index in range(start_index, len_tensor_left): str_start_index = list_given_input_tensor_left[target_index] if opt_print == 1: print ("idx: ", str_start_index) # # #1. Internal Index # if tc_helper.tc_gen_helper_find_1d(list_internal_indices, str_start_index) != -1: continue # # #2. The FVI in the Output Tensor # if str_start_index == list_given_output_tensor[0]: continue if opt_print == 1: print (">> idx: ", str_start_index) # # |REG_X'| # REG_X_Vol *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) if opt_print == 1: print (">> idx: ", str_start_index, ", REG_X_Vol: ", REG_X_Vol) # # # if REG_X_Vol >= size_REG_X: # # |REG_X'| > |REG_X| # if REG_X_Vol > size_REG_X: # # Need to Split (REG and BX) # if done_mapping_REG_X == -1: blocking_tile_size = size_REG_X / REG_X_Vol_Prev list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, int(blocking_tile_size)]) done_mapping_REG_X = 1 else: list_Tile_Sizes.append([str_start_index, 1]) # # |REG_X'| = |REG_X| # else: # # # if done_mapping_REG_X == -1: list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) done_mapping_REG_X = 1 else: list_Tile_Sizes.append([str_start_index, 1]) # # # break else: list_REG_X.append(str_start_index) list_Tile_Sizes.append([str_start_index, tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index)]) # # # REG_X_Vol_Prev *= tc_helper.tc_gen_helper_find(list_representative_problem_size, str_start_index) # # # if done_mapping_REG_X == 1: if opt_print == 1: print ("list_REG_X: ", list_REG_X) print ("list_Tile_sizes: ", list_Tile_Sizes) tc_gen_perms_exclusive_REG_Y(list_sizes_REG, list_sizes_TB, list_given_output_tensor, list_given_input_tensor_left, list_given_input_tensor_right, list_internal_indices, list_representative_problem_size, list_TB_K, list_TB_X, list_REG_X, list_Tile_Sizes, list_CLASS_configuration, opt_print)
def tc_gen_variables_input_right(kernel_number, each_input, l_external_idx, l_v2_d_decl_var, l_v2_parameters, l_cuda_malloc, l_device_dynamic, l_var_input_right, opt_data_type): # Right Input d_input_name = "dev_" + each_input[1][0] h_input_name = "host_" + each_input[1][0] input_f_size = "" input_s_size = "" # idx_f_count = 0 idx_s_count = 0 for each_index in each_input[1][1]: if tc_helper.tc_gen_helper_find_1d(l_external_idx, each_index) != -1: if idx_f_count == 0: input_f_size = "size_" + each_index else: input_f_size = "size_" + each_index + " * " + input_f_size if idx_s_count == 0: input_s_size = "SIZE_SLICE_" + str( kernel_number) + "_" + each_index.capitalize() else: input_s_size = "SIZE_SLICE_" + str( kernel_number) + "_" + each_index.capitalize( ) + " * " + input_s_size idx_f_count = idx_f_count + 1 idx_s_count = idx_s_count + 1 else: if idx_f_count == 0: input_f_size = "size_" + each_index else: input_f_size = "size_" + each_index + " * " + input_f_size idx_f_count = idx_f_count + 1 # if opt_data_type == "DOUBLE": l_var_input_right.append(["double*", d_input_name]) else: l_var_input_right.append(["float*", d_input_name]) l_var_input_right.append(["int*", d_input_name + "_addr"]) l_var_input_right.append(["int*", h_input_name + "_addr"]) l_var_input_right.append(["int*", d_input_name + "_offset"]) l_var_input_right.append(["int*", h_input_name + "_offset"]) # if opt_data_type == "DOUBLE": l_v2_d_decl_var.append("double* " + d_input_name) else: l_v2_d_decl_var.append("float* " + d_input_name) l_v2_d_decl_var.append("const int* __restrict__ " + d_input_name + "_addr") l_v2_d_decl_var.append("const int* __restrict__ " + d_input_name + "_offset") # if opt_data_type == "DOUBLE": l_cuda_malloc.append([d_input_name, "double", input_f_size]) else: l_cuda_malloc.append([d_input_name, "float", input_f_size]) l_cuda_malloc.append([ d_input_name + "_addr", "int", input_s_size + " * num_thread_blocks_kernel_" + str(kernel_number) ]) l_cuda_malloc.append([ d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y" ]) # if opt_data_type == "DOUBLE": l_v2_parameters.append([d_input_name, "double", input_f_size]) else: l_v2_parameters.append([d_input_name, "float", input_f_size]) l_v2_parameters.append([ d_input_name + "_addr", "int", input_s_size + " * num_thread_blocks_kernel_" + str(kernel_number) ]) l_v2_parameters.append([ d_input_name + "_offset", "int", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y" ]) # if opt_data_type == "DOUBLE": l_device_dynamic.append( ["double", d_input_name, h_input_name, input_f_size]) else: l_device_dynamic.append( ["float", d_input_name, h_input_name, input_f_size]) l_device_dynamic.append([ "int", d_input_name + "_addr", h_input_name + "_addr", input_s_size + " * num_thread_blocks_kernel_" + str(kernel_number) ]) l_device_dynamic.append([ "int", d_input_name + "_offset", h_input_name + "_offset", "SIZE_TB_" + str(kernel_number) + "_X * SIZE_TB_" + str(kernel_number) + "_Y" ])