def per_die_config_dse_multiAcc(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): DSP = 6840 / 3 dsp_list = [] pair_list = [] lat_list = [] util_list = [] factor = 1 opt_ratio = 0 for i in range(0, len(sub_conv_N)): dsp_list.append([]) sub_net_gop = gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i]) for j in range(0, len(sub_conv_N[i])): # allocate_dsp by layer gops dsp_list[i].append( DSP * (sub_conv_N[i][j] * sub_conv_M[i][j] * sub_conv_R[i][j] * sub_conv_R[i][j] * sub_conv_K[i][j] * sub_conv_K[i][j]) / sub_net_gop) # do contrained dse for layer pair, cycle, cycle_per_layer = constrained_dse_layer( sub_conv_N[i][j], sub_conv_M[i][j], sub_conv_r[i][j], sub_conv_R[i][j], sub_conv_K[i][j], sub_conv_S[i][j], sub_flag[i][j], int(dsp_list[i][j]), int(37), factor) pair_list.append(pair) lat_list.append(cycle) util_list.append(pair[0] * pair[1] / float(int(dsp_list[i][j]))) print "dsp_list value: ", dsp_list, pair_list print "util_list value: ", util_list # note done best configuration for i in range(0, len(sub_conv_N)): pair, cycle, cycle_per_layer = constrained_dse( sub_conv_N[i], sub_conv_M[i], sub_conv_r[i], sub_conv_R[i], sub_conv_K[i], sub_conv_S[i], sub_flag[i], int(DSP), int(37), factor) if len(pair_list) > len(sub_conv_N): for remove_cnt in range(0, len(sub_conv_N)): pair_list.remove(pair_list[0]) lat_list.remove(lat_list[0]) util_list.remove(util_list[0]) # # ratio_tmp = ((max(lat_list) - min(lat_list)) / float(min(lat_list))) # print "initial diff_ratio: ", ratio_tmp # # max_lat_index = lat_list.index(max(lat_list)) # # find the max latency sub_net # for j in range(0, len(sub_conv_N[max_lat_index])): # if len(sub_conv_N[max_lat_index]) >=4: # max_acc_num = 4 # else: # max_acc_num = len(sub_conv_N[max_lat_index]) # for acc_num in range(0, max_acc_num): # #TODO: keep partitioning the sub_net and search the best number of acc and corresponding configuration return pair_list, lat_list, util_list
def run(self): start = time.time() process_gop_list = [] process_item_list = [] process_util_list = [] process_pair_list = [] search_counter = 0 print("Process " + str(self.processIdx) + " starts global search.") for idx, item in enumerate( partition_to_k(self.layer_list, self.acc_cluster_num, False), 0): if idx % PROCESS_NUM == self.processIdx: sub_gop_list = [] search_counter = search_counter + 1 sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ = model_split_by_list(self.conv_N, self.conv_M, self.conv_r, self.conv_R, self.conv_K, self.conv_S, self.flag, item) sub_pair_list, sub_lat_list, sub_util_list = \ local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) for i in range(0, len(sub_conv_N)): sub_gop_list.append( gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i])) if max(sub_lat_list) < self.overall_lat: overall_lat = max(sub_lat_list) if len(process_pair_list) < 6: process_item_list.append(item) process_pair_list.append(sub_pair_list) # process_pair_list.append([overall_lat]) process_util_list.append([overall_lat]) process_gop_list.append(sub_gop_list) # process_util_list.append(sub_util_list) # process_pair_list.append(sub_util_list) # else: # max_among_mins = process_pair_list.index(max(overall_lat)) # process_pair_list.remove(process_pair_list[max_among_mins]) # process_pair_list.append(sub_pair_list) # process_pair_list.append([overall_lat]) # process_pair_list.append(sub_util_list) # print "For set ID: " + str(idx) + ", the final explored points = ", search_counter if len(process_pair_list) != 0: self.result_Q.put((process_pair_list, process_item_list, process_gop_list, process_util_list)) end = time.time() print("Thread ", self.processIdx, " :", (end - start))
def model_partition_by_gop(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag): sub_conv_N = [] sub_conv_M = [] sub_conv_r = [] sub_conv_R = [] sub_conv_K = [] sub_conv_S = [] sub_flag = [] balance_ratio = 0 min_ration = 0.5 min_pair = [0, 0] sub_gops = [[], [], []] model_len = int(len(conv_N)) for i in range(0, model_len - 2): for j in range(i + 1, model_len - 1): sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag = model_partition_ordered( conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, i + 1, j + 1) # print sub_conv_N for k in range(0, 3): sub_gops[k] = gop_calculate(sub_conv_N[k], sub_conv_M[k], sub_conv_R[k], sub_conv_K[k]) # sub_gops[k] = conv_net_perf_theo(sub_conv_N[k], sub_conv_M[k], sub_conv_R[k], sub_conv_K[k]) balance_ratio = (max(sub_gops) - min(sub_gops)) / float( min(sub_gops)) # print sub_gops # print "2: ", i, j, sub_gops, balance_ratio, sub_conv_N, sub_conv_M print("Verigy cut status: ", i, j, cut_flag[i], cut_flag[j]) if i == 0 and j == 1: min_ration = balance_ratio print("initial balance ratio: ", balance_ratio) min_pair = [i, j] else: if cut_flag[i] == 1 & cut_flag[j] == 1: if balance_ratio < min_ration: min_ration = balance_ratio min_pair = [i, j] print "min_ratio: ", min_ration, min_pair return min_pair, min_ration
def multiAcc_dse(): # define the network parameter containers conv_N = [] conv_M = [] conv_r = [] conv_R = [] conv_K = [] conv_S = [] flag = [] cut_flag = [] sub_conv_N = [] sub_conv_M = [] sub_conv_r = [] sub_conv_R = [] sub_conv_K = [] sub_conv_S = [] sub_flag = [] pair_1 = [] pair_2 = [] pair_3 = [] lat_1 = 0 lat_2 = 0 lat_3 = 0 sub_lat_list = [] lat_list = [] util_1 = 0 util_2 = 0 util_3 = 0 sub_util_list = [] util_list = [] OPs = 0 sub_pair_list = [] item_list = [] pair_list = [] overall_lat = 60551400 layer_list = [] gop_list = [] """ step 1: extract model from txt file with parameter no_include_fc / include_fc """ conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag = model_extract( 'no_include_fc') print("Extracted cut flag: ", cut_flag) OPs = gop_calculate(conv_N, conv_M, conv_R, conv_K) max_layerout = max_layer_dataout(conv_N, conv_M, conv_R, conv_K) print_line("Model extract phase") print "1: ", "Model extracted" print "1: ", "Overall convolution operation required: ", OPs print "1: ", "Max layer output data: ", max_layerout # print_line("Model split finish") """ step 2: randomly cluster, param k=4, layer label results are in item """ print_line("Model partition phase") for i in range(0, len(conv_N)): layer_list.append(i) # kmeans=clusters_layers_kmeans(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, 2) # print kmeans partition_location, diff_ratio = model_partition_by_gop( conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag) print "2: layers extracted", conv_N print "2: layers cutable ", cut_flag print "2: partition location", partition_location print "2: diff_ratio: ", diff_ratio sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ =model_partition_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, partition_location[0]+1, partition_location[1]+1) # print "2: Best partition output: ", partition_location, diff_ratio print "2:", sub_conv_N sub_gop_list = [] for i in range(0, len(sub_conv_N)): sub_gop_list.append( gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_r[i], sub_conv_K[i])) print "2: gop of sub_nets", sub_gop_list print "2: length of sub_conv_N", len(sub_conv_N[0]), len( sub_conv_N[1]), len(sub_conv_N[2]) print "2", sub_flag print "2: length of sub_flag", len(sub_flag[0]), len(sub_flag[1]), len( sub_flag[2]) sub_pair_list = [] sub_lat_list = [] sub_util_list = [] print_line("Best Configuration Search") overall_start = time.time() # acc_cluster_num = 3 # pair_list, item_list, gop_list, util_list = global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat) # pair_list, gop_list, util_list = per_die_config_dse_multiAcc(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, # sub_conv_S, sub_flag) pair_list = per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) overall_end = time.time() print_line("DSEoutpout") print "Best Configuration Search Results: " for i in range(0, len(pair_list)): print pair_list[i] # print item_list #print "gop_list: ", gop_list #print "pair_list: ", pair_list #print "util_list: ", util_list # for i in range(0, len(util_list)): # print util_list[i], sum(util_list[i]) print "------------------------Final optimal configuration-------------------------------" # print "Network clustered results =", item_list[util_list.index(min(util_list))] # print "<Tm, Tn> = ", pair_list[util_list.index(min(util_list))] # print "Estimated overall latency = ", min(util_list) print "Overall time cost:", overall_end - overall_start, "s" print "----------------------------------------------------------------------------------" # item = return_partition(layer_list, 4, False) # # '''step 3: split the layers based on label clustering results''' # print("layer number is: ", int(len(conv_N))) # sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ # = model_split_by_list(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, item) # print sub_conv_N # print "model clustering test done!!!" # # '''step 4: do local search for all sub-models and find optimial <Tm, Tn> pair, lat, and util''' # sub_pair_list, sub_lat_list, sub_util_list = \ # local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) # print sub_pair_list, sub_lat_list, sub_util_list # # if max(sub_lat_list) < overall_lat: # overall_lat = max(sub_lat_list) # if len(pair_list) < 10: # pair_list.append(sub_pair_list) # pair_list.append([overall_lat]) # else: # max_among_mins = pair_list.index(max(overall_lat)) # pair_list.remove(pair_list[max_among_mins]) # pair_list.append(sub_pair_list) # pair_list.append([overall_lat]) # print(pair_1, "%.2f" % util_1, pair_2, "%.2f" % util_2, pair_3, "%.2f" % util_3, lat_1, lat_2, lat_3) # for i in range(1, int(len(conv_N)-1)): # for j in range(int(i+1), int(len(conv_N))): # for i in range(1, 10): # for j in range(1, 10): # sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag = model_split_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, i, j) # sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag = model_split_unordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag) # sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag = model_split_by_label(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, kmeans.labels_) # print(sub_conv_N) # pair_1, lat_1, pair_2, lat_2, pair_3, lat_3, util_1, util_2, util_3 = local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) # # print(i, j, pair_1, "%.2f" % util_1, pair_2, "%.2f" % util_2, pair_3, "%.2f" % util_3, lat_1, lat_2, lat_3) # # if max(lat_1, lat_2, lat_3) < overall_lat: # overall_lat = max(lat_1, lat_2, lat_3) # # if len(pair_list) < 50: # pair_list.append([i, j]) # pair_list.append(pair_1) # pair_list.append(pair_2) # pair_list.append(pair_3) # pair_list.append([overall_lat]) # # else: # # max_among_mins = pair_list.index(max(overall_lat)) # # pair_list.remove(pair_list[max_among_mins]) # # pair_list.append(pair_1) # # pair_list.append(pair_2) # # pair_list.append(pair_3) # # pair_list.append(overall_lat) # print(pair_list) # #step 3: # find_min_in_pairs() # min_among_mins = pair_list.index(min(overall_lat)) # print(pair_list[min_among_mins]) print "---------------------------- test part -------------------------------------------" print conv_net_perf(sub_conv_N[2], sub_conv_M[2], sub_conv_R[2], sub_conv_S[2], sub_conv_K[2], sub_flag[2], 8, 274, 37, 4, 4)
def multiAcc_dse(): # define the network parameter containers conv_N = [] conv_M = [] conv_r = [] conv_R = [] conv_K = [] conv_S = [] flag = [] cut_flag = [] pool_N = [] sub_conv_N = [] sub_conv_M = [] sub_conv_r = [] sub_conv_R = [] sub_conv_K = [] sub_conv_S = [] sub_flag = [] pair_1 = [] pair_2 = [] pair_3 = [] lat_1 = 0 lat_2 = 0 lat_3 = 0 sub_lat_list = [] lat_list = [] util_1 = 0 util_2 = 0 util_3 = 0 sub_util_list = [] util_list = [] OPs = 0 sub_pair_list = [] item_list = [] pair_list = [] overall_lat = 60551400 layer_list = [] gop_list = [] """ step 1: extract model from txt file with parameter no_include_fc / include_fc """ conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag, pool_N = model_extract( 'no_include_fc') # print("Extracted cut flag: ", cut_flag) # print("Extracted pool flag:", flag) OPs = gop_calculate(conv_N, conv_M, conv_R, conv_K) max_layerout = max_layer_dataout(conv_N, conv_M, conv_R, conv_K) print_line("Model extract phase") print("1: ", "Model extracted") print("1: ", "Overall convolution operation required: ", OPs) print("1: ", "Max layer output data: ", max_layerout) # print_line("Model split finish") """ step 2: randomly cluster, param k=4, layer label results are in item """ print_line("Model partition phase") for i in range(0, len(conv_N)): layer_list.append(i) # kmeans=clusters_layers_kmeans(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, 2) # print kmeans partition_location, diff_ratio = model_partition_by_gop( conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, cut_flag) print("2: layers extracted", conv_N) print("2: layers cutable ", cut_flag) print("2: partition location", partition_location) print("2: diff_ratio: ", diff_ratio) sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag \ =model_partition_ordered(conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, conv_G, flag, partition_location[0]+1, partition_location[1]+1) # print "2: Best partition output: ", partition_location, diff_ratio print("2:", sub_conv_N) sub_gop_list = [] for i in range(0, len(sub_conv_N)): sub_gop_list.append( gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i])) print("2: gop of sub_nets", sub_gop_list) print("2: length of sub_conv_N", len(sub_conv_N[0]), len(sub_conv_N[1]), len(sub_conv_N[2])) print("2", sub_flag) print("2: length of sub_flag", len(sub_flag[0]), len(sub_flag[1]), len(sub_flag[2])) sub_pair_list = [] sub_lat_list = [] sub_util_list = [] print_line("Best Configuration Search") overall_start = time.time() # acc_cluster_num = 3 # pair_list, item_list, gop_list, util_list = global_search(layer_list, acc_cluster_num, conv_N, conv_M, conv_r, conv_R, conv_K, conv_S, flag, overall_lat) # pair_list, gop_list, util_list = per_die_config_dse_multiAcc(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, # sub_conv_S, sub_flag) pair_list = per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) overall_end = time.time() print_line("DSE Output") print("Best Configuration Search Results for layer accelerators: ") for i in range(0, len(pair_list)): print(pair_list[i]) acc_task_list, total_acc_num = acc_task_analysis(pair_list, sub_conv_N, sub_conv_M, sub_conv_r, \ sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) print("Accelerator task list: ") for acc_num in range(0, len(acc_task_list)): print("acc core", acc_num, " task list: ", acc_task_list[acc_num]) print_line("Subnet Task Out") subnet_task_list = subnet_task_analysis(pair_list, acc_task_list, sub_conv_N, sub_conv_M, sub_conv_r, \ sub_conv_R, sub_conv_K, sub_conv_S, sub_flag) print("sub net interface list:") for i in range(0, len(subnet_task_list)): print(subnet_task_list[i]) print_line("Write out configurations") print(len(pair_list), "sub-nets are generated") print(total_acc_num, "accelerators are written into the cofig file") generate_param_file(pair_list, pool_N, acc_task_list, subnet_task_list, "acc_ins_params.txt") print_line("netGen run time system info") print("Overall time cost:", overall_end - overall_start, "s") print_line("line") print_line("test") print( conv_net_perf(sub_conv_N[2], sub_conv_M[2], sub_conv_R[2], sub_conv_S[2], sub_conv_K[2], sub_flag[2], 8, 274, 37, 4, 4))
def local_search(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): """ :param sub_conv_N: the input sub_conv_N is already splitted into several sub-nets :param sub_conv_M: same as above :param sub_conv_r: saa :param sub_conv_R: saa :param sub_conv_K: saa :param sub_conv_S: saa :param sub_flag: saa :return: the most optimal configuration for current sub-nets for an optimal system latency """ DSP = 6840 / 3 # datatype = fixed factor = 1 pair_1 = [] lat_1 = 0 util_1 = 0 pair_2 = [] lat_2 = 0 util_2 = 0 pair_3 = [] lat_3 = 0 util_3 = 0 pair_list = [] lat_list = [] util_list = [] gop_list = [] gop_per_subnet = [] gop_total = 0 dsp_per_acc = [] dsp_occupied = 0 # print "lists in sub_conv_N" # print len(sub_conv_N) # print sub_conv_N step = int(1) ratio = 0.05 search_counter = 0 Resolution = 10 ratio_init = 0 """initializing the dsp number for per acc based on the ops requirement""" for i in range(0, len(sub_conv_N)): gop_per_subnet.append( gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i])) gop_total += gop_per_subnet[i] print "gop_per_subnet in local_search: ", gop_per_subnet for i in range(0, len(sub_conv_N)): if i < len(sub_conv_N) - 1: dsp_per_acc.append( math.ceil(DSP * (gop_per_subnet[i] / float(gop_total)))) dsp_occupied += dsp_per_acc[i] else: dsp_per_acc.append(math.ceil(DSP - dsp_occupied)) """ Iteratively find the system level optimal configuration for the all the sub-nets""" search_stop = 0 while search_stop == 0 and search_counter < Resolution + 1: for i in range(0, len(sub_conv_N)): pair, cycle, cycle_per_layer = constrained_dse( sub_conv_N[i], sub_conv_M[i], sub_conv_r[i], sub_conv_R[i], sub_conv_K[i], sub_conv_S[i], sub_flag[i], int(2200), int(37), factor) pair_list.append(pair) lat_list.append(cycle) util_list.append(pair[0] * pair[1] / float(DSP)) if len(pair_list) > len(sub_conv_N): for remove_cnt in range(0, len(sub_conv_N)): pair_list.remove(pair_list[0]) lat_list.remove(lat_list[0]) util_list.remove(util_list[0]) ratio_tmp = ((max(lat_list) - min(lat_list)) / float(min(lat_list))) # print ratio_tmp if search_counter == 0: ratio_init = ratio_tmp # or search_counter == Resolution: if ratio_tmp < ratio: search_stop = 1 else: max_idx = lat_list.index(min(lat_list)) min_idx = lat_list.index(max(lat_list)) if ratio_tmp - ratio > float(0.1): if (dsp_per_acc[max_idx] - 5 * step > 0): dsp_per_acc[max_idx] = dsp_per_acc[max_idx] - 5 * step dsp_per_acc[min_idx] = dsp_per_acc[min_idx] + 5 * step else: dsp_per_acc[max_idx] = dsp_per_acc[max_idx] - step dsp_per_acc[min_idx] = dsp_per_acc[min_idx] + step else: if (dsp_per_acc[max_idx] - step > 0): dsp_per_acc[max_idx] = dsp_per_acc[max_idx] - step dsp_per_acc[min_idx] = dsp_per_acc[min_idx] + step search_counter = search_counter + 1 # if search_stop == 1: # and search_counter == 101 print "local search stopped at =", search_counter - 1, "current ratio: ", ratio_tmp print "initial ratio ->", ratio_init return pair_list, lat_list, util_list
def per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): print "sub_conv_N (original): ", sub_conv_N print "sub_flag (original): ", sub_flag opt_res = [] # i: iterate over each sub-net for i in range(0, len(sub_conv_N)): # print "sub_conv_N[" + str(i) + "]: ", sub_conv_N[i] min_cycle = sys.maxint min_idx = -1 sub_conv_net_gop = gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i]) cycle_list = [] pair_list = [] # when the number of accelerators is j # for j in range(1, 3 + 1): for j in range(1, 3 + 1): # cycle should be compared here, to find optimal accelerator number and config lat_list = [] start_index = 0 # k: the index to split the sub_conv_N for k in split_sub_net(0, len(sub_conv_N[i]), j): DSP = int(6840 / 3 * 0.8) dsp_list = [] local_cycle_list = [] local_pair_list = [] sub_net_gop_list = [] factor = 1 # re-caculate sub_conv_N, sub_conv_M, sub_conv_R, sub_conv_K sub_conv_N_new = [] sub_conv_M_new = [] sub_conv_r_new = [] sub_conv_R_new = [] sub_conv_K_new = [] sub_conv_S_new = [] sub_flag_new = [] # -2: illegal setting, pass if k[0] == -2: print "illegal partitioning of sub-net, passing!" continue # -1: only one accelerator if k[0] == -1: sub_conv_N_new.append(sub_conv_N[i]) sub_conv_M_new.append(sub_conv_M[i]) sub_conv_r_new.append(sub_conv_r[i]) sub_conv_R_new.append(sub_conv_R[i]) sub_conv_K_new.append(sub_conv_K[i]) sub_conv_S_new.append(sub_conv_S[i]) sub_flag_new.append(sub_flag[i]) # else: 2 or 3 accelerators else: zi = zip([0] + k, k + [None]) for idx in range(0, len(zi)): sub_conv_M_new.append( flatten(sub_conv_M[i])[zi[idx][0]:zi[idx][1]]) sub_conv_N_new.append( flatten(sub_conv_N[i])[zi[idx][0]:zi[idx][1]]) sub_conv_r_new.append( flatten(sub_conv_r[i])[zi[idx][0]:zi[idx][1]]) sub_conv_R_new.append( flatten(sub_conv_R[i])[zi[idx][0]:zi[idx][1]]) sub_conv_K_new.append( flatten(sub_conv_K[i])[zi[idx][0]:zi[idx][1]]) sub_conv_S_new.append( flatten(sub_conv_S[i])[zi[idx][0]:zi[idx][1]]) sub_flag_new.append( flatten(sub_flag[i])[zi[idx][0]:zi[idx][1]]) # print "split index k = ", k, "accelerator j = ", j, "sub_conv_N_new: ", sub_conv_N_new # m: the mth sub-sub-net in the sub-net temp_pair_list = [] for m in range(0, len(sub_conv_N_new)): # print "sub_conv_N_new[" + str(m) + "]: ", sub_conv_N_new[m] sub_net_gop_list.append( gop_calculate(sub_conv_N_new[m], sub_conv_M_new[m], sub_conv_R_new[m], sub_conv_K_new[m])) # allocate_dsp by layer gops dsp_list.append( math.ceil(DSP * (sub_net_gop_list[m]) / sub_conv_net_gop)) # search best <Tm,Tn> configurations pair, cycle, cycle_per_layer = constrained_dse( sub_conv_N_new[m], sub_conv_M_new[m], sub_conv_r_new[m], sub_conv_R_new[m], sub_conv_K_new[m], sub_conv_S_new[m], sub_flag_new[m], int(dsp_list[m]), int(37), factor, j) local_cycle_list.append(cycle) temp_pair_list.append(pair) # local_pair_list.append(pair) cycle_list.append([j, k, max(local_cycle_list)]) pair_list.append(temp_pair_list) # find the minimum cycles and the corresponding index for each sub-net for n in range(0, len(cycle_list)): if cycle_list[n][2] < min_cycle: min_cycle = cycle_list[n][2] min_idx = n opt_res.append([cycle_list[min_idx], pair_list[min_idx]]) return opt_res
def per_die_config_dse_multiAcc_flex(sub_conv_N, sub_conv_M, sub_conv_r, sub_conv_R, sub_conv_K, sub_conv_S, sub_flag): print("sub_conv_N (original): ", sub_conv_N) print("sub_flag (original): ", sub_flag) opt_res = [] # i: iterate over each sub-net for i in range(0, len(sub_conv_N)): min_cycle = sys.maxsize min_idx = -1 sub_conv_net_gop = gop_calculate(sub_conv_N[i], sub_conv_M[i], sub_conv_R[i], sub_conv_K[i]) print("\t[DEBUG] processing sub_net - sub_conv_N[{}]: {} - sub_conv_M[{}]: {} - sub_conv_r[{}]: {} - TOTAL GOPs = {:0.4f}".format(\ i,sub_conv_N[i],i,sub_conv_M[i],i,sub_conv_r[i],sub_conv_net_gop/1e9)) cycle_list = [] pair_list = [] # when the number of accelerators is j for j in range(1, 3 + 1): # cycle should be compared here, to find optimal accelerator number and config lat_list = [] start_index = 0 # k: the index to split the sub_conv_N for k in split_sub_net(0, len(sub_conv_N[i]), j): print("\t\t[DEBUG] index to split sub_conv_N[{}] : {}".format( i, j)) DSP = int(6840 / 3 * 0.8) dsp_list = [] local_cycle_list = [] local_pair_list = [] sub_net_gop_list = [] factor = 1 # re-caculate sub_conv_N, sub_conv_M, sub_conv_R, sub_conv_K sub_conv_N_new = [] sub_conv_M_new = [] sub_conv_r_new = [] sub_conv_R_new = [] sub_conv_K_new = [] sub_conv_S_new = [] sub_flag_new = [] # -2: illegal setting, pass if k[0] == -2: print("\t\tillegal partitioning of sub-net, passing!") continue # -1: only one accelerator if k[0] == -1: sub_conv_N_new.append(sub_conv_N[i]) sub_conv_M_new.append(sub_conv_M[i]) sub_conv_r_new.append(sub_conv_r[i]) sub_conv_R_new.append(sub_conv_R[i]) sub_conv_K_new.append(sub_conv_K[i]) sub_conv_S_new.append(sub_conv_S[i]) sub_flag_new.append(sub_flag[i]) print("\t\tsub_conv_N_new: ", sub_conv_N_new) # else: 2 or 3 accelerators else: zi = list(zip([0] + k, k + [None])) print("\t\ttesting zi in python 3.5", zi, len(zi)) for idx in range(0, len(zi)): sub_conv_M_new.append( flatten(sub_conv_M[i])[zi[idx][0]:zi[idx][1]]) sub_conv_N_new.append( flatten(sub_conv_N[i])[zi[idx][0]:zi[idx][1]]) sub_conv_r_new.append( flatten(sub_conv_r[i])[zi[idx][0]:zi[idx][1]]) sub_conv_R_new.append( flatten(sub_conv_R[i])[zi[idx][0]:zi[idx][1]]) sub_conv_K_new.append( flatten(sub_conv_K[i])[zi[idx][0]:zi[idx][1]]) sub_conv_S_new.append( flatten(sub_conv_S[i])[zi[idx][0]:zi[idx][1]]) sub_flag_new.append( flatten(sub_flag[i])[zi[idx][0]:zi[idx][1]]) print("\t\tsub_conv_N_new: ", sub_conv_N_new) # m: the mth sub-sub-net in the sub-net temp_pair_list = [] acc_layer_task_list = [] for m in range(0, len(sub_conv_N_new)): sub_net_gop_list.append( gop_calculate(sub_conv_N_new[m], sub_conv_M_new[m], sub_conv_R_new[m], sub_conv_K_new[m])) # allocate_dsp by layer gops total_gops_ratio = sub_net_gop_list[m] / sub_conv_net_gop dsp_list.append(math.ceil(DSP * total_gops_ratio)) print( "\t\t\tsub_conv_N_new[{}]: {} - GOPs: {:0.4f} ({:0.2f})- DSPs {}" .format(m, sub_conv_N_new[m], sub_net_gop_list[m] / 1e9, total_gops_ratio, dsp_list[m])) # search best <Tm,Tn> configurations pair, cycle, cycle_per_layer = constrained_dse( sub_conv_N_new[m], sub_conv_M_new[m], sub_conv_r_new[m], sub_conv_R_new[m], sub_conv_K_new[m], sub_conv_S_new[m], sub_flag_new[m], int(dsp_list[m]), int(37), factor, j) local_cycle_list.append(cycle) temp_pair_list.append(pair) acc_layer_task_list.append(sub_conv_N_new) # local_pair_list.append(pair) cycle_list.append([j, k, max(local_cycle_list)]) pair_list.append(temp_pair_list) # find the minimum cycles and the corresponding index for each sub-net for n in range(0, len(cycle_list)): if cycle_list[n][2] < min_cycle: min_cycle = cycle_list[n][2] min_idx = n print("\t[DEBUG] cycle_list[min_idx] = {} - pair_list[min_idx] = {}". format(cycle_list[min_idx], pair_list[min_idx])) opt_res.append([cycle_list[min_idx], pair_list[min_idx]]) print("\t[DEBUG] DONE for sub_net_N[{}]".format(i)) return opt_res