def layer_design_space_exploration(self): """ Design space exploration """ self.network['P_fft'] = 1 self.network['P_ifft'] = 1 self.layer['T_img_i'] = np.array( [1]) # Value of T_img_i does not matter. opt_layers = { 'Ni': [], 'di': [], 'fin_pi': [], 'fout_pi': [], 'T_img_i': [] } for li in range(self.num_layers): _opt_layer_li_param = self.layer_i_param(li) for k in self.layer.keys(): opt_layers[k].append(_opt_layer_li_param[k][0]) printf('finish layer dse for {}', li) self.print_layer_conf(_opt_layer_li_param, li) for k in self.layer.keys(): opt_layers[k] = np.array(opt_layers[k]) self.layer = opt_layers self.layer['T_img_i'] = np.array([1] * self.num_layers) self.Nmax = self.layer['Ni'].max()
def go(args): model_cnn = yaml.load(open(args.cnn)) model_hw = yaml.load(open(args.hardware)) ######################### # algo level optimization ######################### # OPS: {'tool': [x1,x1], 'spatial': [y1,y2]} # param_algo: {'fft': [N1,N2], 'folding': [f1,f2]} OPS, param_algo = algo_dse.algo_dse(model_cnn, model_hw, options={ 'CaP': args.cap, 'max_folding': 0, 'var_fft': True }) ######################### # arch level optimization ######################### # performance: {'latency': [l1,l2], 'throughput': [t1,t2]} # param_arch: {''} arch_type = (args.cap) and 'CaP' or 'OaA' performance, param_arch, stat_resource = arch_dse.arch_dse(model_cnn, model_hw, param_algo, type=arch_type) printf('algo params: {}', param_algo) printf('operation count: {:5.4f}%', OPS['tool'].sum() / OPS['spatial'].sum()) printf('latency: {:5.2f}ms throughput: {:5.1f}GOPS', performance['latency'], performance['throughput']) printf('arch params: {}', param_arch) printf('utilization: {}', stat_resource)
def _perm_to_int(perm): """ convert perm (which can be a str or int) to int (understandable by os module) e.g.: perm='0444' if for read-only policy However, I won't process the first char for now """ if type(perm) == type(0): return perm ERROR_PERM_FORMAT = 'format of perm is wrong!' try: assert len(perm) == 4 except AssertionError: printf(ERROR_PERM_FORMAT, type='ERROR') exit() p_pos = ['','USR', 'GRP', 'OTH'] # don't care, owner, group, others p_ret = 0 eval_str = 'stat.S_I{}{}' for n in range(1,4): p_int = int(perm[n]) try: assert p_int <= 7 and p_int >= 0 except AssertionError: printf(ERROR_PERM_FORMAT, type='ERROR') exit() if p_int >= 4: p_ret |= eval(eval_str.format('R', p_pos[n])) if p_int in [2,3,6,7]: p_ret |= eval(eval_str.format('W', p_pos[n])) if p_int%2 == 1: p_ret |= eval(eval_str.format('X', p_pos[n])) return p_ret
def compare_algo(cnn_dir, hw_config, tool_config): """ cnn_dir: the directory for couple of cnn models hw_config: the hw config yaml file tool_config: the tool config yaml file """ cnn_directory = os.fsencode(cnn_dir) oaa_count = None cap_count = None spa_count = None num_cnn = 0 file_list = [] for cnn_f in os.listdir(cnn_directory): cnn_filename = os.fsencode(cnn_f).decode('utf-8') if cnn_filename.endswith('.yaml'): num_cnn += 1 file_list += [cnn_filename] else: continue file_list.sort() for cnn_f in file_list: cnn_filename = os.fsencode(cnn_f).decode('utf-8') params_cnn,params_algo,_1,_2,_3 = parse_input('{}/{}'.\ format(cnn_directory.decode('utf-8'),cnn_filename),hw_config,tool_config) params_algo_spa = copy.deepcopy(params_algo) params_algo_spa['d max'] = 1 params_algo_spa['name'] = 'Spatial' ae_spa = algo_engine.spatial_complexity(params_cnn, params_algo_spa) ae_cap = algo_engine.fft_complexity(params_cnn, params_algo) params_algo_oaa = copy.deepcopy(params_algo) params_algo_oaa['d max'] = 1 params_algo_oaa['name'] = params_algo_oaa['name'].replace('CaP', 'OaA') ae_oaa = algo_engine.fft_complexity(params_cnn, params_algo_oaa) ae_spa.count() ae_cap.count(global_d_N=True) ae_oaa.count() printf(ae_cap.str_compare_algo(ae_oaa, ae_spa), type=None, separator='=') printf('{} d value: {}', cnn_filename, ae_cap.chosen_params_algo['batch fold']) if oaa_count is None: oaa_count = ae_oaa.ops_count cap_count = ae_cap.ops_count spa_count = ae_spa.ops_count else: oaa_count = np.concatenate([oaa_count, ae_oaa.ops_count]) cap_count = np.concatenate([cap_count, ae_cap.ops_count]) spa_count = np.concatenate([spa_count, ae_spa.ops_count]) oaa_count = oaa_count.reshape(num_cnn, -1).T cap_count = cap_count.reshape(num_cnn, -1).T spa_count = spa_count.reshape(num_cnn, -1).T to_file = '{}/{}_complexity.npy' np.save(to_file.format(cnn_dir, 'OaA'), oaa_count) np.save(to_file.format(cnn_dir, 'CaP'), cap_count) np.save(to_file.format(cnn_dir, 'SPA'), spa_count)
def _mat_shape_check(*mat_l): """ check if mat is 2D and its shape is square (N x N) """ try: for mat in mat_l: assert len(mat.shape) == 2 assert mat.shape[0] == mat.shape[1] except AssertionError: printf("Currently only support 2D square matrix, get {}", mat.shape,type="ERROR")
def overlap_add(kern, base, N): """ Do the N point FFT using overlap and add method. Suppose: kern K x K base B x B base_window W x W ret_mat R x R Do the N-point KKT: - W + K - 1 = N - Zero padding kern & base to N x N - KKT padded N x N matrices. Note: - Currently support only square shaped kernel & base. """ # actually OaA has already done the padding for you. #pdb.set_trace() _mat_shape_check(kern, base) K = kern.shape[0] B = base.shape[0] stride = 1 padding = K - 1 R = int((B - K + 2*padding)/stride + 1) assert K <= N W = N + 1 - K # l_sub overlap = K - 1 R_prime = ceil(B/W)*W + K - 1 # temp matrix after padding ret_mat = np.zeros((R_prime, R_prime),dtype=complex) kern_fft = np.fft.fft2(mat_padding(kern,N)) for i in range(0,B,W): for j in range(0,B,W): base_win_fft = np.fft.fft2(mat_padding(base[i:i+W,j:j+W],N)) ret_tile = np.fft.ifft2(kern_fft*base_win_fft) x_off = i y_off = j temp_mat = np.zeros((R_prime,R_prime),dtype=complex) #printf("(x_off,y_off): ({},{}) in ({},{})", x_off, y_off, R_prime,R_prime) try: temp_mat[x_off:x_off+N,y_off:y_off+N] = ret_tile except Exception: pdb.set_trace() printf("exception", type="ERROR") ret_mat += temp_mat # debug assert not np.any(np.around(ret_mat.imag,decimals=10)) #printf("fft conv:\n{}", np.around(ret_mat[0:R,0:R].real,decimals=10)) #printf("normal conv:{}\n{}", scipy.signal.convolve2d(base,kern).shape, scipy.signal.convolve2d(base,kern)) return ret_mat[0:R,0:R].real
def compare_CaP_OaA(): printf( " hybd1 complexity | CaP complexity | folding | spatial complexity", type=None) sum_complexity1 = 0 sum_complexity2 = 0 sum_baseline = 0 for layer in layers: complexity_baseline = op.op_count_spatial(*layer) complexity1 = op.op_count_fft(*layer, folding_1D=1) layer[4] = 16 complexity2, folding_opt = complexity_CaP(layer) sum_complexity1 += complexity1 sum_complexity2 += complexity2 sum_baseline += complexity_baseline printf("{} | {} | {} | {}", complexity1 / 1e9, complexity2 / 1e9, folding_opt, complexity_baseline / 1e9, type=None) printf("sum: hybd1 vs. CaP vs. spatial") printf("{} ({}) {} ({}) {}", sum_complexity1 / 1e9, sum_complexity1 / sum_baseline, sum_complexity2 / 1e9, sum_complexity2 / sum_baseline, sum_baseline)
def mkdir_r(dir_r): """ recursively mkdir if not exist dir_r of 'a/b/c' or 'a/b/c/' will both create directory a, b and c WARNING: no explicit error checking: e.g.: if there is a file (not dir) called 'a/b', then this function will fail """ dir_parent = os.path.dirname(dir_r) dir_parent = (dir_parent != '') and dir_parent or '.' if not os.path.exists(dir_parent): mkdir_r(dir_parent) if not os.path.exists(dir_r): os.mkdir(dir_r) printf("created dir: {}", dir_r, separator=None)
def print_layer_conf(self, _opt_layer_li_param, li): #import pdb;pdb.set_trace() num_param = len(_opt_layer_li_param) row_regex_s = ' '.join(['{:>10s}'] * num_param) row_regex_d = ' '.join(['{:>10d}'] * num_param) s = stringf('LAYER {} CONF', li, type=None, separator='.') s += '\n' s += row_regex_s.format('Ni', 'di', 'fin_pi', 'fout_pi', 'T_img_i') s += '\n' s += '-' * (10 * num_param + 2 * (num_param - 1)) + '\n' s += row_regex_d.format(_opt_layer_li_param['Ni'][0], _opt_layer_li_param['di'][0], _opt_layer_li_param['fin_pi'][0], _opt_layer_li_param['fout_pi'][0], _opt_layer_li_param['T_img_i'][0]) s += '\n' printf(s, type=None, separator='*')
def fft(N, ip, int_bits, tot_bits, format_='h'): ip = list(np.array(ip).flatten()) ip = ip[0:len(ip) // N * N] assert len(ip) >= N ip = [fpt_to_decimal(int_bits, tot_bits, x, format_='h') for x in ip] ip = np.array(ip).reshape(-1, N) op = np.ndarray(shape=ip.shape, dtype=np.complex64) op_str = np.ndarray(shape=ip.shape, dtype=(np.str_, 16)) for i, ip_i in enumerate(ip): printf(ip_i) op[i] = np.fft.fft(ip_i) for i, op_i in enumerate(op): op_str[i] = np.array([ '{} {}'.format( decimal_to_fpt(int_bits, tot_bits, o.real, format_=format_), decimal_to_fpt(int_bits, tot_bits, o.imag, format_=format_)) for o in op[i] ]) return op_str
def log_streaming_data_SPN(N, p, input_data, output_data, resources): row_len = 0 raw_sequence = {'0: INPUT': input_data, '1: OUTPUT': output_data} strf = N // 10 and '{:2s}' or '{:1s}' def tostr_tuple(e): return (strf + ',' + strf).format(str(e[0]), str(e[1])) v_tostr_tuple = np.vectorize(tostr_tuple) for k in sorted(raw_sequence.keys()): printf('{}', k) for tup in raw_sequence[k]: #import pdb;pdb.set_trace() row_len += len(tup) str_tup = ' '.join(v_tostr_tuple(tup)) print(str_tup + ' ', end='') if row_len == N: row_len = 0 print('\n') for k in resources.keys(): printf('{}: {}', k, resources[k])
def overlap_add_1D(kern,base,N): # Default padding & stride: # padding = N-1 # stride = 1 K = kern.shape[0] B = base.shape[0] R = K + B - 1 W = N + 1 - K kern_pad = np.zeros(N) kern_pad[0:K] = kern kern_fft = np.fft.fft(kern_pad,N) ret = np.zeros(R) for i in range(0,B,W): base_pad = np.zeros(N) base_pad[0:W] = base[i:i+W] base_win_fft = np.fft.fft(base_pad,N) ret_win = np.fft.ifft(kern_fft*base_win_fft) temp = np.zeros(R) temp[i:i+N] = ret_win ret += temp # debug printf("normal conv:\n{}", np.convolve(kern,base)) printf("fft conv:\n{}", ret)
def explore_fix_folding(layers, range_N=None, range_folding=None, name=''): """ will give the full statistics for each fixed folding factor. """ N_min_power = 2 N_max_power = 2 folding_min = 1 folding_max = 30 range_N = ((range_N is not None) and [np.array(range_N)] or [[16, 32]])[0] #[4**np.arange(N_min_power,N_max_power+1)])[0] range_folding = ((range_folding is not None) and [np.array(range_folding)] or [np.arange(folding_min, folding_max + 1)])[0] min_ops_layers = np.zeros((len(range_folding), len(layers))) ops_spatial_layers = np.zeros(len(layers)) for i_l, l in enumerate(layers): ops_spatial_layers[i_l] = op.op_count_spatial(*(l[0:4]), None, *(l[4:6])) / 1e6 ops_spatial_total = ops_spatial_layers.sum() N_layers = np.zeros((len(range_folding), len(layers))) for i_fd, fd in enumerate(range_folding): printf('optimal values (FFT,folding={}):', fd) printf(' layer N folding MinOps ratio', type=None, separator='-') for i_l, l in enumerate(layers): min_ops_layers[i_fd][i_l], N_layers[i_fd][ i_l], fd_i = core_fft_size_folding(*(l[0:4]), range_N=range_N, range_folding=fd, name='') printf('{:8d}{:8d}{:8d}{:12.2f}{:8.3f}', i_l + 1, int(N_layers[i_fd][i_l]), fd_i, min_ops_layers[i_fd][i_l], min_ops_layers[i_fd][i_l] / ops_spatial_layers[i_l], type=None, separator=None) min_ops_sum = min_ops_layers[i_fd].sum() printf("Total ops: {:12.2f}; ratio: {:5.3f}", min_ops_sum, min_ops_sum / ops_spatial_total, type=None, separator='><') idx_folding = np.sum(min_ops_layers, axis=1).argmin() return N_layers[idx_folding], range_folding[idx_folding], np.sum( min_ops_layers, axis=1).min()
def log_streaming_data_SPN(N, p, input_data, output_data, resources): row_len = 0 raw_sequence = {'0: INPUT': input_data, '1: OUTPUT': output_data} strf = N // 10 and '{:2s}' or '{:1s}' def tostr_tuple(e): return (strf + ',' + strf).format(str(e[0]), str(e[1])) v_tostr_tuple = np.vectorize(tostr_tuple) for k in sorted(raw_sequence.keys()): printf('{}', k) for tup in raw_sequence[k]: row_len += len(tup) str_tup = '' for start_i in range(0, len(tup), N): if start_i > 0: str_tup += '\n' * 2 str_tup += ' '.join(v_tostr_tuple(tup[start_i:start_i + N])) print(str_tup + ' ', end='') if row_len >= N: row_len = 0 print('\n') for k in resources.keys(): printf('{}: {}', k, resources[k])
for N in np.array([4,8,16]): if N <= l_kern: continue for stride in np.array([1]): op_spatial = op_count_spatial(f_in,f_out,l_img,l_kern,-1,stride) op_fft = op_count_fft(f_in,f_out,l_img,l_kern,N,-1) printf("({:4d},{:4d},{:4d},{:4d},{:4d},{:4d})-->spatial: {:8.0f}, fft: {:8.0f}-->ratio: {:.3f}", f_in,f_out,l_img,l_kern,N,stride,op_spatial,op_fft,op_fft/op_spatial,separator=None) """ # param list: [fin, fout, l_img, l_kern, N, stride, padding] layers = [[3, 96, 224, 11, 64, 4, 0], [96, 256, 55, 5, 64, 1, 2], [256, 384, 27, 3, 64, 1, 1], [384, 384, 13, 3, 64, 1, 1], [384, 256, 13, 3, 64, 1, 1]] folding = -1 min_tot_op_ratio = float("inf") min_folding = -1 printf("operation count is in unit of Mega") _FD_MAX_1D = 9 _FD_MIN_1D = 1 #### matplotlib #### axis_folding = np.array([]) axis_layers_op_spatial = np.array([]) axis_layers_op_oaa = np.array([]) axis_tot_op_spatial = np.array([]) axis_tot_op_oaa = np.array([]) #################### for fd in range(_FD_MIN_1D, _FD_MAX_1D + 1): #### matplotlib #### axis_folding = np.append(axis_folding, [fd], axis=0) #################### printf("folding factor 1D: {:4d}", fd) printf("l FFT spatial OaA Ratio ", type=None, separator='-')
fig.colorbar(surf, shrink=0.5, aspect=5) ax.set_xlabel('FFT size') ax.set_ylabel('folding') ax.set_zlabel('# ops') #plt.show() plt.savefig('plots/{}.png'.format(title)) Z[Z == 0.] = float('Inf') min_args = np.unravel_index(Z.argmin(), Z.shape) return X[min_args], Y[min_args] #return min_args def ceiling_inverse(): x = np.arange(4, 111) k = 1000 y = np.log(x) * x**2 * np.ceil(k / (x - 1)**2) plt.plot(x, y, 'o') plt.show() if __name__ == '__main__': #ceiling_inverse() layers = [[3, 96, 224, 11], [96, 256, 55, 5], [256, 384, 27, 3], [384, 384, 13, 3], [384, 384, 13, 3]] printf('optimal values:') printf(' layer N folding', type=None, separator='-') for i, l in enumerate(layers): N_i, fd_i = plot_fft_size_folding(*l, name='layer{}'.format(i)) printf('{:8d}{:8d}{:8d}', i, N_i, fd_i, type=None, separator=None)
def plot_fixed_len_FFT(): bars = {16: [], 32: [], 64: [], 128: []} for FFT_fixed in list(bars.keys()): for i, layer in enumerate(layers): layer[4] = FFT_fixed _op_oaa = op.op_count_fft(*layer) / 1e9 bars[FFT_fixed] += [_op_oaa] lines = {"spatial": [], "var_fft": [], "native_fft": []} for i, layer in enumerate(layers): _op_spatial = op.op_count_spatial(*layer) / 1e9 lines["spatial"] += [_op_spatial] layers[0][4] = 32 # 32: OaA layers[1][4] = 16 # 16: OaA layers[2][4] = 32 # 32: Native layers[3][4] = 16 # 16: Native layers[4][4] = 16 # 16: Native lines["var_fft"] += [op.op_count_fft(*layers[0]) / 1e9] lines["var_fft"] += [op.op_count_fft(*layers[1]) / 1e9] lines["var_fft"] += [op.op_count_fft(*layers[2]) / 1e9] lines["var_fft"] += [op.op_count_fft(*layers[3]) / 1e9] lines["var_fft"] += [op.op_count_fft(*layers[4]) / 1e9] #import pdb;pdb.set_trace() conv_layers = np.arange(len(layers)) + 1 fig1 = plt.figure(1) ax = plt.subplot(111) ax.set_aspect(0.6) #ax.set_title("Effect of variable length FFT", fontsize=20) ax.set_xlabel("Convolution layers", fontsize=16) ax.set_ylabel("Giga Operations", fontsize=16) ax.set_ylim([0, 4.8]) ax.set_xlim([0.5, 5.5]) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.xaxis.set_major_locator(MaxNLocator(integer=True)) line_spatial, = ax.plot(conv_layers, lines["spatial"], '-o', label='Spatial') line_var_fft, = ax.plot(conv_layers, lines["var_fft"], '--^', label='FFT-hybd', color='G', markersize=10, linewidth=2) cmap = plt.get_cmap("autumn") #.cm.gist_ncar colors = [cmap(i) for i in np.linspace(0, 1, len(layers))] bar_width = 0.1 for i, FFT_fixed in enumerate(sorted(list(bars.keys()))): ax.bar(conv_layers + bar_width * (i - 2), bars[FFT_fixed], bar_width, color=colors[i], label='OaA-{}'.format(FFT_fixed), edgecolor="none") ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fancybox=True, shadow=True, ncol=1) #plt.show() plt.savefig("plots/algo_I.pdf", bbox_inches='tight') _op_spatial_tot = sum(lines['spatial']) for K in list(bars.keys()): printf("[FFT-{}]: {}, {}x", K, sum(bars[K]), _op_spatial_tot / sum(bars[K])) printf("[FFT-hybd]: {}, {}x", sum(lines["var_fft"]), _op_spatial_tot / sum(lines["var_fft"]))
def DSE_baseline(model_cnn, model_hw, param_algo, type='CaP'): if type == 'CaP': target_function = target_function_CaP else: target_function = target_function_OaA byte_per_word = model_hw['byte_per_word'] clk_rate = model_hw['clk_rate'] logic_max = model_hw['logic'] memory_max = model_hw['memory'] / byte_per_word * 1e6 memory_max_2 = model_hw["memory"] / byte_per_word / 2 * 1e6 # double buffer memory_min = memory_max_2 * 0.5 memory_stride = (memory_max_2 - memory_min) / 3 M_img_range = np.arange( memory_min, memory_max_2, memory_stride ) #resources["mem"],800) # [e5] one cache line is 16 complex words # problematic for P_mac_range P_mac_range = np.arange( 1, 10) #1200/12) # [e3] bounded by peak bw of the system (5GB) exp_mac_max = np.log(param_algo['fft'].max()) / np.log(2) q_mac_range = 2**np.arange(exp_mac_max) # [e1] N/2, N/4, N/8, N/16 exp_fft2_max = exp_mac_max exp_fft2_min = np.log( param_algo['fft'].max() / param_algo['fft'].min()) / np.log(2) exp_fft1_max = exp_fft2_max - 2 # -2 because of radix-4 exp_fft1_min = exp_fft2_min q_2dfft_range = 2**np.arange(exp_fft2_min, exp_fft2_max) # [e1] q_2difft_range = 2**np.arange(exp_fft2_min, exp_fft2_max) # [e1] q_1dfft_range = 2**np.arange(exp_fft1_min, exp_fft1_max) q_1difft_range = 2**np.arange(exp_fft1_min, exp_fft1_max) P_fft_range = np.array([1, 4]) P_ifft_range = np.array([1, 4]) prev_opt = float('Inf') opt = [] for i_M_img in M_img_range: printf(i_M_img, type=None) #if i_M_img%100 == 0: # printf("{}", i_M_img) # printf("current opt:") # printf("{}", opt, type=None) # printf("current opt:") # printf("{}", prev_opt, type=None) for i_P_mac in P_mac_range: for i_q_mac in q_mac_range: for i_q_2dfft in q_2dfft_range: for i_q_2difft in q_2difft_range: for i_q_1dfft in q_1dfft_range: for i_q_1difft in q_1difft_range: for i_P_fft in P_fft_range: for i_P_ifft in P_ifft_range: mem = consumption_mem( param_algo, i_M_img, i_P_mac, i_q_mac, i_q_2dfft, i_q_2difft, i_q_1dfft, i_q_1difft, i_P_fft, i_P_ifft) if mem >= memory_max: continue alm = consumption_alm( param_algo, i_M_img, i_P_mac, i_q_mac, i_q_2dfft, i_q_2difft, i_q_1dfft, i_q_1difft, i_P_fft, i_P_ifft) if alm >= logic_max: continue cur_performance = target_function( model_cnn, model_hw, param_algo, i_M_img, i_P_mac, i_q_mac, i_q_2dfft, i_q_2difft, i_q_1dfft, i_q_1difft, i_P_fft, i_P_ifft) if cur_performance >= prev_opt: continue prev_opt = cur_performance opt = [ i_M_img, i_P_mac, i_q_mac, i_q_2dfft, i_q_2difft, i_q_1dfft, i_q_1difft, i_P_fft, i_P_ifft ] printf("optimal configuration: ") printf("{}", opt, type=None) latency = cur_performance / (clk_rate * 1e6) _latency = latency #((type=='CaP') and [0] or [latency])[0] #import pdb;pdb.set_trace() return {'latency':_latency*1e3,'throughput':total_OPS(model_cnn)/latency/1e9},\ {'Memory':opt[0],'P_mac':opt[1],'q_mac':opt[2],'q_2dfft':opt[3],'q_2difft':opt[4], 'q_1dfft':opt[5],'q_1difft':opt[6],'P_fft':opt[7],'P_ifft':opt[8]}