コード例 #1
0
ファイル: arch_engine.py プロジェクト: ZimpleX/CNN_FPGA
 def layer_design_space_exploration(self):
     """
     Design space exploration 
     """
     self.network['P_fft'] = 1
     self.network['P_ifft'] = 1
     self.layer['T_img_i'] = np.array(
         [1])  # Value of T_img_i does not matter.
     opt_layers = {
         'Ni': [],
         'di': [],
         'fin_pi': [],
         'fout_pi': [],
         'T_img_i': []
     }
     for li in range(self.num_layers):
         _opt_layer_li_param = self.layer_i_param(li)
         for k in self.layer.keys():
             opt_layers[k].append(_opt_layer_li_param[k][0])
         printf('finish layer dse for {}', li)
         self.print_layer_conf(_opt_layer_li_param, li)
     for k in self.layer.keys():
         opt_layers[k] = np.array(opt_layers[k])
     self.layer = opt_layers
     self.layer['T_img_i'] = np.array([1] * self.num_layers)
     self.Nmax = self.layer['Ni'].max()
コード例 #2
0
def go(args):
    model_cnn = yaml.load(open(args.cnn))
    model_hw = yaml.load(open(args.hardware))
    #########################
    # algo level optimization
    #########################
    # OPS: {'tool': [x1,x1], 'spatial': [y1,y2]}
    # param_algo: {'fft': [N1,N2], 'folding': [f1,f2]}
    OPS, param_algo = algo_dse.algo_dse(model_cnn,
                                        model_hw,
                                        options={
                                            'CaP': args.cap,
                                            'max_folding': 0,
                                            'var_fft': True
                                        })
    #########################
    # arch level optimization
    #########################
    # performance: {'latency': [l1,l2], 'throughput': [t1,t2]}
    # param_arch: {''}
    arch_type = (args.cap) and 'CaP' or 'OaA'
    performance, param_arch, stat_resource = arch_dse.arch_dse(model_cnn,
                                                               model_hw,
                                                               param_algo,
                                                               type=arch_type)

    printf('algo params: {}', param_algo)
    printf('operation count: {:5.4f}%',
           OPS['tool'].sum() / OPS['spatial'].sum())
    printf('latency: {:5.2f}ms throughput: {:5.1f}GOPS',
           performance['latency'], performance['throughput'])
    printf('arch params: {}', param_arch)
    printf('utilization: {}', stat_resource)
コード例 #3
0
def _perm_to_int(perm):
    """
    convert perm (which can be a str or int) to int (understandable by os module)
    e.g.: perm='0444' if for read-only policy
    However, I won't process the first char for now
    """
    if type(perm) == type(0):
        return perm
    ERROR_PERM_FORMAT = 'format of perm is wrong!'
    try:
        assert len(perm) == 4
    except AssertionError:
        printf(ERROR_PERM_FORMAT, type='ERROR')
        exit()
    p_pos = ['','USR', 'GRP', 'OTH']   # don't care, owner, group, others
    p_ret = 0
    eval_str = 'stat.S_I{}{}'
    for n in range(1,4):
        p_int = int(perm[n])
        try:
            assert p_int <= 7 and p_int >= 0
        except AssertionError:
            printf(ERROR_PERM_FORMAT, type='ERROR')
            exit()
        if p_int >= 4:
            p_ret |= eval(eval_str.format('R', p_pos[n]))
        if p_int in [2,3,6,7]:
            p_ret |= eval(eval_str.format('W', p_pos[n]))
        if p_int%2 == 1:
            p_ret |= eval(eval_str.format('X', p_pos[n]))
    return p_ret
コード例 #4
0
ファイル: auto_tool.py プロジェクト: ZimpleX/CNN_FPGA
def compare_algo(cnn_dir, hw_config, tool_config):
    """
    cnn_dir: the directory for couple of cnn models
    hw_config: the hw config yaml file
    tool_config: the tool config yaml file
    """
    cnn_directory = os.fsencode(cnn_dir)
    oaa_count = None
    cap_count = None
    spa_count = None
    num_cnn = 0
    file_list = []
    for cnn_f in os.listdir(cnn_directory):
        cnn_filename = os.fsencode(cnn_f).decode('utf-8')
        if cnn_filename.endswith('.yaml'):
            num_cnn += 1
            file_list += [cnn_filename]
        else:
            continue
    file_list.sort()
    for cnn_f in file_list:
        cnn_filename = os.fsencode(cnn_f).decode('utf-8')
        params_cnn,params_algo,_1,_2,_3 = parse_input('{}/{}'.\
            format(cnn_directory.decode('utf-8'),cnn_filename),hw_config,tool_config)
        params_algo_spa = copy.deepcopy(params_algo)
        params_algo_spa['d max'] = 1
        params_algo_spa['name'] = 'Spatial'
        ae_spa = algo_engine.spatial_complexity(params_cnn, params_algo_spa)
        ae_cap = algo_engine.fft_complexity(params_cnn, params_algo)
        params_algo_oaa = copy.deepcopy(params_algo)
        params_algo_oaa['d max'] = 1
        params_algo_oaa['name'] = params_algo_oaa['name'].replace('CaP', 'OaA')
        ae_oaa = algo_engine.fft_complexity(params_cnn, params_algo_oaa)
        ae_spa.count()
        ae_cap.count(global_d_N=True)
        ae_oaa.count()
        printf(ae_cap.str_compare_algo(ae_oaa, ae_spa),
               type=None,
               separator='=')
        printf('{} d value: {}', cnn_filename,
               ae_cap.chosen_params_algo['batch fold'])
        if oaa_count is None:
            oaa_count = ae_oaa.ops_count
            cap_count = ae_cap.ops_count
            spa_count = ae_spa.ops_count
        else:
            oaa_count = np.concatenate([oaa_count, ae_oaa.ops_count])
            cap_count = np.concatenate([cap_count, ae_cap.ops_count])
            spa_count = np.concatenate([spa_count, ae_spa.ops_count])
    oaa_count = oaa_count.reshape(num_cnn, -1).T
    cap_count = cap_count.reshape(num_cnn, -1).T
    spa_count = spa_count.reshape(num_cnn, -1).T
    to_file = '{}/{}_complexity.npy'
    np.save(to_file.format(cnn_dir, 'OaA'), oaa_count)
    np.save(to_file.format(cnn_dir, 'CaP'), cap_count)
    np.save(to_file.format(cnn_dir, 'SPA'), spa_count)
コード例 #5
0
def _mat_shape_check(*mat_l):
    """
    check if mat is 2D and its shape is square (N x N)
    """
    try:
        for mat in mat_l:
            assert len(mat.shape) == 2
            assert mat.shape[0] == mat.shape[1]
    except AssertionError:
        printf("Currently only support 2D square matrix, get {}", mat.shape,type="ERROR")
コード例 #6
0
def overlap_add(kern, base, N):
    """
    Do the N point FFT using overlap and add method.
    Suppose: 
        kern        K x K
        base        B x B
        base_window W x W
        ret_mat     R x R
    Do the N-point KKT:
      - W + K - 1 = N
      - Zero padding kern & base to N x N
      - KKT padded N x N matrices.

    Note:
      - Currently support only square shaped kernel & base.
    """
    # actually OaA has already done the padding for you.
    #pdb.set_trace()

    _mat_shape_check(kern, base)
    K = kern.shape[0]
    B = base.shape[0]
    stride = 1
    padding = K - 1
    R = int((B - K + 2*padding)/stride + 1)
    assert K <= N

    W = N + 1 - K           # l_sub
    overlap = K - 1
    R_prime = ceil(B/W)*W + K - 1     # temp matrix after padding
    ret_mat = np.zeros((R_prime, R_prime),dtype=complex)
    kern_fft = np.fft.fft2(mat_padding(kern,N))
    for i in range(0,B,W):
        for j in range(0,B,W):
            base_win_fft = np.fft.fft2(mat_padding(base[i:i+W,j:j+W],N))
            ret_tile = np.fft.ifft2(kern_fft*base_win_fft)
            x_off = i
            y_off = j
            temp_mat = np.zeros((R_prime,R_prime),dtype=complex)
            #printf("(x_off,y_off): ({},{})  in  ({},{})", x_off, y_off, R_prime,R_prime)
            try:
                temp_mat[x_off:x_off+N,y_off:y_off+N] = ret_tile
            except Exception:
                pdb.set_trace()
                printf("exception", type="ERROR")
            ret_mat += temp_mat
    # debug
    assert not np.any(np.around(ret_mat.imag,decimals=10))
    #printf("fft conv:\n{}", np.around(ret_mat[0:R,0:R].real,decimals=10))
    #printf("normal conv:{}\n{}", scipy.signal.convolve2d(base,kern).shape, scipy.signal.convolve2d(base,kern))
    return ret_mat[0:R,0:R].real
コード例 #7
0
def compare_CaP_OaA():
    printf(
        "  hybd1 complexity | CaP complexity | folding | spatial complexity",
        type=None)
    sum_complexity1 = 0
    sum_complexity2 = 0
    sum_baseline = 0
    for layer in layers:
        complexity_baseline = op.op_count_spatial(*layer)
        complexity1 = op.op_count_fft(*layer, folding_1D=1)
        layer[4] = 16
        complexity2, folding_opt = complexity_CaP(layer)
        sum_complexity1 += complexity1
        sum_complexity2 += complexity2
        sum_baseline += complexity_baseline
        printf("{} | {} | {} | {}",
               complexity1 / 1e9,
               complexity2 / 1e9,
               folding_opt,
               complexity_baseline / 1e9,
               type=None)
    printf("sum: hybd1 vs. CaP vs. spatial")
    printf("{} ({}) {} ({}) {}", sum_complexity1 / 1e9,
           sum_complexity1 / sum_baseline, sum_complexity2 / 1e9,
           sum_complexity2 / sum_baseline, sum_baseline)
コード例 #8
0
def mkdir_r(dir_r):
    """
    recursively mkdir if not exist
    dir_r of 'a/b/c' or 'a/b/c/' will both create directory a, b and c

    WARNING:
    no explicit error checking:
    e.g.: if there is a file (not dir) called 'a/b', then this function will fail
    """
    dir_parent = os.path.dirname(dir_r)
    dir_parent = (dir_parent != '') and dir_parent or '.'
    if not os.path.exists(dir_parent):
        mkdir_r(dir_parent)
    if not os.path.exists(dir_r):
        os.mkdir(dir_r)
        printf("created dir: {}", dir_r, separator=None)
コード例 #9
0
ファイル: arch_engine.py プロジェクト: ZimpleX/CNN_FPGA
 def print_layer_conf(self, _opt_layer_li_param, li):
     #import pdb;pdb.set_trace()
     num_param = len(_opt_layer_li_param)
     row_regex_s = '  '.join(['{:>10s}'] * num_param)
     row_regex_d = '  '.join(['{:>10d}'] * num_param)
     s = stringf('LAYER {} CONF', li, type=None, separator='.')
     s += '\n'
     s += row_regex_s.format('Ni', 'di', 'fin_pi', 'fout_pi', 'T_img_i')
     s += '\n'
     s += '-' * (10 * num_param + 2 * (num_param - 1)) + '\n'
     s += row_regex_d.format(_opt_layer_li_param['Ni'][0],
                             _opt_layer_li_param['di'][0],
                             _opt_layer_li_param['fin_pi'][0],
                             _opt_layer_li_param['fout_pi'][0],
                             _opt_layer_li_param['T_img_i'][0])
     s += '\n'
     printf(s, type=None, separator='*')
コード例 #10
0
def fft(N, ip, int_bits, tot_bits, format_='h'):
    ip = list(np.array(ip).flatten())
    ip = ip[0:len(ip) // N * N]
    assert len(ip) >= N
    ip = [fpt_to_decimal(int_bits, tot_bits, x, format_='h') for x in ip]
    ip = np.array(ip).reshape(-1, N)
    op = np.ndarray(shape=ip.shape, dtype=np.complex64)
    op_str = np.ndarray(shape=ip.shape, dtype=(np.str_, 16))
    for i, ip_i in enumerate(ip):
        printf(ip_i)
        op[i] = np.fft.fft(ip_i)
    for i, op_i in enumerate(op):
        op_str[i] = np.array([
            '{} {}'.format(
                decimal_to_fpt(int_bits, tot_bits, o.real, format_=format_),
                decimal_to_fpt(int_bits, tot_bits, o.imag, format_=format_))
            for o in op[i]
        ])
    return op_str
コード例 #11
0
ファイル: SPN.py プロジェクト: pgroup-usc/CNN-Generator
def log_streaming_data_SPN(N, p, input_data, output_data, resources):
    row_len = 0
    raw_sequence = {'0: INPUT': input_data, '1: OUTPUT': output_data}
    strf = N // 10 and '{:2s}' or '{:1s}'

    def tostr_tuple(e):
        return (strf + ',' + strf).format(str(e[0]), str(e[1]))

    v_tostr_tuple = np.vectorize(tostr_tuple)
    for k in sorted(raw_sequence.keys()):
        printf('{}', k)
        for tup in raw_sequence[k]:
            #import pdb;pdb.set_trace()
            row_len += len(tup)
            str_tup = '  '.join(v_tostr_tuple(tup))
            print(str_tup + ' ', end='')
            if row_len == N:
                row_len = 0
                print('\n')
    for k in resources.keys():
        printf('{}: {}', k, resources[k])
コード例 #12
0
def overlap_add_1D(kern,base,N):
    # Default padding & stride:
    #   padding = N-1
    #   stride = 1
    K = kern.shape[0]
    B = base.shape[0]
    R = K + B - 1
    W = N + 1 - K
    kern_pad = np.zeros(N)
    kern_pad[0:K] = kern
    kern_fft = np.fft.fft(kern_pad,N)
    ret = np.zeros(R)
    for i in range(0,B,W):
        base_pad = np.zeros(N)
        base_pad[0:W] = base[i:i+W]
        base_win_fft = np.fft.fft(base_pad,N)
        ret_win = np.fft.ifft(kern_fft*base_win_fft)
        temp = np.zeros(R)
        temp[i:i+N] = ret_win
        ret += temp
    # debug
    printf("normal conv:\n{}", np.convolve(kern,base))
    printf("fft conv:\n{}", ret)
コード例 #13
0
def explore_fix_folding(layers, range_N=None, range_folding=None, name=''):
    """
	will give the full statistics for each fixed folding factor.
	"""
    N_min_power = 2
    N_max_power = 2
    folding_min = 1
    folding_max = 30
    range_N = ((range_N is not None) and [np.array(range_N)] or
               [[16, 32]])[0]  #[4**np.arange(N_min_power,N_max_power+1)])[0]
    range_folding = ((range_folding is not None) and [np.array(range_folding)]
                     or [np.arange(folding_min, folding_max + 1)])[0]
    min_ops_layers = np.zeros((len(range_folding), len(layers)))
    ops_spatial_layers = np.zeros(len(layers))
    for i_l, l in enumerate(layers):
        ops_spatial_layers[i_l] = op.op_count_spatial(*(l[0:4]), None,
                                                      *(l[4:6])) / 1e6
    ops_spatial_total = ops_spatial_layers.sum()
    N_layers = np.zeros((len(range_folding), len(layers)))
    for i_fd, fd in enumerate(range_folding):
        printf('optimal values (FFT,folding={}):', fd)
        printf('   layer       N folding      MinOps   ratio',
               type=None,
               separator='-')
        for i_l, l in enumerate(layers):
            min_ops_layers[i_fd][i_l], N_layers[i_fd][
                i_l], fd_i = core_fft_size_folding(*(l[0:4]),
                                                   range_N=range_N,
                                                   range_folding=fd,
                                                   name='')
            printf('{:8d}{:8d}{:8d}{:12.2f}{:8.3f}',
                   i_l + 1,
                   int(N_layers[i_fd][i_l]),
                   fd_i,
                   min_ops_layers[i_fd][i_l],
                   min_ops_layers[i_fd][i_l] / ops_spatial_layers[i_l],
                   type=None,
                   separator=None)
        min_ops_sum = min_ops_layers[i_fd].sum()
        printf("Total ops: {:12.2f}; ratio: {:5.3f}",
               min_ops_sum,
               min_ops_sum / ops_spatial_total,
               type=None,
               separator='><')
    idx_folding = np.sum(min_ops_layers, axis=1).argmin()
    return N_layers[idx_folding], range_folding[idx_folding], np.sum(
        min_ops_layers, axis=1).min()
コード例 #14
0
def log_streaming_data_SPN(N, p, input_data, output_data, resources):
    row_len = 0
    raw_sequence = {'0: INPUT': input_data, '1: OUTPUT': output_data}
    strf = N // 10 and '{:2s}' or '{:1s}'

    def tostr_tuple(e):
        return (strf + ',' + strf).format(str(e[0]), str(e[1]))

    v_tostr_tuple = np.vectorize(tostr_tuple)
    for k in sorted(raw_sequence.keys()):
        printf('{}', k)
        for tup in raw_sequence[k]:
            row_len += len(tup)
            str_tup = ''
            for start_i in range(0, len(tup), N):
                if start_i > 0:
                    str_tup += '\n' * 2
                str_tup += '  '.join(v_tostr_tuple(tup[start_i:start_i + N]))
            print(str_tup + ' ', end='')
            if row_len >= N:
                row_len = 0
                print('\n')
    for k in resources.keys():
        printf('{}: {}', k, resources[k])
コード例 #15
0
					for N in np.array([4,8,16]):
						if N <= l_kern: continue
						for stride in np.array([1]):
							op_spatial = op_count_spatial(f_in,f_out,l_img,l_kern,-1,stride)
							op_fft = op_count_fft(f_in,f_out,l_img,l_kern,N,-1)
							printf("({:4d},{:4d},{:4d},{:4d},{:4d},{:4d})-->spatial: {:8.0f}, fft: {:8.0f}-->ratio: {:.3f}", 
								f_in,f_out,l_img,l_kern,N,stride,op_spatial,op_fft,op_fft/op_spatial,separator=None)
	"""
    # param list: [fin, fout, l_img, l_kern, N, stride, padding]
    layers = [[3, 96, 224, 11, 64, 4, 0], [96, 256, 55, 5, 64, 1, 2],
              [256, 384, 27, 3, 64, 1, 1], [384, 384, 13, 3, 64, 1, 1],
              [384, 256, 13, 3, 64, 1, 1]]
    folding = -1
    min_tot_op_ratio = float("inf")
    min_folding = -1
    printf("operation count is in unit of Mega")
    _FD_MAX_1D = 9
    _FD_MIN_1D = 1
    #### matplotlib ####
    axis_folding = np.array([])
    axis_layers_op_spatial = np.array([])
    axis_layers_op_oaa = np.array([])
    axis_tot_op_spatial = np.array([])
    axis_tot_op_oaa = np.array([])
    ####################
    for fd in range(_FD_MIN_1D, _FD_MAX_1D + 1):
        #### matplotlib ####
        axis_folding = np.append(axis_folding, [fd], axis=0)
        ####################
        printf("folding factor 1D: {:4d}", fd)
        printf("l   FFT spatial OaA     Ratio ", type=None, separator='-')
コード例 #16
0
    fig.colorbar(surf, shrink=0.5, aspect=5)

    ax.set_xlabel('FFT size')
    ax.set_ylabel('folding')
    ax.set_zlabel('# ops')
    #plt.show()
    plt.savefig('plots/{}.png'.format(title))
    Z[Z == 0.] = float('Inf')
    min_args = np.unravel_index(Z.argmin(), Z.shape)
    return X[min_args], Y[min_args]
    #return min_args


def ceiling_inverse():
    x = np.arange(4, 111)
    k = 1000
    y = np.log(x) * x**2 * np.ceil(k / (x - 1)**2)
    plt.plot(x, y, 'o')
    plt.show()


if __name__ == '__main__':
    #ceiling_inverse()
    layers = [[3, 96, 224, 11], [96, 256, 55, 5], [256, 384, 27, 3],
              [384, 384, 13, 3], [384, 384, 13, 3]]
    printf('optimal values:')
    printf('   layer       N folding', type=None, separator='-')
    for i, l in enumerate(layers):
        N_i, fd_i = plot_fft_size_folding(*l, name='layer{}'.format(i))
        printf('{:8d}{:8d}{:8d}', i, N_i, fd_i, type=None, separator=None)
コード例 #17
0
def plot_fixed_len_FFT():
    bars = {16: [], 32: [], 64: [], 128: []}
    for FFT_fixed in list(bars.keys()):
        for i, layer in enumerate(layers):
            layer[4] = FFT_fixed
            _op_oaa = op.op_count_fft(*layer) / 1e9
            bars[FFT_fixed] += [_op_oaa]

    lines = {"spatial": [], "var_fft": [], "native_fft": []}
    for i, layer in enumerate(layers):
        _op_spatial = op.op_count_spatial(*layer) / 1e9
        lines["spatial"] += [_op_spatial]
    layers[0][4] = 32  # 32: OaA
    layers[1][4] = 16  # 16: OaA
    layers[2][4] = 32  # 32: Native
    layers[3][4] = 16  # 16: Native
    layers[4][4] = 16  # 16: Native
    lines["var_fft"] += [op.op_count_fft(*layers[0]) / 1e9]
    lines["var_fft"] += [op.op_count_fft(*layers[1]) / 1e9]
    lines["var_fft"] += [op.op_count_fft(*layers[2]) / 1e9]
    lines["var_fft"] += [op.op_count_fft(*layers[3]) / 1e9]
    lines["var_fft"] += [op.op_count_fft(*layers[4]) / 1e9]
    #import pdb;pdb.set_trace()

    conv_layers = np.arange(len(layers)) + 1
    fig1 = plt.figure(1)
    ax = plt.subplot(111)
    ax.set_aspect(0.6)
    #ax.set_title("Effect of variable length FFT", fontsize=20)
    ax.set_xlabel("Convolution layers", fontsize=16)
    ax.set_ylabel("Giga Operations", fontsize=16)
    ax.set_ylim([0, 4.8])
    ax.set_xlim([0.5, 5.5])

    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
    line_spatial, = ax.plot(conv_layers,
                            lines["spatial"],
                            '-o',
                            label='Spatial')
    line_var_fft, = ax.plot(conv_layers,
                            lines["var_fft"],
                            '--^',
                            label='FFT-hybd',
                            color='G',
                            markersize=10,
                            linewidth=2)

    cmap = plt.get_cmap("autumn")  #.cm.gist_ncar
    colors = [cmap(i) for i in np.linspace(0, 1, len(layers))]
    bar_width = 0.1
    for i, FFT_fixed in enumerate(sorted(list(bars.keys()))):
        ax.bar(conv_layers + bar_width * (i - 2),
               bars[FFT_fixed],
               bar_width,
               color=colors[i],
               label='OaA-{}'.format(FFT_fixed),
               edgecolor="none")

    ax.legend(loc='center left',
              bbox_to_anchor=(1, 0.5),
              fancybox=True,
              shadow=True,
              ncol=1)
    #plt.show()
    plt.savefig("plots/algo_I.pdf", bbox_inches='tight')

    _op_spatial_tot = sum(lines['spatial'])
    for K in list(bars.keys()):
        printf("[FFT-{}]: {}, {}x", K, sum(bars[K]),
               _op_spatial_tot / sum(bars[K]))
    printf("[FFT-hybd]: {}, {}x", sum(lines["var_fft"]),
           _op_spatial_tot / sum(lines["var_fft"]))
コード例 #18
0
def DSE_baseline(model_cnn, model_hw, param_algo, type='CaP'):
    if type == 'CaP':
        target_function = target_function_CaP
    else:
        target_function = target_function_OaA
    byte_per_word = model_hw['byte_per_word']
    clk_rate = model_hw['clk_rate']
    logic_max = model_hw['logic']
    memory_max = model_hw['memory'] / byte_per_word * 1e6
    memory_max_2 = model_hw["memory"] / byte_per_word / 2 * 1e6  # double buffer
    memory_min = memory_max_2 * 0.5
    memory_stride = (memory_max_2 - memory_min) / 3
    M_img_range = np.arange(
        memory_min, memory_max_2, memory_stride
    )  #resources["mem"],800)	# [e5]	one cache line is 16 complex words
    # problematic for P_mac_range
    P_mac_range = np.arange(
        1, 10)  #1200/12)				# [e3]	bounded by peak bw of the system (5GB)
    exp_mac_max = np.log(param_algo['fft'].max()) / np.log(2)
    q_mac_range = 2**np.arange(exp_mac_max)  # [e1]	N/2, N/4, N/8, N/16
    exp_fft2_max = exp_mac_max
    exp_fft2_min = np.log(
        param_algo['fft'].max() / param_algo['fft'].min()) / np.log(2)
    exp_fft1_max = exp_fft2_max - 2  # -2 because of radix-4
    exp_fft1_min = exp_fft2_min
    q_2dfft_range = 2**np.arange(exp_fft2_min, exp_fft2_max)  # [e1]
    q_2difft_range = 2**np.arange(exp_fft2_min, exp_fft2_max)  # [e1]
    q_1dfft_range = 2**np.arange(exp_fft1_min, exp_fft1_max)
    q_1difft_range = 2**np.arange(exp_fft1_min, exp_fft1_max)
    P_fft_range = np.array([1, 4])
    P_ifft_range = np.array([1, 4])

    prev_opt = float('Inf')
    opt = []
    for i_M_img in M_img_range:
        printf(i_M_img, type=None)
        #if i_M_img%100 == 0:
        #	printf("{}", i_M_img)
        #	printf("current opt:")
        #	printf("{}", opt, type=None)
        #	printf("current opt:")
        #	printf("{}", prev_opt, type=None)
        for i_P_mac in P_mac_range:
            for i_q_mac in q_mac_range:
                for i_q_2dfft in q_2dfft_range:
                    for i_q_2difft in q_2difft_range:
                        for i_q_1dfft in q_1dfft_range:
                            for i_q_1difft in q_1difft_range:
                                for i_P_fft in P_fft_range:
                                    for i_P_ifft in P_ifft_range:
                                        mem = consumption_mem(
                                            param_algo, i_M_img, i_P_mac,
                                            i_q_mac, i_q_2dfft, i_q_2difft,
                                            i_q_1dfft, i_q_1difft, i_P_fft,
                                            i_P_ifft)
                                        if mem >= memory_max: continue
                                        alm = consumption_alm(
                                            param_algo, i_M_img, i_P_mac,
                                            i_q_mac, i_q_2dfft, i_q_2difft,
                                            i_q_1dfft, i_q_1difft, i_P_fft,
                                            i_P_ifft)
                                        if alm >= logic_max: continue
                                        cur_performance = target_function(
                                            model_cnn, model_hw, param_algo,
                                            i_M_img, i_P_mac, i_q_mac,
                                            i_q_2dfft, i_q_2difft, i_q_1dfft,
                                            i_q_1difft, i_P_fft, i_P_ifft)
                                        if cur_performance >= prev_opt:
                                            continue
                                        prev_opt = cur_performance
                                        opt = [
                                            i_M_img, i_P_mac, i_q_mac,
                                            i_q_2dfft, i_q_2difft, i_q_1dfft,
                                            i_q_1difft, i_P_fft, i_P_ifft
                                        ]
    printf("optimal configuration: ")
    printf("{}", opt, type=None)
    latency = cur_performance / (clk_rate * 1e6)
    _latency = latency  #((type=='CaP') and [0] or [latency])[0]
    #import pdb;pdb.set_trace()
    return {'latency':_latency*1e3,'throughput':total_OPS(model_cnn)/latency/1e9},\
     {'Memory':opt[0],'P_mac':opt[1],'q_mac':opt[2],'q_2dfft':opt[3],'q_2difft':opt[4],
      'q_1dfft':opt[5],'q_1difft':opt[6],'P_fft':opt[7],'P_ifft':opt[8]}