def FFT_recursive(a_input_i: Array, *, N, Wn, output_dtype): """https://web.iiit.ac.in/~pratik.kamble/storage/Algorithms/Cormen_Algorithms_3rd.pdf""" # page 911 if len(a_input_i.dtype) == 1: # TODO: when the issue of passed input interface to output is fixed: remove [:] return a_input_i[:] else: a0_i = [] a1_i = [] for i in range(N): if i % 2 == 0: a0_i.append(a_input_i[i]) else: a1_i.append(a_input_i[i]) y0_i = FFT_recursive(ccat(*a0_i) | Array, N=N // 2, Wn=Wn, output_dtype=output_dtype) y1_i = FFT_recursive(ccat(*a1_i) | Array, N=N // 2, Wn=Wn, output_dtype=output_dtype) y_output_i = [] for k in range(N): Wk = Wn[k * (len(Wn) // N)] y_output_i.append( butterfly_sum(y0_i[k % (N // 2)], y1_i[k % (N // 2)], Wn=Wk)) return ccat(*y_output_i) | Array
def hopper(hopper_cfg): cfg_hop_y = ccat(0, hopper_cfg[0][0], 1) hop_y = cfg_hop_y | rng cfg_hop_x = ccat(0, hopper_cfg[0][1], 1) cfg_hop_x = cfg_hop_x | cart_sync_with(hop_y) hop_x = cfg_hop_x | rng dout = cart(hop_y, hop_x) return dout
def test_queue_3(cosim_cls, din_delay, dout_delay): verif(drv(t=Queue[Uint[2]], seq=[[0, 1], [2, 3]]) | delay_rng(din_delay[0], din_delay[0]), drv(t=Queue[Uint[3]], seq=[[4, 5], [6, 7]]) | delay_rng(din_delay[1], din_delay[1]), drv(t=Queue[Uint[8]], seq=[[8, 9], [10, 11]]) | delay_rng(din_delay[2], din_delay[2]), f=ccat(sim_cls=cosim_cls), ref=ccat(name='ref_model'), delays=[delay_rng(dout_delay, dout_delay)]) sim()
def weighted_sum(din: Queue[Tuple[Uint['w_ii'], Uint[1], Int['w_weight']], 1]): data_neg = din[0][0] | neg data = din[0][0] | data_neg.dtype signed_data = mux_valve(din[0][1], ccat(data_neg, data)) | union_collapse signed_data = signed_data * din[0][2] weighted_data = ccat(signed_data, din[1]) weighted_data = weighted_data | Queue[weighted_data.dtype[0], 1] summed_data = weighted_data | accum(add_num=4) summed_data = summed_data | Queue[Int[len(summed_data.dtype[0])], 1] | last_data return summed_data
def leaf_vals(feat_addr: Queue[Uint['w_addr_feat'], 2], din: Uint[1], *, casc_hw): leaf0 = rom(feat_addr[0], data=casc_hw.leaf_vals_mem[0], dtype=Int[casc_hw.w_leaf_vals]) leaf1 = rom(feat_addr[0], data=casc_hw.leaf_vals_mem[1], dtype=Int[casc_hw.w_leaf_vals]) sync = ccat(din, leaf0, leaf1) dout = mux_valve(sync[0], ccat(sync[1], sync[2])) | union_collapse return dout
def matrix_multiplication(cfg, mat1, mat2, *, cols_per_row): """General idea is to parallelize matrix multiplication, this is achieved by multiplying one row with several columns at the same time. Number of columns that are multiplied with one row is cols_per_row. Column_multiplication is module that multiplies one row with one column at the time, it can also store several columns in it. First we need to split mat2 by columns and send them to different column_multiplication modules, to be stored, then send every row on each column_multiplication.""" col_chunks = qdeal(mat2, num=cols_per_row, lvl=1) row_chunks = row_dispatch(mat1, cols_per_row=cols_per_row) \ | dreg \ | decouple(latency=2) \ | dispatch tmp = [] if not isinstance(col_chunks, tuple): col_chunks = (col_chunks, ) if not isinstance(row_chunks, tuple): row_chunks = (row_chunks, ) for col, row in zip(col_chunks, row_chunks): # col is flattened because, after qdeal, every col has type Queue lvl1 with eot == True # after flattening, we group it by cols_per_multiplier (set eot (last) after last column that goes # to specific column multiplier) col = col | flatten | group(size=cfg['cols_per_multiplier']) tmp.append(column_multiplication(cfg, row, col) | flatten) res = ccat(*tmp) | Array return res
def multi_filter(pixels: Queue[Array[Uint[8], 3]], coef: Queue[Array[Fixp, 3], 2], *, window_num, filt_num): filt_coef = coef | qdeal(num=filt_num) res = [filter(pixels, c, window_num=window_num) for c in filt_coef] return ccat(*res)
def boundaries(scale_counter: Queue[Uint['w_scale'], 1], *, casc_hw): bound_y_param = [] for val in casc_hw.boundary_y: bound_y_param.append(Uint[casc_hw.w_boundary](val + 1)) bound_x_param = [] for val in casc_hw.boundary_x: bound_x_param.append(Uint[casc_hw.w_boundary](val + 1)) boundary_y = mux_valve(scale_counter[0], ccat(*bound_y_param)) | union_collapse boundary_x = mux_valve(scale_counter[0], ccat(*bound_x_param)) | union_collapse boundary = ccat(boundary_y, boundary_x) boundary = ccat(boundary, scale_counter[1]) | Queue[boundary.dtype, 1] return boundary
def ii_gen(din: Queue[Uint['w_din'], 2], *, frame_size=(25, 25)): fifo_depth = 2**bitw(frame_size[1]) accum_s = din | dreg_sp | accum_wrap(add_num=frame_size[0] * frame_size[1]) fifo_out = Intf(accum_s.dtype[0]) add_s = ccat(accum_s[0], fifo_out) | add fifo_in = ccat(add_s, accum_s[1]) | Queue[add_s.dtype, 2] fifo_out |= fifo_in | decouple | flatten | fifo2( depth=fifo_depth, preload=frame_size[1], regout=False) ii_s = fifo_in return ii_s | dreg_sp
def sii_gen(din: Queue[Uint['w_din'], 2], *, frame_size=(25, 25)): din = din | dreg_sp mult_s = din[0] * din[0] sii_in = ccat(mult_s, din[1]) | Queue[mult_s.dtype, 2] sii_s = sii_in | ii_gen(frame_size=frame_size) return sii_s
def get_leaf_num(din: Tuple[Int['w_sum'], Int['w_thr'], Uint['w_stddev']]): din = din | dreg_sp thresh_norm = din[2] * din[1] thresh_norm = thresh_norm | Int[len(thresh_norm.dtype)] | dreg_sp dout = lt(ccat(din[0] | dreg_sp, thresh_norm)) return dout
def addr_trans(din: Queue[Tuple[Uint['w_y'], Uint['w_x']], 4], *, img_size): ram_size = img_size[0] * img_size[1] w_addr = math.ceil(math.log(ram_size, 2)) addr_abs = din[0][1] + din[0][0] * img_size[1] | Uint[w_addr] return ccat(addr_abs, din[1]) | Queue[addr_abs.dtype, 4]
def cordic_first_stage(i_xval, i_yval, i_phase, *, iw, ww, pw): pv_0_mux_1 = (i_phase - Uint[pw](2**pw // 4)) >> Uint[pw] pv_0_mux_2 = (i_phase - Uint[pw](2**pw // 2)) >> Uint[pw] pv_0_mux_3 = (i_phase - Uint[pw]((2**pw // 2) + (2**pw // 4))) >> Uint[pw] e_xval = ccat(Uint[ww - iw - 1](0), i_xval, i_xval[-1]) >> Int[ww] e_yval = ccat(Uint[ww - iw - 1](0), i_yval, i_yval[-1]) >> Int[ww] n_e_xval = -e_xval >> Int[ww] n_e_yval = -e_yval >> Int[ww] phase_ctrl = ccat(i_phase[pw - 3], i_phase[pw - 2], i_phase[pw - 1]) >> Uint[3] xv_0 = field_sel( phase_ctrl, ccat(e_xval, n_e_yval, n_e_yval, n_e_xval, n_e_xval, e_yval, e_yval, e_xval)) yv_0 = field_sel( phase_ctrl, ccat(e_yval, e_xval, e_xval, n_e_yval, n_e_yval, n_e_xval, n_e_xval, e_yval)) ph_0 = field_sel( phase_ctrl, ccat(i_phase, pv_0_mux_1, pv_0_mux_1, pv_0_mux_2, pv_0_mux_2, pv_0_mux_3, pv_0_mux_3, i_phase)) return ccat(xv_0, yv_0, ph_0) | dreg
def frame_buffer(din: Queue[Uint['w_din'], 1], rd_addr: Queue[Array[Tuple[Uint['w_rect'], Uint[1], Int['w_weight']], 3], 3], rst_in: Unit, *, frame_size): ##########Parameters################### ram_size = frame_size[0] * frame_size[1] w_addr = math.ceil(math.log(ram_size, 2)) ####################################### rst_in | local_rst din_i, rd_addr_sdp = alternate_queues(din, rd_addr) rd_addr_sdp_dreg = rd_addr_sdp | dreg cfg_rng = ccat(0, Uint[w_addr](ram_size), 1) wr_addr = cfg_rng | rng wr_sdp = ccat(wr_addr[0], din_i[0]) rd_data0 = sdp(wr_sdp, rd_addr_sdp[0][0][0], depth=ram_size) rd_data1 = sdp(wr_sdp, rd_addr_sdp[0][1][0], depth=ram_size) rd_data2 = sdp(wr_sdp, rd_addr_sdp[0][2][0], depth=ram_size) rd_data0 = ccat(rd_data0, rd_addr_sdp_dreg[0][0][1], rd_addr_sdp_dreg[0][0][2]) rd_data1 = ccat(rd_data1, rd_addr_sdp_dreg[0][1][1], rd_addr_sdp_dreg[0][1][2]) rd_data2 = ccat(rd_data2, rd_addr_sdp_dreg[0][2][1], rd_addr_sdp_dreg[0][2][2]) rd_data = ccat(rd_data0, rd_data1, rd_data2) | Array[rd_data0.dtype, 3] dout = ccat(rd_data, rd_addr_sdp_dreg[1]) | Queue[rd_data.dtype, 3] return dout | decouple_sp
def scale_ratio(scale_counter: Queue[Uint['w_scale'], 1], *, casc_hw): y_ratio_param = [] for val in casc_hw.y_ratio: y_ratio_param.append(Uint[casc_hw.w_ratio](val)) x_ratio_param = [] for val in casc_hw.x_ratio: x_ratio_param.append(Uint[casc_hw.w_ratio](val)) y_ratio = mux_valve(scale_counter[0], ccat(*y_ratio_param)) | union_collapse x_ratio = mux_valve(scale_counter[0], ccat(*x_ratio_param)) | union_collapse ratio = ccat(y_ratio, x_ratio) ratio = ccat(ratio, scale_counter[1]) | Queue[ratio.dtype, 1] return ratio
def classifier(fb_data: Queue[Array[Tuple[Uint['w_ii'], Uint[1], Int['w_weight']], 3], 3], feat_addr: Queue[Uint['w_addr_feat'], 2], stage_addr: Queue[Uint['w_stage_addr'], 1], stddev: Uint['w_stddev'], rst_in: Unit, *, w_ii=b'w_ii', w_weight=b'w_weight', casc_hw): rst_in | local_rst stage_addr = stage_addr | dreg stddev = stddev | dreg # fb_data = fb_data | dreg_sp feat_addr = feat_addr | dreg | dreg_sp stddev_repl = replicate(ccat(5000, stddev)) stddev_repl = stddev_repl[0] rect_sum_s = fb_data | rect_sum(w_ii=w_ii, w_weight=w_weight) feature_threshold = rom(feat_addr[0], data=casc_hw.feature_threshold_mem, dtype=Int[casc_hw.w_feature_threshold]) stddev_repl = stddev_repl | cart_sync_with( ccat(rect_sum_s, 0) | Queue[rect_sum_s.dtype, 1]) res = ccat(rect_sum_s, feature_threshold, stddev_repl) leaf_num = res | get_leaf_num | dreg_sp leaf_val = leaf_vals(feat_addr=feat_addr, din=leaf_num, casc_hw=casc_hw) stage_eot = feat_addr[1][0] | dreg_sp leaf_val = ccat(leaf_val, stage_eot) | Queue[leaf_val.dtype, 1] accum_stage = leaf_val | accum_on_eot(add_num=256) stage_res = accum_stage | get_stage_res(stage_addr=stage_addr, casc_hw=casc_hw) stage_res = ccat(stage_res | dreg_sp, stage_addr[1]) | Queue[stage_res.dtype, 1] return stage_res
def img_ram(din: Queue[Uint['w_data'], 1], rd_addr: Queue[Uint['w_addr'], 4], *, img_size=(240, 320)): ##########Parameters################### ram_size = img_size[0] * img_size[1] ####################################### cfg_rng = ccat(0, ram_size, 1) wr_addr = cfg_rng | rng din, rd_addr_sdp = alternate_queues(din, rd_addr) wr_sdp = ccat(wr_addr[0], din[0]) rd_data = sdp(wr_sdp, rd_addr_sdp[0], depth=ram_size) dout = ccat(rd_data, rd_addr[1] | dreg) | Queue[rd_data.dtype, 2] return dout | dreg_sp
def get_stage_res(stage_addr: Queue[Uint['w_stage_addr'], 1], din: Int['w_din'], *, casc_hw): stage_threshold = rom(stage_addr[0], data=casc_hw.stage_threshold_mem, dtype=Int[casc_hw.w_stage_threshold]) sync = ccat(din, stage_threshold) dout = lt(sync[1], sync[0]) return dout
def rects_mem(rd_addr_if: Uint['w_addr'], *, inst_num, casc_hw): w_rect = casc_hw.w_rect_data // 2 rect_tuple = rom( rd_addr_if, data=casc_hw.rects_mem[inst_num], dtype=Uint[casc_hw.w_rect_data]) | \ Tuple[Uint[w_rect/2], Uint[w_rect/2], Uint[w_rect]] rect_coords = rect_tuple | calc_rect_coords(casc_hw=casc_hw) weight = rom(rd_addr_if, data=casc_hw.weights_mem[inst_num], dtype=Int[casc_hw.w_weight]) data_t = Intf(Tuple[Uint[w_rect], Uint[1], Int[casc_hw.w_weight]]) cart_sync = cart(rect_coords, weight) tuple_rect = ccat(cart_sync[0][0], cart_sync[0][1]) | data_t.dtype dout = ccat(tuple_rect, cart_sync[1]) | Queue[data_t.dtype, 1] return dout
def FFT_list(din, *, index_lists, Wn, output_dtype): for stage in range(len(index_lists)): stage_output = [] for ix in range(2**len(index_lists)): stage_output.append( butterfly_sum(din[index_lists[stage][0][ix]], din[index_lists[stage][1][ix]], Wn=Wn[index_lists[stage][2][ix]])) din = stage_output return ccat(*stage_output) | Array | arraymap(f=arraymap(f=format_fixp( t=output_dtype)))
def sweeper(hop: Queue[Tuple[Uint['w_y'], Uint['w_x']], 2], scale_ratio: Queue[Tuple[Uint['w_ratio'], Uint['w_ratio']], 1], *, frame_size): scale_ratio = scale_ratio | cart_sync_with(hop) cfg_sweep_y = ccat(hop[0][0], frame_size[0], 1) sweep_y = cfg_sweep_y | rng(cnt_steps=True) ratio_y = scale_ratio | cart_sync_with(sweep_y) scaled_y = ((sweep_y[0] * ratio_y[0][0]) >> 16) | sweep_y.dtype[0] sweep_y = ccat(scaled_y, sweep_y[1]) | Queue[sweep_y.dtype[0], 1] cfg_sweep_x = ccat(hop[0][1], frame_size[1], 1) \ | cart_sync_with(sweep_y) sweep_x = cfg_sweep_x | rng(cnt_steps=True) ratio_x = ratio_y | cart_sync_with(sweep_x) scaled_x = ( (sweep_x[0] * ratio_x[0][1]) >> 16) | sweep_x.dtype[0] | decouple_sp sweep_x = ccat(scaled_x, sweep_x[1] | decouple_sp) | Queue[sweep_x.dtype[0], 1] dout = cart(sweep_y | decouple_sp, sweep_x) dout = cart(hop | flatten, dout) dout_eot = ccat(dout[1], ratio_x[1] | decouple_sp) | Uint[4] dout = ccat(dout[0][1], dout_eot) | Queue[dout.dtype[0][1], 4] return dout | decouple_sp
def test_directed( cosim_cls, wr0_delay, rd0_delay, wr1_delay, rd1_delay, dout_delay, depth, ): def wr0_delay_gen(): for _ in range(depth): yield 0 while True: yield wr0_delay w_addr = 3 w_data = 8 wr_req_t = TWrReq[w_addr, Uint[w_data]] rd_req_t = Uint[w_addr] req_t = Union[rd_req_t, wr_req_t] wr0_req_seq = [(i, i * 2) for i in range(depth)] wr0_init_seq = [(i, 0) for i in range(depth)] rd0_req_seq = list(range(depth)) rd1_req_seq = list(range(depth)) wr0_req = drv(t=wr_req_t, seq=wr0_init_seq + wr0_req_seq) \ | delay_gen(f=wr0_delay_gen()) rd0_req = drv(t=Uint[w_addr], seq=rd0_req_seq) \ | delay_gen(f=iter([depth])) \ | delay_rng(0, rd0_delay) req0 = priority_mux(rd0_req, wr0_req) req1 = ccat(drv(t=Uint[w_addr], seq=rd1_req_seq) \ | req_t.data \ | delay_gen(f=iter([depth])) \ | delay_rng(0, rd1_delay) , Bool(False)) | req_t verif(req0, req1, f=tdp(name='dut', sim_cls=cosim_cls, depth=depth), ref=tdp(depth=depth), delays=[delay_rng(0, dout_delay), delay_rng(0, 0)]) sim()
def feature_addr(stage_counter: Queue[Uint['w_stage_addr'], 1], rst_in: Unit, *, casc_hw): rst_in | local_rst stage_counter = stage_counter feature_num_in_stage = stage_counter[0] | rom( data=casc_hw.features_stage_count_mem, dtype=Uint[casc_hw.w_features_stage_count]) cnt_end, cnt_start = feature_num_in_stage | Tuple[ Uint[casc_hw.w_features_stage_count / 2], Uint[casc_hw.w_features_stage_count / 2]] feature_cnt = ccat(cnt_start, cnt_end, 1) | rng stage_counter = stage_counter | cart_sync_with(feature_cnt) dout_eot = ccat(feature_cnt[1], stage_counter[1]) | Uint[2] feature_cnt = ccat( feature_cnt[0], dout_eot) | Queue[Uint[int(casc_hw.w_features_stage_count / 2)], 2] return feature_cnt | dreg
def test_uint_3(cosim_cls, din_delay, dout_delay): directed(drv(t=Uint[2], seq=[0, 1, 2, 3]) | delay_rng(din_delay[0], din_delay[0]), drv(t=Uint[3], seq=[4, 5, 6, 7]) | delay_rng(din_delay[1], din_delay[1]), drv(t=Uint[8], seq=[8, 9, 10, 11]) | delay_rng(din_delay[2], din_delay[2]), f=ccat(sim_cls=cosim_cls), ref=[(0, 4, 8), (1, 5, 9), (2, 6, 10), (3, 7, 11)], delays=[delay_rng(dout_delay, dout_delay)]) sim()
def calc_rect_coords(din: Tuple[Uint['w_meas'], Uint['w_meas'], Uint['w_rect']], *, w_meas=b'w_meas', w_rect=b'w_rect', casc_hw): width = din[1] height = din[0] A = din[2] B = (A + width) | Uint[w_rect] | dreg_sp tmp = height * casc_hw.frame_size[1] | dreg_sp D = (B + tmp) | Uint[w_rect] C = (D - (width | dreg_sp)) | Uint[w_rect] sign = ccat(1, 0, 0, 1) | Array[Uint[1], 4] | serialize_plain rect_coord = ccat(A | dreg_sp, B, C, D) | Array[Uint[w_rect], 4] rect_coord = rect_coord | serialize_plain return ccat(rect_coord[0], sign[0], rect_coord[1]) | \ Queue[Tuple[Uint[w_rect], Uint[1]], 1]
def features_mem(rd_addr: Queue[Uint['w_addr'], 2], rst_in: Unit, *, casc_hw): w_rect = casc_hw.w_rect_data // 2 rst_in | local_rst rd_addr = rd_addr | decouple_sp features_data = [] for i in range(3): feature = rects_mem(rd_addr_if=rd_addr[0], inst_num=i, casc_hw=casc_hw) features_data.append(feature | decouple_sp) feature_data_t = Intf(Tuple[Uint[w_rect], Uint[1], Int[casc_hw.w_weight]]) features_zip = czip(*features_data) | Queue[Array[feature_data_t.dtype, 3], 1] sync = cart(rd_addr[1] | dreg, features_zip) dout_eot = ccat(sync[1], sync[0][0]) | Uint[3] dout = ccat(sync[0][1], dout_eot) | Queue[Array[feature_data_t.dtype, 3], 3] return dout
def ph_neg(data): xv, yv, ph = data if i + 1 < ww: xv_shift = (xv >> (i + 1)) yv_shift = (yv >> (i + 1)) else: xv_shift = Uint[1](0) yv_shift = Uint[1](0) xv_neg = (xv + yv_shift) >> Int[ww] yv_neg = (yv - xv_shift) >> Int[ww] ph_neg = (ph + cordic_angle) >> Uint[pw] return ccat(xv_neg, yv_neg, ph_neg)
def ph_pos(data): xv, yv, ph = data if i + 1 < ww: xv_shift = (xv >> (i + 1)) yv_shift = (yv >> (i + 1)) else: xv_shift = Uint[1](0) yv_shift = Uint[1](0) xv_pos = (xv - yv_shift) >> Int[ww] yv_pos = (yv + xv_shift) >> Int[ww] ph_pos = (ph - cordic_angle) >> Uint[pw] return ccat(xv_pos, yv_pos, ph_pos)
def reorder( din: Queue[Tuple[Array['d1', 3], Array['d2', 3]]] ) -> (Queue[Tuple['d1', 'd2']], ) * 3: (a1, a2), eot = din return ( ccat(ccat(a1[0], a2[0]), eot), ccat(ccat(a1[1], a2[1]), eot), ccat(ccat(a1[2], a2[2]), eot), )
def rect_sum(fb_data: Queue[Array[Tuple[Uint['w_ii'], Uint[1], Int['w_weight']], 3], 3], *, w_ii=b'w_ii', w_weight=b'w_weight'): rect_data_t = Intf(Tuple[Uint[w_ii], Uint[1], Int[w_weight]]) rect = [] for i in range(3): rect_tmp = ccat(fb_data[0][i], fb_data[1][0]) | Queue[rect_data_t.dtype, 1] | weighted_sum | dreg_sp rect_tmp = rect_tmp * 4096 rect.append(rect_tmp) rect_sum = rect[0] + rect[1] + rect[2] return rect_sum