def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 ram0 = vthread.RAM(m, 'ram0', clk, rst, datawidth, addrwidth) ram1 = vthread.RAM(m, 'ram1', clk, rst, datawidth, addrwidth) def blink(times): for i in range(times): wdata = i ram0.write(i, wdata) print('wdata = %d' % wdata) # reverse order vthread.copy_pattern(ram0, ram1, 0, times - 1, ((times, 1), ), ((times, -1), )) sum = 0 for i in range(times): rdata = ram1.read(i) sum += rdata print('rdata = %d' % rdata) print('sum = %d' % sum) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') size = strm.constant('size') sum, sum_valid = strm.ReduceAddValid(a, size) strm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_constant('size', size) strm.set_sink('sum', ram_b, offset, 1) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) sum += a ram_b.write(offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_b.read(i + offset_stream) sq = ram_b.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('OK') else: print('NG') def comp(size): offset = 0 myaxi.dma_read(ram_a, offset, 0, size) comp_stream(size, offset) myaxi.dma_write(ram_b, offset, 1024, 1) offset = size myaxi.dma_read(ram_a, offset, 0, size) comp_sequential(size, offset) myaxi.dma_write(ram_b, offset, 1024 * 2, 1) check(1, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) def matmul(): while True: saxi.wait_flag(0, value=1, resetvalue=0) matrix_size = saxi.read(1) a_offset = saxi.read(2) b_offset = saxi.read(3) c_offset = saxi.read(4) comp(matrix_size, a_offset, b_offset, c_offset) saxi.write_flag(5, 1, resetvalue=0) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): maxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): maxi.dma_read(ram_b, 0, b_addr, matrix_size) sum = 0 for k in range(matrix_size): x = ram_a.read(k) y = ram_b.read(k) sum += x * y ram_c.write(j, sum) b_addr += matrix_size * (datawidth // 8) maxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 ram0 = vthread.RAM(m, 'ram0', clk, rst, datawidth, addrwidth) ram1 = vthread.RAM(m, 'ram1', clk, rst, datawidth, addrwidth) def blink(times): all_ok = True write_sum = 0 for i in range(times): wdata = i ram0.write(i, wdata) write_sum += wdata print('wdata = %d' % wdata) # reverse order vthread.copy_pattern(ram0, ram1, 0, times - 1, ((times, 1), ), ((times, -1), )) read_sum = 0 for i in range(times): rdata = ram1.read(i) read_sum += rdata print('rdata = %d' % rdata) # reverse order if vthread.verilog.NotEql(rdata, times - i - 1): all_ok = False print('read_sum = %d' % read_sum) if vthread.verilog.NotEql(read_sum, write_sum): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) def blink(times): for i in range(times): wdata = i myram.write(i, wdata) print('wdata = %d' % wdata) sum = 0 for i in range(times): rdata = myram.read(i) sum += rdata print('rdata = %d' % rdata) print('sum = %d' % sum) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numports = 1 initvals = [i + 10 for i in range(2 ** addrwidth - 100)] myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth, numports, initvals) def blink(times): for i in range(times): rdata = myram.read(i) print('rdata = %d' % rdata) wdata = rdata + 1 myram.write(i, wdata) print('wdata = %d' % wdata) sum = 0 for i in range(times): rdata = myram.read(i) sum += rdata print('rdata = %d' % rdata) print('sum = %d' % sum) th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(numthreads=8): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('LED', 8) datawidth = 32 addrwidth = 10 myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) mymutex = vthread.Mutex(m, 'mymutex', clk, rst) def myfunc(tid, size): mymutex.lock() print("Thread %d Lock" % tid) for i in range(size): read_data = myram.read(i) write_data = read_data + tid + i myram.write(i, write_data) print("Thread %d ram[%d] <- %d" % (tid, i, write_data)) mymutex.unlock() print("Thread %d Unlock" % tid) def blink(): all_ok = True size = 16 for i in range(size): myram.write(i, 0) for tid in range(numthreads): pool.run(tid, tid, size) for tid in range(numthreads): pool.join(tid) for i in range(size): read_data = myram.read(i) led.value = read_data print("result ram[%d] = %d" % (i, read_data)) expected = i * numthreads + (0 + numthreads - 1) * numthreads // 2 if vthread.verilog.NotEql(read_data, expected): all_ok = False print(i, read_data, expected) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_blink', clk, rst, blink) pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) read_size = 10 write_size = read_size write_done = m.Reg('write_done', initval=0) write_addr = m.Reg('write_addr', addrwidth, initval=0) write_data = write_addr read_addr = m.Reg('read_addr', addrwidth, initval=0) sum = m.Reg('sum', datawidth, initval=0) fsm = FSM(m, 'fsm', clk, rst) fsm.If(write_done).goto_next() # write_rtl myram.write_rtl(write_addr, write_data, cond=fsm) fsm(write_addr.inc()) fsm(Display('wdata = %d', write_data)) fsm.If(write_addr == write_size - 1).goto_next() # read_rtl read_data, read_valid = myram.read_rtl(read_addr, cond=fsm) fsm.goto_next() fsm(read_addr.inc()) read_data, read_valid = myram.read_rtl(read_addr, cond=fsm) fsm.If(read_valid)(Display('rdata = %d', read_data), sum.add(read_data)) fsm.If(read_addr == read_size - 1).goto_next() fsm.If(read_valid)(Display('rdata = %d', read_data), sum.add(read_data)) fsm.goto_next() fsm.If(read_valid)(Display('rdata = %d', read_data), sum.add(read_data)) fsm.goto_next() fsm(Display('sum = %d', sum)) fsm.goto_next() def blink(times): write_done.value = 0 for i in range(times): wdata = i + 100 myram.write(i, wdata) print('wdata = %d' % wdata) write_done.value = 1 th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(read_size) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numports = 1 initvals = [Cat(Int(0, width=16), Int(i, width=12), Int(0, width=4)) for i in range(2 ** addrwidth - 100)] myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth, numports, initvals, nocheck_initvals=True) def blink(times): all_ok = True write_sum = 0 for i in range(times): rdata = myram.read(i) print('rdata = %d' % rdata) if vthread.verilog.NotEql(rdata, i * 16): all_ok = False wdata = rdata + 1 myram.write(i, wdata) write_sum += wdata print('wdata = %d' % wdata) read_sum = 0 for i in range(times): rdata = myram.read(i) read_sum += rdata print('rdata = %d' % rdata) if vthread.verilog.NotEql(rdata, i * 16 + 1): all_ok = False print('read_sum = %d' % read_sum) if vthread.verilog.NotEql(read_sum, write_sum): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkMemcpy(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_words = (2**addrwidth) // (datawidth // 8) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) def memcpy(): while True: saxi.wait_flag(0, value=1, resetvalue=0) copy_bytes = saxi.read(1) src_offset = saxi.read(2) dst_offset = saxi.read(3) copy(copy_bytes, src_offset, dst_offset) saxi.write_flag(4, 1, resetvalue=0) def copy(copy_bytes, src_offset, dst_offset): rest_words = copy_bytes // (datawidth // 8) src_global_addr = src_offset dst_global_addr = dst_offset local_addr = 0 while rest_words > 0: if rest_words > ram_words: dma_size = ram_words else: dma_size = rest_words maxi.dma_read(ram_a, local_addr, src_global_addr, dma_size) maxi.dma_write(ram_a, local_addr, dst_global_addr, dma_size) src_global_addr += dma_size * (datawidth // 8) dst_global_addr += dma_size * (datawidth // 8) rest_words -= dma_size th = vthread.Thread(m, 'th_memcpy', clk, rst, memcpy) fsm = th.start() return m
def mkLed(numthreads=8): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('LED', 8) datawidth = 32 addrwidth = 10 myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) mymutex = vthread.Mutex(m, 'mymutex', clk, rst) def myfunc(tid, size): mymutex.lock() print("Thread %d Lock" % tid) for i in range(size): read_data = myram.read(i) write_data = read_data + tid + i myram.write(i, write_data) print("Thread %d ram[%d] <- %d" % (tid, i, write_data)) mymutex.unlock() print("Thread %d Unlock" % tid) def blink(): size = 16 for i in range(size): myram.write(i, 0) for tid in range(numthreads): pool.run(tid, tid, size) for tid in range(numthreads): pool.join(tid) for i in range(size): read_data = myram.read(i) led.value = read_data print("result ram[%d] = %d" % (i, read_data)) th = vthread.Thread(m, 'th_blink', clk, rst, blink) pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth, ram_style='(* ram_style = "block" *)') def blink(times): all_ok = True write_sum = 0 for i in range(times): wdata = i myram.write(i, wdata) write_sum += wdata print('wdata = %d' % wdata) read_sum = 0 for i in range(times): rdata = myram.read(i) read_sum += rdata print('rdata = %d' % rdata) if vthread.verilog.NotEql(rdata, i): all_ok = False print('read_sum = %d' % read_sum) if vthread.verilog.NotEql(read_sum, write_sum): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(10) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) img_width = strm.parameter('img_width') counter = strm.Counter() a = strm.source('a') a_addr = strm.Counter() sp = strm.Scratchpad(a, a_addr, length=128) a0 = a a1 = a0.prev(1) a2 = a1.prev(1) a3_addr = a_addr - img_width a3 = sp.read(a3_addr) a4 = a3.prev(1) a5 = a4.prev(1) a6_addr = a3_addr - img_width a6 = sp.read(a6_addr) a7 = a6.prev(1) a8 = a7.prev(1) #b = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 b = strm.AddN(a0, a1, a2, a3, a4, a5, a6, a7, a8) strm.sink(b, 'b', when=counter >= img_width + img_width + 2) def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size * 3) strm.set_sink('b', ram_b, offset, size - 2) strm.set_parameter('img_width', size) strm.run() strm.join() def comp_sequential(size, offset): for i in range(size - 2): a0 = ram_a.read(i + offset) a1 = ram_a.read(i + offset + 1) a2 = ram_a.read(i + offset + 2) a3 = ram_a.read(i + offset + size) a4 = ram_a.read(i + offset + size + 1) a5 = ram_a.read(i + offset + size + 2) a6 = ram_a.read(i + offset + size + size) a7 = ram_a.read(i + offset + size + size + 1) a8 = ram_a.read(i + offset + size + size + 2) b = a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 ram_b.write(i + offset, b) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size - 2): st = ram_b.read(i + offset_stream) sq = ram_b.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size * 3) comp_stream(size, offset) myaxi.dma_write(ram_b, offset, 1024, size) # sequential offset = size * 4 myaxi.dma_read(ram_a, offset, 0, size * 3) comp_sequential(size, offset) myaxi.dma_write(ram_b, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - memory_datawidth // 8) body(size, offset) print('# iter %d end' % i) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() def body(size, offset): # write for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for i in range(size): wdata = i + 1000 myram.write(i, wdata) laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 100): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 1000): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(17) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) shape = [16, 4, 8] size = functools.reduce(lambda x, y: x * y, shape, 1) order = [1, 2, 0] def to_pattern(shape, order): pattern = [] for p in order: size = shape[p] stride = functools.reduce(lambda x, y: x * y, shape[p + 1:], 1) pattern.append((size, stride)) return pattern pattern_a = to_pattern(shape, order) pattern_b = to_pattern(shape, order) pattern_c = to_pattern(shape, order) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset, pattern_a) strm.set_source_pattern('b', ram_b, offset, pattern_b) strm.set_sink_pattern('c', ram_c, offset, pattern_c) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b ram_c.write(i + offset, sum) def check(offset_stream, offset_seq): all_ok = True st = ram_c.read(offset_stream) sq = ram_c.read(offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('OK') else: print('NG') def comp(): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) # verification check(0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) cnt1 = strm.Counter() cnt2 = strm.Counter(initval=1) cnt3 = strm.Counter(initval=2, size=5) cnt4 = strm.Counter(initval=3, interval=3) cnt5 = strm.Counter(initval=4, interval=3, size=7) cnt6 = strm.Counter(initval=4, step=2, interval=2) a = strm.source('a') b = strm.source('b') c = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): cnt = 0 for i in range(size): cnt1 = cnt cnt2 = 1 + cnt cnt3 = (cnt + 2) % 5 cnt4 = (cnt // 3) + 3 cnt5 = ((cnt // 3) + 4) % 7 cnt6 = (cnt // 2) * 2 + 4 a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 ram_c.write(i + offset, sum) cnt += 1 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) myrams = [vthread.RAM(m, 'myram_%d' % i, clk, rst, datawidth, addrwidth) for i in range(numbanks)] myram = vthread.MultibankRAM(rams=myrams, name='myram') all_ok = m.TmpReg(initval=0) array_len = 16 array_size = (array_len + array_len) * 4 * numbanks def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - 4) body(size, offset) print('# iter %d end' % i) if all_ok: print('ALL OK') def body(size, offset): # write for bank in range(numbanks): for i in range(size): wdata = i + 100 + bank myram.write_bank(bank, i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for bank in range(numbanks): for i in range(size): wdata = i + 1000 + bank myram.write_bank(bank, i, wdata) laddr = 0 gaddr = array_size + offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for bank in range(numbanks): for i in range(size): rdata = myram.read_bank(bank, i) if vthread.verilog.NotEql(rdata, i + 100 + bank): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = array_size + offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for bank in range(numbanks): for i in range(size): rdata = myram.read_bank(bank, i) if vthread.verilog.NotEql(rdata, i + 1000 + bank): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(array_len) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) size = strm.constant('size') cnt, valid = strm.CounterValid(size) a = strm.source('a') b = strm.source('b') cntval = strm.Mux(valid, 1000, cnt) c = a + b + cntval strm.sink(c, 'c') def comp_stream(size, offset): strm.set_constant('size', size // 2) strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 cnt = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b + cnt cnt += 1 if cnt == 1001: cnt = 0 if cnt == size // 2 - 1: cnt = 1000 ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) mulstrm = vthread.Stream(m, 'mul_stream', clk, rst) mulx = mulstrm.source('x') muly = mulstrm.source('y') mulz = mulx * muly mulstrm.sink(mulz, 'z') macstrm = vthread.Stream(m, 'mac_stream', clk, rst) a = macstrm.source('a') b = macstrm.source('b') a = a + 1 b = b + 1 sub = macstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = macstrm.constant('size') sum, sum_valid = macstrm.ReduceAddValid(c, size) macstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') actstrm = vthread.Stream(m, 'act_stream', clk, rst) a = actstrm.source('a') b = actstrm.source('b') a = a + 1 b = b + 1 a = a + 1 b = b + 1 sub = actstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = actstrm.constant('size') sum, sum_valid = actstrm.ReduceAddValid(c, size) sum = actstrm.Mux(sum > 0, sum, 0) actstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def comp_stream_mul(size, offset): mulstrm.set_source('x', ram_a, offset, size) mulstrm.set_source('y', ram_b, offset, size) mulstrm.set_sink('z', ram_c, offset, size) mulstrm.run() mulstrm.join() def comp_stream_mac(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('size', size) macstrm.set_sink('sum', ram_c, offset, 1) macstrm.run() macstrm.join() def comp_stream_act(size, offset): actstrm.set_source('a', ram_a, offset, size) actstrm.set_source('b', ram_b, offset, size) actstrm.set_constant('size', size) actstrm.set_sink('sum', ram_c, offset, 1) actstrm.run() actstrm.join() def comp_sequential_mul(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a * b ram_c.write(i + offset, sum) def comp_sequential_mac(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 1 b = ram_b.read(i + offset) + 1 sum += a * b ram_c.write(offset, sum) def comp_sequential_act(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 2 b = ram_b.read(i + offset) + 2 sum += a * b if sum <= 0: sum = 0 ram_c.write(offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # mul # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# MUL') check(size, 0, offset) # mac # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) # mac 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(matrix_size=16): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') seq = Seq(m, 'seq', clk, rst) timer = m.Reg('timer', 32, initval=0) seq(timer.inc()) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) def matmul(matrix_size, a_offset, b_offset, c_offset): start_time = timer comp(matrix_size, a_offset, b_offset, c_offset) end_time = timer time = end_time - start_time print("Time (cycles): %d" % time) check(matrix_size, a_offset, b_offset, c_offset) def strm_madd(strm, size, waddr): a = strm.read(ram_a, 0, size) b = strm.read(ram_b, 0, size) sum, valid = strm.RegionAdd(a * b, size) strm.write(ram_c, waddr, 1, sum, when=valid) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): myaxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): myaxi.dma_read(ram_b, 0, b_addr, matrix_size) stream.run(matrix_size, j) stream.join() b_addr += matrix_size * (datawidth // 8) myaxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) def check(matrix_size, a_offset, b_offset, c_offset): all_ok = True c_addr = c_offset for i in range(matrix_size): myaxi.dma_read(ram_c, 0, c_addr, matrix_size) for j in range(matrix_size): v = ram_c.read(j) if i == j and vthread.verilog.NotEql(v, (i + 1) * 2): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) if i != j and vthread.verilog.NotEql(v, 0): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) c_addr += matrix_size * (datawidth // 8) if all_ok: print("OK") else: print("NG") stream = vthread.Stream(m, 'strm_madd', clk, rst, strm_madd) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start(matrix_size, 0, 1024, 2048) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b v = strm.Ands(c > 140, c < 150) cnt = strm.ReduceAdd(v) strm.sink(c, 'c', when=v, when_name='v') strm.sink(cnt, 'cnt') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, 0) # max_size strm.set_sink_immediate('cnt', 0) # max_size strm.run() strm.join() cnt = strm.read_sink('cnt') print('# num of counted: %d' % cnt) return cnt def comp_sequential(size, offset): sum = 0 addr = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) c = a + b if c > 140 and c < 150: ram_c.write(addr + offset, c) addr += 1 print('# num of counted: %d' % addr) return addr def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, cnt) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, cnt) # verification myaxi.dma_read(ram_c, 0, 1024, cnt) myaxi.dma_read(ram_c, offset, 1024 * 2, cnt) check(cnt, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True # Test for 4KB boundary check offset = myaxi.boundary_size - 4 body(size, offset) if all_ok: print('ALL OK') def body(size, offset): # write for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for i in range(size): wdata = i + 1000 myram.write(i, wdata) laddr = 0 gaddr = offset + myaxi.boundary_size myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 100): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = offset + myaxi.boundary_size myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 1000): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(256 + 256 + 64) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 reduce_size = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth) macstrm = vthread.Stream(m, 'macstream', clk, rst) macstrm_a = macstrm.source('a') macstrm_b = macstrm.source('b') macstrm_const = macstrm.constant('const') macstrm_mul = macstrm_a * macstrm_b macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const) macstrm_v += 0 macstrm.sink(macstrm_c, 'c') macstrm.sink(macstrm_v, 'v') strm = vthread.Stream(m, 'mystream', clk, rst) x = strm.source('x') y = strm.source('y') const = strm.constant('const') sub = strm.substream(macstrm) sub.to_source('a', x) sub.to_source('b', y) sub.to_constant('const', const) z = sub.from_sink('c') v = sub.from_sink('v') z = z + x strm.sink(z, 'z', when=v, when_name='v') def comp_stream_macstrm(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('const', reduce_size) macstrm.set_sink('c', ram_c, offset, size) macstrm.set_sink('v', ram_d, offset, size) macstrm.run() macstrm.join() def comp_stream_mystrm(size, offset): strm.set_source('x', ram_a, offset, size) strm.set_source('y', ram_b, offset, size) strm.set_constant('const', reduce_size) strm.set_sink('z', ram_c, offset, size // reduce_size) strm.run() strm.join() def comp_sequential_macstrm(size, offset): sum = 0 count = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum += a * b count += 1 ram_c.write(i + offset, sum) ram_d.write(i + offset, count == (reduce_size - 1)) if count == reduce_size: sum = 0 count = 0 def comp_sequential_mystrm(size, offset): sum = 0 count = 0 write_offset = offset for i in range(size): x = ram_a.read(i + offset) y = ram_b.read(i + offset) sum += x * y val = sum + x count += 1 if count == reduce_size: ram_c.write(write_offset, val) write_offset += 1 sum = 0 count = 0 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# macstream') check(size, 0, offset) # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size // reduce_size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size) # verification print('# mystream') check(size // reduce_size, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(16) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 reduce_size = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth) macstrm = vthread.Stream(m, 'macstream', clk, rst) macstrm_a = macstrm.source('a') macstrm_b = macstrm.source('b') macstrm_const = macstrm.parameter('const') macstrm_mul = macstrm_a * macstrm_b macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const) macstrm.sink(macstrm_c, 'c') macstrm.sink(macstrm_v, 'v') macstrm2 = vthread.Stream(m, 'macstream2', clk, rst) macstrm2_a = macstrm2.source('a') macstrm2_b = macstrm2.source('b') macstrm2_const = macstrm2.parameter('const') macstrm2_a = macstrm2_a + 1 macstrm2_a = macstrm2_a - 1 macstrm2_b = macstrm2_b * 1 macsub = macstrm2.substream(macstrm) macsub.to_source('a', macstrm2_a) macsub.to_source('b', macstrm2_b) macsub.to_parameter('const', macstrm2_const) macstrm2_c = macsub.from_sink('c') macstrm2_v = macsub.from_sink('v') macstrm2.sink(macstrm2_c, 'c') macstrm2.sink(macstrm2_v, 'v') neststrm = vthread.Stream(m, 'neststream', clk, rst) neststrm_a = neststrm.source('a') neststrm_b = neststrm.source('b') neststrm_const = neststrm.parameter('const') neststrm_a += 1 neststrm_a += 0 neststrm_b += 1 macsub = neststrm.substream(macstrm2) macsub.to_source('a', neststrm_a) macsub.to_source('b', neststrm_b) macsub.to_parameter('const', neststrm_const) neststrm_c = macsub.from_sink('c') neststrm_c += neststrm_a neststrm_c += 0 neststrm_v = macsub.from_sink('v') neststrm.sink(neststrm_c, 'c') neststrm.sink(neststrm_v, 'v') strm = vthread.Stream(m, 'mystream', clk, rst) x = strm.source('x') y = strm.source('y') const = strm.parameter('const') sub = strm.substream(neststrm) sub.to_source('a', x) sub.to_source('b', y) sub.to_parameter('const', const) z = sub.from_sink('c') v = sub.from_sink('v') z = z + y strm.sink(z, 'z', when=v, when_name='v') all_ok = m.TmpReg(initval=0) def comp_stream_macstrm(size, offset): macstrm2.set_source('a', ram_a, offset, size) macstrm2.set_source('b', ram_b, offset, size) macstrm2.set_parameter('const', reduce_size) macstrm2.set_sink('c', ram_c, offset, size) macstrm2.set_sink('v', ram_d, offset, size) macstrm2.run() macstrm2.join() def comp_stream_mystrm(size, offset): strm.set_source('x', ram_a, offset, size) strm.set_source('y', ram_b, offset, size) strm.set_parameter('const', reduce_size) strm.set_sink('z', ram_c, offset, size // reduce_size) strm.run() strm.join() def comp_sequential_macstrm(size, offset): sum = 0 count = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum += a * b count += 1 ram_c.write(i + offset, sum) ram_d.write(i + offset, count == (reduce_size - 1)) if count == reduce_size: sum = 0 count = 0 def comp_sequential_mystrm(size, offset): sum = 0 count = 0 write_offset = offset for i in range(size): x = ram_a.read(i + offset) y = ram_b.read(i + offset) sum += (x + 1) * (y + 1) val = sum + (x + 1) + y count += 1 if count == reduce_size: ram_c.write(write_offset, val) write_offset += 1 sum = 0 count = 0 def check(size, offset_stream, offset_seq): for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok.value = False print(i, st, sq) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): all_ok.value = True # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# macstream') check(size, 0, offset) # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size // reduce_size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size) # verification print('# mystream') check(size // reduce_size, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(16) return m
def mkLed(matrix_size=16): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') seq = Seq(m, 'seq', clk, rst) timer = m.Reg('timer', 32, initval=0) seq( timer.inc() ) addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) strm = vthread.Stream(m, 'strm_madd', clk, rst) a = strm.source('a') b = strm.source('b') size = strm.constant('size') sum, sum_valid = strm.ReduceAddValid(a * b, size) strm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def strm_madd(size, waddr): strm.set_source('a', ram_a, 0, size) strm.set_source('b', ram_b, 0, size) strm.set_constant('size', size) strm.set_sink('sum', ram_c, waddr, 1) strm.run() strm.join() def matmul(matrix_size, a_offset, b_offset, c_offset): start_time = timer comp(matrix_size, a_offset, b_offset, c_offset) end_time = timer time = end_time - start_time print("Time (cycles): %d" % time) check(matrix_size, a_offset, b_offset, c_offset) vthread.finish() def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): myaxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): myaxi.dma_read(ram_b, 0, b_addr, matrix_size) strm_madd(matrix_size, j) b_addr += matrix_size * (datawidth // 8) myaxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) def check(matrix_size, a_offset, b_offset, c_offset): all_ok = True c_addr = c_offset for i in range(matrix_size): myaxi.dma_read(ram_c, 0, c_addr, matrix_size) for j in range(matrix_size): v = ram_c.read(j) if i == j and vthread.verilog.NotEql(v, (i + 1) * 2): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) if i != j and vthread.verilog.NotEql(v, 0): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) c_addr += matrix_size * (datawidth // 8) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start(matrix_size, a_offset, b_offset, c_offset) return m
def mkLed(word_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, word_datawidth, addrwidth, numports=2) axi_in = vthread.AXIStreamInFifo(m, 'axi_in', clk, rst, datawidth, with_last=True, noio=True) axi_out = vthread.AXIStreamOutFifo(m, 'axi_out', clk, rst, datawidth, with_last=True, noio=True) maxi_in = vthread.AXIM_for_AXIStreamIn(axi_in, 'maxi_in') maxi_out = vthread.AXIM_for_AXIStreamOut(axi_out, 'maxi_out') fifo_addrwidth = 8 fifo_in = vthread.FIFO(m, 'fifo_in', clk, rst, word_datawidth, fifo_addrwidth) fifo_out = vthread.FIFO(m, 'fifo_out', clk, rst, word_datawidth, fifo_addrwidth) all_ok = m.TmpReg(initval=0) def blink(size): all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - (word_datawidth // 8)) body(size, offset) print('# iter %d end' % i) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() def body(size, offset): # write a test vector for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size, port=1) # AXI-stream read -> FIFO -> FIFO -> AXI-stream write maxi_in.dma_read_async(gaddr, size * (word_datawidth // datawidth)) axi_in.write_fifo(fifo_in, size) for i in range(size): va = fifo_in.deq() fifo_out.enq(va) out_gaddr = (size + size) * (word_datawidth // 8) + offset maxi_out.dma_write_async(out_gaddr, size * (word_datawidth // datawidth)) axi_out.read_fifo(fifo_out, size) # check myaxi.dma_read(myram, 0, gaddr, size, port=1) myaxi.dma_read(myram, size, out_gaddr, size, port=1) for i in range(size): v0 = myram.read(i) v1 = myram.read(i + size) if vthread.verilog.NotEql(v0, v1): all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(17) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) img_width = strm.parameter('img_width') counter = strm.Counter() a = strm.source('a') a_addr = strm.Counter() sp = strm.Scratchpad(a, a_addr, length=128) a_old_addr = strm.Counter() - img_width a_old = sp.read(a_old_addr) b = a + a_old strm.sink(b, 'b', when=counter >= img_width) # add a stall condition count = m.Reg('count', 4, initval=0) seq = Seq(m, 'seq', clk, rst) seq(count.inc()) util.add_disable_cond(strm.oready, 1, count == 0) def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size * 2) strm.set_sink('b', ram_b, offset, size) strm.set_parameter('img_width', size) strm.run() strm.join() def comp_sequential(size, offset): for i in range(size): a_buf = ram_a.read(i + offset) a = ram_a.read(i + offset + size) b = a_buf + a ram_b.write(i + offset, b) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_b.read(i + offset_stream) sq = ram_b.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size * 2) comp_stream(size, offset) myaxi.dma_write(ram_b, offset, 1024, size) # sequential offset = size * 4 myaxi.dma_read(ram_a, offset, 0, size * 2) comp_sequential(size, offset) myaxi.dma_write(ram_b, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) def blink(): size = 256 * 2 offset = 1024 * 4 # write for i in range(size): wdata = i myram.write(i, wdata) laddr = 0 gaddr = offset myram.dma_write(myaxi, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # overwrite for i in range(size): wdata = 128 myram.write(i, wdata) laddr = 0 gaddr = offset + size * 4 myram.dma_write(myaxi, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read all_ok = True laddr = 0 gaddr = offset myram.dma_read(myaxi, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if rdata != i: print('rdata[%d] = %d' % (i, rdata)) all_ok = False # read laddr = 0 gaddr = offset + size * 4 myram.dma_read(myaxi, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if rdata != 128: print('rdata[%d] = %d' % (i, rdata)) all_ok = False if all_ok: print('ALL OK') th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) size = 16 pattern = [(size, 0)] strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') sum = a + b strm.sink(sum, 'sum') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset + 10, pattern) strm.set_source_pattern('b', ram_b, offset + 10, pattern) strm.set_sink('sum', ram_c, offset, size) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(offset + 10) b = ram_b.read(offset + 10) sum = a + b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a * b strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() # double buffer of comp and cmd strm.set_source('a', ram_a, offset + size, size) strm.set_source('b', ram_b, offset + size, size) strm.set_sink('c', ram_c, offset + size, size) strm.source_join() strm.run() # double buffer of comp and cmd strm.set_source('a', ram_a, offset + size + size, size) strm.set_source('b', ram_b, offset + size + size, size) strm.set_sink('c', ram_c, offset + size + size, size) strm.source_join() strm.run() strm.source_join() strm.join() def comp_sequential(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a * b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): new_size = size + size + size # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, new_size) myaxi.dma_read(ram_b, offset, 512, new_size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, new_size) # sequential offset = new_size myaxi.dma_read(ram_a, offset, 0, new_size) myaxi.dma_read(ram_b, offset, 512, new_size) comp_sequential(new_size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, new_size) # verification myaxi.dma_read(ram_c, 0, 1024, new_size) myaxi.dma_read(ram_c, offset, 1024 * 2, new_size) check(new_size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m