def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 axi_a = vthread.AXIStreamIn(m, 'axi_a', clk, rst, datawidth, with_last=True) axi_b = vthread.AXIStreamOut(m, 'axi_b', clk, rst, datawidth, with_last=True) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth) def comp(): while True: saxi.wait_flag(0, value=1, resetvalue=0) saxi.write(1, 1) # set busy size = saxi.read(2) for i in range(size): a, a_last = axi_a.read() b = a + 1 b_last = a_last axi_b.write(b, b_last) saxi.write(1, 0) # unset busy vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('add') clk = m.Input('CLK') rst = m.Input('RST') saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth=32, length=8) def add(): while True: saxi.wait_flag(0, value=1, resetvalue=0) a = saxi.read(2) b = saxi.read(3) c = a + b saxi.write(4, c) saxi.write_flag(1, 1, resetvalue=0) th = vthread.Thread(m, 'th_add', clk, rst, add) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) def matmul(): while True: saxi.wait_flag(0, value=1, resetvalue=0) matrix_size = saxi.read(1) a_offset = saxi.read(2) b_offset = saxi.read(3) c_offset = saxi.read(4) comp(matrix_size, a_offset, b_offset, c_offset) saxi.write_flag(5, 1, resetvalue=0) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): maxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): maxi.dma_read(ram_b, 0, b_addr, matrix_size) sum = 0 for k in range(matrix_size): x = ram_a.read(k) y = ram_b.read(k) sum += x * y ram_c.write(j, sum) b_addr += matrix_size * (datawidth // 8) maxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start() return m
def mkMemcpy(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_words = (2**addrwidth) // (datawidth // 8) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) def memcpy(): while True: saxi.wait_flag(0, value=1, resetvalue=0) copy_bytes = saxi.read(1) src_offset = saxi.read(2) dst_offset = saxi.read(3) copy(copy_bytes, src_offset, dst_offset) saxi.write_flag(4, 1, resetvalue=0) def copy(copy_bytes, src_offset, dst_offset): rest_words = copy_bytes // (datawidth // 8) src_global_addr = src_offset dst_global_addr = dst_offset local_addr = 0 while rest_words > 0: if rest_words > ram_words: dma_size = ram_words else: dma_size = rest_words maxi.dma_read(ram_a, local_addr, src_global_addr, dma_size) maxi.dma_write(ram_a, local_addr, dst_global_addr, dma_size) src_global_addr += dma_size * (datawidth // 8) dst_global_addr += dma_size * (datawidth // 8) rest_words -= dma_size th = vthread.Thread(m, 'th_memcpy', clk, rst, memcpy) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth) def wait(fsm, sleep): cnt = fsm.m.TmpReg(32, initval=0) fsm.If(cnt < sleep)( cnt.inc() ) fsm.If(cnt >= sleep)( cnt(0) ) fsm.Then().goto_next() def blink(size): while True: # wait start saxi.wait_flag(0, value=1, resetvalue=0) # reset done saxi.write(3, 0) sleep = saxi.read(1) size = saxi.read(2) for i in range(size): wait(sleep) led.value += 1 # done saxi.write_flag(3, 1, resetvalue=0) th = vthread.Thread(m, 'th_blink', clk, rst, blink) th.add_intrinsics(wait) fsm = th.start(16) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth) all_ok = m.TmpReg(initval=0) def blink(size): # wait start saxi.wait_flag(0, value=1, resetvalue=0) # reset done saxi.write(1, 0) all_ok.value = True for i in range(4): print('# iter %d start' % i) # Test for 4KB boundary check offset = i * 1024 * 16 + (myaxi.boundary_size - 4) body(size, offset) print('# iter %d end' % i) if all_ok: print('# verify (local): PASSED') else: print('# verify (local): FAILED') # result saxi.write(2, all_ok) # done saxi.write_flag(1, 1, resetvalue=0) def body(size, offset): # write for i in range(size): wdata = i + 100 myram.write(i, wdata) laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # write for i in range(size): wdata = i + 1000 myram.write(i, wdata) laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 100): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False # read laddr = 0 gaddr = (size + size) * 4 + offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) if vthread.verilog.NotEql(rdata, i + 1000): print('rdata[%d] = %d' % (i, rdata)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(16) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 axi_a = vthread.AXIStreamIn(m, 'axi_a', clk, rst, datawidth, with_last=True) axi_b = vthread.AXIStreamIn(m, 'axi_b', clk, rst, datawidth, with_last=True) axi_c = vthread.AXIStreamOut(m, 'axi_c', clk, rst, datawidth, with_last=True) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth, numports=2) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth, numports=2) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth, numports=2) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp(): while True: saxi.wait_flag(0, value=1, resetvalue=0) saxi.write(1, 1) # set busy size = saxi.read(2) offset = 0 axi_a.write_ram(ram_a, offset, size, port=1) # blocking read axi_b.write_ram(ram_b, offset, size, port=1) # blocking read comp_stream(size, offset) axi_c.read_ram(ram_c, offset, size, port=1) # blocking write saxi.write(1, 0) # unset busy vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkMemcpy(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_words = (2**addrwidth) // (datawidth // 8) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth, numports=2) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) # import verilog submodule start = m.Reg('start', initval=0) busy = m.Wire('busy') size = m.Reg('size', addrwidth, initval=0) sub = Submodule(m, pe_verilog_code, 'inst_pe', prefix='pe_', arg_params=(('ADDR_WIDTH', addrwidth), ('DATA_WIDTH', datawidth)), arg_ports=(('CLK', clk), ('RST', rst), ('start', start), ('busy', busy), ('size', size)), as_wire=('addr', 'rdata', 'wdata', 'wenable')) # connect ports to RAM ram_a.connect_rtl(1, sub['addr'], sub['wdata'], sub['wenable'], sub['rdata']) def control_processing_unit(v): size.value = v start.value = 1 start.value = 0 while busy: pass def memcpy(): while True: saxi.wait_flag(0, value=1, resetvalue=0) copy_bytes = saxi.read(1) src_offset = saxi.read(2) dst_offset = saxi.read(3) copy(copy_bytes, src_offset, dst_offset) saxi.write_flag(4, 1, resetvalue=0) def copy(copy_bytes, src_offset, dst_offset): rest_words = copy_bytes // (datawidth // 8) src_global_addr = src_offset dst_global_addr = dst_offset local_addr = 0 while rest_words > 0: if rest_words > ram_words: dma_size = ram_words else: dma_size = rest_words maxi.dma_read(ram_a, local_addr, src_global_addr, dma_size) control_processing_unit(dma_size) maxi.dma_write(ram_a, local_addr, dst_global_addr, dma_size) src_global_addr += dma_size * (datawidth // 8) dst_global_addr += dma_size * (datawidth // 8) rest_words -= dma_size th = vthread.Thread(m, 'th_memcpy', clk, rst, memcpy) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) saxi_length = 5 saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth=datawidth, length=saxi_length) ram_src = vthread.RAM(m, 'ram_src', clk, rst, datawidth, addrwidth) ram_dummy_src = vthread.RAM(m, 'ram_dummy_src', clk, rst, datawidth, addrwidth) ram_dst = vthread.RAM(m, 'ram_dst', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) dummy_src = strm.source('dummy_src') c = strm.Counter(initval=0, size=4) x = strm.Counter(initval=0, size=8, enable=(c == 3)) y = strm.Counter(initval=0, size=8, enable=((c == 3) & (x == 7))) shift_cond = (x & 1 == 0) & ((y & 1) == 0) rotate_cond1 = (((((x & 1) == 0) & ((y & 1) == 0)) == 0) & (((x & 1) == 0) == 0)) rotate_cond2 = (((((x & 1) == 0) & ((y & 1) == 0)) == 0) & ((x & 1) == 0)) read_cond = shift_cond addrcounter = strm.Counter(initval=0, enable=read_cond) src = strm.read_RAM('ram_src', addr=addrcounter, when=read_cond, datawidth=datawidth) counter = strm.Counter(initval=0) width = strm.parameter('width') height = strm.parameter('height') linebuf = strm.LineBuffer(shape=(1, 1, 1), memlens=[4, 13], head_initvals=[0, 0], tail_initvals=[3, 12], data=src, shift_cond=shift_cond, rotate_conds=[rotate_cond1, rotate_cond2]) dst = linebuf.get_window(0) strm.sink(dst, 'dst') # add a stall condition count = m.Reg('count', 4, initval=0) seq = Seq(m, 'seq', clk, rst) seq(count.inc()) util.add_disable_cond(strm.oready, 1, count == 0) def comp_stream(channel, width, height, offset): strm.set_source('dummy_src', ram_dummy_src, offset, channel * width * height * 2 * 2) strm.set_read_RAM('ram_src', ram_src) strm.set_sink('dst', ram_dst, offset, channel * width * height * 2 * 2) strm.set_parameter('width', width) strm.set_parameter('height', height) strm.run() strm.join() def comp_sequential(channel, width, height, roffset, woffset): for yy in range(height * 2): for xx in range(width * 2): for c in range(channel): # f(c, x, y) = in(c, x/2, y/2); src_i = (xx // 2) * channel + (yy // 2) * width * channel + c dst_i = xx * channel + yy * width * 2 * channel + c val = ram_src.read(roffset + src_i) ram_dst.write(woffset + dst_i, val) def check(offset_stream, offset_seq, size): all_ok = True for i in range(size): st = ram_dst.read(offset_stream + i) sq = ram_dst.read(offset_seq + i) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): saxi.write(addr=1, value=0) saxi.wait_flag(0, value=1, resetvalue=0) channel = saxi.read(2) width = saxi.read(3) height = saxi.read(4) insize = channel * width * height outsize = channel * width * 2 * height * 2 roffset = 0 woffset = 0 myaxi.dma_read(ram_src, roffset, 0, insize) comp_stream(channel, width, height, roffset) myaxi.dma_write(ram_dst, woffset, 1024, outsize) roffset = insize woffset = outsize myaxi.dma_read(ram_src, roffset, 0, insize) comp_sequential(channel, width, height, roffset, woffset) myaxi.dma_write(ram_dst, woffset, 2 * 1024, outsize) check(0, woffset, outsize) saxi.write(addr=1, value=1) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) def matmul(): while True: saxi.wait_flag(0, value=1, resetvalue=0) matrix_size = saxi.read(1) a_offset = saxi.read(2) b_offset = saxi.read(3) c_offset = saxi.read(4) comp(matrix_size, a_offset, b_offset, c_offset) #check(matrix_size, a_offset, b_offset, c_offset) saxi.write_flag(5, 1, resetvalue=0) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): ram_a.dma_read(maxi, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): ram_b.dma_read(maxi, 0, b_addr, matrix_size) sum = 0 for k in range(matrix_size): x = ram_a.read(k) y = ram_b.read(k) sum += x * y ram_c.write(j, sum) b_addr += matrix_size * (datawidth // 8) ram_c.dma_write(maxi, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) def check(matrix_size, a_offset, b_offset, c_offset): all_ok = True c_addr = c_offset for i in range(matrix_size): ram_c.dma_read(maxi, 0, c_addr, matrix_size) for j in range(matrix_size): v = ram_c.read(j) if i == j and v != (i + 1) * 2: all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) if i != j and v != 0: all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) c_addr += matrix_size * (datawidth // 8) if all_ok: led.value = 0b01010101 print("OK") else: led.value = 0x0f print("NG") th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) saxi_length = 4 saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth=datawidth, length=saxi_length) ram_src = vthread.RAM(m, 'ram_src', clk, rst, datawidth, addrwidth) ram_dummy_src = vthread.RAM(m, 'ram_dummy_src', clk, rst, datawidth, addrwidth) ram_dst = vthread.RAM(m, 'ram_dst', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) dummy_src = strm.source('dummy_src') x = strm.Counter(initval=0, size=8) y = strm.Counter(initval=0, size=8, enable=(x == 7)) shift_cond = ((x & 1 == 0) & (y & 1 == 0)) rotate_cond = ((shift_cond == 0) & (x & 1 == 0)) read_cond = shift_cond addrcounter = strm.Counter(initval=0, enable=read_cond) src = strm.read_RAM('ram_src', addr=addrcounter, when=read_cond, datawidth=datawidth) counter = strm.Counter(initval=0) width = strm.parameter('width') height = strm.parameter('height') linebuf = strm.LineBuffer(shape=(1, 1), memlens=[4], head_initvals=[0], tail_initvals=[3], data=src, shift_cond=shift_cond, rotate_conds=[rotate_cond]) dst = linebuf.get_window(0) strm.sink(dst, 'dst') def comp_stream(width, height, offset): strm.set_source('dummy_src', ram_dummy_src, offset, width * height * 2 * 2) strm.set_read_RAM('ram_src', ram_src) strm.set_sink('dst', ram_dst, offset, width * height * 2 * 2) strm.set_parameter('width', width) strm.set_parameter('height', height) strm.run() strm.join() def comp_sequential(width, height, roffset, woffset): for y in range(height * 2): for x in range(width * 2): src_i = x // 2 + (y // 2) * width dst_i = x + y * width * 2 val = ram_src.read(roffset + src_i) ram_dst.write(woffset + dst_i, val) def check(offset_stream, offset_seq, size): all_ok = True for i in range(size): st = ram_dst.read(offset_stream + i) sq = ram_dst.read(offset_seq + i) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): saxi.write(addr=1, value=0) saxi.wait_flag(0, value=1, resetvalue=0) width = saxi.read(2) height = saxi.read(3) in_size = width * height out_size = width * height * 2 * 2 roffset = 0 woffset = 0 myaxi.dma_read(ram_src, roffset, 0, in_size) comp_stream(width, height, roffset) myaxi.dma_write(ram_dst, woffset, 1024, out_size) roffset = in_size woffset = out_size myaxi.dma_read(ram_src, roffset, 0, in_size) comp_sequential(width, height, roffset, woffset) myaxi.dma_write(ram_dst, woffset, 2 * 1024, out_size) check(0, woffset, out_size) saxi.write(addr=1, value=1) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) maxi.disable_write() saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth=32, length=8) axi_in = vthread.AXIStreamInFifo(m, 'axi_in', clk, rst, datawidth, with_last=True) axi_out = vthread.AXIStreamOutFifo(m, 'axi_out', clk, rst, datawidth, with_last=True) fifo_addrwidth = 8 fifo_a = vthread.FIFO(m, 'fifo_a', clk, rst, datawidth, fifo_addrwidth) fifo_b = vthread.FIFO(m, 'fifo_b', clk, rst, datawidth, fifo_addrwidth) fifo_c = vthread.FIFO(m, 'fifo_c', clk, rst, datawidth, fifo_addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) strm0 = vthread.Stream(m, 'mystream_reduce', clk, rst) a = strm0.source('a') reduce_size = strm0.parameter('reduce_size') v = a * a sum, sum_valid = strm0.ReduceAddValid(v, reduce_size) strm0.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') strm1 = vthread.Stream(m, 'mystream_bias', clk, rst) x = strm1.source('x') y = strm1.source('y') z = x + y strm1.sink(z, 'z') def comp(): while True: saxi.wait_flag(0, value=1, resetvalue=0) saxi.write(1, 1) # set busy read_size = saxi.read(2) write_size = saxi.read(3) reduce_size = saxi.read(4) bias_addr = saxi.read(5) if read_size <= 0: read_size = 1 if write_size <= 0: write_size = 1 if reduce_size <= 0: reduce_size = 1 maxi.dma_read(ram_b, 0, bias_addr, write_size) axi_in.write_fifo(fifo_a, read_size) axi_out.read_fifo(fifo_c, write_size) strm0.set_source_fifo('a', fifo_a, read_size) strm0.set_parameter('reduce_size', reduce_size) strm0.set_sink_fifo('sum', fifo_b, write_size) strm1.set_source_fifo('x', fifo_b, write_size) strm1.set_source('y', ram_b, 0, write_size) strm1.set_sink_fifo('z', fifo_c, write_size) strm0.run() strm1.run() strm0.join() strm1.join() saxi.write(1, 0) # unset busy th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(axi_datawidth=32, datawidth=4, addrwidth=10): if datawidth >= 8: raise ValueError('not supported.') m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') numbanks = int(math.ceil(axi_datawidth / datawidth)) myaxi = vthread.AXIM(m, 'myaxi', clk, rst, axi_datawidth) myram = vthread.MultibankRAM(m, 'myram', clk, rst, datawidth, addrwidth, numbanks=numbanks) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, 32) all_ok = m.TmpReg(initval=0) def blink(size): # wait start saxi.wait_flag(0, value=1, resetvalue=0) # reset done saxi.write(1, 0) all_ok.value = True # Test for 4KB boundary check offset = 1024 * 16 + (myaxi.boundary_size - 4) body(size, offset) if all_ok: print('# verify (local): PASSED') else: print('# verify (local): FAILED') # result saxi.write(2, all_ok) # done saxi.write_flag(1, 1, resetvalue=0) def body(size, offset): # read and modify laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) & (2**datawidth - 1) verify = (offset * 8 // datawidth + i) % (2**datawidth - 1) + 1 wdata = (verify + 1000) % (2**datawidth - 1) myram.write(i, wdata) if vthread.verilog.NotEql(rdata, verify): print('rdata[%d] = %d (!= %d)' % (i, rdata, verify)) all_ok.value = False # write laddr = 0 gaddr = offset myaxi.dma_write(myram, laddr, gaddr, size) print('dma_write: [%d] -> [%d]' % (laddr, gaddr)) # read (verify) laddr = 0 gaddr = offset myaxi.dma_read(myram, laddr, gaddr, size) print('dma_read: [%d] <- [%d]' % (laddr, gaddr)) for i in range(size): rdata = myram.read(i) & (2**datawidth - 1) verify = (((offset * 8 // datawidth + i) % (2**datawidth - 1) + 1 + 1000) % (2**datawidth - 1)) if vthread.verilog.NotEql(rdata, verify): print('rdata[%d] = %d (!= %d)' % (i, rdata, verify)) all_ok.value = False th = vthread.Thread(m, 'th_blink', clk, rst, blink) fsm = th.start(32) return m
def mkMemcpy(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') led = m.OutputReg('led', 8, initval=0) datawidth = 32 addrwidth = 10 ram_words = (2 ** addrwidth) // (datawidth // 8) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) code = m.EmbeddedCode(""" reg [31:0] sum; always @(posedge CLK) begin if(RST) begin sum <= 0; led <= 0; end else begin if({interface.wenable}) begin sum <= sum + {interface.wdata}; end led <= sum; end end """.format(interface=ram_a.interfaces[0])) def memcpy(): while True: saxi.wait_flag(0, value=1, resetvalue=0) copy_bytes = saxi.read(1) src_offset = saxi.read(2) dst_offset = saxi.read(3) copy(copy_bytes, src_offset, dst_offset) saxi.write_flag(4, 1, resetvalue=0) def copy(copy_bytes, src_offset, dst_offset): rest_words = copy_bytes // (datawidth // 8) src_global_addr = src_offset dst_global_addr = dst_offset local_addr = 0 while rest_words > 0: if rest_words > ram_words: dma_size = ram_words else: dma_size = rest_words maxi.dma_read(ram_a, local_addr, src_global_addr, dma_size) maxi.dma_write(ram_a, local_addr, dst_global_addr, dma_size) src_global_addr += dma_size * (datawidth // 8) dst_global_addr += dma_size * (datawidth // 8) rest_words -= dma_size th = vthread.Thread(m, 'th_memcpy', clk, rst, memcpy) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) saxi_length = 4 saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth=datawidth, length=saxi_length) ram_src = vthread.RAM(m, 'ram_src', clk, rst, datawidth, addrwidth) ram_dst = vthread.RAM(m, 'ram_dst', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) src = strm.source('src') counter = strm.Counter(initval=0) width = strm.parameter('width') height = strm.parameter('height') # shift x20 # rotate x10 # shift, rotate x34 shift_cond = strm.Or((counter < 20), ((counter >= 30) & (counter & 1 == 0))) rotate_cond = strm.Or(((counter >= 20) & (counter < 30)), ((counter >= 30) & (counter & 1 == 1))) linebuf = strm.LineBuffer(shape=(3, 3), memlens=[4], data=src, head_initvals=[0], tail_initvals=[3], shift_cond=shift_cond, rotate_conds=[rotate_cond]) window = [None] * 9 for y in range(3): for x in range(3): window[y * 3 + x] = linebuf.get_window(y * 3 + x) # The window register contains an invalid value in the beginning # because the initial value of shift memory is undefined. # Do not output sum until all the window register have valid value. dst = strm.Mux(counter < 20, window[8], strm.AddN(*window)) strm.sink(dst, 'dst') # add a stall condition count = m.Reg('count', 4, initval=0) seq = Seq(m, 'seq', clk, rst) seq(count.inc()) util.add_disable_cond(strm.oready, 1, count == 0) # for sequential ram_bufs = [ vthread.RAM(m, 'ram_buf' + str(i), clk, rst, datawidth, addrwidth) for i in range(3) ] def comp_stream(width, height, offset): strm.set_source('src', ram_src, offset, width * height) strm.set_sink('dst', ram_dst, offset, width * height) strm.set_parameter('width', width) strm.set_parameter('height', height) strm.run() strm.join() def comp_sequential(width, height, offset): head = 0 tail = 3 window_0 = window_1 = window_2 = 0 window_3 = window_4 = window_5 = 0 window_6 = window_7 = window_8 = 0 for i in range(width * height): src = ram_src.read(offset + i) shift = ((i < 20) or ((i >= 30) and (i & 1 == 0))) rotate = (((i >= 20) and (i < 30)) or ((i >= 30) and (i & 1 == 1))) if shift: ram_bufs[2].write(tail, window_8) window_8 = window_7 window_7 = window_6 window_6 = ram_bufs[1].read(head) ram_bufs[1].write(tail, window_5) window_5 = window_4 window_4 = window_3 window_3 = ram_bufs[0].read(head) ram_bufs[0].write(tail, window_2) window_2 = window_1 window_1 = window_0 window_0 = src head = head + 1 if head < 3 else 0 tail = tail + 1 if tail < 3 else 0 elif rotate: ram_bufs[2].write(tail, window_8) window_8 = window_7 window_7 = window_6 window_6 = ram_bufs[2].read(head) ram_bufs[1].write(tail, window_5) window_5 = window_4 window_4 = window_3 window_3 = ram_bufs[1].read(head) ram_bufs[0].write(tail, window_2) window_2 = window_1 window_1 = window_0 window_0 = ram_bufs[0].read(head) head = head + 1 if head < 3 else 0 tail = tail + 1 if tail < 3 else 0 sum = window_0 + window_1 + window_2 + window_3 + \ window_4 + window_5 + window_6 + window_7 + window_8 if i < 20: ram_dst.write(offset + i, window_0) else: ram_dst.write(offset + i, sum) def check(offset_stream, offset_seq, size): all_ok = True for i in range(size): st = ram_dst.read(offset_stream + i) sq = ram_dst.read(offset_seq + i) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): saxi.write(addr=1, value=0) saxi.wait_flag(0, value=1, resetvalue=0) width = saxi.read(2) height = saxi.read(3) size = width * height offset = 0 myaxi.dma_read(ram_src, offset, 0, size) comp_stream(width, height, offset) myaxi.dma_write(ram_dst, offset, 1024, size) offset = size myaxi.dma_read(ram_src, offset, 0, size) comp_sequential(width, height, offset) myaxi.dma_write(ram_dst, offset, 2 * 1024, size) check(0, offset, size) saxi.write(addr=1, value=1) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) maxi = vthread.AXIM(m, 'maxi', clk, rst, datawidth) saxi = vthread.AXISLiteRegister(m, 'saxi', clk, rst, datawidth, length=8) # Stream definition strm = vthread.Stream(m, 'strm_madd', clk, rst) a = strm.source('a') b = strm.source('b') size = strm.parameter('size') sum, sum_valid = strm.ReduceAddValid(a * b, size) strm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def strm_madd(size, waddr): strm.set_source('a', ram_a, 0, size) strm.set_source('b', ram_b, 0, size) strm.set_parameter('size', size) strm.set_sink('sum', ram_c, waddr, 1) strm.run() strm.join() def matmul(): while True: saxi.wait_flag(0, value=1, resetvalue=0) matrix_size = saxi.read(1) a_offset = saxi.read(2) b_offset = saxi.read(3) c_offset = saxi.read(4) comp(matrix_size, a_offset, b_offset, c_offset) saxi.write_flag(5, 1, resetvalue=0) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): maxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): maxi.dma_read(ram_b, 0, b_addr, matrix_size) strm_madd(matrix_size, j) b_addr += matrix_size * (datawidth // 8) maxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start() return m