Example #1
0
def mkLed(numthreads=8):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    mymutex = vthread.Mutex(m, 'mymutex', clk, rst)

    def myfunc(tid):
        print("-- Thread %d TryLock" % tid)
        lock = mymutex.try_lock()
        waitcount = 0
        while not lock:
            print("-- Thread %d TryLock" % tid)
            waitcount += 1
            lock = mymutex.try_lock()

        print("Thread %d Lock: waitcount=%d" % (tid, waitcount))

        for i in range(20):
            pass  # sleep

        print("Thread %d Hello" % tid)

        mymutex.unlock()
        print("Thread %d Unlock" % tid)

    def blink():
        for tid in range(numthreads):
            pool.run(tid, tid)

        for tid in range(numthreads):
            pool.join(tid)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads)
    fsm = th.start()

    return m
Example #2
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')
    led = m.OutputReg('LED', 8, initval=0)

    count = m.Reg('count', 8, initval=0)

    def blink(times):
        led.value = 0
        count.value = 0
        for i in range(times):
            for x in range(8):
                # pointer
                led.value[x] = count[x]
            print("led = ", led)
            count.value += 1

        led.value = 0
        count.value = 0
        for i in range(times):
            # slice
            led.value = count[0:2]
            print("led = ", led)
            count.value += 1

        led.value = 0
        count.value = 0
        for i in range(times):
            # slice with step
            led.value = count[0:8:2]
            print("led = ", led)
            count.value += 1

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(10)

    return m
Example #3
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')
    led = m.OutputReg('LED', 8, initval=0)

    count = m.Reg('count', 8, initval=0)

    seq = Seq(m, 'seq', clk, rst)
    seq(
        count.inc()
    )

    def blink(times):
        led.value = 0
        for i in range(times):
            led.value = count + global_value
            print("led = ", led)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(10)

    return m
Example #4
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')
    led = m.Reg('LED', 8, initval=0)

    count = fx.FixedReg(m, 'count', 8, point=3, initval=0)

    seq = Seq(m, 'seq', clk, rst)
    seq(count.inc())

    def blink(times):
        led.value = 0
        next_val = vthread.fixed.FixedConst(0, 8)
        for i in range(times):
            next_val = next_val + vthread.fixed.FixedConst(0.5, 8)
            led.value = next_val.int_part
            print("led = ", led)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(10)

    return m
Example #5
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    numports = 1
    initvals = [i * 0.5 + 10 for i in range(2 ** addrwidth - 100)]
    myram = vthread.FixedRAM(m, 'myram', clk, rst, datawidth, addrwidth,
                             point=8, numports=numports, initvals=initvals)

    def blink(times):
        for i in range(times):
            rdata = myram.read(i)
            print('rdata = %f' % rdata)

        for i in range(times):
            rdata = myram.read(i)

            b = vthread.fixed.FixedConst(0.25, 8)
            wdata = rdata + b
            myram.write(i, wdata)
            print('wdata = %f' % wdata)

        sum = vthread.fixed.FixedConst(0, 8)
        for i in range(times):
            rdata = myram.read(i)
            print('rdata = %f' % rdata)
            sum += rdata

        print('sum = %f' % sum)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(10)

    return m
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    numbanks = 4
    myram = vthread.MultibankRAM(m,
                                 'myram',
                                 clk,
                                 rst,
                                 datawidth,
                                 addrwidth,
                                 numbanks=numbanks)

    def blink(times):
        wdata = 0
        for i in range(times):
            for b in range(numbanks):
                myram.write_bank(b, i, wdata)
                print('bank:%d wdata = %d' % (b, wdata))
                wdata += 1

        sum = 0
        for i in range(times):
            for b in range(numbanks):
                rdata = myram.read_bank(b, i)
                sum += rdata
                print('bank:%d rdata = %d' % (b, rdata))

        print('sum = %d' % sum)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(10)

    return m
Example #7
0
def mkLed(numthreads=8):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')
    led = m.Output('LED', 8)

    count = vthread.Shared(m.Reg('count', 32, initval=0))
    led.assign(count.value)

    def myfunc(tid):
        count.lock()
        print("Thread %d Lock" % tid)

        for i in range(20):
            pass  # sleep

        count.write(count.value + 1)
        print("Thread %d count = %d" % (tid, count.value))

        count.unlock()
        print("Thread %d Unlock" % tid)

    def blink():
        count.write(0)
        for tid in range(numthreads):
            pool.run(tid, tid)

        for tid in range(numthreads):
            pool.join(tid)

        print("result count = %d" % count.value)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    pool = vthread.ThreadPool(m, 'th_myfunc', clk, rst, myfunc, numthreads)
    fsm = th.start()

    return m
Example #8
0
def mkLed(baudrate=19200, clockfreq=100 * 1000 * 1000):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')
    sw = m.Input('sw', 16)
    led = m.OutputReg('led', 16, initval=0)
    tx = m.Output('utx')
    rx = m.Input('urx')

    uart_tx = UartTx(m,
                     'inst_tx',
                     'tx_',
                     clk,
                     rst,
                     tx,
                     baudrate=baudrate,
                     clockfreq=clockfreq)
    uart_rx = UartRx(m,
                     'inst_rx',
                     'rx_',
                     clk,
                     rst,
                     rx,
                     baudrate=baudrate,
                     clockfreq=clockfreq)

    def blink():
        while True:
            c = uart_rx.recv()
            data = c + sw
            led.value = data
            uart_tx.send(data)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start()

    return m
def mkLed(numthreads=8):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    mymutex = vthread.Mutex(m, 'mymutex', clk, rst)

    def myfunc(tid):
        mymutex.lock()
        print("Thread %d Lock" % tid)

        for i in range(20):
            pass  # sleep

        print("Thread %d Hello" % tid)

        mymutex.unlock()
        print("Thread %d Unlock" % tid)

    def blink():
        for tid in range(numthreads):
            pool.run(tid, tid)

        for tid in range(numthreads):
            pool.join(tid)

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    pool = vthread.ThreadPool(m,
                              'th_myfunc',
                              clk,
                              rst,
                              myfunc,
                              numthreads,
                              fsm_as_module=True)
    fsm = th.start()

    return m
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    data = m.Reg('data', 8, initval=0)
    enable = m.Reg('enable', initval=0)
    ready = m.Wire('ready')
    ready.assign(1)

    def send(fsm, value):
        fsm(data(value), enable(1), Display("data = %d", value))
        fsm.goto_next()
        fsm(enable(0))
        fsm.goto_next()
        return 0

    def wait(fsm):
        fsm.If(ready).goto_next()
        return 0

    def blink(times):
        for i in range(times):
            #data = i + 100
            data = vthread.verilog.Plus(i, 100)
            send(data)
            wait()

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)

    # add intrinsics
    th.add_intrinsics(send, wait)

    fsm = th.start(10)

    return m
Example #11
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    mulstrm = vthread.Stream(m, 'mul_stream', clk, rst)
    mulx = mulstrm.source('x')
    muly = mulstrm.source('y')
    mulz = mulx * muly
    mulstrm.sink(mulz, 'z')

    macstrm = vthread.Stream(m, 'mac_stream', clk, rst)
    a = macstrm.source('a')
    b = macstrm.source('b')
    a = a + 1
    b = b + 1
    sub = macstrm.substream(mulstrm)
    sub.to_source('x', a)
    sub.to_source('y', b)
    c = sub.from_sink('z')
    size = macstrm.constant('size')
    sum, sum_valid = macstrm.ReduceAddValid(c, size)
    macstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid')

    actstrm = vthread.Stream(m, 'act_stream', clk, rst)
    a = actstrm.source('a')
    b = actstrm.source('b')
    a = a + 1
    b = b + 1
    a = a + 1
    b = b + 1
    sub = actstrm.substream(mulstrm)
    sub.to_source('x', a)
    sub.to_source('y', b)
    c = sub.from_sink('z')
    size = actstrm.constant('size')
    sum, sum_valid = actstrm.ReduceAddValid(c, size)
    sum = actstrm.Mux(sum > 0, sum, 0)
    actstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid')

    def comp_stream_mul(size, offset):
        mulstrm.set_source('x', ram_a, offset, size)
        mulstrm.set_source('y', ram_b, offset, size)
        mulstrm.set_sink('z', ram_c, offset, size)
        mulstrm.run()
        mulstrm.join()

    def comp_stream_mac(size, offset):
        macstrm.set_source('a', ram_a, offset, size)
        macstrm.set_source('b', ram_b, offset, size)
        macstrm.set_constant('size', size)
        macstrm.set_sink('sum', ram_c, offset, 1)
        macstrm.run()
        macstrm.join()

    def comp_stream_act(size, offset):
        actstrm.set_source('a', ram_a, offset, size)
        actstrm.set_source('b', ram_b, offset, size)
        actstrm.set_constant('size', size)
        actstrm.set_sink('sum', ram_c, offset, 1)
        actstrm.run()
        actstrm.join()

    def comp_sequential_mul(size, offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum = a * b
            ram_c.write(i + offset, sum)

    def comp_sequential_mac(size, offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(i + offset) + 1
            b = ram_b.read(i + offset) + 1
            sum += a * b
        ram_c.write(offset, sum)

    def comp_sequential_act(size, offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(i + offset) + 2
            b = ram_b.read(i + offset) + 2
            sum += a * b
        if sum <= 0:
            sum = 0
        ram_c.write(offset, sum)

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
                print(i, st, sq)
        if all_ok:
            print('OK')
        else:
            print('NG')

    def comp(size):
        # mul
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream_mul(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, size)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential_mul(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, size)

        # verification
        print('# MUL')
        check(size, 0, offset)

        # mac
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream_mac(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, 1)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential_mac(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, 1)

        # verification
        print('# MAC')
        check(1, 0, offset)

        # act
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream_act(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, 1)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential_act(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, 1)

        # verification
        print('# ACT')
        check(1, 0, offset)

        # mac 2
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream_mac(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, 1)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential_mac(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, 1)

        # verification
        print('# MAC')
        check(1, 0, offset)

        # act 2
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream_act(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, 1)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential_act(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, 1)

        # verification
        print('# ACT')
        check(1, 0, offset)

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(32)

    return m
Example #12
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    strm = vthread.Stream(m, 'mystream', clk, rst)
    size = strm.constant('size')
    cnt, valid = strm.CounterValid(size)
    a = strm.source('a')
    b = strm.source('b')
    cntval = strm.Mux(valid, 1000, cnt)
    c = a + b + cntval
    strm.sink(c, 'c')

    def comp_stream(size, offset):
        strm.set_constant('size', size // 2)
        strm.set_source('a', ram_a, offset, size)
        strm.set_source('b', ram_b, offset, size)
        strm.set_sink('c', ram_c, offset, size)
        strm.run()
        strm.join()

    def comp_sequential(size, offset):
        sum = 0
        cnt = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum = a + b + cnt
            cnt += 1
            if cnt == 1001:
                cnt = 0
            if cnt == size // 2 - 1:
                cnt = 1000
            ram_c.write(i + offset, sum)

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    def comp(size):
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, size)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, size)

        # verification
        check(size, 0, offset)

        vthread.finish()

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(32)

    return m
Example #13
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    strm = vthread.Stream(m, 'mystream', clk, rst)
    a = strm.source('a')
    b = strm.source('b')
    c = a + b
    v = strm.Ands(c > 140, c < 150)
    cnt = strm.ReduceAdd(v)
    strm.sink(c, 'c', when=v, when_name='v')
    strm.sink(cnt, 'cnt')

    def comp_stream(size, offset):
        strm.set_source('a', ram_a, offset, size)
        strm.set_source('b', ram_b, offset, size)
        strm.set_sink('c', ram_c, offset, 0)  # max_size
        strm.set_sink_immediate('cnt', 0)  # max_size
        strm.run()
        strm.join()
        cnt = strm.read_sink('cnt')
        print('# num of counted: %d' % cnt)
        return cnt

    def comp_sequential(size, offset):
        sum = 0
        addr = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            c = a + b
            if c > 140 and c < 150:
                ram_c.write(addr + offset, c)
                addr += 1
        print('# num of counted: %d' % addr)
        return addr

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    def comp(size):
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        cnt = comp_stream(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, cnt)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        cnt = comp_sequential(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, cnt)

        # verification
        myaxi.dma_read(ram_c, 0, 1024, cnt)
        myaxi.dma_read(ram_c, offset, 1024 * 2, cnt)
        check(cnt, 0, offset)

        vthread.finish()

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(32)

    return m
Example #14
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')
    t = ng.add(a, b, dtype=c_dtype, par=par)
    c = ng.relu(t, dtype=c_dtype, par=par)

    targ = ng.to_veriloggen([c],
                            'matrix_add_relu',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
          [100]) % [6] - [10]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Example #15
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth)

    all_ok = m.TmpReg(initval=0)

    def blink(size):
        all_ok.value = True

        # Test for 4KB boundary check
        offset = myaxi.boundary_size - 4
        body(size, offset)

        if all_ok:
            print('ALL OK')

    def body(size, offset):
        # write
        for i in range(size):
            wdata = i + 100
            myram.write(i, wdata)

        laddr = 0
        gaddr = offset
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # write
        for i in range(size):
            wdata = i + 1000
            myram.write(i, wdata)

        laddr = 0
        gaddr = offset + myaxi.boundary_size
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # read
        laddr = 0
        gaddr = offset
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for i in range(size):
            rdata = myram.read(i)
            if vthread.verilog.NotEql(rdata, i + 100):
                print('rdata[%d] = %d' % (i, rdata))
                all_ok.value = False

        # read
        laddr = 0
        gaddr = offset + myaxi.boundary_size
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for i in range(size):
            rdata = myram.read(i)
            if vthread.verilog.NotEql(rdata, i + 1000):
                print('rdata[%d] = %d' % (i, rdata))
                all_ok.value = False

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(256 + 256 + 64)

    return m
Example #16
0
def mkLed(word_datawidth=128):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10

    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    myram = vthread.RAM(m,
                        'myram',
                        clk,
                        rst,
                        word_datawidth,
                        addrwidth,
                        numports=2)

    axi_in = vthread.AXIStreamInFifo(m,
                                     'axi_in',
                                     clk,
                                     rst,
                                     datawidth,
                                     with_last=True,
                                     noio=True)
    axi_out = vthread.AXIStreamOutFifo(m,
                                       'axi_out',
                                       clk,
                                       rst,
                                       datawidth,
                                       with_last=True,
                                       noio=True)

    maxi_in = vthread.AXIM_for_AXIStreamIn(axi_in, 'maxi_in')
    maxi_out = vthread.AXIM_for_AXIStreamOut(axi_out, 'maxi_out')

    fifo_addrwidth = 8
    fifo_in = vthread.FIFO(m, 'fifo_in', clk, rst, word_datawidth,
                           fifo_addrwidth)
    fifo_out = vthread.FIFO(m, 'fifo_out', clk, rst, word_datawidth,
                            fifo_addrwidth)

    all_ok = m.TmpReg(initval=0)

    def blink(size):
        all_ok.value = True

        for i in range(4):
            print('# iter %d start' % i)
            # Test for 4KB boundary check
            offset = i * 1024 * 16 + (myaxi.boundary_size -
                                      (word_datawidth // 8))
            body(size, offset)
            print('# iter %d end' % i)

        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    def body(size, offset):
        # write a test vector
        for i in range(size):
            wdata = i + 100
            myram.write(i, wdata)

        laddr = 0
        gaddr = offset
        myaxi.dma_write(myram, laddr, gaddr, size, port=1)

        # AXI-stream read -> FIFO -> FIFO -> AXI-stream write
        maxi_in.dma_read_async(gaddr, size * (word_datawidth // datawidth))
        axi_in.write_fifo(fifo_in, size)

        for i in range(size):
            va = fifo_in.deq()
            fifo_out.enq(va)

        out_gaddr = (size + size) * (word_datawidth // 8) + offset
        maxi_out.dma_write_async(out_gaddr,
                                 size * (word_datawidth // datawidth))
        axi_out.read_fifo(fifo_out, size)

        # check
        myaxi.dma_read(myram, 0, gaddr, size, port=1)
        myaxi.dma_read(myram, size, out_gaddr, size, port=1)

        for i in range(size):
            v0 = myram.read(i)
            v1 = myram.read(i + size)
            if vthread.verilog.NotEql(v0, v1):
                all_ok.value = False

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(17)

    return m
Example #17
0
def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    model = MatrixMul()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_mul.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    input_scale_factors = {'a': 10.0, 'b': 15.0}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.scaled_multiply):
            op.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    # verification data
    input_a = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [17]
    input_b = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
               [100]) % [13]

    # execution on pytorch
    model_a = input_a.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)

    model_b = input_b.astype(np.float32)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    # software-based verification
    va = input_a * input_scale_factors['a']
    va = np.clip(va, -1.0 * (2**(a.dtype.width - 1) - 1),
                 1.0 * (2**(a.dtype.width - 1) - 1))
    va = np.round(va).astype(np.int64)

    vb = input_b * input_scale_factors['b']
    vb = np.clip(vb, -1.0 * (2**(b.dtype.width - 1) - 1),
                 1.0 * (2**(b.dtype.width - 1) - 1))
    vb = np.round(vb).astype(np.int64)

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    mean_square_error = np.sum((vc - scaled_model_c)**2) / vc.size
    corrcoef = np.corrcoef(model_c.reshape([-1]), vc.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_mul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([c])
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15),
        bias_shape=None, scale_shape=None,
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        conv2d_stride=(1, 1, 1, 1),
        rshift_mul=None, rshift_sum=None, rshift_out=None,
        act_func=None,
        par_ich=1, par_och=1, par_col=1, par_row=1,
        concur_och=None, stationary='filter',
        input_ram_size=None, filter_ram_size=None,
        bias_ram_size=None, scale_ram_size=None,
        out_ram_size=None,
        ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    weight = ng.variable(weight_dtype, shape=weight_shape, name='weight')

    if bias_shape is not None:
        bias = ng.variable(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.variable(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    tmp = ng.conv2d(act, weight, conv2d_stride,
                    bias, scale,
                    rshift_mul, rshift_sum, rshift_out,
                    act_func, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d',
                    par_ich, par_och, par_col, par_row,
                    concur_och, stationary,
                    input_ram_size, filter_ram_size,
                    bias_ram_size, scale_ram_size,
                    None, None, None,
                    out_ram_size)

    out = ng.avg_pool(tmp, ksize=ksize,
                      strides=pool_stride,
                      sum_dtype=ng.int32, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]
    vweight = np.arange(weight.length,
                        dtype=np.int64).reshape(weight.shape) % [32] - [16]

    if bias is not None:
        vbias = np.arange(bias.length,
                          dtype=np.int64).reshape(bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length,
                           dtype=np.int64).reshape(scale.shape) % [6]
    else:
        vscale = None

    eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale)
    vout = eval_outs[0]

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight.memory_size,
                                 bias.memory_size if bias is not None else 0,
                                 scale.memory_size if scale is not None else 0,
                                 out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, weight.addr,
                     bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    axi.set_memory(mem, vweight, memimg_datawidth,
                   weight_dtype.width, weight.addr,
                   max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich))

    if bias is not None:
        axi.set_memory(mem, vbias, memimg_datawidth,
                       bias_dtype.width, bias.addr,
                       max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och))

    if scale is not None:
        axi.set_memory(mem, vscale, memimg_datawidth,
                       scale_dtype.width, scale.addr,
                       max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                + x * out.aligned_shape[3] + ch,
                                                out.addr, out_dtype.width)
                        check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + x * out.aligned_shape[3] + ch,
                                                 check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Example #19
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    strm = vthread.Stream(m, 'mystream', clk, rst)
    cnt1 = strm.Counter()
    cnt2 = strm.Counter(initval=1)
    cnt3 = strm.Counter(initval=2, size=5)
    cnt4 = strm.Counter(initval=3, interval=3)
    cnt5 = strm.Counter(initval=4, interval=3, size=7)
    cnt6 = strm.Counter(initval=4, step=2, interval=2)
    a = strm.source('a')
    b = strm.source('b')
    c = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6
    strm.sink(c, 'c')

    def comp_stream(size, offset):
        strm.set_source('a', ram_a, offset, size)
        strm.set_source('b', ram_b, offset, size)
        strm.set_sink('c', ram_c, offset, size)
        strm.run()
        strm.join()

    def comp_sequential(size, offset):
        cnt = 0
        for i in range(size):
            cnt1 = cnt
            cnt2 = 1 + cnt
            cnt3 = (cnt + 2) % 5
            cnt4 = (cnt // 3) + 3
            cnt5 = ((cnt // 3) + 4) % 7
            cnt6 = (cnt // 2) * 2 + 4
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6
            ram_c.write(i + offset, sum)
            cnt += 1

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    def comp(size):
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_stream(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, size)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 512, size)
        comp_sequential(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, size)

        # verification
        check(size, 0, offset)

        vthread.finish()

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(32)

    return m
def run(act_shape=(1, 4, 4, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 36),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        stride0=1,
        padding0=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='ReLU',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 is not None:
        layers.append(getattr(nn, act_func0)())

    class Transpose(nn.Module):
        def __init__(self, perm):
            super(Transpose, self).__init__()
            self.perm = perm

        def forward(self, input):
            return input.permute(*self.perm)

    layers.append(Transpose([0, 1, 3, 2]))

    class Flatten(nn.Module):
        def forward(self, input):
            # return input.view(input.size(0), -1)
            return torch.reshape(input, (input.size(0), -1))

    layers.append(Flatten())
    layers.append(nn.Linear(weight1_shape[1], weight1_shape[0]))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 is not None:
        layers.append(getattr(nn, act_func1)())

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_transpose_linear.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '3.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    # random data
    std = 0.2
    mean = 0.5
    img = np.random.normal(size=act.length).astype(np.float32).reshape(
        act.shape)
    img = img * std + mean

    # execution on pytorch
    model_input = img

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_transpose_linear',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(out.shape[0]):
            for j in range(out.shape[1]):
                orig = memory.read_word(i * out.aligned_shape[1] + j, out.addr,
                                        act_dtype.width)
                check = memory.read_word(i * out.aligned_shape[1] + j,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', i, j, ') orig: ', orig, 'check: ', check)
                    ok = False
                # else:
                #    print('OK (', i, j, ') orig: ', orig, 'check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def mkLed(memory_datawidth=128):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    numbanks = 4
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth)

    myrams = [vthread.RAM(m, 'myram_%d' % i, clk, rst, datawidth, addrwidth)
              for i in range(numbanks)]
    myram = vthread.MultibankRAM(rams=myrams, name='myram')

    all_ok = m.TmpReg(initval=0)

    array_len = 16
    array_size = (array_len + array_len) * 4 * numbanks

    def blink(size):
        all_ok.value = True

        for i in range(4):
            print('# iter %d start' % i)
            # Test for 4KB boundary check
            offset = i * 1024 * 16 + (myaxi.boundary_size - 4)
            body(size, offset)
            print('# iter %d end' % i)

        if all_ok:
            print('ALL OK')

    def body(size, offset):
        # write
        for bank in range(numbanks):
            for i in range(size):
                wdata = i + 100 + bank
                myram.write_bank(bank, i, wdata)

        laddr = 0
        gaddr = offset
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # write
        for bank in range(numbanks):
            for i in range(size):
                wdata = i + 1000 + bank
                myram.write_bank(bank, i, wdata)

        laddr = 0
        gaddr = array_size + offset
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # read
        laddr = 0
        gaddr = offset
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for bank in range(numbanks):
            for i in range(size):
                rdata = myram.read_bank(bank, i)
                if vthread.verilog.NotEql(rdata, i + 100 + bank):
                    print('rdata[%d] = %d' % (i, rdata))
                    all_ok.value = False

        # read
        laddr = 0
        gaddr = array_size + offset
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for bank in range(numbanks):
            for i in range(size):
                rdata = myram.read_bank(bank, i)
                if vthread.verilog.NotEql(rdata, i + 1000 + bank):
                    print('rdata[%d] = %d' % (i, rdata))
                    all_ok.value = False

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(array_len)

    return m
Example #22
0
def mkTest(baudrate=19200, clockfreq=19200 * 10):
    m = Module('test')

    # target instance
    led = mkLed(baudrate, clockfreq)

    uut = Submodule(m, led, name='uut', prefix='', as_wire=('utx', 'urx'))
    clk = uut['CLK']
    rst = uut['RST']
    tx = uut['utx']
    rx = uut['urx']
    sw = uut['sw']

    uart_tx = UartTx(m,
                     'inst_tx',
                     'tx_',
                     clk,
                     rst,
                     as_wire='txd',
                     baudrate=baudrate,
                     clockfreq=clockfreq)
    uart_rx = UartRx(m,
                     'inst_rx',
                     'rx_',
                     clk,
                     rst,
                     as_wire='rxd',
                     baudrate=baudrate,
                     clockfreq=clockfreq)

    txd = uart_tx['txd']
    rxd = uart_rx['rxd']
    rx.assign(txd)
    rxd.assign(tx)

    #simulation.setup_waveform(m, uut, uart_tx, uart_rx)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(sw(10), Delay(1000000), Systask('finish'))

    all_ok = m.TmpReg(initval=0)

    def test():
        all_ok = True
        for i in range(10):
            s = 100 + i
            uart_tx.send(s)
            r = uart_rx.recv()
            if vthread.verilog.Eql(r, s + sw):
                print('OK: %d + %d === %d' % (s, sw, r))
            else:
                print('NG: %d + %d !== %d' % (s, sw, r))
                all_ok = False

        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'test', clk, rst, test)
    th.start()

    return m
Example #23
0
def mkLed(memory_datawidth=128):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth)
    myram = vthread.RAM(m, 'myram', clk, rst, datawidth, addrwidth)

    all_ok = m.TmpReg(initval=0)

    def blink(size):
        all_ok.value = True

        for i in range(4):
            print('# iter %d start' % i)
            # Test for 4KB boundary check
            offset = i * 1024 * 16 + (myaxi.boundary_size -
                                      memory_datawidth // 8)
            body(size, offset)
            print('# iter %d end' % i)

        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    def body(size, offset):
        # write
        for i in range(size):
            wdata = i + 100
            myram.write(i, wdata)

        laddr = 0
        gaddr = offset
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # write
        for i in range(size):
            wdata = i + 1000
            myram.write(i, wdata)

        laddr = 0
        gaddr = (size + size) * 4 + offset
        myaxi.dma_write(myram, laddr, gaddr, size)
        print('dma_write: [%d] -> [%d]' % (laddr, gaddr))

        # read
        laddr = 0
        gaddr = offset
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for i in range(size):
            rdata = myram.read(i)
            if vthread.verilog.NotEql(rdata, i + 100):
                print('rdata[%d] = %d' % (i, rdata))
                all_ok.value = False

        # read
        laddr = 0
        gaddr = (size + size) * 4 + offset
        myaxi.dma_read(myram, laddr, gaddr, size)
        print('dma_read:  [%d] <- [%d]' % (laddr, gaddr))

        for i in range(size):
            rdata = myram.read(i)
            if vthread.verilog.NotEql(rdata, i + 1000):
                print('rdata[%d] = %d' % (i, rdata))
                all_ok.value = False

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(17)

    return m
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    numbanks = 4
    ram_addrwidth = addrwidth - int(math.log(addrwidth, 2))
    myram = vthread.MultibankRAM(m, 'myram', clk, rst, datawidth, ram_addrwidth,
                                 numbanks=numbanks, numports=2)

    read_size = 10
    write_size = read_size

    write_done = m.Reg('write_done', initval=0)

    addr = m.Reg('addr', addrwidth, initval=0)
    wdata = m.Reg('wdata', datawidth, initval=0)
    wenable = m.Reg('wenable', initval=0)
    rdata = m.Wire('rdata', datawidth)
    sum = m.Reg('sum', datawidth, initval=0)

    fsm = FSM(m, 'fsm', clk, rst)
    fsm.If(write_done).goto_next()

    # write
    fsm(
        addr(-1),
        wdata(-1),
        wenable(0)
    )
    fsm.goto_next()

    fsm(
        addr.inc(),
        wdata.inc(),
        wenable(1)
    )
    fsm.Delay(1)(
        Display('wdata =  %d', wdata),
        wenable(0)
    )
    fsm.If(addr == write_size - 2).goto_next()

    # read
    fsm(
        addr(-1),
        wenable(0)
    )
    fsm.goto_next()

    fsm(
        addr.inc()
    )
    fsm.Delay(2)(
        sum.add(rdata),
        Display('rdata =  %d', rdata)
    )
    fsm.If(addr == read_size - 2).goto_next()

    fsm.goto_next()
    fsm.goto_next()

    # sum
    fsm(
        Display('sum =  %d', sum)
    )
    fsm.goto_next()

    # connect ports to RAM
    myram.connect_rtl(1, addr, wdata, wenable, rdata)

    def blink(times):
        write_done.value = 0
        for i in range(times):
            wdata = i + 100
            myram.write(i, wdata)
            print('wdata = %d' % wdata)
        write_done.value = 1

    th = vthread.Thread(m, 'th_blink', clk, rst, blink)
    fsm = th.start(read_size)

    return m
Example #25
0
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    shape = [16, 4, 8]
    size = functools.reduce(lambda x, y: x * y, shape, 1)
    order = [1, 2, 0]

    def to_pattern(shape, order):
        pattern = []
        for p in order:
            size = shape[p]
            stride = functools.reduce(lambda x, y: x * y, shape[p + 1:], 1)
            pattern.append((size, stride))
        return pattern

    pattern_a = to_pattern(shape, order)
    pattern_b = to_pattern(shape, order)
    pattern_c = to_pattern(shape, order)

    strm = vthread.Stream(m, 'mystream', clk, rst)
    a = strm.source('a')
    b = strm.source('b')
    c = a + b
    strm.sink(c, 'c')

    def comp_stream(offset):
        strm.set_source_pattern('a', ram_a, offset, pattern_a)
        strm.set_source_pattern('b', ram_b, offset, pattern_b)
        strm.set_sink_pattern('c', ram_c, offset, pattern_c)
        strm.run()
        strm.join()

    def comp_sequential(offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum = a + b
            ram_c.write(i + offset, sum)

    def check(offset_stream, offset_seq):
        all_ok = True
        st = ram_c.read(offset_stream)
        sq = ram_c.read(offset_seq)
        if vthread.verilog.NotEql(st, sq):
            all_ok = False

        if all_ok:
            print('OK')
        else:
            print('NG')

    def comp():
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_stream(offset)
        myaxi.dma_write(ram_c, offset, 1024 * 4, 1)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_sequential(offset)
        myaxi.dma_write(ram_c, offset, 1024 * 8, 1)

        # verification
        check(0, offset)

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start()

    return m
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    reduce_size = 4

    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)
    ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth)

    macstrm = vthread.Stream(m, 'macstream', clk, rst)
    macstrm_a = macstrm.source('a')
    macstrm_b = macstrm.source('b')
    macstrm_const = macstrm.constant('const')
    macstrm_mul = macstrm_a * macstrm_b
    macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const)
    macstrm_v += 0
    macstrm.sink(macstrm_c, 'c')
    macstrm.sink(macstrm_v, 'v')

    strm = vthread.Stream(m, 'mystream', clk, rst)
    x = strm.source('x')
    y = strm.source('y')
    const = strm.constant('const')
    sub = strm.substream(macstrm)
    sub.to_source('a', x)
    sub.to_source('b', y)
    sub.to_constant('const', const)
    z = sub.from_sink('c')
    v = sub.from_sink('v')
    z = z + x
    strm.sink(z, 'z', when=v, when_name='v')

    def comp_stream_macstrm(size, offset):
        macstrm.set_source('a', ram_a, offset, size)
        macstrm.set_source('b', ram_b, offset, size)
        macstrm.set_constant('const', reduce_size)
        macstrm.set_sink('c', ram_c, offset, size)
        macstrm.set_sink('v', ram_d, offset, size)
        macstrm.run()
        macstrm.join()

    def comp_stream_mystrm(size, offset):
        strm.set_source('x', ram_a, offset, size)
        strm.set_source('y', ram_b, offset, size)
        strm.set_constant('const', reduce_size)
        strm.set_sink('z', ram_c, offset, size // reduce_size)
        strm.run()
        strm.join()

    def comp_sequential_macstrm(size, offset):
        sum = 0
        count = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum += a * b
            count += 1
            ram_c.write(i + offset, sum)
            ram_d.write(i + offset, count == (reduce_size - 1))
            if count == reduce_size:
                sum = 0
                count = 0

    def comp_sequential_mystrm(size, offset):
        sum = 0
        count = 0
        write_offset = offset
        for i in range(size):
            x = ram_a.read(i + offset)
            y = ram_b.read(i + offset)
            sum += x * y
            val = sum + x
            count += 1
            if count == reduce_size:
                ram_c.write(write_offset, val)
                write_offset += 1
                sum = 0
                count = 0

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
                print(i, st, sq)
        if all_ok:
            print('OK')
        else:
            print('NG')

    def comp(size):
        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_stream_macstrm(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, size)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_sequential_macstrm(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, size)

        # verification
        print('# macstream')
        check(size, 0, offset)

        # stream
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_stream_mystrm(size, offset)
        myaxi.dma_write(ram_c, offset, 1024, size // reduce_size)

        # sequential
        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_sequential_mystrm(size, offset)
        myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size)

        # verification
        print('# mystream')
        check(size // reduce_size, 0, offset)

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(16)

    return m
def run(
        act_dtype=ng.int16,
        weight_dtype=ng.int8,
        bias_dtype=ng.int32,
        scale_dtype=ng.int8,
        with_batchnorm=True,
        disable_fusion=False,
        conv2d_par_ich=1,
        conv2d_par_och=1,
        conv2d_par_col=1,
        conv2d_par_row=1,
        conv2d_concur_och=None,
        conv2d_stationary='filter',
        pool_par=1,
        elem_par=1,
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        # simtype='iverilog',
        # simtype='verilator',
        simtype=None,  # no RTL simulation
        outputfile=None):

    # input mean and standard deviation
    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    act_shape = (1, 224, 224, 3)

    if not with_batchnorm:
        raise ValueError('with_batchnorm must be True for ResNet18.')

    # pytorch model
    model = torchvision.models.resnet18(pretrained=True)

    # Pytorch to ONNX
    onnx_filename = 'resnet18_imagenet.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=scale_dtype,
                               default_bias_dtype=bias_dtype,
                               disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}
    input_means = {'act': imagenet_mean * act_scale_factor}
    input_stds = {'act': imagenet_std * act_scale_factor}

    ng.quantize(outputs, input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial,
                           ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    img = np.array(PIL.Image.open('car.png').convert('RGB')).astype(np.float32)
    img = img.reshape([1] + list(img.shape))

    img = img / 255
    img = (img - imagenet_mean) / imagenet_std

    # execution on pytorch
    model_input = np.broadcast_to(img, act_shape)

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)
    vact = np.broadcast_to(vact, act_shape)

    # compare outputs of hidden layers
    relu_op = [
        v for k, v in operators.items()
        if isinstance(v, ng.conv2d) and not isinstance(v, ng.matmul)
    ][0]
    maxpool_op = [
        v for k, v in operators.items()
        if isinstance(v, (ng.max_pool, ng.max_pool_serial))
    ][0]
    relu_ops = [v for k, v in operators.items() if isinstance(v, ng.relu)]
    layer1_0_op = relu_ops[0]
    layer1_op = relu_ops[1]
    layer2_0_op = relu_ops[2]
    layer2_op = relu_ops[3]
    layer3_0_op = relu_ops[4]
    layer3_op = relu_ops[5]
    layer4_0_op = relu_ops[6]
    layer4_op = relu_ops[7]
    avgpool_op = [
        v for k, v in operators.items()
        if isinstance(v, (ng.avg_pool, ng.avg_pool_serial))
    ][0]
    fc_op = [v for k, v in operators.items() if isinstance(v, ng.matmul)][0]
    sub_ops = [
        relu_op, maxpool_op, layer1_0_op, layer1_op, layer2_0_op, layer2_op,
        layer3_0_op, layer3_op, layer4_0_op, layer4_op, avgpool_op, fc_op
    ]
    sub_outs = ng.eval(sub_ops, act=vact)
    sub_outs = [sub_out.transpose([0, 3, 1, 2])
                for sub_out in sub_outs[:-1]] + sub_outs[-1:]
    sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops]

    model.eval()
    model_relu_out = nn.Sequential(model.conv1, model.bn1, model.relu)(
        torch.from_numpy(model_input)).detach().numpy()
    model_maxpool_out = nn.Sequential(
        model.conv1, model.bn1, model.relu,
        model.maxpool)(torch.from_numpy(model_input)).detach().numpy()

    #    class model_layer1_0(nn.Module):
    #        def __init__(self):
    #            super(model_layer1_0, self).__init__()
    #            self.conv1 = model.conv1
    #            self.bn1 = model.bn1
    #            self.relu = model.relu
    #            self.maxpool = model.maxpool
    #            self.layer1_0 = model.layer1[0]
    #
    #        def forward(self, x):
    #            x = self.relu(self.bn1(self.conv1(x)))
    #            x = self.maxpool(x)
    #            x = self.layer1_0(x)
    #            return x
    #
    #    model_layer1_0_out = model_layer1_0()(torch.from_numpy(model_input)).detach().numpy()

    model_layer1_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer1_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1)(torch.from_numpy(model_input)).detach().numpy()

    model_layer2_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer2_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2)(torch.from_numpy(model_input)).detach().numpy()

    model_layer3_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2,
        model.layer3[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer3_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2,
        model.layer3)(torch.from_numpy(model_input)).detach().numpy()

    model_layer4_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3,
        model.layer4[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer4_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3,
        model.layer4)(torch.from_numpy(model_input)).detach().numpy()

    model_avgpool_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3, model.layer4,
        model.avgpool)(torch.from_numpy(model_input)).detach().numpy()

    class Flatten(nn.Module):
        def forward(self, input):
            return input.view(input.size(0), -1)

    model_fc_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool,
        Flatten(), model.fc)(torch.from_numpy(model_input)).detach().numpy()

    model_outs = [
        model_relu_out, model_maxpool_out, model_layer1_0_out,
        model_layer1_out, model_layer2_0_out, model_layer2_out,
        model_layer3_0_out, model_layer3_out, model_layer4_0_out,
        model_layer4_out, model_avgpool_out, model_fc_out
    ]
    scaled_outs = [
        model_out * scale_factor
        for model_out, scale_factor in zip(model_outs, sub_scale_factors)
    ]

    max_diffs = [
        model_out.max() / sub_out.max()
        for model_out, sub_out in zip(scaled_outs, sub_outs)
    ]
    overflows = [
        np.sum(np.abs(sub_out) >= abs(2**(sub_op.dtype.width - 1) - 1))
        for sub_op, sub_out in zip(sub_ops, sub_outs)
    ]
    mean_square_errors = [
        np.sum((sub_out - model_out)**2) / sub_out.size
        for model_out, sub_out in zip(scaled_outs, sub_outs)
    ]
    corrcoefs = [
        np.corrcoef(model_out.reshape([-1]), sub_out.reshape([-1]))
        for model_out, sub_out in zip(model_outs, sub_outs)
    ]

    # compare prediction results
    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    class_index = json.load(open('imagenet_class_index.json', 'r'))
    labels = {int(key): value for (key, value) in class_index.items()}

    mout = scaled_model_out
    for bat in range(mout.shape[0]):
        m_top10 = list(
            sorted(enumerate(mout[bat]), key=lambda x: x[1],
                   reverse=True))[:10]
        m_top10_indexes = [index for index, value in m_top10]
        v_top10 = list(
            sorted(enumerate(vout[bat]), key=lambda x: x[1],
                   reverse=True))[:10]
        v_top10_indexes = [index for index, value in v_top10]
        num_hit = 0
        score = 0
        for index, value in m_top10:
            print("# mout: %s (%d) = %f" % (str(labels[index]), index, value))
        for index, value in v_top10:
            print("# vout: %s (%d) = %d" % (str(labels[index]), index, value))
            if index in m_top10_indexes:
                num_hit += 1
                score += 10 - abs(
                    m_top10_indexes.index(index) -
                    v_top10_indexes.index(index))
        print("# top-10 hit: %d" % num_hit)
        print("# top-10 score: %d" % score)

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([out], 'resnet18', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([out],
                        'resnet18',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([out], 'resnet18', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)],
                   dtype=np.int16)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for x in range(out.shape[1]):
                orig = memory.read_word(bat * out.aligned_shape[1] + x,
                                        out.addr, act_dtype.width)
                check = memory.read_word(bat * out.aligned_shape[1] + x,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                else:
                    print('OK (', bat, x, ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Example #28
0
def mkLed(memory_datawidth=128):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    numbanks = 4
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth)
    ram_a = vthread.MultibankRAM(m,
                                 'ram_a',
                                 clk,
                                 rst,
                                 datawidth,
                                 addrwidth,
                                 numbanks=numbanks)
    ram_b = vthread.MultibankRAM(m,
                                 'ram_b',
                                 clk,
                                 rst,
                                 datawidth,
                                 addrwidth,
                                 numbanks=numbanks)
    ram_c = vthread.MultibankRAM(m,
                                 'ram_c',
                                 clk,
                                 rst,
                                 datawidth,
                                 addrwidth,
                                 numbanks=numbanks)

    strm = vthread.Stream(m, 'mystream', clk, rst)
    a = strm.source('a')
    b = strm.source('b')
    c = a + b
    strm.sink(c, 'c')

    def comp_stream(size, offset):
        strm.set_source('a', ram_a, offset, size)
        strm.set_source('b', ram_b, offset, size)
        strm.set_sink('c', ram_c, offset, size)
        strm.run()
        strm.join()

    def comp_sequential(size, offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(i + offset)
            b = ram_b.read(i + offset)
            sum = a + b
            ram_c.write(i + offset, sum)

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
                print(i, st, sq)
        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    def comp(size):
        dma_size = size
        comp_size = size * numbanks

        dma_offset = 0
        comp_offset = 0
        myaxi.dma_read(ram_a, dma_offset, 0, dma_size)
        myaxi.dma_read(ram_b, dma_offset, 0, dma_size)
        comp_stream(size, comp_offset)
        myaxi.dma_write(ram_c, dma_offset, 1024, dma_size)

        dma_offset = size
        comp_offset = comp_size
        myaxi.dma_read(ram_a, dma_offset, 0, dma_size)
        myaxi.dma_read(ram_b, dma_offset, 0, dma_size)
        comp_sequential(size, comp_offset)
        myaxi.dma_write(ram_c, dma_offset, 1024 * 2, dma_size)

        check(comp_size, 0, comp_offset)

        vthread.finish()

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start(32)

    return m
def mkLed():
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    datawidth = 32
    addrwidth = 10
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)

    size = 16
    pattern = [(size, 0)]

    strm = vthread.Stream(m, 'mystream', clk, rst)
    a = strm.source('a')
    b = strm.source('b')
    sum = a + b
    strm.sink(sum, 'sum')

    def comp_stream(offset):
        strm.set_source_pattern('a', ram_a, offset + 10, pattern)
        strm.set_source_pattern('b', ram_b, offset + 10, pattern)
        strm.set_sink('sum', ram_c, offset, size)
        strm.run()
        strm.join()

    def comp_sequential(offset):
        sum = 0
        for i in range(size):
            a = ram_a.read(offset + 10)
            b = ram_b.read(offset + 10)
            sum = a + b
            ram_c.write(i + offset, sum)

    def check(size, offset_stream, offset_seq):
        all_ok = True
        for i in range(size):
            st = ram_c.read(i + offset_stream)
            sq = ram_c.read(i + offset_seq)
            if vthread.verilog.NotEql(st, sq):
                all_ok = False
        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    def comp():
        offset = 0
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_stream(offset)
        myaxi.dma_write(ram_c, offset, 1024 * 4, 1)

        offset = size
        myaxi.dma_read(ram_a, offset, 0, size)
        myaxi.dma_read(ram_b, offset, 0, size)
        comp_sequential(offset)
        myaxi.dma_write(ram_c, offset, 1024 * 8, 1)

        check(size, 0, offset)

        vthread.finish()

    th = vthread.Thread(m, 'th_comp', clk, rst, comp)
    fsm = th.start()

    return m
Example #30
0
def mkLed(matrix_size=16):
    m = Module('blinkled')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    seq = Seq(m, 'seq', clk, rst)
    timer = m.Reg('timer', 32, initval=0)
    seq(timer.inc())

    datawidth = 32
    addrwidth = 10
    ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth)
    ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth)
    ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth)
    myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth)

    def matmul(matrix_size, a_offset, b_offset, c_offset):
        start_time = timer
        comp(matrix_size, a_offset, b_offset, c_offset)
        end_time = timer
        time = end_time - start_time
        print("Time (cycles): %d" % time)
        check(matrix_size, a_offset, b_offset, c_offset)

    def strm_madd(strm, size, waddr):
        a = strm.read(ram_a, 0, size)
        b = strm.read(ram_b, 0, size)
        sum, valid = strm.RegionAdd(a * b, size)
        strm.write(ram_c, waddr, 1, sum, when=valid)

    def comp(matrix_size, a_offset, b_offset, c_offset):
        a_addr, c_addr = a_offset, c_offset

        for i in range(matrix_size):
            myaxi.dma_read(ram_a, 0, a_addr, matrix_size)

            b_addr = b_offset
            for j in range(matrix_size):
                myaxi.dma_read(ram_b, 0, b_addr, matrix_size)

                stream.run(matrix_size, j)
                stream.join()

                b_addr += matrix_size * (datawidth // 8)

            myaxi.dma_write(ram_c, 0, c_addr, matrix_size)
            a_addr += matrix_size * (datawidth // 8)
            c_addr += matrix_size * (datawidth // 8)

    def check(matrix_size, a_offset, b_offset, c_offset):
        all_ok = True
        c_addr = c_offset
        for i in range(matrix_size):
            myaxi.dma_read(ram_c, 0, c_addr, matrix_size)
            for j in range(matrix_size):
                v = ram_c.read(j)
                if i == j and vthread.verilog.NotEql(v, (i + 1) * 2):
                    all_ok = False
                    print("NG [%d,%d] = %d" % (i, j, v))
                if i != j and vthread.verilog.NotEql(v, 0):
                    all_ok = False
                    print("NG [%d,%d] = %d" % (i, j, v))
            c_addr += matrix_size * (datawidth // 8)

        if all_ok:
            print("OK")
        else:
            print("NG")

    stream = vthread.Stream(m, 'strm_madd', clk, rst, strm_madd)
    th = vthread.Thread(m, 'th_matmul', clk, rst, matmul)
    fsm = th.start(matrix_size, 0, 1024, 2048)

    return m