コード例 #1
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth)

    df = dataflow.DataflowManager(m, clk, rst)
    # df.enable_draw_graph()

    fsm = FSM(m, 'fsm', clk, rst)

    length = 8
    a = df.Counter()
    b = df.Counter(size=length)
    c = b == 0

    wport = 0
    waddr = 0
    wlen = 32
    done = myram.write_dataflow(wport, waddr, a, wlen, cond=fsm, when=c)

    fsm.goto_next()
    fsm.If(done).goto_next()

    seq = Seq(m, 'seq', clk, rst)

    seq.If(myram[0].wenable)(
        Systask('display', '[%d] <- %d', myram[0].addr, myram[0].wdata)
    )

    return m
コード例 #2
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth)

    df = dataflow.DataflowManager(m, clk, rst)
    # df.enable_draw_graph()

    fsm = FSM(m, 'fsm', clk, rst)

    length = 8
    a = df.Counter()
    b = df.Counter(size=length)
    c = b == 0

    wport = 0
    waddr = 0
    wlen = 32
    done = myram.write_dataflow(wport, waddr, a, wlen, cond=fsm, when=c)

    fsm.goto_next()
    fsm.If(done).goto_next()

    seq = Seq(m, 'seq', clk, rst)

    seq.If(myram[0].wenable)(Systask('display', '[%d] <- %d', myram[0].addr,
                                     myram[0].wdata))

    return m
コード例 #3
0
def mkMain():
    m = Module('main')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    myaxi = axi.AxiMaster(m, 'myaxi', clk, rst)
    myram = RAM(m, 'myram', clk, rst, numports=1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # AXI read request
    araddr = 1024
    arlen = 64
    ack, counter = myaxi.read_request_counter(araddr, arlen, cond=fsm)
    fsm.If(ack).goto_next()

    # AXI read dataflow (AXI -> Dataflow)
    axi_data, axi_last, done = myaxi.read_dataflow()
    sum = df.ReduceAdd(axi_data, reset=axi_last.prev(1))

    # RAM write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = arlen
    done = myram.write_dataflow(wport, waddr, sum, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # AXI write request
    awaddr = 1024
    awlen = 64
    ack, counter = myaxi.write_request_counter(awaddr, awlen, cond=fsm)
    fsm.If(ack).goto_next()

    # RAM read dataflow (RAM -> Dataflow)
    rport = 0
    raddr = 0
    rlen = arlen
    rdata, rlast, done = myram.read_dataflow(rport, raddr, rlen, cond=fsm)
    fsm.goto_next()

    # AXI write dataflow
    done = myaxi.write_dataflow(rdata, counter)
    fsm.If(done).goto_next()

    # verify
    sum = m.Reg('sum', 32, initval=0)
    expected_sum = 0
    for i in range(arlen):
        expected_sum += (araddr + araddr + i) * (i + 1) // 2

    seq = Seq(m, 'seq', clk, rst)
    seq.If(Ands(myaxi.wdata.wvalid,
                myaxi.wdata.wready))(sum.add(myaxi.wdata.wdata))
    seq.Then().If(myaxi.wdata.wlast).Delay(1)(Systask(
        'display', "sum=%d expected_sum=%d", sum, expected_sum))

    return m
コード例 #4
0
def mkMain():
    m = Module('main')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    myaxi = axi.AxiMaster(m, 'myaxi', clk, rst)
    myaxi.disable_write()

    myram = RAM(m, 'myram', clk, rst, numports=1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # AXI read request
    araddr = 1024
    arlen = 64
    ack, axi_counter = myaxi.read_request(araddr, arlen, cond=fsm)
    fsm.If(ack).goto_next()

    # AXI read dataflow (AXI -> Dataflow)
    axi_data, axi_last, done = myaxi.read_dataflow()
    sum = df.ReduceAdd(axi_data, reset=axi_last.prev(1))

    # RAM write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = arlen
    done = myram.write_dataflow(wport, waddr, sum, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    # read dataflow (RAM -> Dataflow)
    rport = 0
    raddr = 0
    rlen = arlen
    rdata, rlast, done = myram.read_dataflow(rport, raddr, rlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    rdata_data, rdata_valid = rdata.read()
    rlast_data, rlast_valid = rlast.read()

    sum = m.Reg('sum', 32, initval=0)
    expected_sum = 0
    for i in range(arlen):
        expected_sum += (araddr + araddr + i) * (i + 1) // 2

    seq = Seq(m, 'seq', clk, rst)

    seq.If(rdata_valid)(
        sum.add(rdata_data)
    )
    seq.Then().If(rlast_data == 1).Delay(1)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum)
    )

    return m
コード例 #5
0
def mkMain():
    m = Module('main')
    clk = m.Input('CLK')
    rst = m.Input('RST')

    myaxi = axi.AxiMaster(m, 'myaxi', clk, rst)
    myaxi.disable_write()

    myram = RAM(m, 'myram', clk, rst, numports=1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # AXI read request
    araddr = 1024
    arlen = 64
    ack, axi_counter = myaxi.read_request_counter(araddr, arlen, cond=fsm)
    fsm.If(ack).goto_next()

    # AXI read dataflow (AXI -> Dataflow)
    axi_data, axi_last, done = myaxi.read_dataflow()
    sum = df.ReduceAdd(axi_data, reset=axi_last.prev(1))

    # RAM write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = arlen
    done = myram.write_dataflow(wport, waddr, sum, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    # read dataflow (RAM -> Dataflow)
    rport = 0
    raddr = 0
    rlen = arlen
    rdata, rlast, done = myram.read_dataflow(rport, raddr, rlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    rdata_data, rdata_valid = rdata.read()
    rlast_data, rlast_valid = rlast.read()

    sum = m.Reg('sum', 32, initval=0)
    expected_sum = 0
    for i in range(arlen):
        expected_sum += (araddr + araddr + i) * (i + 1) // 2

    seq = Seq(m, 'seq', clk, rst)

    seq.If(rdata_valid)(sum.add(rdata_data))
    seq.Then().If(rlast_data == 1).Delay(1)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum),
        If(NotEql(sum, expected_sum))(Display('# verify: FAILED')).Else(
            Display('# verify: PASSED')))

    return m
コード例 #6
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2

    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 1)

    # example how to access RAM
    count = m.Reg('count', 32, initval=0)
    sum = m.Reg('sum', 32, initval=0)
    addr = m.Reg('addr', 32, initval=0)

    fsm = FSM(m, 'fsm', clk, rst)

    fsm(addr(0), count(0), sum(0))

    fsm.goto_next()

    step = 16

    myram.write_rtl(addr, count, port=0, cond=fsm)

    fsm(addr.inc(), count.inc())

    fsm.If(count == step - 1)(addr(0), count(0))

    fsm.Then().goto_next()

    read_data, read_valid = myram.read_rtl(addr, port=0, cond=fsm)

    fsm(addr.inc(), count.inc())

    fsm.If(read_valid)(sum(sum + read_data))

    fsm.Then().Delay(1)(Systask('display', "sum=%d", sum))

    fsm.If(count == step - 1)(addr(0), count(0)).Then().goto_next()

    fsm.If(read_valid)(sum(sum + read_data))

    fsm.Then().Delay(1)(Systask('display', "sum=%d", sum))

    fsm.goto_next()

    fsm(Systask('display', "expected_sum=%d", (step - 1) * step // 2))

    fsm.goto_next()

    fsm.make_always()

    return m
コード例 #7
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 2)
    myram.disable_write(1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # dataflow
    value = df.Counter()

    # write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = 64
    done = myram.write_dataflow(wport, waddr, value, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    fsm.goto_next()

    # read dataflow (RAM -> Dataflow)
    rport = 1
    raddr = 0
    rlen = 32
    rdata, rlast, done = myram.read_dataflow(rport, raddr, rlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    rdata_data, rdata_valid = rdata.read()
    rlast_data, rlast_valid = rlast.read()

    sum = m.Reg('sum', 32, initval=0)
    expected_sum = (raddr + raddr + rlen - 1) * rlen // 2

    seq = Seq(m, 'seq', clk, rst)

    seq.If(rdata_valid)(
        sum.add(rdata_data)
    )
    seq.Then().If(rlast_data == 1).Delay(1)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum)
    )

    return m
コード例 #8
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth)

    df = dataflow.DataflowManager(m, clk, rst)
    # df.enable_draw_graph()

    fsm = FSM(m, 'fsm', clk, rst)

    length = 8
    a = df.Counter()
    b = df.Counter(size=length)
    c = b == 0

    wport = 0
    waddr = 0
    wlen = 32
    done = myram.write_dataflow(wport, waddr, a, wlen, cond=fsm, when=c)

    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    sum = m.Reg('sum', 32, initval=0)
    expected_sum = (waddr + waddr + (wlen - 1) * length) * wlen // 2

    seq = Seq(m, 'seq', clk, rst)

    seq.If(myram[0].wenable)(
        sum.add(myram[0].wdata)
    )
    seq.Then().If(myram[0].addr == wlen - 1).Delay(2)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum),
        If(NotEql(sum, expected_sum))(Display('# verify: FAILED')).Else(Display('# verify: PASSED'))
    )

    return m
コード例 #9
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 2)
    myram.disable_write(1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    fsm.goto_next()

    # dataflow
    value = df.Counter(size=64)
    value = value - 1

    # write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = 64
    done = myram.write_dataflow(wport, waddr, value, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    sum = m.Reg('sum', 32, initval=0)
    expected_sum = (waddr + waddr + wlen - 1) * wlen // 2 - wlen

    seq = Seq(m, 'seq', clk, rst)
    seq.If(myram[0].wenable)(
        sum.add(myram[0].wdata)
    )
    seq.Then().If(myram[0].addr == wlen - 1).Delay(2)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum)
    )

    return m
コード例 #10
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 2)
    myram.disable_write(1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # dataflow
    value = df.Counter()

    # write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = 64
    done = myram.write_dataflow(wport, waddr, value, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    fsm.goto_next()

    # read dataflow (RAM -> Dataflow)
    rport = 1
    raddr = 0
    rlen = 32
    reuse_size = 4

    rdata0, rdata1, rlast, done = myram.read_dataflow_reuse(
        rport, raddr, rlen, num_outputs=2, reuse_size=reuse_size, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    rdata0_data, rdata0_valid = rdata0.read()
    rdata1_data, rdata1_valid = rdata1.read()
    rlast_data, rlast_valid = rlast.read()

    sum0 = m.Reg('sum0', 32, initval=0)
    sum1 = m.Reg('sum1', 32, initval=0)
    expected_sum = ((raddr + raddr + rlen - 1) * rlen // 2) * reuse_size

    seq = Seq(m, 'seq', clk, rst)

    seq.If(rdata0_valid)(sum0.add(rdata0_data),
                         Systask('display', 'rdata0_data=%d', rdata0_data))
    seq.If(rdata1_valid)(sum1.add(rdata1_data),
                         Systask('display', 'rdata1_data=%d', rdata1_data))
    seq.Then().If(rlast_data == 1).Delay(1)(Systask('display',
                                                    'sum=%d expected_sum=%d',
                                                    sum0 + sum1, expected_sum))

    return m
コード例 #11
0
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2
    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 2)
    myram.disable_write(1)

    df = dataflow.DataflowManager(m, clk, rst)
    fsm = FSM(m, 'fsm', clk, rst)

    # dataflow
    value = df.Counter()

    # write dataflow (Dataflow -> RAM)
    wport = 0
    waddr = 0
    wlen = 64
    done = myram.write_dataflow(wport, waddr, value, wlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    fsm.goto_next()

    # read dataflow (RAM -> Dataflow)
    rport = 1
    raddr = 0
    rlen = 32
    rdata, rlast, done = myram.read_dataflow(rport, raddr, rlen, cond=fsm)
    fsm.goto_next()
    fsm.If(done).goto_next()

    # verify
    rdata_data, rdata_valid = rdata.read()
    rlast_data, rlast_valid = rlast.read()

    sum = m.Reg('sum', 32, initval=0)
    expected_sum = (raddr + raddr + rlen - 1) * rlen // 2

    seq = Seq(m, 'seq', clk, rst)

    seq.If(rdata_valid)(sum.add(rdata_data))
    seq.Then().If(rlast_data == 1).Delay(1)(
        Systask('display', 'sum=%d expected_sum=%d', sum, expected_sum),
        If(NotEql(sum, expected_sum))(Display('# verify: FAILED')).Else(
            Display('# verify: PASSED')))

    return m
コード例 #12
0
ファイル: ram_rtl.py プロジェクト: PyHDI/veriloggen
def mkMain(n=128, datawidth=32, numports=2):
    m = Module('main')

    clk = m.Input('CLK')
    rst = m.Input('RST')

    addrwidth = int(math.log(n, 2)) * 2

    myram = RAM(m, 'myram', clk, rst, datawidth, addrwidth, 1)

    # example how to access RAM
    count = m.Reg('count', 32, initval=0)
    sum = m.Reg('sum', 32, initval=0)
    addr = m.Reg('addr', 32, initval=0)

    fsm = FSM(m, 'fsm', clk, rst)

    fsm(
        addr(0),
        count(0),
        sum(0)
    )

    fsm.goto_next()

    step = 16

    myram.write_rtl(addr, count, port=0, cond=fsm)

    fsm(
        addr.inc(),
        count.inc()
    )

    fsm.If(count == step - 1)(
        addr(0),
        count(0)
    )

    fsm.Then().goto_next()

    read_data, read_valid = myram.read_rtl(addr, port=0, cond=fsm)

    fsm(
        addr.inc(),
        count.inc()
    )

    fsm.If(read_valid)(
        sum(sum + read_data)
    )

    fsm.Then().Delay(1)(
        Systask('display', "sum=%d", sum)
    )

    fsm.If(count == step - 1)(
        addr(0),
        count(0)
    ).Then().goto_next()

    fsm.If(read_valid)(
        sum(sum + read_data)
    )

    fsm.Then().Delay(1)(
        Systask('display', "sum=%d", sum)
    )

    fsm.make_always()

    return m
コード例 #13
0
ファイル: dataflow_stencil.py プロジェクト: yinxx/veriloggen
def mkStencil(n=16, size=3, datawidth=32, point=16, coe_test=False):
    m = Module('stencil')

    addrwidth = int(math.log(n, 2))

    clk = m.Input('CLK')
    rst = m.Input('RST')

    start = m.Input('start')
    busy = m.OutputReg('busy', initval=0)

    done = m.TmpReg(initval=0)

    # external RAM I/F
    ext_src_rams = [
        ram.RAMSlaveInterface(m,
                              'ext_src_ram%d' % i,
                              datawidth=datawidth,
                              addrwidth=addrwidth) for i in range(size)
    ]
    ext_dst_ram = ram.RAMSlaveInterface(m,
                                        'ext_dst_ram',
                                        datawidth=datawidth,
                                        addrwidth=addrwidth)

    # RAM
    addrwidth = int(math.log(n, 2)) * 2

    src_rams = [
        RAM(m,
            'src_ram%d' % i,
            clk,
            rst,
            datawidth=datawidth,
            addrwidth=addrwidth,
            numports=2) for i in range(size)
    ]

    dst_ram = RAM(m,
                  'dst_ram',
                  clk,
                  rst,
                  datawidth=datawidth,
                  addrwidth=addrwidth,
                  numports=2)

    # connect RAM I/Fs
    for src_ram, ext_src_ram in zip(src_rams, ext_src_rams):
        src_ram[1].connect(ext_src_ram)

    dst_ram[1].connect(ext_dst_ram)

    # read FSM
    read_fsm = FSM(m, 'read_fsm', clk, rst)
    read_count = m.Reg('read_count', 32, initval=0)
    read_addr = m.Reg('read_addr', 32, initval=0)

    read_fsm(read_addr(0), read_count(0), busy(0))

    read_fsm.If(start)(busy(1))

    read_fsm.Then().goto_next()

    read_fsm(read_addr.inc(), read_count.inc())

    idata = []
    ivalid = []
    for i, src_ram in enumerate(src_rams):
        src_ram.disable_write(0)
        rdata, rvalid = src_ram.read_rtl(read_addr, port=0, cond=read_fsm)
        idata.append(rdata)
        ivalid.append(rvalid)

    read_fsm.If(read_count == n - 1)(read_addr(0), read_count(0))

    read_fsm.Then().goto_next()

    read_fsm.If(done)(busy(0))

    read_fsm.Then().goto_init()

    read_fsm.make_always()

    # instance
    odata = m.Wire('odata', datawidth)
    ovalid = m.Wire('ovalid')

    ports = []
    ports.append(('CLK', clk))
    ports.append(('RST', rst))

    for i, (d, v) in enumerate(zip(idata, ivalid)):
        ports.append(('idata%d' % i, d))
        ports.append(('ivalid%d' % i, v))

    ports.append(('odata', odata))
    ports.append(('ovalid', ovalid))

    coe = None
    if coe_test:
        coe = [[dataflow.Constant(1, point=point) for i in range(size)]
               for j in range(size)]
        point = 0

    st = mkStencilPipeline2D(size=3, width=datawidth, point=point, coe=coe)
    m.Instance(st, 'inst_stencil', ports=ports)

    skip_offset = int(math.floor(size / 2))

    # write FSM
    write_fsm = FSM(m, 'write_fsm', clk, rst)
    write_count = m.Reg('write_count', 32, initval=0)
    write_addr = m.Reg('write_addr', 32, initval=skip_offset)

    write_fsm(done(0))

    write_fsm.If(Ands(ovalid, write_count > skip_offset))(write_addr.inc())

    dst_ram.write_rtl(write_addr, odata, port=0, cond=write_fsm.then)

    write_fsm.If(ovalid)(write_count.inc(), )

    write_fsm.If(write_count == n)(write_count(0), write_addr(skip_offset),
                                   done(1))
    write_fsm.Then().goto_init()

    write_fsm.make_always()

    return m
コード例 #14
0
ファイル: dataflow_stencil.py プロジェクト: PyHDI/veriloggen
def mkStencil(n=16, size=3, datawidth=32, point=16, coe_test=False):
    m = Module('stencil')

    addrwidth = int(math.log(n, 2))

    clk = m.Input('CLK')
    rst = m.Input('RST')

    start = m.Input('start')
    busy = m.OutputReg('busy', initval=0)

    done = m.TmpReg(initval=0)

    # external RAM I/F
    ext_src_rams = [ram.RAMSlaveInterface(m, 'ext_src_ram%d' % i,
                                          datawidth=datawidth, addrwidth=addrwidth)
                    for i in range(size)]
    ext_dst_ram = ram.RAMSlaveInterface(m, 'ext_dst_ram',
                                           datawidth=datawidth, addrwidth=addrwidth)

    # RAM
    addrwidth = int(math.log(n, 2)) * 2

    src_rams = [RAM(m, 'src_ram%d' % i, clk, rst,
                    datawidth=datawidth, addrwidth=addrwidth, numports=2)
                for i in range(size)]

    dst_ram = RAM(m, 'dst_ram', clk, rst,
                  datawidth=datawidth, addrwidth=addrwidth, numports=2)

    # connect RAM I/Fs
    for src_ram, ext_src_ram in zip(src_rams, ext_src_rams):
        src_ram[1].connect(ext_src_ram)

    dst_ram[1].connect(ext_dst_ram)

    # read FSM
    read_fsm = FSM(m, 'read_fsm', clk, rst)
    read_count = m.Reg('read_count', 32, initval=0)
    read_addr = m.Reg('read_addr', 32, initval=0)

    read_fsm(
        read_addr(0),
        read_count(0),
        busy(0)
    )

    read_fsm.If(start)(
        busy(1)
    )

    read_fsm.Then().goto_next()

    read_fsm(
        read_addr.inc(),
        read_count.inc()
    )

    idata = []
    ivalid = []
    for i, src_ram in enumerate(src_rams):
        src_ram.disable_write(0)
        rdata, rvalid = src_ram.read_rtl(read_addr, port=0, cond=read_fsm)
        idata.append(rdata)
        ivalid.append(rvalid)

    read_fsm.If(read_count == n - 1)(
        read_addr(0),
        read_count(0)
    )

    read_fsm.Then().goto_next()

    read_fsm.If(done)(
        busy(0)
    )

    read_fsm.Then().goto_init()

    read_fsm.make_always()

    # instance
    odata = m.Wire('odata', datawidth)
    ovalid = m.Wire('ovalid')

    ports = []
    ports.append(('CLK', clk))
    ports.append(('RST', rst))

    for i, (d, v) in enumerate(zip(idata, ivalid)):
        ports.append(('idata%d' % i, d))
        ports.append(('ivalid%d' % i, v))

    ports.append(('odata', odata))
    ports.append(('ovalid', ovalid))

    coe = None
    if coe_test:
        coe = [[dataflow.Constant(1, point=point) for i in range(size)]
               for j in range(size)]
        point = 0

    st = mkStencilPipeline2D(size=3, width=datawidth, point=point, coe=coe)
    m.Instance(st, 'inst_stencil', ports=ports)

    skip_offset = int(math.floor(size / 2))

    # write FSM
    write_fsm = FSM(m, 'write_fsm', clk, rst)
    write_count = m.Reg('write_count', 32, initval=0)
    write_addr = m.Reg('write_addr', 32, initval=skip_offset)

    write_fsm(
        done(0)
    )

    write_fsm.If(Ands(ovalid, write_count > skip_offset))(
        write_addr.inc()
    )

    dst_ram.write_rtl(write_addr, odata, port=0, cond=write_fsm.then)

    write_fsm.If(ovalid)(
        write_count.inc(),
    )

    write_fsm.If(write_count == n)(
        write_count(0),
        write_addr(skip_offset),
        done(1)
    )
    write_fsm.Then().goto_init()

    write_fsm.make_always()

    return m