Beispiel #1
0
def mkTest():
    m = Module('test')

    # target instance
    main = mkMain()

    # copy paras and ports
    params = m.copy_params(main)
    ports = m.copy_sim_ports(main)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(ports, 'myaxi')

    uut = m.Instance(main,
                     'uut',
                     params=m.connect_params(main),
                     ports=m.connect_ports(main))

    simulation.setup_waveform(m, uut, m.get_vars())
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000 * 100),
        Systask('finish'),
    )

    return m
def mkTest(memimg_name=None):
    m = Module('test')

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name)
    memory.connect(ports, 'myaxi')

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    #simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def mkTest(memimg_name=None):
    m = Module('test')

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name)
    memory.connect(ports, 'myaxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    def ctrl():
        for i in range(100):
            pass

        awaddr = 0
        _saxi.write(awaddr, 1)

        araddr = 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        araddr = 8
        v = _saxi.read(araddr)
        if v:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led, 'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    #simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
Beispiel #4
0
def mkTest(memimg_name=None):
    m = Module('test')

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name)
    memory.connect(ports, 'myaxi')
    maxi = vthread.AXIMLite(m, 'maxi', clk, rst, noio=True)
    maxi.connect(ports, 'saxi')

    def ctrl():
        channel, width, height = [4, 4, 4]

        awaddr = 2 * 4
        maxi.write(awaddr, channel)

        awaddr = 3 * 4
        maxi.write(awaddr, width)

        awaddr = 4 * 4
        maxi.write(awaddr, height)

        awaddr = 0 * 4
        maxi.write(awaddr, 1)

        araddr = 1 * 4
        v = maxi.read(araddr)
        while v == 0:
            v = maxi.read(araddr)
        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def mkTest(memimg_name=None, axi_datawidth=32, datawidth=4, addrwidth=10):
    m = Module('test')

    # target instance
    led = mkLed(axi_datawidth, datawidth, addrwidth)

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memimg_datawidth = 32
    length = 1024 * 1024 // (memimg_datawidth // 8)
    mem = np.zeros([length], dtype=np.int64)
    data = np.arange(length, dtype=np.int64) % [2**(datawidth - 1)] + [1]
    addr = 0
    axi.set_memory(mem, data, memimg_datawidth, datawidth, addr, None)

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'myaxi')

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def mkTest(memimg_name=None):
    m = Module('test')

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']

    # active low
    resetn = ports['RESETN']

    # active low -> active high
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name)
    memory.connect(ports, 'myaxi')

    uut = m.Instance(led, 'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    #simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
Beispiel #7
0
def run(act_shape=(1, 7, 7, 3),
        act_dtype=ng.int32,
        ksize=2, stride=2, padding=0,
        par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # pytorch model
    layers = []
    layers.append(nn.AvgPool2d(ksize, stride=stride, padding=padding))

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_avg_pool.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {'act': act_dtype,
                    'out': act_dtype}

    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=value_dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=ng.int32,
                                          default_constant_dtype=ng.int32,
                                          default_operator_dtype=act_dtype,
                                          default_scale_dtype=ng.int32,
                                          default_bias_dtype=ng.int32,
                                          disable_fusion=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    input_scale_factors = {'act': 1.0}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.avg_pool):
            op.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    if act_dtype.width > 4:
        vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1]
    else:
        vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1]

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # software-based verification
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None:
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    mean_square_error = np.sum((vout - scaled_model_out) ** 2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([out], 'onnx_matrix_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth,
                                    'chunk_size': chunk_size})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out])
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            out.addr, act_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            check_addr, act_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #8
0
def mkTest(memimg_name=None):

    a_shape = (matrix_size, matrix_size)
    b_shape = (matrix_size, matrix_size)
    c_shape = (a_shape[0], b_shape[0])

    n_raw_a = axi.shape_to_length(a_shape)
    n_raw_b = axi.shape_to_length(b_shape)

    n_a = axi.shape_to_memory_size(a_shape, datawidth)
    n_b = axi.shape_to_memory_size(b_shape, datawidth)

    a = np.zeros(a_shape, dtype=np.int64)
    b = np.zeros(b_shape, dtype=np.int64)

    value = 1
    for y in range(a_shape[0]):
        for x in range(a_shape[1]):
            if x == y:
                a[y][x] = value
                value += 1
            else:
                a[y][x] = 0

    for y in range(b_shape[0]):
        for x in range(b_shape[1]):
            if x == y:
                b[y][x] = 2
            else:
                b[y][x] = 0

    a_addr = a_offset
    size_a = n_a * datawidth // 8
    b_addr = b_offset
    size_b = n_b * datawidth // 8

    mem = np.zeros([1024 * 1024 * 8 // axi_datawidth], dtype=np.int64)
    axi.set_memory(mem, a, axi_datawidth, datawidth, a_addr)
    axi.set_memory(mem, b, axi_datawidth, datawidth, b_addr)

    led = mkLed()

    m = Module('test')
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)
    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                mem_datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name)

    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # Timer
    counter = m.Reg('counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        awaddr = 4
        print('# matrix_size = %d' % matrix_size)
        _saxi.write(awaddr, matrix_size)

        awaddr = 8
        print('# a_offset = %d' % a_offset)
        _saxi.write(awaddr, a_offset)

        awaddr = 12
        print('# b_offset = %d' % b_offset)
        _saxi.write(awaddr, b_offset)

        awaddr = 16
        print('# c_offset = %d' % c_offset)
        _saxi.write(awaddr, c_offset)

        awaddr = 0
        start_time = counter
        print('# start time = %d' % start_time)
        _saxi.write(awaddr, 1)

        araddr = 20
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        end_time = counter
        print('# end time = %d' % end_time)
        time = end_time - start_time
        print('# exec time = %d' % time)

        all_ok = True
        for y in range(matrix_size):
            for x in range(matrix_size):
                v = memory.read(
                    c_offset + (y * matrix_size + x) * datawidth // 8)
                if y == x and vthread.verilog.NotEql(v, (y + 1) * 2):
                    all_ok = False
                    print("NG [%d,%d] = %d" % (y, x, v))
                if y != x and vthread.verilog.NotEql(v, 0):
                    all_ok = False
                    print("NG [%d,%d] = %d" % (y, x, v))

        if all_ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led, 'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def mkTest():
    m = Module('test')

    copy_bytes = 1024 * 4

    # target instance
    memcpy = mkMemcpy()

    uut = Submodule(m, memcpy, name='uut')
    clk = uut['CLK']
    rst = uut['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(uut.get_inst_ports(), 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(uut.get_inst_ports(), 'saxi')

    # Timer
    counter = m.Reg('counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(counter.inc())

    def ctrl():
        for i in range(100):
            pass

        awaddr = 4 * 1
        print('# copy_bytes = %d' % copy_bytes)
        _saxi.write(awaddr, copy_bytes)

        awaddr = 4 * 2
        src_offset = 0
        print('# src_offset = %d' % src_offset)
        _saxi.write(awaddr, src_offset)

        awaddr = 4 * 3
        dst_offset = 1024 * 8
        print('# dst_offset = %d' % dst_offset)
        _saxi.write(awaddr, dst_offset)

        awaddr = 4 * 0
        start_time = counter
        print('# start time = %d' % start_time)
        _saxi.write(awaddr, 1)

        araddr = 4 * 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        end_time = counter
        print('# end time = %d' % end_time)
        time = end_time - start_time
        print('# exec time = %d' % time)

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def run(act_shape=(1, 7, 7, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 3, 3, 9),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        out_dtype=ng.int32,
        stride0=1,
        stride1=1,
        padding0=0,
        padding1=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='relu',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func0 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    layers.append(
        nn.Conv2d(weight1_shape[3],
                  weight1_shape[0],
                  weight1_shape[1],
                  stride=stride1,
                  padding=padding1))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func1 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_conv2d.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '1.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=out_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # default linear quantization
    if act_dtype.width >= 8:
        value_ranges = {'act': (-120, 120)}
    else:
        value_ranges = {
            'act': (-(2**(act_dtype.width - 1)), (2**(act_dtype.width - 1)))
        }

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_conv2d',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # verification data
    # if act_dtype.width > 4:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1]
    # else:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1]

    #vact = np.ones(act.shape)
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2**(vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None:
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    out_diff = vout - scaled_model_out
    out_err = out_diff / (scaled_model_out + 0.00000001)
    max_out_err = np.max(np.abs(out_err))

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, out_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #11
0
def run(
    act_dtype=ng.int8,
    weight_dtype=ng.int8,
    bias_dtype=ng.int32,
    scale_dtype=ng.int8,
    par_ich=2,
    par_och=2,
    chunk_size=64,
    axi_datawidth=32,
    silent=False,
    weight_filename='cnn.npy',
    verilog_filename=None,
    sim_filename=None,
    # simtype='iverilog',
    simtype='verilator',
    # simtype=None,  # no RTL simulation
):

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # input
    input_layer = ng.placeholder(
        dtype=act_dtype,
        shape=(1, 32, 32, 3),  # N, H, W, C
        name='input_layer')

    # layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool
    w0 = ng.variable(
        dtype=weight_dtype,
        shape=(64, 3, 3, 3),  # Och, Ky, Kx, Ich
        name='w0')
    b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0')
    s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0')

    a0 = ng.conv2d(input_layer,
                   w0,
                   strides=(1, 1, 1, 1),
                   bias=b0,
                   scale=s0,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(weight_dtype, shape=(64, 3, 3, a0.shape[-1]), name='w1')
    b1 = ng.variable(bias_dtype, shape=(w1.shape[0], ), name='b1')
    s1 = ng.variable(scale_dtype, shape=(w1.shape[0], ), name='s1')

    a1 = ng.conv2d(a0p,
                   w1,
                   strides=(1, 1, 1, 1),
                   bias=b1,
                   scale=s1,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    a1r = ng.reshape(a1, [1, -1])

    # layer 2: full-connection, relu
    w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2')
    b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2')
    s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2')

    a2 = ng.matmul(a1r,
                   w2,
                   bias=b2,
                   scale=s2,
                   transposed_b=True,
                   act_func=ng.relu,
                   dtype=act_dtype,
                   sum_dtype=ng.int32)

    # layer 3: full-connection, relu
    w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3')
    b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3')
    s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3')

    # output
    output_layer = ng.matmul(a2,
                             w3,
                             bias=b3,
                             scale=s3,
                             transposed_b=True,
                             name='output_layer',
                             dtype=act_dtype,
                             sum_dtype=ng.int32)

    # --------------------
    # (2) Assign weights to the NNgen operators
    # --------------------

    # In this example, random floating-point values are assigned.
    # In a real case, you should assign actual weight values
    # obtianed by a training on DNN framework.

    # If you don't you NNgen's quantizer, you can assign integer weights to each tensor.

    w0_value = np.random.normal(size=w0.length).reshape(w0.shape)
    w0_value = np.clip(w0_value, -3.0, 3.0)
    w0.set_value(w0_value)

    b0_value = np.random.normal(size=b0.length).reshape(b0.shape)
    b0_value = np.clip(b0_value, -3.0, 3.0)
    b0.set_value(b0_value)

    s0_value = np.ones(s0.shape)
    s0.set_value(s0_value)

    w1_value = np.random.normal(size=w1.length).reshape(w1.shape)
    w1_value = np.clip(w1_value, -3.0, 3.0)
    w1.set_value(w1_value)

    b1_value = np.random.normal(size=b1.length).reshape(b1.shape)
    b1_value = np.clip(b1_value, -3.0, 3.0)
    b1.set_value(b1_value)

    s1_value = np.ones(s1.shape)
    s1.set_value(s1_value)

    w2_value = np.random.normal(size=w2.length).reshape(w2.shape)
    w2_value = np.clip(w2_value, -3.0, 3.0)
    w2.set_value(w2_value)

    b2_value = np.random.normal(size=b2.length).reshape(b2.shape)
    b2_value = np.clip(b2_value, -3.0, 3.0)
    b2.set_value(b2_value)

    s2_value = np.ones(s2.shape)
    s2.set_value(s2_value)

    w3_value = np.random.normal(size=w3.length).reshape(w3.shape)
    w3_value = np.clip(w3_value, -3.0, 3.0)
    w3.set_value(w3_value)

    b3_value = np.random.normal(size=b3.length).reshape(b3.shape)
    b3_value = np.clip(b3_value, -3.0, 3.0)
    b3.set_value(b3_value)

    s3_value = np.ones(s3.shape)
    s3.set_value(s3_value)

    # Quantizing the floating-point weights by the NNgen quantizer.
    # Alternatively, you can assign integer weights by yourself to each tensor.

    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'input_layer': act_scale_factor}
    input_means = {'input_layer': imagenet_mean * act_scale_factor}
    input_stds = {'input_layer': imagenet_std * act_scale_factor}

    ng.quantize([output_layer], input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    # conv2d, matmul
    # par_ich: parallelism in input-channel
    # par_och: parallelism in output-channel
    # par_col: parallelism in pixel column
    # par_row: parallelism in pixel row

    a0.attribute(par_ich=par_ich, par_och=par_och)
    a1.attribute(par_ich=par_ich, par_och=par_och)
    a2.attribute(par_ich=par_ich, par_och=par_och)
    output_layer.attribute(par_ich=par_ich, par_och=par_och)

    # cshamt_out: right shift amount after applying bias/scale
    # If you assign integer weights by yourself to each tensor,
    # cshamt (constant shift amount) must be assigned to each operator.

    # a0.attribute(cshamt_out=weight_dtype.width + 1)
    # a1.attribute(cshamt_out=weight_dtype.width + 1)
    # a2.attribute(cshamt_out=weight_dtype.width + 1)
    # output_layer.attribute(cshamt_out=weight_dtype.width + 1)

    # max_pool
    # par: parallelism in in/out channel

    par = par_och

    a0p.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    # In this example, random integer values are assigned.
    # In real case, you should assign actual integer activation values, such as an image.

    input_layer_value = np.random.normal(size=input_layer.length).reshape(
        input_layer.shape)
    input_layer_value = input_layer_value * imagenet_std + imagenet_mean
    input_layer_value = np.clip(input_layer_value, -5.0, 5.0)
    input_layer_value = input_layer_value * act_scale_factor
    input_layer_value = np.clip(input_layer_value,
                                -1 * 2**(act_dtype.width - 1) - 1,
                                2**(act_dtype.width - 1))
    input_layer_value = np.round(input_layer_value).astype(np.int64)

    eval_outs = ng.eval([output_layer], input_layer=input_layer_value)
    output_layer_value = eval_outs[0]

    # print(output_layer_value)
    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([output_layer], 'cnn', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([output_layer],
                        'cnn',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([output_layer], 'cnn', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Save the quantized weights
    # --------------------

    # convert weight values to a memory image:
    # on a real FPGA platform, this image will be used as a part of the model definition.

    param_filename = 'hello_nngen.npy'
    chunk_size = 64

    param_data = ng.export_ndarray([output_layer], chunk_size)
    np.save(weight_filename, param_data)

    # --------------------
    # (7) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil((input_layer.addr + input_layer.memory_size) /
                  chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(
        math.ceil(
            (check_addr + output_layer.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, input_layer_value, memimg_datawidth, act_dtype.width,
        input_layer.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, output_layer_value, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if sim_filename is None:
        sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + sim_filename

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(output_layer.shape[0]):
            for x in range(output_layer.shape[1]):
                orig = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, output_layer.addr,
                    act_dtype.width)
                check = memory.read_word(
                    bat * output_layer.aligned_shape[1] + x, check_addr,
                    act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                else:
                    print('OK (', bat, x, ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if verilog_filename is not None:
        m.to_verilog(verilog_filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=sim_filename)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #12
0
def mkTest():
    m = Module('test')

    matrix_size = 16

    # target instance
    led = mkLed(matrix_size)

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    # memory image
    memname = 'mymem.out'

    def fwrite(f, value):
        s = '%08x' % value
        f.write('%s\n' % s[6:8])
        f.write('%s\n' % s[4:6])
        f.write('%s\n' % s[2:4])
        f.write('%s\n' % s[0:2])

    with open(memname, 'w') as f:
        # ram_a
        addr = 0
        nv = 1
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                if x == y:
                    value = nv
                    nv += 1
                else:
                    value = 0
                fwrite(f, value)

        for i in range(1024 - addr):
            f.write('%s\n' % '00')

        # ram_b
        addr = 1024
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                if x == y:
                    value = 2
                else:
                    value = 0
                fwrite(f, value)

        for i in range(2048 - addr):
            f.write('%s\n' % '00')

        # ram_c
        addr = 2048
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                value = 100
                fwrite(f, value)

        for i in range(2**20 - addr):
            f.write('%s\n' % '00')

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg=memname)
    memory.connect(ports, 'myaxi')

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def run(act_shape=(1, 7, 7, 15),
        weight1_shape=(7, 3, 3, 15), bias1_shape=None, scale1_shape=None,
        weight2_shape=(9, 3, 3, 7), bias2_shape=None, scale2_shape=None,
        act_dtype=ng.int32,
        weight1_dtype=ng.int32, bias1_dtype=ng.int32, scale1_dtype=ng.int32,
        weight2_dtype=ng.int32, bias2_dtype=ng.int32, scale2_dtype=ng.int32,
        tmp_dtype=ng.int32,
        out_dtype=ng.int32,
        stride1=(1, 1, 1, 1), stride2=(1, 1, 1, 1),
        rshift_mul1=None, rshift_sum1=None, rshift_out1=None,
        rshift_mul2=None, rshift_sum2=None, rshift_out2=None,
        act_func1=None, act_func2=None,
        par_ich1=1, par_och1=1, par_col1=1, par_row1=1,
        concur_och1=None, stationary1='filter',
        par_ich2=1, par_och2=1, par_col2=1, par_row2=1,
        concur_och2=None, stationary2='filter',
        input_ram_size1=None, filter_ram_size1=None,
        bias_ram_size1=None, scale_ram_size1=None,
        out_ram_size1=None,
        input_ram_size2=None, filter_ram_size2=None,
        bias_ram_size2=None, scale_ram_size2=None,
        out_ram_size2=None,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')

    weight1 = ng.variable(weight1_dtype, shape=weight1_shape,
                          name='weight1')

    if bias1_shape is not None:
        bias1 = ng.variable(bias1_dtype, bias1_shape, name='bias1')
    else:
        bias1 = None

    if scale1_shape is not None:
        scale1 = ng.variable(scale1_dtype, scale1_shape, name='scale1')
    else:
        scale1 = None

    weight2 = ng.variable(weight2_dtype, shape=weight2_shape,
                          name='weight2')

    if bias2_shape is not None:
        bias2 = ng.variable(bias2_dtype, bias2_shape, name='bias2')
    else:
        bias2 = None

    if scale2_shape is not None:
        scale2 = ng.variable(scale2_dtype, scale2_shape, name='scale2')
    else:
        scale2 = None

    tmp = ng.conv2d(act, weight1, stride1,
                    bias1, scale1,
                    rshift_mul1, rshift_sum1, rshift_out1,
                    act_func1, 'SAME',
                    tmp_dtype, ng.int32, ng.int32,
                    'conv2d_1',
                    par_ich1, par_och1, par_col1, par_row1,
                    concur_och1, stationary1,
                    input_ram_size1, filter_ram_size1,
                    bias_ram_size1, scale_ram_size1,
                    None, None, None,
                    out_ram_size1)

    out = ng.conv2d(tmp, weight2, stride2,
                    bias2, scale2,
                    rshift_mul2, rshift_sum2, rshift_out2,
                    act_func2, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d_2',
                    par_ich2, par_och2, par_col2, par_row2,
                    concur_och2, stationary2,
                    input_ram_size2, filter_ram_size2,
                    bias_ram_size2, scale_ram_size2,
                    None, None, None,
                    out_ram_size2)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_conv2d_variable', silent=silent,
                            config={'maxi_datawidth': axi_datawidth,
                                    'offchipram_chunk_bytes': chunk_size})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]

    vweight1 = np.arange(weight1.length,
                         dtype=np.int64).reshape(weight1_shape) % [32] - [16]

    if bias1 is not None:
        vbias1 = np.arange(bias1.length,
                           dtype=np.int64).reshape(bias1.shape) % [16]
    else:
        vbias1 = None

    if scale1 is not None:
        vscale1 = np.arange(scale1.length,
                            dtype=np.int64).reshape(scale1.shape) % [8]
    else:
        vscale1 = None

    vweight2 = np.arange(weight2.length,
                         dtype=np.int64).reshape(weight2_shape) % [32] - [16]

    if bias2 is not None:
        vbias2 = np.arange(bias2.length,
                           dtype=np.int64).reshape(bias2.shape) % [16]
    else:
        vbias2 = None

    if scale2 is not None:
        vscale2 = np.arange(scale2.length,
                            dtype=np.int64).reshape(scale2.shape) % [8]
    else:
        vscale2 = None

    vtmp = ng.verify.conv2d(vact, vweight1, stride1,
                            vbias1, vscale1,
                            rshift_mul1, rshift_sum1, rshift_out1,
                            act_func1, 'SAME',
                            tmp_dtype, ng.int32, ng.int32,
                            'conv2d_1',
                            par_ich1, par_och1, par_col1, par_row1,
                            concur_och1, stationary1,
                            input_ram_size1, filter_ram_size1,
                            bias_ram_size1, scale_ram_size1,
                            None, None, None,
                            out_ram_size1,
                            False,
                            act_dtype, weight1_dtype)

    vout = ng.verify.conv2d(vtmp, vweight2, stride2,
                            vbias2, vscale2,
                            rshift_mul2, rshift_sum2, rshift_out2,
                            act_func2, 'SAME',
                            out_dtype, ng.int32, ng.int32,
                            'conv2d_2',
                            par_ich2, par_och2, par_col2, par_row2,
                            concur_och2, stationary2,
                            input_ram_size2, filter_ram_size2,
                            bias_ram_size2, scale_ram_size2,
                            None, None, None,
                            out_ram_size2,
                            False,
                            tmp_dtype, weight2_dtype)

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight1.memory_size,
                                 bias1.memory_size if bias1 is not None else 0,
                                 scale1.memory_size if scale1 is not None else 0,
                                 weight2.memory_size,
                                 bias2.memory_size if bias2 is not None else 0,
                                 scale2.memory_size if scale2 is not None else 0,
                                 out.memory_size) / chunk_size)) * chunk_size

    # assign custom addresses
    variable_addr = max(act.addr, out.addr) + size_max

    weight1_addr = variable_addr
    bias1_addr = weight1_addr + int(math.ceil(weight1.memory_size / chunk_size)) * chunk_size
    scale1_addr = (bias1_addr + int(math.ceil(bias1.memory_size / chunk_size)) * chunk_size
                   if bias1 is not None else weight1_addr)

    weight2_addr = (scale1_addr + int(math.ceil(scale1.memory_size / chunk_size)) * chunk_size
                    if scale1 is not None else bias1_addr)
    bias2_addr = weight2_addr + int(math.ceil(weight2.memory_size / chunk_size)) * chunk_size
    scale2_addr = (bias2_addr + int(math.ceil(bias2.memory_size / chunk_size)) * chunk_size
                   if bias2 is not None else weight2_addr)

    check_addr = scale2_addr + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich1))

    axi.set_memory(mem, vweight1, memimg_datawidth,
                   weight1_dtype.width, weight1_addr,
                   max(int(math.ceil(axi_datawidth / weight1_dtype.width)), par_ich1))
    if bias1_shape is not None:
        axi.set_memory(mem, vbias1, memimg_datawidth,
                       bias1_dtype.width, bias1_addr,
                       max(int(math.ceil(axi_datawidth / bias1_dtype.width)), par_och1))
    if scale1_shape is not None:
        axi.set_memory(mem, vscale1, memimg_datawidth,
                       scale1_dtype.width, scale1_addr,
                       max(int(math.ceil(axi_datawidth / scale1_dtype.width)), par_och1))

    axi.set_memory(mem, vweight2, memimg_datawidth,
                   weight2_dtype.width, weight2_addr,
                   max(int(math.ceil(axi_datawidth / weight2_dtype.width)), par_ich2))
    if bias2_shape is not None:
        axi.set_memory(mem, vbias2, memimg_datawidth,
                       bias2_dtype.width, bias2_addr,
                       max(int(math.ceil(axi_datawidth / bias2_dtype.width)), par_och2))
    if scale2_shape is not None:
        axi.set_memory(mem, vscale2, memimg_datawidth,
                       scale2_dtype.width, scale2_addr,
                       max(int(math.ceil(axi_datawidth / scale2_dtype.width)), par_och2))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och2))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        # set custom addresses
        ng.sim.set_global_addrs(_saxi, tmp_addr, out.addr, act.addr, variable_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                            + y * out.aligned_shape[2] * out.aligned_shape[3]
                            + x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)
                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #14
0
def run(act_shape=(1, 7, 7, 15),
        act_dtype=ng.int32,
        out_dtype=ng.int32,
        factors=(1, 2, 2, 1),
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    out = ng.upsampling2d(act, factors=factors, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out],
                            'matrix_upsampling2d',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape)

    vout = ng.verify.upsampling2d(vact, factors=factors, dtype=out_dtype)

    # to memory image
    size_max = int(math.ceil(
        max(act.memory_size, out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par))
    axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #15
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        bias_shape=None,
        scale_shape=None,
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        bias_dtype=ng.int32,
        scale_dtype=ng.int32,
        c_dtype=ng.int32,
        rshift_mul=None,
        rshift_sum=None,
        rshift_out=None,
        act_func=None,
        par_left_col=1,
        par_left_row=1,
        par_out_col=1,
        concur_out_col=None,
        stationary='right',
        left_ram_size=None,
        right_ram_size=None,
        bias_ram_size=None,
        scale_ram_size=None,
        out_ram_size=None,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    if bias_shape is not None:
        bias = ng.placeholder(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.placeholder(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    transposed_a = False
    transposed_b = True

    c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul,
                  rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                  ng.int32, 'matmul', par_left_col, par_left_row, par_out_col,
                  concur_out_col, stationary, left_ram_size, right_ram_size,
                  bias_ram_size, scale_ram_size, None, None, None,
                  out_ram_size)

    targ = ng.to_veriloggen([c],
                            'matrix_matmul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3]

    if bias is not None:
        vbias = np.arange(bias.length, dtype=np.int64).reshape(
            bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length, dtype=np.int64).reshape(
            scale.shape) % [6]
    else:
        vscale = None

    vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul,
                          rshift_sum, rshift_out, act_func, c_dtype, ng.int32,
                          ng.int32, 'matmul', par_left_col, par_left_row,
                          par_out_col, concur_out_col, stationary,
                          left_ram_size, right_ram_size, bias_ram_size,
                          scale_ram_size, None, None, None, out_ram_size,
                          False, a_dtype, b_dtype, bias_dtype, scale_dtype)

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size,
                bias.memory_size if bias is not None else 0, scale.memory_size
                if scale is not None else 0, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(
        mem, va, memimg_datawidth, a_dtype.width, a.addr,
        max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col))

    axi.set_memory(
        mem, vb, memimg_datawidth, b_dtype.width, b.addr,
        max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col))

    if bias is not None:
        axi.set_memory(
            mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr,
            max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col))

    if scale is not None:
        axi.set_memory(
            mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr,
            max(int(math.ceil(axi_datawidth / scale_dtype.width)),
                par_out_col))

    axi.set_memory(
        mem, vc, memimg_datawidth, c_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(c.shape[0]):
            for j in range(c.shape[1]):
                orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print(i, j, orig, check)
                    ok = False

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def mkTest(memname='mymem.out'):
    m = Module('test')

    matrix_size = 16

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    # memory image
    #memname = 'mymem.out'

    def fwrite(f, value):
        s = '%08x' % value
        f.write('%s\n' % s[6:8])
        f.write('%s\n' % s[4:6])
        f.write('%s\n' % s[2:4])
        f.write('%s\n' % s[0:2])

    with open(memname, 'w') as f:
        # ram_a
        addr = 0
        nv = 1
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                if x == y:
                    value = nv
                    nv += 1
                else:
                    value = 0
                fwrite(f, value)

        for i in range(1024 - addr):
            f.write('%s\n' % '00')

        # ram_b
        addr = 1024
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                if x == y:
                    value = 2
                else:
                    value = 0
                fwrite(f, value)

        for i in range(2048 - addr):
            f.write('%s\n' % '00')

        # ram_c
        addr = 2048
        for x in range(matrix_size):
            for y in range(matrix_size):
                addr += 4
                value = 100
                fwrite(f, value)

        for i in range(2**20 - addr):
            f.write('%s\n' % '00')

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg=memname)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # Timer
    counter = m.Reg('counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(counter.inc())

    def ctrl():
        for i in range(100):
            pass

        awaddr = 4
        matrix_size = 16
        print('# matrix_size = %d' % matrix_size)
        _saxi.write(awaddr, matrix_size)

        awaddr = 8
        a_offset = 0
        print('# a_offset = %d' % a_offset)
        _saxi.write(awaddr, a_offset)

        awaddr = 12
        b_offset = 1024 * 1
        print('# b_offset = %d' % b_offset)
        _saxi.write(awaddr, b_offset)

        awaddr = 16
        c_offset = 1024 * 2
        print('# c_offset = %d' % c_offset)
        _saxi.write(awaddr, c_offset)

        awaddr = 0
        start_time = counter
        print('# start time = %d' % start_time)
        _saxi.write(awaddr, 1)

        araddr = 20
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        end_time = counter
        print('# end time = %d' % end_time)
        time = end_time - start_time
        print('# exec time = %d' % time)

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
Beispiel #17
0
def run(act_dtype=ng.int16, weight_dtype=ng.int16,
        bias_dtype=ng.int32, scale_dtype=ng.int16,
        with_batchnorm=False, disable_fusion=False,
        conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1,
        conv2d_concur_och=None, conv2d_stationary='filter',
        pool_par=1, elem_par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None,
        simtype='iverilog',
        # simtype='verilator',
        # simtype=None,  # no RTL simulation
        outputfile=None):

    # input mean and standard deviation
    cifar10_mean = np.array([0.4914, 0.4822, 0.4465]).astype(np.float32)
    cifar10_std = np.array([0.247, 0.243, 0.261]).astype(np.float32)

    act_shape = (1, 32, 32, 3)

    # pytorch model
    if with_batchnorm:
        model = torchvision.models.vgg11_bn(pretrained=False)
    else:
        model = torchvision.models.vgg11(pretrained=False)

    model.features[0].in_channels = act_shape[-1]

    model.avgpool = nn.Identity()
    #model.classifier[0] = nn.Linear(512, 4096)
    #model.classifier[6] = nn.Linear(4096, 10)

    model.classifier = nn.Sequential(
        nn.Linear(in_features=512, out_features=1024, bias=True),
        nn.ReLU(inplace=True),
        nn.Dropout(p=0.5),
        nn.Linear(in_features=1024, out_features=1024, bias=True),
        nn.ReLU(inplace=True),
        nn.Dropout(p=0.5),
        nn.Linear(in_features=1024, out_features=10, bias=True),
    )

    # Pytorch to ONNX
    onnx_filename = 'vgg11.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=weight_dtype,
                                          default_constant_dtype=weight_dtype,
                                          default_operator_dtype=act_dtype,
                                          default_scale_dtype=scale_dtype,
                                          default_bias_dtype=bias_dtype,
                                          disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2 ** (act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}
    input_means = {'act': cifar10_mean * act_scale_factor}
    input_stds = {'act': cifar10_std * act_scale_factor}

    ng.quantize(outputs, input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool,
                           ng.avg_pool_serial, ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    # random data
    img = np.random.uniform(size=act.length).astype(np.float32).reshape(act.shape)
    img = img * 12.0 * cifar10_std + cifar10_mean
    # img = np.random.normal(size=act.length).astype(np.float32).reshape(act.shape)
    # img = img * cifar10_std + cifar10_mean

    # execution on pytorch
    model_input = img

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact,
                   -1.0 * (2 ** (act.dtype.width - 1) - 1),
                   1.0 * (2 ** (act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    labels = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

    mout = scaled_model_out
    for bat in range(mout.shape[0]):
        for index, value in list(sorted(enumerate(mout[bat]),
                                        key=lambda x: x[1], reverse=True))[:10]:
            print("# mout: %s (%d) = %f" % (str(labels[index]), index, value))
        for index, value in list(sorted(enumerate(vout[bat]),
                                        key=lambda x: x[1], reverse=True))[:10]:
            print("# vout: %s (%d) = %d" % (str(labels[index]), index, value))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([out], 'vgg11', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([out], 'onnx_vgg11', silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([out], 'vgg11', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)], dtype=np.int16)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for x in range(out.shape[1]):
                orig = memory.read_word(bat * out.aligned_shape[1] + x,
                                        out.addr, act_dtype.width)
                check = memory.read_word(bat * out.aligned_shape[1] + x,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x,
                          ') orig: ', orig, ' check: ', check)
                    ok = False
                # else:
                #    print('OK (', bat, x,
                #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def mkTest():
    m = Module('test')

    # target instance
    led = mkLed()

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst)
    memory.connect(ports, 'myaxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    def ctrl():
        for i in range(100):
            pass

        for i in range(16):
            # byte addressing
            v = memory.read(i * 4)
            print('read:  mem[%d] -> %x' % (i, v))
            v = v + 1024
            # byte addressing
            memory.write(i * 4, v)
            print('write: mem[%d] <- %x' % (i, v))

        awaddr = 0
        _saxi.write(awaddr, 1)

        araddr = 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        araddr = 8
        v = _saxi.read(araddr)
        if v:
            print('SLAVE: ALL OK')
        else:
            print('SLAVE: NOT ALL OK')

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(100000),
        Systask('finish'),
    )

    return m
Beispiel #19
0
def mkTest(memimg_name=None):
    matrix_size = 16

    a_shape = (matrix_size, matrix_size)
    b_shape = (matrix_size, matrix_size)
    c_shape = (a_shape[0], b_shape[0])

    n_raw_a = axi.shape_to_length(a_shape)
    n_raw_b = axi.shape_to_length(b_shape)

    n_a = axi.shape_to_memory_size(a_shape, datawidth)
    n_b = axi.shape_to_memory_size(b_shape, datawidth)

    a = np.zeros(a_shape, dtype=np.int32)
    b = np.zeros(b_shape, dtype=np.int32)

    value = 1
    for y in range(a_shape[0]):
        for x in range(a_shape[1]):
            if x == y:
                a[y][x] = value
                value += 1
            else:
                a[y][x] = 0

    for y in range(b_shape[0]):
        for x in range(b_shape[1]):
            if x == y:
                b[y][x] = 2
            else:
                b[y][x] = 0

    a_addr = a_offset
    size_a = n_a * datawidth // 8
    b_addr = b_offset
    size_b = n_b * datawidth // 8

    mem = np.zeros([1024 * 1024 * 8 // axi_datawidth], dtype=np.int64)
    axi.set_memory(mem, a, axi_datawidth, datawidth, a_addr)
    axi.set_memory(mem, b, axi_datawidth, datawidth, b_addr)

    led = mkLed(matrix_size)

    m = Module('test')
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)
    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                mem_datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name)

    memory.connect(ports, 'myaxi')

    uut = m.Instance(led, 'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
Beispiel #20
0
def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    model = MatrixMul()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_mul.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    input_scale_factors = {'a': 10.0, 'b': 15.0}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.scaled_multiply):
            op.attribute(par=par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    # verification data
    input_a = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [17]
    input_b = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
               [100]) % [13]

    # execution on pytorch
    model_a = input_a.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)

    model_b = input_b.astype(np.float32)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    # software-based verification
    va = input_a * input_scale_factors['a']
    va = np.clip(va, -1.0 * (2**(a.dtype.width - 1) - 1),
                 1.0 * (2**(a.dtype.width - 1) - 1))
    va = np.round(va).astype(np.int64)

    vb = input_b * input_scale_factors['b']
    vb = np.clip(vb, -1.0 * (2**(b.dtype.width - 1) - 1),
                 1.0 * (2**(b.dtype.width - 1) - 1))
    vb = np.round(vb).astype(np.int64)

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    mean_square_error = np.sum((vc - scaled_model_c)**2) / vc.size
    corrcoef = np.corrcoef(model_c.reshape([-1]), vc.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_mul',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([c])
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #21
0
ports = m.copy_sim_ports(targ)
clk = ports['CLK']
resetn = ports['RESETN']
rst = m.Wire('RST')
rst.assign(Not(resetn))

# AXI memory model
if outputfile is None:
    outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

memimg_name = 'memimg_' + outputfile

memory = axi.AxiMemoryModel(m,
                            'memory',
                            clk,
                            rst,
                            datawidth=axi_datawidth,
                            memimg=mem,
                            memimg_name=memimg_name,
                            memimg_datawidth=memimg_datawidth)
memory.connect(ports, 'maxi')

# AXI-Slave controller
_saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
_saxi.connect(ports, 'saxi')

# timer
time_counter = m.Reg('time_counter', 32, initval=0)
seq = Seq(m, 'seq', clk, rst)
seq(time_counter.inc())

Beispiel #22
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')
    t = ng.add(a, b, dtype=c_dtype, par=par)
    c = ng.relu(t, dtype=c_dtype, par=par)

    targ = ng.to_veriloggen([c],
                            'matrix_add_relu',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) +
          [100]) % [6] - [10]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(act_shape=(1, 4, 4, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 36),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        stride0=1,
        padding0=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='ReLU',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # pytorch model
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 is not None:
        layers.append(getattr(nn, act_func0)())

    class Transpose(nn.Module):
        def __init__(self, perm):
            super(Transpose, self).__init__()
            self.perm = perm

        def forward(self, input):
            return input.permute(*self.perm)

    layers.append(Transpose([0, 1, 3, 2]))

    class Flatten(nn.Module):
        def forward(self, input):
            # return input.view(input.size(0), -1)
            return torch.reshape(input, (input.size(0), -1))

    layers.append(Flatten())
    layers.append(nn.Linear(weight1_shape[1], weight1_shape[0]))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 is not None:
        layers.append(getattr(nn, act_func1)())

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_transpose_linear.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '3.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}

    ng.quantize(outputs, input_scale_factors)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    # random data
    std = 0.2
    mean = 0.5
    img = np.random.normal(size=act.length).astype(np.float32).reshape(
        act.shape)
    img = img * std + mean

    # execution on pytorch
    model_input = img

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_transpose_linear',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(out.shape[0]):
            for j in range(out.shape[1]):
                orig = memory.read_word(i * out.aligned_shape[1] + j, out.addr,
                                        act_dtype.width)
                check = memory.read_word(i * out.aligned_shape[1] + j,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', i, j, ') orig: ', orig, 'check: ', check)
                    ok = False
                # else:
                #    print('OK (', i, j, ') orig: ', orig, 'check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def mkTest(memimg_name=None, axi_datawidth=32, datawidth=4, addrwidth=10):
    m = Module('test')

    # target instance
    led = mkLed(axi_datawidth, datawidth, addrwidth)

    # copy paras and ports
    params = m.copy_params(led)
    ports = m.copy_sim_ports(led)

    clk = ports['CLK']
    rst = ports['RST']

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name)
    memory.connect(ports, 'myaxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    def ctrl():
        for i in range(100):
            pass

        for i in range(16):
            # word addressing
            r = memory.read_word(i, 0, datawidth)
            print('read:  mem[%d] -> %x' % (i, r))

            # word addressing
            w = (r + i + 100) % (2**datawidth - 1)
            memory.write_word(i, 0, w, datawidth)
            print('write: mem[%d] <- %x' % (i, w))

        awaddr = 0
        _saxi.write(awaddr, 1)

        araddr = 4
        v = _saxi.read(araddr)
        while v == 0:
            v = _saxi.read(araddr)

        araddr = 8
        v = _saxi.read(araddr)
        if v:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(led,
                     'uut',
                     params=m.connect_params(led),
                     ports=m.connect_ports(led))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, rst, m.make_reset(), period=100)

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    return m
def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15),
        bias_shape=None, scale_shape=None,
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        conv2d_stride=(1, 1, 1, 1),
        rshift_mul=None, rshift_sum=None, rshift_out=None,
        act_func=None,
        par_ich=1, par_och=1, par_col=1, par_row=1,
        concur_och=None, stationary='filter',
        input_ram_size=None, filter_ram_size=None,
        bias_ram_size=None, scale_ram_size=None,
        out_ram_size=None,
        ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    # create target hardware
    act = ng.placeholder(act_dtype, shape=act_shape, name='act')
    weight = ng.variable(weight_dtype, shape=weight_shape, name='weight')

    if bias_shape is not None:
        bias = ng.variable(bias_dtype, bias_shape, name='bias')
    else:
        bias = None

    if scale_shape is not None:
        scale = ng.variable(scale_dtype, scale_shape, name='scale')
    else:
        scale = None

    tmp = ng.conv2d(act, weight, conv2d_stride,
                    bias, scale,
                    rshift_mul, rshift_sum, rshift_out,
                    act_func, 'SAME',
                    out_dtype, ng.int32, ng.int32,
                    'conv2d',
                    par_ich, par_och, par_col, par_row,
                    concur_och, stationary,
                    input_ram_size, filter_ram_size,
                    bias_ram_size, scale_ram_size,
                    None, None, None,
                    out_ram_size)

    out = ng.avg_pool(tmp, ksize=ksize,
                      strides=pool_stride,
                      sum_dtype=ng.int32, dtype=out_dtype, par=par)

    targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16]
    vweight = np.arange(weight.length,
                        dtype=np.int64).reshape(weight.shape) % [32] - [16]

    if bias is not None:
        vbias = np.arange(bias.length,
                          dtype=np.int64).reshape(bias.shape) % [4]
    else:
        vbias = None

    if scale is not None:
        vscale = np.arange(scale.length,
                           dtype=np.int64).reshape(scale.shape) % [6]
    else:
        vscale = None

    eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale)
    vout = eval_outs[0]

    # to memory image
    size_max = int(math.ceil(max(act.memory_size, weight.memory_size,
                                 bias.memory_size if bias is not None else 0,
                                 scale.memory_size if scale is not None else 0,
                                 out.memory_size) / 4096)) * 4096
    check_addr = max(act.addr, weight.addr,
                     bias.addr if bias is not None else -1,
                     scale.addr if scale is not None else -1,
                     out.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    axi.set_memory(mem, vweight, memimg_datawidth,
                   weight_dtype.width, weight.addr,
                   max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich))

    if bias is not None:
        axi.set_memory(mem, vbias, memimg_datawidth,
                       bias_dtype.width, bias.addr,
                       max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och))

    if scale is not None:
        axi.set_memory(mem, vscale, memimg_datawidth,
                       scale_dtype.width, scale.addr,
                       max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och))

    axi.set_memory(mem, vout, memimg_datawidth,
                   out_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / out_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                + x * out.aligned_shape[3] + ch,
                                                out.addr, out_dtype.width)
                        check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + y * out.aligned_shape[2] * out.aligned_shape[3]
                                                 + x * out.aligned_shape[3] + ch,
                                                 check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #26
0
def run(a_shape=(15, 15),
        b_shape=(15, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        interrupt_name='irq',
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # create target hardware
    a = ng.placeholder(a_dtype, shape=a_shape, name='a')
    b = ng.placeholder(b_dtype, shape=b_shape, name='b')

    d = ng.add(a, b, dtype=c_dtype, par=par)
    e = ng.add(b, a, dtype=c_dtype, par=par)

    # SW returns ng.add(x, y)
    f = ng.extern([d, e], shape=a_shape, opcode=0x1, func=lambda x, y: x + y)
    g = ng.sub(f, a)

    # SW returns d as-is
    h = ng.extern([g], shape=a_shape, opcode=0x2, func=lambda x: x)
    c = ng.sub(h, b)

    targ = ng.to_veriloggen([c],
                            'matrix_extern',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'interrupt_name': interrupt_name
                            })

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16]
    vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # to memory image
    size_max = int(
        math.ceil(
            max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096
    check_addr = max(a.addr, b.addr, c.addr) + size_max
    size_check = size_max
    tmp_addr = check_addr + size_check

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    irq = ports[interrupt_name]
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg_datawidth=memimg_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def irq_join(saxi, irq_bit):
        while irq == 0:
            pass
        araddr = ng.control_reg_interrupt_isr * 4
        irq_stat = saxi.read(araddr)

        if irq_stat != irq_bit:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()

        print('# irq stat = %d' % irq_stat)
        awaddr = ng.control_reg_interrupt_iar * 4
        saxi.write(awaddr, irq_bit)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        araddr_ext_snd = ng.control_reg_extern_send * 4
        awaddr_ext_rcv = ng.control_reg_extern_recv * 4
        awaddr_irq_ier = ng.control_reg_interrupt_ier * 4
        araddr_irq_isr = ng.control_reg_interrupt_isr * 4
        awaddr_irq_iar = ng.control_reg_interrupt_iar * 4
        _saxi.write(awaddr_irq_ier, 3)  # irq enable

        ng.sim.sw_rst(_saxi)

        print('# 0st software reset (during idle)')

        for i in range(100):
            pass

        irq_stat = _saxi.read(araddr_irq_isr)
        if irq_stat != 0:
            print('# Unexpected irq signal: %d' % irq_stat)
            print('# verify: FAILED')
            vthread.finish()
        print('# irq stat = %d' %
              irq_stat)  # no irq busy by software reset when idle

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 1st test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        # software reset
        ng.sim.sw_rst(_saxi)

        print('# 1st software reset (before resume)')

        # from extern-send
        irq_join(_saxi, 1)

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send

        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_1st = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_1st = False
                # else:
                #    print('OK', i, j, orig, check)

        # 2nd test

        # start
        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# 2nd test start')

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        while (memory.waddr.awvalid) == 0:
            pass

        ng.sim.sw_rst(_saxi)

        print('# 2nd software reset (during Master AXI transaction)')

        irq_join(_saxi, 1)  # irq busy by software reset

        # restart
        ng.sim.start(_saxi)

        print('# Restart')

        # from extern-send
        irq_join(_saxi, 2)
        araddr = ng.control_reg_extern_send * 4
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - d.default_global_addr
                y_offset = tmp_addr - e.default_global_addr
                z_offset = tmp_addr - f.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     d.addr + x_offset, c_dtype.width)
                y = memory.read_word(i * c.aligned_shape[-1] + j,
                                     e.addr + y_offset, c_dtype.width)
                z = x + y
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  f.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # from extern-send
        irq_join(_saxi, 2)
        v = _saxi.read(araddr_ext_snd)
        print('# opcode = %d' % v)

        for i in range(num_rep):
            for j in range(c.shape[-1]):
                x_offset = tmp_addr - g.default_global_addr
                z_offset = tmp_addr - h.default_global_addr
                x = memory.read_word(i * c.aligned_shape[-1] + j,
                                     g.addr + x_offset, c_dtype.width)
                z = x
                memory.write_word(i * c.aligned_shape[-1] + j,
                                  h.addr + z_offset, z, c_dtype.width)

        # to extern-recv
        _saxi.write(awaddr_ext_rcv, 1)

        # termination
        irq_join(_saxi, 1)
        #ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok_2nd = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok_2nd = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok_1st and ok_2nd:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
def run(
        act_dtype=ng.int16,
        weight_dtype=ng.int8,
        bias_dtype=ng.int32,
        scale_dtype=ng.int8,
        with_batchnorm=True,
        disable_fusion=False,
        conv2d_par_ich=1,
        conv2d_par_och=1,
        conv2d_par_col=1,
        conv2d_par_row=1,
        conv2d_concur_och=None,
        conv2d_stationary='filter',
        pool_par=1,
        elem_par=1,
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        # simtype='iverilog',
        # simtype='verilator',
        simtype=None,  # no RTL simulation
        outputfile=None):

    # input mean and standard deviation
    imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32)
    imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32)

    act_shape = (1, 224, 224, 3)

    if not with_batchnorm:
        raise ValueError('with_batchnorm must be True for ResNet18.')

    # pytorch model
    model = torchvision.models.resnet18(pretrained=True)

    # Pytorch to ONNX
    onnx_filename = 'resnet18_imagenet.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # --------------------
    # (1) Represent a DNN model as a dataflow by NNgen operators
    # --------------------

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=act_dtype,
                               default_scale_dtype=scale_dtype,
                               default_bias_dtype=bias_dtype,
                               disable_fusion=disable_fusion)

    # --------------------
    # (2) Assign quantized weights to the NNgen operators
    # --------------------

    if act_dtype.width > 8:
        act_scale_factor = 128
    else:
        act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5))

    input_scale_factors = {'act': act_scale_factor}
    input_means = {'act': imagenet_mean * act_scale_factor}
    input_stds = {'act': imagenet_std * act_scale_factor}

    ng.quantize(outputs, input_scale_factors, input_means, input_stds)

    # --------------------
    # (3) Assign hardware attributes
    # --------------------

    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial,
                           ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # --------------------
    # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software
    # --------------------

    act = placeholders['act']
    out = outputs['out']

    # verification data
    img = np.array(PIL.Image.open('car.png').convert('RGB')).astype(np.float32)
    img = img.reshape([1] + list(img.shape))

    img = img / 255
    img = (img - imagenet_mean) / imagenet_std

    # execution on pytorch
    model_input = np.broadcast_to(img, act_shape)

    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    # software-based verification
    vact = img * act_scale_factor
    vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1),
                   1.0 * (2**(act.dtype.width - 1) - 1))
    vact = np.round(vact).astype(np.int64)
    vact = np.broadcast_to(vact, act_shape)

    # compare outputs of hidden layers
    relu_op = [
        v for k, v in operators.items()
        if isinstance(v, ng.conv2d) and not isinstance(v, ng.matmul)
    ][0]
    maxpool_op = [
        v for k, v in operators.items()
        if isinstance(v, (ng.max_pool, ng.max_pool_serial))
    ][0]
    relu_ops = [v for k, v in operators.items() if isinstance(v, ng.relu)]
    layer1_0_op = relu_ops[0]
    layer1_op = relu_ops[1]
    layer2_0_op = relu_ops[2]
    layer2_op = relu_ops[3]
    layer3_0_op = relu_ops[4]
    layer3_op = relu_ops[5]
    layer4_0_op = relu_ops[6]
    layer4_op = relu_ops[7]
    avgpool_op = [
        v for k, v in operators.items()
        if isinstance(v, (ng.avg_pool, ng.avg_pool_serial))
    ][0]
    fc_op = [v for k, v in operators.items() if isinstance(v, ng.matmul)][0]
    sub_ops = [
        relu_op, maxpool_op, layer1_0_op, layer1_op, layer2_0_op, layer2_op,
        layer3_0_op, layer3_op, layer4_0_op, layer4_op, avgpool_op, fc_op
    ]
    sub_outs = ng.eval(sub_ops, act=vact)
    sub_outs = [sub_out.transpose([0, 3, 1, 2])
                for sub_out in sub_outs[:-1]] + sub_outs[-1:]
    sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops]

    model.eval()
    model_relu_out = nn.Sequential(model.conv1, model.bn1, model.relu)(
        torch.from_numpy(model_input)).detach().numpy()
    model_maxpool_out = nn.Sequential(
        model.conv1, model.bn1, model.relu,
        model.maxpool)(torch.from_numpy(model_input)).detach().numpy()

    #    class model_layer1_0(nn.Module):
    #        def __init__(self):
    #            super(model_layer1_0, self).__init__()
    #            self.conv1 = model.conv1
    #            self.bn1 = model.bn1
    #            self.relu = model.relu
    #            self.maxpool = model.maxpool
    #            self.layer1_0 = model.layer1[0]
    #
    #        def forward(self, x):
    #            x = self.relu(self.bn1(self.conv1(x)))
    #            x = self.maxpool(x)
    #            x = self.layer1_0(x)
    #            return x
    #
    #    model_layer1_0_out = model_layer1_0()(torch.from_numpy(model_input)).detach().numpy()

    model_layer1_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer1_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1)(torch.from_numpy(model_input)).detach().numpy()

    model_layer2_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer2_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2)(torch.from_numpy(model_input)).detach().numpy()

    model_layer3_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2,
        model.layer3[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer3_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2,
        model.layer3)(torch.from_numpy(model_input)).detach().numpy()

    model_layer4_0_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3,
        model.layer4[0])(torch.from_numpy(model_input)).detach().numpy()
    model_layer4_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3,
        model.layer4)(torch.from_numpy(model_input)).detach().numpy()

    model_avgpool_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
        model.layer2, model.layer3, model.layer4,
        model.avgpool)(torch.from_numpy(model_input)).detach().numpy()

    class Flatten(nn.Module):
        def forward(self, input):
            return input.view(input.size(0), -1)

    model_fc_out = nn.Sequential(
        model.conv1, model.bn1, model.relu, model.maxpool,
        model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool,
        Flatten(), model.fc)(torch.from_numpy(model_input)).detach().numpy()

    model_outs = [
        model_relu_out, model_maxpool_out, model_layer1_0_out,
        model_layer1_out, model_layer2_0_out, model_layer2_out,
        model_layer3_0_out, model_layer3_out, model_layer4_0_out,
        model_layer4_out, model_avgpool_out, model_fc_out
    ]
    scaled_outs = [
        model_out * scale_factor
        for model_out, scale_factor in zip(model_outs, sub_scale_factors)
    ]

    max_diffs = [
        model_out.max() / sub_out.max()
        for model_out, sub_out in zip(scaled_outs, sub_outs)
    ]
    overflows = [
        np.sum(np.abs(sub_out) >= abs(2**(sub_op.dtype.width - 1) - 1))
        for sub_op, sub_out in zip(sub_ops, sub_outs)
    ]
    mean_square_errors = [
        np.sum((sub_out - model_out)**2) / sub_out.size
        for model_out, sub_out in zip(scaled_outs, sub_outs)
    ]
    corrcoefs = [
        np.corrcoef(model_out.reshape([-1]), sub_out.reshape([-1]))
        for model_out, sub_out in zip(model_outs, sub_outs)
    ]

    # compare prediction results
    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size
    corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1]))

    class_index = json.load(open('imagenet_class_index.json', 'r'))
    labels = {int(key): value for (key, value) in class_index.items()}

    mout = scaled_model_out
    for bat in range(mout.shape[0]):
        m_top10 = list(
            sorted(enumerate(mout[bat]), key=lambda x: x[1],
                   reverse=True))[:10]
        m_top10_indexes = [index for index, value in m_top10]
        v_top10 = list(
            sorted(enumerate(vout[bat]), key=lambda x: x[1],
                   reverse=True))[:10]
        v_top10_indexes = [index for index, value in v_top10]
        num_hit = 0
        score = 0
        for index, value in m_top10:
            print("# mout: %s (%d) = %f" % (str(labels[index]), index, value))
        for index, value in v_top10:
            print("# vout: %s (%d) = %d" % (str(labels[index]), index, value))
            if index in m_top10_indexes:
                num_hit += 1
                score += 10 - abs(
                    m_top10_indexes.index(index) -
                    v_top10_indexes.index(index))
        print("# top-10 hit: %d" % num_hit)
        print("# top-10 score: %d" % score)

    # breakpoint()

    # --------------------
    # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT)
    # --------------------

    # to Veriloggen object
    # targ = ng.to_veriloggen([out], 'resnet18', silent=silent,
    #                        config={'maxi_datawidth': axi_datawidth})

    # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen)
    targ = ng.to_ipxact([out],
                        'resnet18',
                        silent=silent,
                        config={'maxi_datawidth': axi_datawidth})

    # to Verilog HDL RTL (the method returns a source code text)
    # rtl = ng.to_verilog([out], 'resnet18', silent=silent,
    #                    config={'maxi_datawidth': axi_datawidth})

    # --------------------
    # (6) Simulate the generated hardware by Veriloggen and Verilog simulator
    # --------------------

    if simtype is None:
        sys.exit()

    # to memory image
    param_data = ng.export_ndarray([out], chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64)
    mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)],
                   dtype=np.int16)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, act_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for x in range(out.shape[1]):
                orig = memory.read_word(bat * out.aligned_shape[1] + x,
                                        out.addr, act_dtype.width)
                check = memory.read_word(bat * out.aligned_shape[1] + x,
                                         check_addr, act_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG (', bat, x, ') orig: ', orig, ' check: ', check)
                    ok = False
                else:
                    print('OK (', bat, x, ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #28
0
def run(act_shape=(1, 32, 32, 3),
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        with_batchnorm=True, disable_fusion=False,
        conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1,
        conv2d_concur_och=None, conv2d_stationary='filter',
        pool_par=1, elem_par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    if not with_batchnorm:
        raise ValueError('with_batchnorm must be True for ResNet18.')

    # pytorch model
    model = torchvision.models.resnet18(pretrained=False)

    model.conv1.in_channels = act_shape[-1]
    model.fc = nn.Linear(in_features=model.fc.in_features,
                         out_features=10, bias=True)

    # Pytorch to ONNX
    onnx_filename = 'resnet18.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=weight_dtype,
                                          default_constant_dtype=weight_dtype,
                                          default_operator_dtype=out_dtype,
                                          default_scale_dtype=scale_dtype,
                                          default_bias_dtype=bias_dtype,
                                          disable_fusion=disable_fusion)

    # default linear quantization
    value_ranges = {'act': (0, 255)}

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool,
                           ng.avg_pool_serial, ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out], 'onnx_resnet18', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2 ** (vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    mout = scaled_model_out.astype(np.int64)
    for bat in range(vout.shape[0]):
        vout_max = np.max(vout[bat])
        vout_max_index = list(vout[bat]).index(vout_max)
        mout_max = np.max(mout[bat])
        mout_max_index = list(mout[bat]).index(mout_max)
        print("# vout[%d]: max = %d, index = %d" % (bat, vout_max, vout_max_index))
        print("# mout[%d]: max = %d, index = %d" % (bat, mout_max, mout_max_index))

    # out_diff = vout - scaled_model_out
    # out_err = out_diff / (scaled_model_out + 0.00000001)
    # max_out_err = np.max(np.abs(out_err))
    # breakpoint()

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #29
0
def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    model = MatrixAdd()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_add.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.add):
            op.attribute(par=par)

    # create target hardware
    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_add',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # exec on pytorch
    model_a = va.astype(np.float32)
    model_b = vb.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    c_diff = vc - scaled_model_c
    c_err = c_diff / (scaled_model_c + 0.00000001)
    max_c_err = np.max(np.abs(c_err))

    # if max_c_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_c_err)

    # to memory image
    param_data = ng.export_ndarray([c])
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)],
                   dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt
Beispiel #30
0
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28):
    # create target hardware

    # layer 0: conv2d, max_pool_serial, relu
    input_layer = ng.placeholder(ng.int32,
                                 shape=(1, row, col, ich),
                                 name='input_layer')
    w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0')
    a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1))
    a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1))
    a0 = ng.relu(a0)

    # layer 1: conv2d, relu, reshape
    w1 = ng.variable(ng.int32,
                     shape=(ch, ksize, ksize, a0.shape[-1]),
                     name='w1')
    a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1))
    a1 = ng.relu(a1)
    a1 = ng.reshape(a1, [-1])

    # layer 2: full-connection
    w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2')
    a2 = ng.matmul(a1, w2, transposed_b=True)
    a2 = ng.relu(a2)

    # layer 3: full-connection
    w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3')
    output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer')

    targ = ng.to_veriloggen([output_layer], 'cnn')
    #targ = ng.to_ipxact([output_layer], 'cnn')

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    return m