Ejemplos de make_param_array en Python

Lenguaje de programación: Python

Namespace/Package Name: nngen

Método / Función: make_param_array

Ejemplos en hotexamples.com: 3

Python make_param_array - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de nngen.make_param_array extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def run(act_shape=(1, 32, 32, 3),
        act_dtype=ng.int32, weight_dtype=ng.int32,
        bias_dtype=ng.int32, scale_dtype=ng.int32,
        out_dtype=ng.int32,
        with_batchnorm=True, disable_fusion=False,
        conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1,
        conv2d_concur_och=None, conv2d_stationary='filter',
        pool_par=1, elem_par=1,
        chunk_size=64,
        axi_datawidth=32, silent=False,
        filename=None, simtype='iverilog', outputfile=None):

    if not with_batchnorm:
        raise ValueError('with_batchnorm must be True for ResNet18.')

    # pytorch model
    model = torchvision.models.resnet18(pretrained=False)

    model.conv1.in_channels = act_shape[-1]
    model.fc = nn.Linear(in_features=model.fc.in_features,
                         out_features=10, bias=True)

    # Pytorch to ONNX
    onnx_filename = 'resnet18.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model, dummy_input, onnx_filename,
                      input_names=input_names, output_names=output_names)

    # ONNX to NNgen
    dtypes = {}
    (outputs, placeholders, variables,
     constants, operators) = ng.from_onnx(onnx_filename,
                                          value_dtypes=dtypes,
                                          default_placeholder_dtype=act_dtype,
                                          default_variable_dtype=weight_dtype,
                                          default_constant_dtype=weight_dtype,
                                          default_operator_dtype=out_dtype,
                                          default_scale_dtype=scale_dtype,
                                          default_bias_dtype=bias_dtype,
                                          disable_fusion=disable_fusion)

    # default linear quantization
    value_ranges = {'act': (0, 255)}

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=conv2d_par_ich,
                         par_och=conv2d_par_och,
                         par_col=conv2d_par_col,
                         par_row=conv2d_par_row,
                         concur_och=conv2d_concur_och,
                         stationary=conv2d_stationary)

        if isinstance(op, (ng.avg_pool, ng.max_pool,
                           ng.avg_pool_serial, ng.max_pool_serial)):
            op.attribute(par=pool_par)

        if ng.is_elementwise_operator(op):
            op.attribute(par=elem_par)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out], 'onnx_resnet18', silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2 ** (vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None and len(model_out.shape) == len(act.shape):
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    mout = scaled_model_out.astype(np.int64)
    for bat in range(vout.shape[0]):
        vout_max = np.max(vout[bat])
        vout_max_index = list(vout[bat]).index(vout_max)
        mout_max = np.max(mout[bat])
        mout_max_index = list(mout[bat]).index(mout_max)
        print("# vout[%d]: max = %d, index = %d" % (bat, vout_max, vout_max_index))
        print("# mout[%d]: max = %d, index = %d" % (bat, mout_max, mout_max_index))

    # out_diff = vout - scaled_model_out
    # out_err = out_diff / (scaled_model_out + 0.00000001)
    # max_out_err = np.max(np.abs(out_err))
    # breakpoint()

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 256 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, vact, memimg_datawidth,
                   act_dtype.width, act.addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth,
                   8, variable_addr)

    # verification data
    axi.set_memory(mem, vout, memimg_datawidth,
                   act_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m, 'memory', clk, rst,
                                datawidth=axi_datawidth,
                                memimg=mem, memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(
        time_counter.inc()
    )

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            out.addr, out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch,
                            check_addr, out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch,
                                  ') orig: ', orig, ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ, 'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt

Ejemplo n.º 2

Mostrar archivo

Archivo: onnx_matrix_conv2d_conv2d.py Proyecto: herosugi/nngen

def run(act_shape=(1, 7, 7, 3),
        weight0_shape=(9, 3, 3, 3),
        weight1_shape=(9, 3, 3, 9),
        act_dtype=ng.int32,
        weight_dtype=ng.int32,
        out_dtype=ng.int32,
        stride0=1,
        stride1=1,
        padding0=0,
        padding1=0,
        with_batchnorm0=False,
        with_batchnorm1=False,
        act_func0='relu',
        act_func1='relu',
        disable_fusion=False,
        par_ich=1,
        par_och=1,
        par_col=1,
        par_row=1,
        concur_och=None,
        stationary='filter',
        chunk_size=64,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    layers = []
    layers.append(
        nn.Conv2d(weight0_shape[3],
                  weight0_shape[0],
                  weight0_shape[1],
                  stride=stride0,
                  padding=padding0))

    if with_batchnorm0:
        layers.append(nn.BatchNorm2d(weight0_shape[0]))

    if act_func0 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func0 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    layers.append(
        nn.Conv2d(weight1_shape[3],
                  weight1_shape[0],
                  weight1_shape[1],
                  stride=stride1,
                  padding=padding1))

    if with_batchnorm1:
        layers.append(nn.BatchNorm2d(weight1_shape[0]))

    if act_func1 == 'relu':
        layers.append(nn.ReLU(inplace=True))
    elif act_func1 == 'leaky_relu':
        layers.append(nn.LeakyReLU(inplace=True))

    model = nn.Sequential(*layers)

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_conv2d_conv2d.onnx'
    dummy_input = torch.randn(*act_shape).transpose(1, 3)
    input_names = ['act']
    output_names = ['out']
    model.eval()
    torch.onnx.export(model,
                      dummy_input,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {
        'act': act_dtype,
        '0.weight': weight_dtype,
        '1.weight': weight_dtype,
        'out': act_dtype
    }

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=act_dtype,
                               default_variable_dtype=weight_dtype,
                               default_constant_dtype=weight_dtype,
                               default_operator_dtype=out_dtype,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=disable_fusion)

    # default linear quantization
    if act_dtype.width >= 8:
        value_ranges = {'act': (-120, 120)}
    else:
        value_ranges = {
            'act': (-(2**(act_dtype.width - 1)), (2**(act_dtype.width - 1)))
        }

    ng.quantize(outputs, value_ranges=value_ranges)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.conv2d):
            op.attribute(par_ich=par_ich,
                         par_och=par_och,
                         par_row=par_row,
                         par_col=par_col,
                         concur_och=concur_och)

    # create target hardware
    act = placeholders['act']
    out = outputs['out']

    targ = ng.to_veriloggen([out],
                            'onnx_matrix_conv2d_conv2d',
                            silent=silent,
                            config={
                                'maxi_datawidth': axi_datawidth,
                                'chunk_size': chunk_size
                            })

    # verification data
    # if act_dtype.width > 4:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1]
    # else:
    #    vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1]

    #vact = np.ones(act.shape)
    vact = np.random.normal(size=act.length).reshape(act.shape)
    vact = np.clip(vact, -3.0, 3.0)
    vact_min_val, vact_max_val = value_ranges['act']
    vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val))
    vact_width = vact_max_abs_range.bit_length() + 1
    vact = vact * (1.0 * (2**(vact_width - 1) - 1)) / 3.0
    vact = np.round(vact).astype(np.int64)

    eval_outs = ng.eval([out], act=vact)
    vout = eval_outs[0]

    # exec on pytorch
    model_input = vact.astype(np.float32)
    if act.perm is not None:
        model_input = np.transpose(model_input, act.reversed_perm)

    model.eval()
    model_out = model(torch.from_numpy(model_input)).detach().numpy()
    if act.perm is not None:
        model_out = np.transpose(model_out, act.perm)
    scaled_model_out = model_out * out.scale_factor

    out_diff = vout - scaled_model_out
    out_err = out_diff / (scaled_model_out + 0.00000001)
    max_out_err = np.max(np.abs(out_err))

    # if max_out_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_out_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants, chunk_size)
    param_bytes = len(param_data)

    variable_addr = int(math.ceil(
        (act.addr + act.memory_size) / chunk_size)) * chunk_size
    check_addr = int(math.ceil(
        (variable_addr + param_bytes) / chunk_size)) * chunk_size
    tmp_addr = int(math.ceil(
        (check_addr + out.memory_size) / chunk_size)) * chunk_size

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(
        mem, vact, memimg_datawidth, act_dtype.width, act.addr,
        max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(
        mem, vout, memimg_datawidth, out_dtype.width, check_addr,
        max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for bat in range(out.shape[0]):
            for y in range(out.shape[1]):
                for x in range(out.shape[2]):
                    for ch in range(out.shape[3]):
                        orig = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, out.addr,
                            out_dtype.width)
                        check = memory.read_word(
                            bat * out.aligned_shape[1] * out.aligned_shape[2] *
                            out.aligned_shape[3] +
                            y * out.aligned_shape[2] * out.aligned_shape[3] +
                            x * out.aligned_shape[3] + ch, check_addr,
                            out_dtype.width)

                        if vthread.verilog.NotEql(orig, check):
                            print('NG (', bat, y, x, ch, ') orig: ', orig,
                                  ' check: ', check)
                            ok = False
                        # else:
                        #    print('OK (', bat, y, x, ch,
                        #          ') orig: ', orig, ' check: ', check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(10000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt

Ejemplo n.º 3

Mostrar archivo

Archivo: onnx_matrix_add.py Proyecto: herosugi/nngen

def run(a_shape=(7, 15),
        b_shape=(7, 15),
        a_dtype=ng.int32,
        b_dtype=ng.int32,
        c_dtype=ng.int32,
        par=1,
        axi_datawidth=32,
        silent=False,
        filename=None,
        simtype='iverilog',
        outputfile=None):

    # model definition
    model = MatrixAdd()

    # Pytorch to ONNX
    onnx_filename = 'onnx_matrix_add.onnx'
    dummy_a = torch.randn(*a_shape)
    dummy_b = torch.randn(*b_shape)
    dummy_inputs = (dummy_a, dummy_b)
    input_names = ['a', 'b']
    output_names = ['c']
    model.eval()
    torch.onnx.export(model,
                      dummy_inputs,
                      onnx_filename,
                      input_names=input_names,
                      output_names=output_names)

    # ONNX to NNgen
    value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype}

    (outputs, placeholders, variables, constants,
     operators) = ng.from_onnx(onnx_filename,
                               value_dtypes=value_dtypes,
                               default_placeholder_dtype=ng.int32,
                               default_variable_dtype=ng.int32,
                               default_constant_dtype=ng.int32,
                               default_operator_dtype=ng.int32,
                               default_scale_dtype=ng.int32,
                               default_bias_dtype=ng.int32,
                               disable_fusion=False)

    # set attribute
    for op in operators.values():
        if isinstance(op, ng.add):
            op.attribute(par=par)

    # create target hardware
    a = placeholders['a']
    b = placeholders['b']
    c = outputs['c']

    targ = ng.to_veriloggen([c],
                            'onnx_matrix_add',
                            silent=silent,
                            config={'maxi_datawidth': axi_datawidth})

    # verification data
    va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5]
    vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6]

    eval_outs = ng.eval([c], a=va, b=vb)
    vc = eval_outs[0]

    # exec on pytorch
    model_a = va.astype(np.float32)
    model_b = vb.astype(np.float32)
    if a.perm is not None:
        model_a = np.transpose(model_a, a.reversed_perm)
    if b.perm is not None:
        model_b = np.transpose(model_b, b.reversed_perm)

    model.eval()
    model_c = model(torch.from_numpy(model_a),
                    torch.from_numpy(model_b)).detach().numpy()
    if a.perm is not None:
        model_c = np.transpose(model_c, a.perm)
    scaled_model_c = model_c * c.scale_factor

    c_diff = vc - scaled_model_c
    c_err = c_diff / (scaled_model_c + 0.00000001)
    max_c_err = np.max(np.abs(c_err))

    # if max_c_err > 0.1:
    #    raise ValueError("too large output error: %f > 0.1" % max_c_err)

    # to memory image
    param_data = ng.make_param_array(variables, constants)
    param_bytes = len(param_data)

    variable_addr = int(
        math.ceil(
            max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096
    check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096
    tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096

    memimg_datawidth = 32
    mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64)
    mem = mem + [100]

    # placeholder
    axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr,
                   max(int(math.ceil(axi_datawidth / a_dtype.width)), par))
    axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr,
                   max(int(math.ceil(axi_datawidth / b_dtype.width)), par))

    # parameters (variable and constant)
    axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr)

    # verification data
    axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr,
                   max(int(math.ceil(axi_datawidth / c_dtype.width)), par))

    # test controller
    m = Module('test')
    params = m.copy_params(targ)
    ports = m.copy_sim_ports(targ)
    clk = ports['CLK']
    resetn = ports['RESETN']
    rst = m.Wire('RST')
    rst.assign(Not(resetn))

    # AXI memory model
    if outputfile is None:
        outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out'

    memimg_name = 'memimg_' + outputfile

    memory = axi.AxiMemoryModel(m,
                                'memory',
                                clk,
                                rst,
                                datawidth=axi_datawidth,
                                memimg=mem,
                                memimg_name=memimg_name,
                                memimg_datawidth=memimg_datawidth)
    memory.connect(ports, 'maxi')

    # AXI-Slave controller
    _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True)
    _saxi.connect(ports, 'saxi')

    # timer
    time_counter = m.Reg('time_counter', 32, initval=0)
    seq = Seq(m, 'seq', clk, rst)
    seq(time_counter.inc())

    num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1)

    def ctrl():
        for i in range(100):
            pass

        ng.sim.set_global_addrs(_saxi, tmp_addr)

        start_time = time_counter.value
        ng.sim.start(_saxi)

        print('# start')

        ng.sim.wait(_saxi)
        end_time = time_counter.value

        print('# end')
        print('# execution cycles: %d' % (end_time - start_time))

        # verify
        ok = True
        for i in range(num_rep):
            for j in range(c.shape[-1]):
                orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr,
                                        c_dtype.width)
                check = memory.read_word(i * c.aligned_shape[-1] + j,
                                         check_addr, c_dtype.width)

                if vthread.verilog.NotEql(orig, check):
                    print('NG', i, j, orig, check)
                    ok = False
                # else:
                #    print('OK', i, j, orig, check)

        if ok:
            print('# verify: PASSED')
        else:
            print('# verify: FAILED')

        vthread.finish()

    th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl)
    fsm = th.start()

    uut = m.Instance(targ,
                     'uut',
                     params=m.connect_params(targ),
                     ports=m.connect_ports(targ))

    # simulation.setup_waveform(m, uut)
    simulation.setup_clock(m, clk, hperiod=5)
    init = simulation.setup_reset(m,
                                  resetn,
                                  m.make_reset(),
                                  period=100,
                                  polarity='low')

    init.add(
        Delay(1000000),
        Systask('finish'),
    )

    # output source code
    if filename is not None:
        m.to_verilog(filename)

    # run simulation
    sim = simulation.Simulator(m, sim=simtype)
    rslt = sim.run(outputfile=outputfile)
    lines = rslt.splitlines()
    if simtype == 'verilator' and lines[-1].startswith('-'):
        rslt = '\n'.join(lines[:-1])
    return rslt