def mkTest(): m = Module('test') # target instance main = mkMain() # copy paras and ports params = m.copy_params(main) ports = m.copy_sim_ports(main) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst) memory.connect(ports, 'myaxi') uut = m.Instance(main, 'uut', params=m.connect_params(main), ports=m.connect_ports(main)) simulation.setup_waveform(m, uut, m.get_vars()) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000 * 100), Systask('finish'), ) return m
def mkTest(memimg_name=None): m = Module('test') # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name) memory.connect(ports, 'myaxi') uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) #simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def mkTest(memimg_name=None): m = Module('test') # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name) memory.connect(ports, 'myaxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') def ctrl(): for i in range(100): pass awaddr = 0 _saxi.write(awaddr, 1) araddr = 4 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) araddr = 8 v = _saxi.read(araddr) if v: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) #simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def mkTest(memimg_name=None): m = Module('test') # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name) memory.connect(ports, 'myaxi') maxi = vthread.AXIMLite(m, 'maxi', clk, rst, noio=True) maxi.connect(ports, 'saxi') def ctrl(): channel, width, height = [4, 4, 4] awaddr = 2 * 4 maxi.write(awaddr, channel) awaddr = 3 * 4 maxi.write(awaddr, width) awaddr = 4 * 4 maxi.write(awaddr, height) awaddr = 0 * 4 maxi.write(awaddr, 1) araddr = 1 * 4 v = maxi.read(araddr) while v == 0: v = maxi.read(araddr) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def mkTest(memimg_name=None, axi_datawidth=32, datawidth=4, addrwidth=10): m = Module('test') # target instance led = mkLed(axi_datawidth, datawidth, addrwidth) # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memimg_datawidth = 32 length = 1024 * 1024 // (memimg_datawidth // 8) mem = np.zeros([length], dtype=np.int64) data = np.arange(length, dtype=np.int64) % [2**(datawidth - 1)] + [1] addr = 0 axi.set_memory(mem, data, memimg_datawidth, datawidth, addr, None) memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'myaxi') uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def mkTest(memimg_name=None): m = Module('test') # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] # active low resetn = ports['RESETN'] # active low -> active high rst = m.Wire('RST') rst.assign(Not(resetn)) memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name) memory.connect(ports, 'myaxi') uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) #simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) return m
def run(act_shape=(1, 7, 7, 3), act_dtype=ng.int32, ksize=2, stride=2, padding=0, par=1, chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # pytorch model layers = [] layers.append(nn.AvgPool2d(ksize, stride=stride, padding=padding)) model = nn.Sequential(*layers) # Pytorch to ONNX onnx_filename = 'onnx_matrix_avg_pool.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen value_dtypes = {'act': act_dtype, 'out': act_dtype} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=ng.int32, default_constant_dtype=ng.int32, default_operator_dtype=act_dtype, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=False) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- input_scale_factors = {'act': 1.0} ng.quantize(outputs, input_scale_factors) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.avg_pool): op.attribute(par=par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data if act_dtype.width > 4: vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1] else: vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1] eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] # software-based verification model_input = vact.astype(np.float32) if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None: model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor mean_square_error = np.sum((vout - scaled_model_out) ** 2) / vout.size corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- targ = ng.to_veriloggen([out], 'onnx_matrix_avg_pool', silent=silent, config={'maxi_datawidth': axi_datawidth, 'chunk_size': chunk_size}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out]) param_bytes = len(param_data) variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, act_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(memimg_name=None): a_shape = (matrix_size, matrix_size) b_shape = (matrix_size, matrix_size) c_shape = (a_shape[0], b_shape[0]) n_raw_a = axi.shape_to_length(a_shape) n_raw_b = axi.shape_to_length(b_shape) n_a = axi.shape_to_memory_size(a_shape, datawidth) n_b = axi.shape_to_memory_size(b_shape, datawidth) a = np.zeros(a_shape, dtype=np.int64) b = np.zeros(b_shape, dtype=np.int64) value = 1 for y in range(a_shape[0]): for x in range(a_shape[1]): if x == y: a[y][x] = value value += 1 else: a[y][x] = 0 for y in range(b_shape[0]): for x in range(b_shape[1]): if x == y: b[y][x] = 2 else: b[y][x] = 0 a_addr = a_offset size_a = n_a * datawidth // 8 b_addr = b_offset size_b = n_b * datawidth // 8 mem = np.zeros([1024 * 1024 * 8 // axi_datawidth], dtype=np.int64) axi.set_memory(mem, a, axi_datawidth, datawidth, a_addr) axi.set_memory(mem, b, axi_datawidth, datawidth, b_addr) led = mkLed() m = Module('test') params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # Timer counter = m.Reg('counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( counter.inc() ) def ctrl(): for i in range(100): pass awaddr = 4 print('# matrix_size = %d' % matrix_size) _saxi.write(awaddr, matrix_size) awaddr = 8 print('# a_offset = %d' % a_offset) _saxi.write(awaddr, a_offset) awaddr = 12 print('# b_offset = %d' % b_offset) _saxi.write(awaddr, b_offset) awaddr = 16 print('# c_offset = %d' % c_offset) _saxi.write(awaddr, c_offset) awaddr = 0 start_time = counter print('# start time = %d' % start_time) _saxi.write(awaddr, 1) araddr = 20 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) end_time = counter print('# end time = %d' % end_time) time = end_time - start_time print('# exec time = %d' % time) all_ok = True for y in range(matrix_size): for x in range(matrix_size): v = memory.read( c_offset + (y * matrix_size + x) * datawidth // 8) if y == x and vthread.verilog.NotEql(v, (y + 1) * 2): all_ok = False print("NG [%d,%d] = %d" % (y, x, v)) if y != x and vthread.verilog.NotEql(v, 0): all_ok = False print("NG [%d,%d] = %d" % (y, x, v)) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def mkTest(): m = Module('test') copy_bytes = 1024 * 4 # target instance memcpy = mkMemcpy() uut = Submodule(m, memcpy, name='uut') clk = uut['CLK'] rst = uut['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst) memory.connect(uut.get_inst_ports(), 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(uut.get_inst_ports(), 'saxi') # Timer counter = m.Reg('counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(counter.inc()) def ctrl(): for i in range(100): pass awaddr = 4 * 1 print('# copy_bytes = %d' % copy_bytes) _saxi.write(awaddr, copy_bytes) awaddr = 4 * 2 src_offset = 0 print('# src_offset = %d' % src_offset) _saxi.write(awaddr, src_offset) awaddr = 4 * 3 dst_offset = 1024 * 8 print('# dst_offset = %d' % dst_offset) _saxi.write(awaddr, dst_offset) awaddr = 4 * 0 start_time = counter print('# start time = %d' % start_time) _saxi.write(awaddr, 1) araddr = 4 * 4 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) end_time = counter print('# end time = %d' % end_time) time = end_time - start_time print('# exec time = %d' % time) th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def run(act_shape=(1, 7, 7, 3), weight0_shape=(9, 3, 3, 3), weight1_shape=(9, 3, 3, 9), act_dtype=ng.int32, weight_dtype=ng.int32, out_dtype=ng.int32, stride0=1, stride1=1, padding0=0, padding1=0, with_batchnorm0=False, with_batchnorm1=False, act_func0='relu', act_func1='relu', disable_fusion=False, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # model definition layers = [] layers.append( nn.Conv2d(weight0_shape[3], weight0_shape[0], weight0_shape[1], stride=stride0, padding=padding0)) if with_batchnorm0: layers.append(nn.BatchNorm2d(weight0_shape[0])) if act_func0 == 'relu': layers.append(nn.ReLU(inplace=True)) elif act_func0 == 'leaky_relu': layers.append(nn.LeakyReLU(inplace=True)) layers.append( nn.Conv2d(weight1_shape[3], weight1_shape[0], weight1_shape[1], stride=stride1, padding=padding1)) if with_batchnorm1: layers.append(nn.BatchNorm2d(weight1_shape[0])) if act_func1 == 'relu': layers.append(nn.ReLU(inplace=True)) elif act_func1 == 'leaky_relu': layers.append(nn.LeakyReLU(inplace=True)) model = nn.Sequential(*layers) # Pytorch to ONNX onnx_filename = 'onnx_matrix_conv2d_conv2d.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # ONNX to NNgen value_dtypes = { 'act': act_dtype, '0.weight': weight_dtype, '1.weight': weight_dtype, 'out': act_dtype } (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=out_dtype, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=disable_fusion) # default linear quantization if act_dtype.width >= 8: value_ranges = {'act': (-120, 120)} else: value_ranges = { 'act': (-(2**(act_dtype.width - 1)), (2**(act_dtype.width - 1))) } ng.quantize(outputs, value_ranges=value_ranges) # set attribute for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=par_ich, par_och=par_och, par_row=par_row, par_col=par_col, concur_och=concur_och) # create target hardware act = placeholders['act'] out = outputs['out'] targ = ng.to_veriloggen([out], 'onnx_matrix_conv2d_conv2d', silent=silent, config={ 'maxi_datawidth': axi_datawidth, 'chunk_size': chunk_size }) # verification data # if act_dtype.width > 4: # vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [11] + [1] # else: # vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [5] + [1] #vact = np.ones(act.shape) vact = np.random.normal(size=act.length).reshape(act.shape) vact = np.clip(vact, -3.0, 3.0) vact_min_val, vact_max_val = value_ranges['act'] vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val)) vact_width = vact_max_abs_range.bit_length() + 1 vact = vact * (1.0 * (2**(vact_width - 1) - 1)) / 3.0 vact = np.round(vact).astype(np.int64) eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] # exec on pytorch model_input = vact.astype(np.float32) if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None: model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor out_diff = vout - scaled_model_out out_err = out_diff / (scaled_model_out + 0.00000001) max_out_err = np.max(np.abs(out_err)) # if max_out_err > 0.1: # raise ValueError("too large output error: %f > 0.1" % max_out_err) # to memory image param_data = ng.make_param_array(variables, constants, chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run( act_dtype=ng.int8, weight_dtype=ng.int8, bias_dtype=ng.int32, scale_dtype=ng.int8, par_ich=2, par_och=2, chunk_size=64, axi_datawidth=32, silent=False, weight_filename='cnn.npy', verilog_filename=None, sim_filename=None, # simtype='iverilog', simtype='verilator', # simtype=None, # no RTL simulation ): # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # input input_layer = ng.placeholder( dtype=act_dtype, shape=(1, 32, 32, 3), # N, H, W, C name='input_layer') # layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool w0 = ng.variable( dtype=weight_dtype, shape=(64, 3, 3, 3), # Och, Ky, Kx, Ich name='w0') b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0') s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0') a0 = ng.conv2d(input_layer, w0, strides=(1, 1, 1, 1), bias=b0, scale=s0, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1)) # layer 1: conv2d, relu, reshape w1 = ng.variable(weight_dtype, shape=(64, 3, 3, a0.shape[-1]), name='w1') b1 = ng.variable(bias_dtype, shape=(w1.shape[0], ), name='b1') s1 = ng.variable(scale_dtype, shape=(w1.shape[0], ), name='s1') a1 = ng.conv2d(a0p, w1, strides=(1, 1, 1, 1), bias=b1, scale=s1, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) a1r = ng.reshape(a1, [1, -1]) # layer 2: full-connection, relu w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2') b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2') s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2') a2 = ng.matmul(a1r, w2, bias=b2, scale=s2, transposed_b=True, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) # layer 3: full-connection, relu w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3') b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3') s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3') # output output_layer = ng.matmul(a2, w3, bias=b3, scale=s3, transposed_b=True, name='output_layer', dtype=act_dtype, sum_dtype=ng.int32) # -------------------- # (2) Assign weights to the NNgen operators # -------------------- # In this example, random floating-point values are assigned. # In a real case, you should assign actual weight values # obtianed by a training on DNN framework. # If you don't you NNgen's quantizer, you can assign integer weights to each tensor. w0_value = np.random.normal(size=w0.length).reshape(w0.shape) w0_value = np.clip(w0_value, -3.0, 3.0) w0.set_value(w0_value) b0_value = np.random.normal(size=b0.length).reshape(b0.shape) b0_value = np.clip(b0_value, -3.0, 3.0) b0.set_value(b0_value) s0_value = np.ones(s0.shape) s0.set_value(s0_value) w1_value = np.random.normal(size=w1.length).reshape(w1.shape) w1_value = np.clip(w1_value, -3.0, 3.0) w1.set_value(w1_value) b1_value = np.random.normal(size=b1.length).reshape(b1.shape) b1_value = np.clip(b1_value, -3.0, 3.0) b1.set_value(b1_value) s1_value = np.ones(s1.shape) s1.set_value(s1_value) w2_value = np.random.normal(size=w2.length).reshape(w2.shape) w2_value = np.clip(w2_value, -3.0, 3.0) w2.set_value(w2_value) b2_value = np.random.normal(size=b2.length).reshape(b2.shape) b2_value = np.clip(b2_value, -3.0, 3.0) b2.set_value(b2_value) s2_value = np.ones(s2.shape) s2.set_value(s2_value) w3_value = np.random.normal(size=w3.length).reshape(w3.shape) w3_value = np.clip(w3_value, -3.0, 3.0) w3.set_value(w3_value) b3_value = np.random.normal(size=b3.length).reshape(b3.shape) b3_value = np.clip(b3_value, -3.0, 3.0) b3.set_value(b3_value) s3_value = np.ones(s3.shape) s3.set_value(s3_value) # Quantizing the floating-point weights by the NNgen quantizer. # Alternatively, you can assign integer weights by yourself to each tensor. imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32) imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32) if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'input_layer': act_scale_factor} input_means = {'input_layer': imagenet_mean * act_scale_factor} input_stds = {'input_layer': imagenet_std * act_scale_factor} ng.quantize([output_layer], input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- # conv2d, matmul # par_ich: parallelism in input-channel # par_och: parallelism in output-channel # par_col: parallelism in pixel column # par_row: parallelism in pixel row a0.attribute(par_ich=par_ich, par_och=par_och) a1.attribute(par_ich=par_ich, par_och=par_och) a2.attribute(par_ich=par_ich, par_och=par_och) output_layer.attribute(par_ich=par_ich, par_och=par_och) # cshamt_out: right shift amount after applying bias/scale # If you assign integer weights by yourself to each tensor, # cshamt (constant shift amount) must be assigned to each operator. # a0.attribute(cshamt_out=weight_dtype.width + 1) # a1.attribute(cshamt_out=weight_dtype.width + 1) # a2.attribute(cshamt_out=weight_dtype.width + 1) # output_layer.attribute(cshamt_out=weight_dtype.width + 1) # max_pool # par: parallelism in in/out channel par = par_och a0p.attribute(par=par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- # In this example, random integer values are assigned. # In real case, you should assign actual integer activation values, such as an image. input_layer_value = np.random.normal(size=input_layer.length).reshape( input_layer.shape) input_layer_value = input_layer_value * imagenet_std + imagenet_mean input_layer_value = np.clip(input_layer_value, -5.0, 5.0) input_layer_value = input_layer_value * act_scale_factor input_layer_value = np.clip(input_layer_value, -1 * 2**(act_dtype.width - 1) - 1, 2**(act_dtype.width - 1)) input_layer_value = np.round(input_layer_value).astype(np.int64) eval_outs = ng.eval([output_layer], input_layer=input_layer_value) output_layer_value = eval_outs[0] # print(output_layer_value) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen([output_layer], 'cnn', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact([output_layer], 'cnn', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog([output_layer], 'cnn', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Save the quantized weights # -------------------- # convert weight values to a memory image: # on a real FPGA platform, this image will be used as a part of the model definition. param_filename = 'hello_nngen.npy' chunk_size = 64 param_data = ng.export_ndarray([output_layer], chunk_size) np.save(weight_filename, param_data) # -------------------- # (7) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() param_bytes = len(param_data) variable_addr = int( math.ceil((input_layer.addr + input_layer.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int( math.ceil( (check_addr + output_layer.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory( mem, input_layer_value, memimg_datawidth, act_dtype.width, input_layer.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, output_layer_value, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if sim_filename is None: sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + sim_filename memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(output_layer.shape[0]): for x in range(output_layer.shape[1]): orig = memory.read_word( bat * output_layer.aligned_shape[1] + x, output_layer.addr, act_dtype.width) check = memory.read_word( bat * output_layer.aligned_shape[1] + x, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False else: print('OK (', bat, x, ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if verilog_filename is not None: m.to_verilog(verilog_filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=sim_filename) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(): m = Module('test') matrix_size = 16 # target instance led = mkLed(matrix_size) # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] # memory image memname = 'mymem.out' def fwrite(f, value): s = '%08x' % value f.write('%s\n' % s[6:8]) f.write('%s\n' % s[4:6]) f.write('%s\n' % s[2:4]) f.write('%s\n' % s[0:2]) with open(memname, 'w') as f: # ram_a addr = 0 nv = 1 for x in range(matrix_size): for y in range(matrix_size): addr += 4 if x == y: value = nv nv += 1 else: value = 0 fwrite(f, value) for i in range(1024 - addr): f.write('%s\n' % '00') # ram_b addr = 1024 for x in range(matrix_size): for y in range(matrix_size): addr += 4 if x == y: value = 2 else: value = 0 fwrite(f, value) for i in range(2048 - addr): f.write('%s\n' % '00') # ram_c addr = 2048 for x in range(matrix_size): for y in range(matrix_size): addr += 4 value = 100 fwrite(f, value) for i in range(2**20 - addr): f.write('%s\n' % '00') memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg=memname) memory.connect(ports, 'myaxi') uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def run(act_shape=(1, 7, 7, 15), weight1_shape=(7, 3, 3, 15), bias1_shape=None, scale1_shape=None, weight2_shape=(9, 3, 3, 7), bias2_shape=None, scale2_shape=None, act_dtype=ng.int32, weight1_dtype=ng.int32, bias1_dtype=ng.int32, scale1_dtype=ng.int32, weight2_dtype=ng.int32, bias2_dtype=ng.int32, scale2_dtype=ng.int32, tmp_dtype=ng.int32, out_dtype=ng.int32, stride1=(1, 1, 1, 1), stride2=(1, 1, 1, 1), rshift_mul1=None, rshift_sum1=None, rshift_out1=None, rshift_mul2=None, rshift_sum2=None, rshift_out2=None, act_func1=None, act_func2=None, par_ich1=1, par_och1=1, par_col1=1, par_row1=1, concur_och1=None, stationary1='filter', par_ich2=1, par_och2=1, par_col2=1, par_row2=1, concur_och2=None, stationary2='filter', input_ram_size1=None, filter_ram_size1=None, bias_ram_size1=None, scale_ram_size1=None, out_ram_size1=None, input_ram_size2=None, filter_ram_size2=None, bias_ram_size2=None, scale_ram_size2=None, out_ram_size2=None, chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware act = ng.placeholder(act_dtype, shape=act_shape, name='act') weight1 = ng.variable(weight1_dtype, shape=weight1_shape, name='weight1') if bias1_shape is not None: bias1 = ng.variable(bias1_dtype, bias1_shape, name='bias1') else: bias1 = None if scale1_shape is not None: scale1 = ng.variable(scale1_dtype, scale1_shape, name='scale1') else: scale1 = None weight2 = ng.variable(weight2_dtype, shape=weight2_shape, name='weight2') if bias2_shape is not None: bias2 = ng.variable(bias2_dtype, bias2_shape, name='bias2') else: bias2 = None if scale2_shape is not None: scale2 = ng.variable(scale2_dtype, scale2_shape, name='scale2') else: scale2 = None tmp = ng.conv2d(act, weight1, stride1, bias1, scale1, rshift_mul1, rshift_sum1, rshift_out1, act_func1, 'SAME', tmp_dtype, ng.int32, ng.int32, 'conv2d_1', par_ich1, par_och1, par_col1, par_row1, concur_och1, stationary1, input_ram_size1, filter_ram_size1, bias_ram_size1, scale_ram_size1, None, None, None, out_ram_size1) out = ng.conv2d(tmp, weight2, stride2, bias2, scale2, rshift_mul2, rshift_sum2, rshift_out2, act_func2, 'SAME', out_dtype, ng.int32, ng.int32, 'conv2d_2', par_ich2, par_och2, par_col2, par_row2, concur_och2, stationary2, input_ram_size2, filter_ram_size2, bias_ram_size2, scale_ram_size2, None, None, None, out_ram_size2) targ = ng.to_veriloggen([out], 'matrix_conv2d_conv2d_variable', silent=silent, config={'maxi_datawidth': axi_datawidth, 'offchipram_chunk_bytes': chunk_size}) # verification data vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16] vweight1 = np.arange(weight1.length, dtype=np.int64).reshape(weight1_shape) % [32] - [16] if bias1 is not None: vbias1 = np.arange(bias1.length, dtype=np.int64).reshape(bias1.shape) % [16] else: vbias1 = None if scale1 is not None: vscale1 = np.arange(scale1.length, dtype=np.int64).reshape(scale1.shape) % [8] else: vscale1 = None vweight2 = np.arange(weight2.length, dtype=np.int64).reshape(weight2_shape) % [32] - [16] if bias2 is not None: vbias2 = np.arange(bias2.length, dtype=np.int64).reshape(bias2.shape) % [16] else: vbias2 = None if scale2 is not None: vscale2 = np.arange(scale2.length, dtype=np.int64).reshape(scale2.shape) % [8] else: vscale2 = None vtmp = ng.verify.conv2d(vact, vweight1, stride1, vbias1, vscale1, rshift_mul1, rshift_sum1, rshift_out1, act_func1, 'SAME', tmp_dtype, ng.int32, ng.int32, 'conv2d_1', par_ich1, par_och1, par_col1, par_row1, concur_och1, stationary1, input_ram_size1, filter_ram_size1, bias_ram_size1, scale_ram_size1, None, None, None, out_ram_size1, False, act_dtype, weight1_dtype) vout = ng.verify.conv2d(vtmp, vweight2, stride2, vbias2, vscale2, rshift_mul2, rshift_sum2, rshift_out2, act_func2, 'SAME', out_dtype, ng.int32, ng.int32, 'conv2d_2', par_ich2, par_och2, par_col2, par_row2, concur_och2, stationary2, input_ram_size2, filter_ram_size2, bias_ram_size2, scale_ram_size2, None, None, None, out_ram_size2, False, tmp_dtype, weight2_dtype) # to memory image size_max = int(math.ceil(max(act.memory_size, weight1.memory_size, bias1.memory_size if bias1 is not None else 0, scale1.memory_size if scale1 is not None else 0, weight2.memory_size, bias2.memory_size if bias2 is not None else 0, scale2.memory_size if scale2 is not None else 0, out.memory_size) / chunk_size)) * chunk_size # assign custom addresses variable_addr = max(act.addr, out.addr) + size_max weight1_addr = variable_addr bias1_addr = weight1_addr + int(math.ceil(weight1.memory_size / chunk_size)) * chunk_size scale1_addr = (bias1_addr + int(math.ceil(bias1.memory_size / chunk_size)) * chunk_size if bias1 is not None else weight1_addr) weight2_addr = (scale1_addr + int(math.ceil(scale1.memory_size / chunk_size)) * chunk_size if scale1 is not None else bias1_addr) bias2_addr = weight2_addr + int(math.ceil(weight2.memory_size / chunk_size)) * chunk_size scale2_addr = (bias2_addr + int(math.ceil(bias2.memory_size / chunk_size)) * chunk_size if bias2 is not None else weight2_addr) check_addr = scale2_addr + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich1)) axi.set_memory(mem, vweight1, memimg_datawidth, weight1_dtype.width, weight1_addr, max(int(math.ceil(axi_datawidth / weight1_dtype.width)), par_ich1)) if bias1_shape is not None: axi.set_memory(mem, vbias1, memimg_datawidth, bias1_dtype.width, bias1_addr, max(int(math.ceil(axi_datawidth / bias1_dtype.width)), par_och1)) if scale1_shape is not None: axi.set_memory(mem, vscale1, memimg_datawidth, scale1_dtype.width, scale1_addr, max(int(math.ceil(axi_datawidth / scale1_dtype.width)), par_och1)) axi.set_memory(mem, vweight2, memimg_datawidth, weight2_dtype.width, weight2_addr, max(int(math.ceil(axi_datawidth / weight2_dtype.width)), par_ich2)) if bias2_shape is not None: axi.set_memory(mem, vbias2, memimg_datawidth, bias2_dtype.width, bias2_addr, max(int(math.ceil(axi_datawidth / bias2_dtype.width)), par_och2)) if scale2_shape is not None: axi.set_memory(mem, vscale2, memimg_datawidth, scale2_dtype.width, scale2_addr, max(int(math.ceil(axi_datawidth / scale2_dtype.width)), par_och2)) axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par_och2)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass # set custom addresses ng.sim.set_global_addrs(_saxi, tmp_addr, out.addr, act.addr, variable_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(act_shape=(1, 7, 7, 15), act_dtype=ng.int32, out_dtype=ng.int32, factors=(1, 2, 2, 1), par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware act = ng.placeholder(act_dtype, shape=act_shape, name='act') out = ng.upsampling2d(act, factors=factors, dtype=out_dtype, par=par) targ = ng.to_veriloggen([out], 'matrix_upsampling2d', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) vout = ng.verify.upsampling2d(vact, factors=factors, dtype=out_dtype) # to memory image size_max = int(math.ceil( max(act.memory_size, out.memory_size) / 4096)) * 4096 check_addr = max(act.addr, out.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par)) axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(a_shape=(15, 15), b_shape=(15, 15), bias_shape=None, scale_shape=None, a_dtype=ng.int32, b_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, c_dtype=ng.int32, rshift_mul=None, rshift_sum=None, rshift_out=None, act_func=None, par_left_col=1, par_left_row=1, par_out_col=1, concur_out_col=None, stationary='right', left_ram_size=None, right_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') if bias_shape is not None: bias = ng.placeholder(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.placeholder(scale_dtype, scale_shape, name='scale') else: scale = None transposed_a = False transposed_b = True c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) targ = ng.to_veriloggen([c], 'matrix_matmul', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape( bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape( scale.shape) % [6] else: vscale = None vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size, False, a_dtype, b_dtype, bias_dtype, scale_dtype) # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] axi.set_memory( mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col)) axi.set_memory( mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col)) if bias is not None: axi.set_memory( mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col)) if scale is not None: axi.set_memory( mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_out_col)) axi.set_memory( mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(c.shape[0]): for j in range(c.shape[1]): orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print(i, j, orig, check) ok = False if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(memname='mymem.out'): m = Module('test') matrix_size = 16 # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] # memory image #memname = 'mymem.out' def fwrite(f, value): s = '%08x' % value f.write('%s\n' % s[6:8]) f.write('%s\n' % s[4:6]) f.write('%s\n' % s[2:4]) f.write('%s\n' % s[0:2]) with open(memname, 'w') as f: # ram_a addr = 0 nv = 1 for x in range(matrix_size): for y in range(matrix_size): addr += 4 if x == y: value = nv nv += 1 else: value = 0 fwrite(f, value) for i in range(1024 - addr): f.write('%s\n' % '00') # ram_b addr = 1024 for x in range(matrix_size): for y in range(matrix_size): addr += 4 if x == y: value = 2 else: value = 0 fwrite(f, value) for i in range(2048 - addr): f.write('%s\n' % '00') # ram_c addr = 2048 for x in range(matrix_size): for y in range(matrix_size): addr += 4 value = 100 fwrite(f, value) for i in range(2**20 - addr): f.write('%s\n' % '00') memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg=memname) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # Timer counter = m.Reg('counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(counter.inc()) def ctrl(): for i in range(100): pass awaddr = 4 matrix_size = 16 print('# matrix_size = %d' % matrix_size) _saxi.write(awaddr, matrix_size) awaddr = 8 a_offset = 0 print('# a_offset = %d' % a_offset) _saxi.write(awaddr, a_offset) awaddr = 12 b_offset = 1024 * 1 print('# b_offset = %d' % b_offset) _saxi.write(awaddr, b_offset) awaddr = 16 c_offset = 1024 * 2 print('# c_offset = %d' % c_offset) _saxi.write(awaddr, c_offset) awaddr = 0 start_time = counter print('# start time = %d' % start_time) _saxi.write(awaddr, 1) araddr = 20 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) end_time = counter print('# end time = %d' % end_time) time = end_time - start_time print('# exec time = %d' % time) th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def run(act_dtype=ng.int16, weight_dtype=ng.int16, bias_dtype=ng.int32, scale_dtype=ng.int16, with_batchnorm=False, disable_fusion=False, conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1, conv2d_concur_och=None, conv2d_stationary='filter', pool_par=1, elem_par=1, chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', # simtype='verilator', # simtype=None, # no RTL simulation outputfile=None): # input mean and standard deviation cifar10_mean = np.array([0.4914, 0.4822, 0.4465]).astype(np.float32) cifar10_std = np.array([0.247, 0.243, 0.261]).astype(np.float32) act_shape = (1, 32, 32, 3) # pytorch model if with_batchnorm: model = torchvision.models.vgg11_bn(pretrained=False) else: model = torchvision.models.vgg11(pretrained=False) model.features[0].in_channels = act_shape[-1] model.avgpool = nn.Identity() #model.classifier[0] = nn.Linear(512, 4096) #model.classifier[6] = nn.Linear(4096, 10) model.classifier = nn.Sequential( nn.Linear(in_features=512, out_features=1024, bias=True), nn.ReLU(inplace=True), nn.Dropout(p=0.5), nn.Linear(in_features=1024, out_features=1024, bias=True), nn.ReLU(inplace=True), nn.Dropout(p=0.5), nn.Linear(in_features=1024, out_features=10, bias=True), ) # Pytorch to ONNX onnx_filename = 'vgg11.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen dtypes = {} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=scale_dtype, default_bias_dtype=bias_dtype, disable_fusion=disable_fusion) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2 ** (act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} input_means = {'act': cifar10_mean * act_scale_factor} input_stds = {'act': cifar10_std * act_scale_factor} ng.quantize(outputs, input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=conv2d_par_ich, par_och=conv2d_par_och, par_col=conv2d_par_col, par_row=conv2d_par_row, concur_och=conv2d_concur_och, stationary=conv2d_stationary) if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial, ng.max_pool_serial)): op.attribute(par=pool_par) if ng.is_elementwise_operator(op): op.attribute(par=elem_par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data # random data img = np.random.uniform(size=act.length).astype(np.float32).reshape(act.shape) img = img * 12.0 * cifar10_std + cifar10_mean # img = np.random.normal(size=act.length).astype(np.float32).reshape(act.shape) # img = img * cifar10_std + cifar10_mean # execution on pytorch model_input = img if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2 ** (act.dtype.width - 1) - 1), 1.0 * (2 ** (act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] labels = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') mout = scaled_model_out for bat in range(mout.shape[0]): for index, value in list(sorted(enumerate(mout[bat]), key=lambda x: x[1], reverse=True))[:10]: print("# mout: %s (%d) = %f" % (str(labels[index]), index, value)) for index, value in list(sorted(enumerate(vout[bat]), key=lambda x: x[1], reverse=True))[:10]: print("# vout: %s (%d) = %d" % (str(labels[index]), index, value)) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen([out], 'vgg11', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact([out], 'onnx_vgg11', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog([out], 'vgg11', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out], chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)], dtype=np.int16) mem = mem + [100] # placeholder axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for x in range(out.shape[1]): orig = memory.read_word(bat * out.aligned_shape[1] + x, out.addr, act_dtype.width) check = memory.read_word(bat * out.aligned_shape[1] + x, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, x, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(): m = Module('test') # target instance led = mkLed() # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst) memory.connect(ports, 'myaxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') def ctrl(): for i in range(100): pass for i in range(16): # byte addressing v = memory.read(i * 4) print('read: mem[%d] -> %x' % (i, v)) v = v + 1024 # byte addressing memory.write(i * 4, v) print('write: mem[%d] <- %x' % (i, v)) awaddr = 0 _saxi.write(awaddr, 1) araddr = 4 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) araddr = 8 v = _saxi.read(araddr) if v: print('SLAVE: ALL OK') else: print('SLAVE: NOT ALL OK') th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(100000), Systask('finish'), ) return m
def mkTest(memimg_name=None): matrix_size = 16 a_shape = (matrix_size, matrix_size) b_shape = (matrix_size, matrix_size) c_shape = (a_shape[0], b_shape[0]) n_raw_a = axi.shape_to_length(a_shape) n_raw_b = axi.shape_to_length(b_shape) n_a = axi.shape_to_memory_size(a_shape, datawidth) n_b = axi.shape_to_memory_size(b_shape, datawidth) a = np.zeros(a_shape, dtype=np.int32) b = np.zeros(b_shape, dtype=np.int32) value = 1 for y in range(a_shape[0]): for x in range(a_shape[1]): if x == y: a[y][x] = value value += 1 else: a[y][x] = 0 for y in range(b_shape[0]): for x in range(b_shape[1]): if x == y: b[y][x] = 2 else: b[y][x] = 0 a_addr = a_offset size_a = n_a * datawidth // 8 b_addr = b_offset size_b = n_b * datawidth // 8 mem = np.zeros([1024 * 1024 * 8 // axi_datawidth], dtype=np.int64) axi.set_memory(mem, a, axi_datawidth, datawidth, a_addr) axi.set_memory(mem, b, axi_datawidth, datawidth, b_addr) led = mkLed(matrix_size) m = Module('test') params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name) memory.connect(ports, 'myaxi') uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def run(a_shape=(7, 15), b_shape=(7, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # pytorch model model = MatrixMul() # Pytorch to ONNX onnx_filename = 'onnx_matrix_mul.onnx' dummy_a = torch.randn(*a_shape) dummy_b = torch.randn(*b_shape) dummy_inputs = (dummy_a, dummy_b) input_names = ['a', 'b'] output_names = ['c'] model.eval() torch.onnx.export(model, dummy_inputs, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=ng.int32, default_variable_dtype=ng.int32, default_constant_dtype=ng.int32, default_operator_dtype=ng.int32, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=False) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- input_scale_factors = {'a': 10.0, 'b': 15.0} ng.quantize(outputs, input_scale_factors) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.scaled_multiply): op.attribute(par=par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- a = placeholders['a'] b = placeholders['b'] c = outputs['c'] # verification data input_a = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [17] input_b = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [13] # execution on pytorch model_a = input_a.astype(np.float32) if a.perm is not None: model_a = np.transpose(model_a, a.reversed_perm) model_b = input_b.astype(np.float32) if b.perm is not None: model_b = np.transpose(model_b, b.reversed_perm) model.eval() model_c = model(torch.from_numpy(model_a), torch.from_numpy(model_b)).detach().numpy() if a.perm is not None: model_c = np.transpose(model_c, a.perm) scaled_model_c = model_c * c.scale_factor # software-based verification va = input_a * input_scale_factors['a'] va = np.clip(va, -1.0 * (2**(a.dtype.width - 1) - 1), 1.0 * (2**(a.dtype.width - 1) - 1)) va = np.round(va).astype(np.int64) vb = input_b * input_scale_factors['b'] vb = np.clip(vb, -1.0 * (2**(b.dtype.width - 1) - 1), 1.0 * (2**(b.dtype.width - 1) - 1)) vb = np.round(vb).astype(np.int64) eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] mean_square_error = np.sum((vc - scaled_model_c)**2) / vc.size corrcoef = np.corrcoef(model_c.reshape([-1]), vc.reshape([-1])) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- targ = ng.to_veriloggen([c], 'onnx_matrix_mul', silent=silent, config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([c]) param_bytes = len(param_data) variable_addr = int( math.ceil( max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096 check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096 tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096 memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc())
def run(a_shape=(15, 15), b_shape=(15, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') t = ng.add(a, b, dtype=c_dtype, par=par) c = ng.relu(t, dtype=c_dtype, par=par) targ = ng.to_veriloggen([c], 'matrix_add_relu', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] - [10] vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6] - [10] eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(act_shape=(1, 4, 4, 3), weight0_shape=(9, 3, 3, 3), weight1_shape=(9, 36), act_dtype=ng.int32, weight_dtype=ng.int32, stride0=1, padding0=0, with_batchnorm0=False, with_batchnorm1=False, act_func0='ReLU', act_func1='relu', disable_fusion=False, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # pytorch model layers = [] layers.append( nn.Conv2d(weight0_shape[3], weight0_shape[0], weight0_shape[1], stride=stride0, padding=padding0)) if with_batchnorm0: layers.append(nn.BatchNorm2d(weight0_shape[0])) if act_func0 is not None: layers.append(getattr(nn, act_func0)()) class Transpose(nn.Module): def __init__(self, perm): super(Transpose, self).__init__() self.perm = perm def forward(self, input): return input.permute(*self.perm) layers.append(Transpose([0, 1, 3, 2])) class Flatten(nn.Module): def forward(self, input): # return input.view(input.size(0), -1) return torch.reshape(input, (input.size(0), -1)) layers.append(Flatten()) layers.append(nn.Linear(weight1_shape[1], weight1_shape[0])) if with_batchnorm1: layers.append(nn.BatchNorm2d(weight1_shape[0])) if act_func1 is not None: layers.append(getattr(nn, act_func1)()) model = nn.Sequential(*layers) # Pytorch to ONNX onnx_filename = 'onnx_matrix_conv2d_transpose_linear.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen value_dtypes = { 'act': act_dtype, '0.weight': weight_dtype, '3.weight': weight_dtype, 'out': act_dtype } (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=disable_fusion) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} ng.quantize(outputs, input_scale_factors) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=par_ich, par_och=par_och, par_row=par_row, par_col=par_col, concur_och=concur_och) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data # random data std = 0.2 mean = 0.5 img = np.random.normal(size=act.length).astype(np.float32).reshape( act.shape) img = img * std + mean # execution on pytorch model_input = img if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1), 1.0 * (2**(act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- targ = ng.to_veriloggen([out], 'onnx_matrix_conv2d_transpose_linear', silent=silent, config={ 'maxi_datawidth': axi_datawidth, 'chunk_size': chunk_size }) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out], chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(out.shape[0]): for j in range(out.shape[1]): orig = memory.read_word(i * out.aligned_shape[1] + j, out.addr, act_dtype.width) check = memory.read_word(i * out.aligned_shape[1] + j, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', i, j, ') orig: ', orig, 'check: ', check) ok = False # else: # print('OK (', i, j, ') orig: ', orig, 'check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(memimg_name=None, axi_datawidth=32, datawidth=4, addrwidth=10): m = Module('test') # target instance led = mkLed(axi_datawidth, datawidth, addrwidth) # copy paras and ports params = m.copy_params(led) ports = m.copy_sim_ports(led) clk = ports['CLK'] rst = ports['RST'] memory = axi.AxiMemoryModel(m, 'memory', clk, rst, memimg_name=memimg_name) memory.connect(ports, 'myaxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') def ctrl(): for i in range(100): pass for i in range(16): # word addressing r = memory.read_word(i, 0, datawidth) print('read: mem[%d] -> %x' % (i, r)) # word addressing w = (r + i + 100) % (2**datawidth - 1) memory.write_word(i, 0, w, datawidth) print('write: mem[%d] <- %x' % (i, w)) awaddr = 0 _saxi.write(awaddr, 1) araddr = 4 v = _saxi.read(araddr) while v == 0: v = _saxi.read(araddr) araddr = 8 v = _saxi.read(araddr) if v: print('# verify: PASSED') else: print('# verify: FAILED') th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(led, 'uut', params=m.connect_params(led), ports=m.connect_ports(led)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, rst, m.make_reset(), period=100) init.add( Delay(1000000), Systask('finish'), ) return m
def run(act_shape=(1, 7, 7, 15), weight_shape=(7, 3, 3, 15), bias_shape=None, scale_shape=None, act_dtype=ng.int32, weight_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, out_dtype=ng.int32, conv2d_stride=(1, 1, 1, 1), rshift_mul=None, rshift_sum=None, rshift_out=None, act_func=None, par_ich=1, par_och=1, par_col=1, par_row=1, concur_och=None, stationary='filter', input_ram_size=None, filter_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, ksize=(1, 2, 2, 1), pool_stride=(1, 2, 2, 1), par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware act = ng.placeholder(act_dtype, shape=act_shape, name='act') weight = ng.variable(weight_dtype, shape=weight_shape, name='weight') if bias_shape is not None: bias = ng.variable(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.variable(scale_dtype, scale_shape, name='scale') else: scale = None tmp = ng.conv2d(act, weight, conv2d_stride, bias, scale, rshift_mul, rshift_sum, rshift_out, act_func, 'SAME', out_dtype, ng.int32, ng.int32, 'conv2d', par_ich, par_och, par_col, par_row, concur_och, stationary, input_ram_size, filter_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) out = ng.avg_pool(tmp, ksize=ksize, strides=pool_stride, sum_dtype=ng.int32, dtype=out_dtype, par=par) targ = ng.to_veriloggen([out], 'matrix_conv2d_avg_pool', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data vact = np.arange(act.length, dtype=np.int64).reshape(act.shape) % [16] vweight = np.arange(weight.length, dtype=np.int64).reshape(weight.shape) % [32] - [16] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape(bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape(scale.shape) % [6] else: vscale = None eval_outs = ng.eval([out], act=vact, weight=vweight, bias=vbias, scale=vscale) vout = eval_outs[0] # to memory image size_max = int(math.ceil(max(act.memory_size, weight.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, out.memory_size) / 4096)) * 4096 check_addr = max(act.addr, weight.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, out.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) axi.set_memory(mem, vweight, memimg_datawidth, weight_dtype.width, weight.addr, max(int(math.ceil(axi_datawidth / weight_dtype.width)), par_ich)) if bias is not None: axi.set_memory(mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_och)) if scale is not None: axi.set_memory(mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_och)) axi.set_memory(mem, vout, memimg_datawidth, out_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / out_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word(bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(a_shape=(15, 15), b_shape=(15, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, interrupt_name='irq', silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') d = ng.add(a, b, dtype=c_dtype, par=par) e = ng.add(b, a, dtype=c_dtype, par=par) # SW returns ng.add(x, y) f = ng.extern([d, e], shape=a_shape, opcode=0x1, func=lambda x, y: x + y) g = ng.sub(f, a) # SW returns d as-is h = ng.extern([g], shape=a_shape, opcode=0x2, func=lambda x: x) c = ng.sub(h, b) targ = ng.to_veriloggen([c], 'matrix_extern', silent=silent, config={ 'maxi_datawidth': axi_datawidth, 'interrupt_name': interrupt_name }) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [16] vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [32] + [16] eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] irq = ports[interrupt_name] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg_datawidth=memimg_datawidth, memimg=mem, memimg_name=memimg_name) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def irq_join(saxi, irq_bit): while irq == 0: pass araddr = ng.control_reg_interrupt_isr * 4 irq_stat = saxi.read(araddr) if irq_stat != irq_bit: print('# Unexpected irq signal: %d' % irq_stat) print('# verify: FAILED') vthread.finish() print('# irq stat = %d' % irq_stat) awaddr = ng.control_reg_interrupt_iar * 4 saxi.write(awaddr, irq_bit) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) araddr_ext_snd = ng.control_reg_extern_send * 4 awaddr_ext_rcv = ng.control_reg_extern_recv * 4 awaddr_irq_ier = ng.control_reg_interrupt_ier * 4 araddr_irq_isr = ng.control_reg_interrupt_isr * 4 awaddr_irq_iar = ng.control_reg_interrupt_iar * 4 _saxi.write(awaddr_irq_ier, 3) # irq enable ng.sim.sw_rst(_saxi) print('# 0st software reset (during idle)') for i in range(100): pass irq_stat = _saxi.read(araddr_irq_isr) if irq_stat != 0: print('# Unexpected irq signal: %d' % irq_stat) print('# verify: FAILED') vthread.finish() print('# irq stat = %d' % irq_stat) # no irq busy by software reset when idle start_time = time_counter.value ng.sim.start(_saxi) print('# 1st test start') # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - d.default_global_addr y_offset = tmp_addr - e.default_global_addr z_offset = tmp_addr - f.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, d.addr + x_offset, c_dtype.width) y = memory.read_word(i * c.aligned_shape[-1] + j, e.addr + y_offset, c_dtype.width) z = x + y memory.write_word(i * c.aligned_shape[-1] + j, f.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) # software reset ng.sim.sw_rst(_saxi) print('# 1st software reset (before resume)') # from extern-send irq_join(_saxi, 1) # restart ng.sim.start(_saxi) print('# Restart') # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - d.default_global_addr y_offset = tmp_addr - e.default_global_addr z_offset = tmp_addr - f.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, d.addr + x_offset, c_dtype.width) y = memory.read_word(i * c.aligned_shape[-1] + j, e.addr + y_offset, c_dtype.width) z = x + y memory.write_word(i * c.aligned_shape[-1] + j, f.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - g.default_global_addr z_offset = tmp_addr - h.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, g.addr + x_offset, c_dtype.width) z = x memory.write_word(i * c.aligned_shape[-1] + j, h.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) # from extern-send irq_join(_saxi, 1) #ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok_1st = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok_1st = False # else: # print('OK', i, j, orig, check) # 2nd test # start start_time = time_counter.value ng.sim.start(_saxi) print('# 2nd test start') # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - d.default_global_addr y_offset = tmp_addr - e.default_global_addr z_offset = tmp_addr - f.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, d.addr + x_offset, c_dtype.width) y = memory.read_word(i * c.aligned_shape[-1] + j, e.addr + y_offset, c_dtype.width) z = x + y memory.write_word(i * c.aligned_shape[-1] + j, f.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) while (memory.waddr.awvalid) == 0: pass ng.sim.sw_rst(_saxi) print('# 2nd software reset (during Master AXI transaction)') irq_join(_saxi, 1) # irq busy by software reset # restart ng.sim.start(_saxi) print('# Restart') # from extern-send irq_join(_saxi, 2) araddr = ng.control_reg_extern_send * 4 print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - d.default_global_addr y_offset = tmp_addr - e.default_global_addr z_offset = tmp_addr - f.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, d.addr + x_offset, c_dtype.width) y = memory.read_word(i * c.aligned_shape[-1] + j, e.addr + y_offset, c_dtype.width) z = x + y memory.write_word(i * c.aligned_shape[-1] + j, f.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) # from extern-send irq_join(_saxi, 2) v = _saxi.read(araddr_ext_snd) print('# opcode = %d' % v) for i in range(num_rep): for j in range(c.shape[-1]): x_offset = tmp_addr - g.default_global_addr z_offset = tmp_addr - h.default_global_addr x = memory.read_word(i * c.aligned_shape[-1] + j, g.addr + x_offset, c_dtype.width) z = x memory.write_word(i * c.aligned_shape[-1] + j, h.addr + z_offset, z, c_dtype.width) # to extern-recv _saxi.write(awaddr_ext_rcv, 1) # termination irq_join(_saxi, 1) #ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok_2nd = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok_2nd = False # else: # print('OK', i, j, orig, check) if ok_1st and ok_2nd: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run( act_dtype=ng.int16, weight_dtype=ng.int8, bias_dtype=ng.int32, scale_dtype=ng.int8, with_batchnorm=True, disable_fusion=False, conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1, conv2d_concur_och=None, conv2d_stationary='filter', pool_par=1, elem_par=1, chunk_size=64, axi_datawidth=32, silent=False, filename=None, # simtype='iverilog', # simtype='verilator', simtype=None, # no RTL simulation outputfile=None): # input mean and standard deviation imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32) imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32) act_shape = (1, 224, 224, 3) if not with_batchnorm: raise ValueError('with_batchnorm must be True for ResNet18.') # pytorch model model = torchvision.models.resnet18(pretrained=True) # Pytorch to ONNX onnx_filename = 'resnet18_imagenet.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # ONNX to NNgen dtypes = {} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=act_dtype, default_scale_dtype=scale_dtype, default_bias_dtype=bias_dtype, disable_fusion=disable_fusion) # -------------------- # (2) Assign quantized weights to the NNgen operators # -------------------- if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'act': act_scale_factor} input_means = {'act': imagenet_mean * act_scale_factor} input_stds = {'act': imagenet_std * act_scale_factor} ng.quantize(outputs, input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=conv2d_par_ich, par_och=conv2d_par_och, par_col=conv2d_par_col, par_row=conv2d_par_row, concur_och=conv2d_concur_och, stationary=conv2d_stationary) if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial, ng.max_pool_serial)): op.attribute(par=pool_par) if ng.is_elementwise_operator(op): op.attribute(par=elem_par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- act = placeholders['act'] out = outputs['out'] # verification data img = np.array(PIL.Image.open('car.png').convert('RGB')).astype(np.float32) img = img.reshape([1] + list(img.shape)) img = img / 255 img = (img - imagenet_mean) / imagenet_std # execution on pytorch model_input = np.broadcast_to(img, act_shape) if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor # software-based verification vact = img * act_scale_factor vact = np.clip(vact, -1.0 * (2**(act.dtype.width - 1) - 1), 1.0 * (2**(act.dtype.width - 1) - 1)) vact = np.round(vact).astype(np.int64) vact = np.broadcast_to(vact, act_shape) # compare outputs of hidden layers relu_op = [ v for k, v in operators.items() if isinstance(v, ng.conv2d) and not isinstance(v, ng.matmul) ][0] maxpool_op = [ v for k, v in operators.items() if isinstance(v, (ng.max_pool, ng.max_pool_serial)) ][0] relu_ops = [v for k, v in operators.items() if isinstance(v, ng.relu)] layer1_0_op = relu_ops[0] layer1_op = relu_ops[1] layer2_0_op = relu_ops[2] layer2_op = relu_ops[3] layer3_0_op = relu_ops[4] layer3_op = relu_ops[5] layer4_0_op = relu_ops[6] layer4_op = relu_ops[7] avgpool_op = [ v for k, v in operators.items() if isinstance(v, (ng.avg_pool, ng.avg_pool_serial)) ][0] fc_op = [v for k, v in operators.items() if isinstance(v, ng.matmul)][0] sub_ops = [ relu_op, maxpool_op, layer1_0_op, layer1_op, layer2_0_op, layer2_op, layer3_0_op, layer3_op, layer4_0_op, layer4_op, avgpool_op, fc_op ] sub_outs = ng.eval(sub_ops, act=vact) sub_outs = [sub_out.transpose([0, 3, 1, 2]) for sub_out in sub_outs[:-1]] + sub_outs[-1:] sub_scale_factors = [sub_op.scale_factor for sub_op in sub_ops] model.eval() model_relu_out = nn.Sequential(model.conv1, model.bn1, model.relu)( torch.from_numpy(model_input)).detach().numpy() model_maxpool_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool)(torch.from_numpy(model_input)).detach().numpy() # class model_layer1_0(nn.Module): # def __init__(self): # super(model_layer1_0, self).__init__() # self.conv1 = model.conv1 # self.bn1 = model.bn1 # self.relu = model.relu # self.maxpool = model.maxpool # self.layer1_0 = model.layer1[0] # # def forward(self, x): # x = self.relu(self.bn1(self.conv1(x))) # x = self.maxpool(x) # x = self.layer1_0(x) # return x # # model_layer1_0_out = model_layer1_0()(torch.from_numpy(model_input)).detach().numpy() model_layer1_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1[0])(torch.from_numpy(model_input)).detach().numpy() model_layer1_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1)(torch.from_numpy(model_input)).detach().numpy() model_layer2_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2[0])(torch.from_numpy(model_input)).detach().numpy() model_layer2_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2)(torch.from_numpy(model_input)).detach().numpy() model_layer3_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3[0])(torch.from_numpy(model_input)).detach().numpy() model_layer3_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3)(torch.from_numpy(model_input)).detach().numpy() model_layer4_0_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4[0])(torch.from_numpy(model_input)).detach().numpy() model_layer4_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4)(torch.from_numpy(model_input)).detach().numpy() model_avgpool_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool)(torch.from_numpy(model_input)).detach().numpy() class Flatten(nn.Module): def forward(self, input): return input.view(input.size(0), -1) model_fc_out = nn.Sequential( model.conv1, model.bn1, model.relu, model.maxpool, model.layer1, model.layer2, model.layer3, model.layer4, model.avgpool, Flatten(), model.fc)(torch.from_numpy(model_input)).detach().numpy() model_outs = [ model_relu_out, model_maxpool_out, model_layer1_0_out, model_layer1_out, model_layer2_0_out, model_layer2_out, model_layer3_0_out, model_layer3_out, model_layer4_0_out, model_layer4_out, model_avgpool_out, model_fc_out ] scaled_outs = [ model_out * scale_factor for model_out, scale_factor in zip(model_outs, sub_scale_factors) ] max_diffs = [ model_out.max() / sub_out.max() for model_out, sub_out in zip(scaled_outs, sub_outs) ] overflows = [ np.sum(np.abs(sub_out) >= abs(2**(sub_op.dtype.width - 1) - 1)) for sub_op, sub_out in zip(sub_ops, sub_outs) ] mean_square_errors = [ np.sum((sub_out - model_out)**2) / sub_out.size for model_out, sub_out in zip(scaled_outs, sub_outs) ] corrcoefs = [ np.corrcoef(model_out.reshape([-1]), sub_out.reshape([-1])) for model_out, sub_out in zip(model_outs, sub_outs) ] # compare prediction results eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] mean_square_error = np.sum((vout - scaled_model_out)**2) / vout.size corrcoef = np.corrcoef(model_out.reshape([-1]), vout.reshape([-1])) class_index = json.load(open('imagenet_class_index.json', 'r')) labels = {int(key): value for (key, value) in class_index.items()} mout = scaled_model_out for bat in range(mout.shape[0]): m_top10 = list( sorted(enumerate(mout[bat]), key=lambda x: x[1], reverse=True))[:10] m_top10_indexes = [index for index, value in m_top10] v_top10 = list( sorted(enumerate(vout[bat]), key=lambda x: x[1], reverse=True))[:10] v_top10_indexes = [index for index, value in v_top10] num_hit = 0 score = 0 for index, value in m_top10: print("# mout: %s (%d) = %f" % (str(labels[index]), index, value)) for index, value in v_top10: print("# vout: %s (%d) = %d" % (str(labels[index]), index, value)) if index in m_top10_indexes: num_hit += 1 score += 10 - abs( m_top10_indexes.index(index) - v_top10_indexes.index(index)) print("# top-10 hit: %d" % num_hit) print("# top-10 score: %d" % score) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen([out], 'resnet18', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact([out], 'resnet18', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog([out], 'resnet18', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() # to memory image param_data = ng.export_ndarray([out], chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil( (act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil( (check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 # mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = np.zeros([1024 * 1024 * 1024 // (memimg_datawidth // 8)], dtype=np.int16) mem = mem + [100] # placeholder axi.set_memory( mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for x in range(out.shape[1]): orig = memory.read_word(bat * out.aligned_shape[1] + x, out.addr, act_dtype.width) check = memory.read_word(bat * out.aligned_shape[1] + x, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False else: print('OK (', bat, x, ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(act_shape=(1, 32, 32, 3), act_dtype=ng.int32, weight_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, out_dtype=ng.int32, with_batchnorm=True, disable_fusion=False, conv2d_par_ich=1, conv2d_par_och=1, conv2d_par_col=1, conv2d_par_row=1, conv2d_concur_och=None, conv2d_stationary='filter', pool_par=1, elem_par=1, chunk_size=64, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): if not with_batchnorm: raise ValueError('with_batchnorm must be True for ResNet18.') # pytorch model model = torchvision.models.resnet18(pretrained=False) model.conv1.in_channels = act_shape[-1] model.fc = nn.Linear(in_features=model.fc.in_features, out_features=10, bias=True) # Pytorch to ONNX onnx_filename = 'resnet18.onnx' dummy_input = torch.randn(*act_shape).transpose(1, 3) input_names = ['act'] output_names = ['out'] model.eval() torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names, output_names=output_names) # ONNX to NNgen dtypes = {} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=dtypes, default_placeholder_dtype=act_dtype, default_variable_dtype=weight_dtype, default_constant_dtype=weight_dtype, default_operator_dtype=out_dtype, default_scale_dtype=scale_dtype, default_bias_dtype=bias_dtype, disable_fusion=disable_fusion) # default linear quantization value_ranges = {'act': (0, 255)} ng.quantize(outputs, value_ranges=value_ranges) # set attribute for op in operators.values(): if isinstance(op, ng.conv2d): op.attribute(par_ich=conv2d_par_ich, par_och=conv2d_par_och, par_col=conv2d_par_col, par_row=conv2d_par_row, concur_och=conv2d_concur_och, stationary=conv2d_stationary) if isinstance(op, (ng.avg_pool, ng.max_pool, ng.avg_pool_serial, ng.max_pool_serial)): op.attribute(par=pool_par) if ng.is_elementwise_operator(op): op.attribute(par=elem_par) # create target hardware act = placeholders['act'] out = outputs['out'] targ = ng.to_veriloggen([out], 'onnx_resnet18', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data vact = np.random.normal(size=act.length).reshape(act.shape) vact = np.clip(vact, -3.0, 3.0) vact_min_val, vact_max_val = value_ranges['act'] vact_max_abs_range = max(abs(vact_min_val), abs(vact_max_val)) vact_width = vact_max_abs_range.bit_length() + 1 vact = vact * (1.0 * (2 ** (vact_width - 1) - 1)) / 3.0 vact = np.round(vact).astype(np.int64) eval_outs = ng.eval([out], act=vact) vout = eval_outs[0] # exec on pytorch model_input = vact.astype(np.float32) if act.perm is not None: model_input = np.transpose(model_input, act.reversed_perm) model.eval() model_out = model(torch.from_numpy(model_input)).detach().numpy() if act.perm is not None and len(model_out.shape) == len(act.shape): model_out = np.transpose(model_out, act.perm) scaled_model_out = model_out * out.scale_factor mout = scaled_model_out.astype(np.int64) for bat in range(vout.shape[0]): vout_max = np.max(vout[bat]) vout_max_index = list(vout[bat]).index(vout_max) mout_max = np.max(mout[bat]) mout_max_index = list(mout[bat]).index(mout_max) print("# vout[%d]: max = %d, index = %d" % (bat, vout_max, vout_max_index)) print("# mout[%d]: max = %d, index = %d" % (bat, mout_max, mout_max_index)) # out_diff = vout - scaled_model_out # out_err = out_diff / (scaled_model_out + 0.00000001) # max_out_err = np.max(np.abs(out_err)) # breakpoint() # if max_out_err > 0.1: # raise ValueError("too large output error: %f > 0.1" % max_out_err) # to memory image param_data = ng.make_param_array(variables, constants, chunk_size) param_bytes = len(param_data) variable_addr = int(math.ceil((act.addr + act.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil((variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int(math.ceil((check_addr + out.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 256 // memimg_datawidth], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory(mem, vact, memimg_datawidth, act_dtype.width, act.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vout, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), conv2d_par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq( time_counter.inc() ) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(out.shape[0]): for y in range(out.shape[1]): for x in range(out.shape[2]): for ch in range(out.shape[3]): orig = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, out.addr, out_dtype.width) check = memory.read_word( bat * out.aligned_shape[1] * out.aligned_shape[2] * out.aligned_shape[3] + y * out.aligned_shape[2] * out.aligned_shape[3] + x * out.aligned_shape[3] + ch, check_addr, out_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, y, x, ch, ') orig: ', orig, ' check: ', check) ok = False # else: # print('OK (', bat, y, x, ch, # ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def run(a_shape=(7, 15), b_shape=(7, 15), a_dtype=ng.int32, b_dtype=ng.int32, c_dtype=ng.int32, par=1, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # model definition model = MatrixAdd() # Pytorch to ONNX onnx_filename = 'onnx_matrix_add.onnx' dummy_a = torch.randn(*a_shape) dummy_b = torch.randn(*b_shape) dummy_inputs = (dummy_a, dummy_b) input_names = ['a', 'b'] output_names = ['c'] model.eval() torch.onnx.export(model, dummy_inputs, onnx_filename, input_names=input_names, output_names=output_names) # ONNX to NNgen value_dtypes = {'a': a_dtype, 'b': b_dtype, 'c': c_dtype} (outputs, placeholders, variables, constants, operators) = ng.from_onnx(onnx_filename, value_dtypes=value_dtypes, default_placeholder_dtype=ng.int32, default_variable_dtype=ng.int32, default_constant_dtype=ng.int32, default_operator_dtype=ng.int32, default_scale_dtype=ng.int32, default_bias_dtype=ng.int32, disable_fusion=False) # set attribute for op in operators.values(): if isinstance(op, ng.add): op.attribute(par=par) # create target hardware a = placeholders['a'] b = placeholders['b'] c = outputs['c'] targ = ng.to_veriloggen([c], 'onnx_matrix_add', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] vb = (np.arange(b.length, dtype=np.int64).reshape(b.shape) + [100]) % [6] eval_outs = ng.eval([c], a=va, b=vb) vc = eval_outs[0] # exec on pytorch model_a = va.astype(np.float32) model_b = vb.astype(np.float32) if a.perm is not None: model_a = np.transpose(model_a, a.reversed_perm) if b.perm is not None: model_b = np.transpose(model_b, b.reversed_perm) model.eval() model_c = model(torch.from_numpy(model_a), torch.from_numpy(model_b)).detach().numpy() if a.perm is not None: model_c = np.transpose(model_c, a.perm) scaled_model_c = model_c * c.scale_factor c_diff = vc - scaled_model_c c_err = c_diff / (scaled_model_c + 0.00000001) max_c_err = np.max(np.abs(c_err)) # if max_c_err > 0.1: # raise ValueError("too large output error: %f > 0.1" % max_c_err) # to memory image param_data = ng.export_ndarray([c]) param_bytes = len(param_data) variable_addr = int( math.ceil( max(a.addr + a.memory_size, b.addr + b.memory_size) / 4096)) * 4096 check_addr = int(math.ceil((variable_addr + param_bytes) / 4096)) * 4096 tmp_addr = int(math.ceil((check_addr + c.memory_size) / 4096)) * 4096 memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory(mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par)) axi.set_memory(mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory(mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) num_rep = functools.reduce(lambda x, y: x * y, c.shape[:-1], 1) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(num_rep): for j in range(c.shape[-1]): orig = memory.read_word(i * c.aligned_shape[-1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[-1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG', i, j, orig, check) ok = False # else: # print('OK', i, j, orig, check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28): # create target hardware # layer 0: conv2d, max_pool_serial, relu input_layer = ng.placeholder(ng.int32, shape=(1, row, col, ich), name='input_layer') w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0') a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1)) a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1)) a0 = ng.relu(a0) # layer 1: conv2d, relu, reshape w1 = ng.variable(ng.int32, shape=(ch, ksize, ksize, a0.shape[-1]), name='w1') a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1)) a1 = ng.relu(a1) a1 = ng.reshape(a1, [-1]) # layer 2: full-connection w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2') a2 = ng.matmul(a1, w2, transposed_b=True) a2 = ng.relu(a2) # layer 3: full-connection w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3') output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer') targ = ng.to_veriloggen([output_layer], 'cnn') #targ = ng.to_ipxact([output_layer], 'cnn') # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) return m