scale=s1, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) a1r = ng.reshape(a1, [batchsize, -1]) # layer 2: full-connection, relu w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2') b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2') s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2') a2 = ng.matmul(a1r, w2, bias=b2, scale=s2, transposed_b=True, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) # layer 3: full-connection, relu w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3') b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3') s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3') # output output_layer = ng.matmul(a2, w3, bias=b3, scale=s3, transposed_b=True,
def run(a_shape=(15, 15), b_shape=(15, 15), bias_shape=None, scale_shape=None, a_dtype=ng.int32, b_dtype=ng.int32, bias_dtype=ng.int32, scale_dtype=ng.int32, c_dtype=ng.int32, rshift_mul=None, rshift_sum=None, rshift_out=None, act_func=None, par_left_col=1, par_left_row=1, par_out_col=1, concur_out_col=None, stationary='right', left_ram_size=None, right_ram_size=None, bias_ram_size=None, scale_ram_size=None, out_ram_size=None, axi_datawidth=32, silent=False, filename=None, simtype='iverilog', outputfile=None): # create target hardware a = ng.placeholder(a_dtype, shape=a_shape, name='a') b = ng.placeholder(b_dtype, shape=b_shape, name='b') if bias_shape is not None: bias = ng.placeholder(bias_dtype, bias_shape, name='bias') else: bias = None if scale_shape is not None: scale = ng.placeholder(scale_dtype, scale_shape, name='scale') else: scale = None transposed_a = False transposed_b = True c = ng.matmul(a, b, bias, scale, transposed_a, transposed_b, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size) targ = ng.to_veriloggen([c], 'matrix_matmul', silent=silent, config={'maxi_datawidth': axi_datawidth}) # verification data va = np.arange(a.length, dtype=np.int64).reshape(a.shape) % [5] vb = np.arange(b.length, dtype=np.int64).reshape(b.shape) % [5] - [3] if bias is not None: vbias = np.arange(bias.length, dtype=np.int64).reshape( bias.shape) % [4] else: vbias = None if scale is not None: vscale = np.arange(scale.length, dtype=np.int64).reshape( scale.shape) % [6] else: vscale = None vc = ng.verify.matmul(va, vb, bias, scale, False, True, rshift_mul, rshift_sum, rshift_out, act_func, c_dtype, ng.int32, ng.int32, 'matmul', par_left_col, par_left_row, par_out_col, concur_out_col, stationary, left_ram_size, right_ram_size, bias_ram_size, scale_ram_size, None, None, None, out_ram_size, False, a_dtype, b_dtype, bias_dtype, scale_dtype) # to memory image size_max = int( math.ceil( max(a.memory_size, b.memory_size, bias.memory_size if bias is not None else 0, scale.memory_size if scale is not None else 0, c.memory_size) / 4096)) * 4096 check_addr = max(a.addr, b.addr, bias.addr if bias is not None else -1, scale.addr if scale is not None else -1, c.addr) + size_max size_check = size_max tmp_addr = check_addr + size_check memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 8 // memimg_datawidth], dtype=np.int64) mem = mem + [100] axi.set_memory( mem, va, memimg_datawidth, a_dtype.width, a.addr, max(int(math.ceil(axi_datawidth / a_dtype.width)), par_left_col)) axi.set_memory( mem, vb, memimg_datawidth, b_dtype.width, b.addr, max(int(math.ceil(axi_datawidth / b_dtype.width)), par_left_col)) if bias is not None: axi.set_memory( mem, vbias, memimg_datawidth, bias_dtype.width, bias.addr, max(int(math.ceil(axi_datawidth / bias_dtype.width)), par_out_col)) if scale is not None: axi.set_memory( mem, vscale, memimg_datawidth, scale_dtype.width, scale.addr, max(int(math.ceil(axi_datawidth / scale_dtype.width)), par_out_col)) axi.set_memory( mem, vc, memimg_datawidth, c_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / c_dtype.width)), par_out_col)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if outputfile is None: outputfile = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + outputfile memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for i in range(c.shape[0]): for j in range(c.shape[1]): orig = memory.read_word(i * c.aligned_shape[1] + j, c.addr, c_dtype.width) check = memory.read_word(i * c.aligned_shape[1] + j, check_addr, c_dtype.width) if vthread.verilog.NotEql(orig, check): print(i, j, orig, check) ok = False if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) # output source code if filename is not None: m.to_verilog(filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=outputfile) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(ich=3, och=10, ch=64, ksize=3, stride=1, col=28, row=28): # create target hardware # layer 0: conv2d, max_pool_serial, relu input_layer = ng.placeholder(ng.int32, shape=(1, row, col, ich), name='input_layer') w0 = ng.variable(ng.int32, shape=(ch, ksize, ksize, ich), name='w0') a0 = ng.conv2d(input_layer, w0, strides=(1, stride, stride, 1)) a0 = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1)) a0 = ng.relu(a0) # layer 1: conv2d, relu, reshape w1 = ng.variable(ng.int32, shape=(ch, ksize, ksize, a0.shape[-1]), name='w1') a1 = ng.conv2d(a0, w1, strides=(1, stride, stride, 1)) a1 = ng.relu(a1) a1 = ng.reshape(a1, [-1]) # layer 2: full-connection w2 = ng.variable(ng.int32, shape=(16, a1.shape[-1]), name='w2') a2 = ng.matmul(a1, w2, transposed_b=True) a2 = ng.relu(a2) # layer 3: full-connection w3 = ng.variable(ng.int32, shape=(och, a2.shape[-1]), name='w3') output_layer = ng.matmul(a2, w3, transposed_b=True, name='output_layer') targ = ng.to_veriloggen([output_layer], 'cnn') #targ = ng.to_ipxact([output_layer], 'cnn') # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model memory = axi.AxiMemoryModel(m, 'memory', clk, rst, mem_addrwidth=23) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) return m
def run( act_dtype=ng.int8, weight_dtype=ng.int8, bias_dtype=ng.int32, scale_dtype=ng.int8, par_ich=2, par_och=2, chunk_size=64, axi_datawidth=32, silent=False, weight_filename='cnn.npy', verilog_filename=None, sim_filename=None, # simtype='iverilog', simtype='verilator', # simtype=None, # no RTL simulation ): # -------------------- # (1) Represent a DNN model as a dataflow by NNgen operators # -------------------- # input input_layer = ng.placeholder( dtype=act_dtype, shape=(1, 32, 32, 3), # N, H, W, C name='input_layer') # layer 0: conv2d (with bias and scale (= batchnorm)), relu, max_pool w0 = ng.variable( dtype=weight_dtype, shape=(64, 3, 3, 3), # Och, Ky, Kx, Ich name='w0') b0 = ng.variable(dtype=bias_dtype, shape=(w0.shape[0], ), name='b0') s0 = ng.variable(dtype=scale_dtype, shape=(w0.shape[0], ), name='s0') a0 = ng.conv2d(input_layer, w0, strides=(1, 1, 1, 1), bias=b0, scale=s0, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) a0p = ng.max_pool_serial(a0, ksize=(1, 2, 2, 1), strides=(1, 2, 2, 1)) # layer 1: conv2d, relu, reshape w1 = ng.variable(weight_dtype, shape=(64, 3, 3, a0.shape[-1]), name='w1') b1 = ng.variable(bias_dtype, shape=(w1.shape[0], ), name='b1') s1 = ng.variable(scale_dtype, shape=(w1.shape[0], ), name='s1') a1 = ng.conv2d(a0p, w1, strides=(1, 1, 1, 1), bias=b1, scale=s1, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) a1r = ng.reshape(a1, [1, -1]) # layer 2: full-connection, relu w2 = ng.variable(weight_dtype, shape=(256, a1r.shape[-1]), name='w2') b2 = ng.variable(bias_dtype, shape=(w2.shape[0], ), name='b2') s2 = ng.variable(scale_dtype, shape=(w2.shape[0], ), name='s2') a2 = ng.matmul(a1r, w2, bias=b2, scale=s2, transposed_b=True, act_func=ng.relu, dtype=act_dtype, sum_dtype=ng.int32) # layer 3: full-connection, relu w3 = ng.variable(weight_dtype, shape=(10, a2.shape[-1]), name='w3') b3 = ng.variable(bias_dtype, shape=(w3.shape[0], ), name='b3') s3 = ng.variable(scale_dtype, shape=(w3.shape[0], ), name='s3') # output output_layer = ng.matmul(a2, w3, bias=b3, scale=s3, transposed_b=True, name='output_layer', dtype=act_dtype, sum_dtype=ng.int32) # -------------------- # (2) Assign weights to the NNgen operators # -------------------- # In this example, random floating-point values are assigned. # In a real case, you should assign actual weight values # obtianed by a training on DNN framework. # If you don't you NNgen's quantizer, you can assign integer weights to each tensor. w0_value = np.random.normal(size=w0.length).reshape(w0.shape) w0_value = np.clip(w0_value, -3.0, 3.0) w0.set_value(w0_value) b0_value = np.random.normal(size=b0.length).reshape(b0.shape) b0_value = np.clip(b0_value, -3.0, 3.0) b0.set_value(b0_value) s0_value = np.ones(s0.shape) s0.set_value(s0_value) w1_value = np.random.normal(size=w1.length).reshape(w1.shape) w1_value = np.clip(w1_value, -3.0, 3.0) w1.set_value(w1_value) b1_value = np.random.normal(size=b1.length).reshape(b1.shape) b1_value = np.clip(b1_value, -3.0, 3.0) b1.set_value(b1_value) s1_value = np.ones(s1.shape) s1.set_value(s1_value) w2_value = np.random.normal(size=w2.length).reshape(w2.shape) w2_value = np.clip(w2_value, -3.0, 3.0) w2.set_value(w2_value) b2_value = np.random.normal(size=b2.length).reshape(b2.shape) b2_value = np.clip(b2_value, -3.0, 3.0) b2.set_value(b2_value) s2_value = np.ones(s2.shape) s2.set_value(s2_value) w3_value = np.random.normal(size=w3.length).reshape(w3.shape) w3_value = np.clip(w3_value, -3.0, 3.0) w3.set_value(w3_value) b3_value = np.random.normal(size=b3.length).reshape(b3.shape) b3_value = np.clip(b3_value, -3.0, 3.0) b3.set_value(b3_value) s3_value = np.ones(s3.shape) s3.set_value(s3_value) # Quantizing the floating-point weights by the NNgen quantizer. # Alternatively, you can assign integer weights by yourself to each tensor. imagenet_mean = np.array([0.485, 0.456, 0.406]).astype(np.float32) imagenet_std = np.array([0.229, 0.224, 0.225]).astype(np.float32) if act_dtype.width > 8: act_scale_factor = 128 else: act_scale_factor = int(round(2**(act_dtype.width - 1) * 0.5)) input_scale_factors = {'input_layer': act_scale_factor} input_means = {'input_layer': imagenet_mean * act_scale_factor} input_stds = {'input_layer': imagenet_std * act_scale_factor} ng.quantize([output_layer], input_scale_factors, input_means, input_stds) # -------------------- # (3) Assign hardware attributes # -------------------- # conv2d, matmul # par_ich: parallelism in input-channel # par_och: parallelism in output-channel # par_col: parallelism in pixel column # par_row: parallelism in pixel row a0.attribute(par_ich=par_ich, par_och=par_och) a1.attribute(par_ich=par_ich, par_och=par_och) a2.attribute(par_ich=par_ich, par_och=par_och) output_layer.attribute(par_ich=par_ich, par_och=par_och) # cshamt_out: right shift amount after applying bias/scale # If you assign integer weights by yourself to each tensor, # cshamt (constant shift amount) must be assigned to each operator. # a0.attribute(cshamt_out=weight_dtype.width + 1) # a1.attribute(cshamt_out=weight_dtype.width + 1) # a2.attribute(cshamt_out=weight_dtype.width + 1) # output_layer.attribute(cshamt_out=weight_dtype.width + 1) # max_pool # par: parallelism in in/out channel par = par_och a0p.attribute(par=par) # -------------------- # (4) Verify the DNN model behavior by executing the NNgen dataflow as a software # -------------------- # In this example, random integer values are assigned. # In real case, you should assign actual integer activation values, such as an image. input_layer_value = np.random.normal(size=input_layer.length).reshape( input_layer.shape) input_layer_value = input_layer_value * imagenet_std + imagenet_mean input_layer_value = np.clip(input_layer_value, -5.0, 5.0) input_layer_value = input_layer_value * act_scale_factor input_layer_value = np.clip(input_layer_value, -1 * 2**(act_dtype.width - 1) - 1, 2**(act_dtype.width - 1)) input_layer_value = np.round(input_layer_value).astype(np.int64) eval_outs = ng.eval([output_layer], input_layer=input_layer_value) output_layer_value = eval_outs[0] # print(output_layer_value) # breakpoint() # -------------------- # (5) Convert the NNgen dataflow to a hardware description (Verilog HDL and IP-XACT) # -------------------- # to Veriloggen object # targ = ng.to_veriloggen([output_layer], 'cnn', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # to IP-XACT (the method returns Veriloggen object, as well as to_veriloggen) targ = ng.to_ipxact([output_layer], 'cnn', silent=silent, config={'maxi_datawidth': axi_datawidth}) # to Verilog HDL RTL (the method returns a source code text) # rtl = ng.to_verilog([output_layer], 'cnn', silent=silent, # config={'maxi_datawidth': axi_datawidth}) # -------------------- # (6) Save the quantized weights # -------------------- # convert weight values to a memory image: # on a real FPGA platform, this image will be used as a part of the model definition. param_filename = 'hello_nngen.npy' chunk_size = 64 param_data = ng.export_ndarray([output_layer], chunk_size) np.save(weight_filename, param_data) # -------------------- # (7) Simulate the generated hardware by Veriloggen and Verilog simulator # -------------------- if simtype is None: sys.exit() param_bytes = len(param_data) variable_addr = int( math.ceil((input_layer.addr + input_layer.memory_size) / chunk_size)) * chunk_size check_addr = int(math.ceil( (variable_addr + param_bytes) / chunk_size)) * chunk_size tmp_addr = int( math.ceil( (check_addr + output_layer.memory_size) / chunk_size)) * chunk_size memimg_datawidth = 32 mem = np.zeros([1024 * 1024 * 256 // (memimg_datawidth // 8)], dtype=np.int64) mem = mem + [100] # placeholder axi.set_memory( mem, input_layer_value, memimg_datawidth, act_dtype.width, input_layer.addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_ich)) # parameters (variable and constant) axi.set_memory(mem, param_data, memimg_datawidth, 8, variable_addr) # verification data axi.set_memory( mem, output_layer_value, memimg_datawidth, act_dtype.width, check_addr, max(int(math.ceil(axi_datawidth / act_dtype.width)), par_och)) # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model if sim_filename is None: sim_filename = os.path.splitext(os.path.basename(__file__))[0] + '.out' memimg_name = 'memimg_' + sim_filename memory = axi.AxiMemoryModel(m, 'memory', clk, rst, datawidth=axi_datawidth, memimg=mem, memimg_name=memimg_name, memimg_datawidth=memimg_datawidth) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass ng.sim.set_global_addrs(_saxi, tmp_addr) start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) # verify ok = True for bat in range(output_layer.shape[0]): for x in range(output_layer.shape[1]): orig = memory.read_word( bat * output_layer.aligned_shape[1] + x, output_layer.addr, act_dtype.width) check = memory.read_word( bat * output_layer.aligned_shape[1] + x, check_addr, act_dtype.width) if vthread.verilog.NotEql(orig, check): print('NG (', bat, x, ') orig: ', orig, ' check: ', check) ok = False else: print('OK (', bat, x, ') orig: ', orig, ' check: ', check) if ok: print('# verify: PASSED') else: print('# verify: FAILED') vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(10000000), Systask('finish'), ) # output source code if verilog_filename is not None: m.to_verilog(verilog_filename) # run simulation sim = simulation.Simulator(m, sim=simtype) rslt = sim.run(outputfile=sim_filename) lines = rslt.splitlines() if simtype == 'verilator' and lines[-1].startswith('-'): rslt = '\n'.join(lines[:-1]) return rslt
def mkTest(n_input=784, n_classes=10): # create target hardware x = ng.placeholder(ng.int32, shape=[n_input]) w1 = ng.variable(ng.int32, shape=(n_input, n_input), name='h1') w2 = ng.variable(ng.int32, shape=(n_input, n_input), name='h2') w3 = ng.variable(ng.int32, shape=(n_classes, n_input), name='out') l1 = ng.matmul(x, w1, transposed_b=True) l1 = ng.relu(l1) l2 = ng.matmul(l1, w2, transposed_b=True) l2 = ng.relu(l2) out = ng.matmul(l2, w3, transposed_b=True) targ = ng.to_veriloggen([out], 'mlp') #targ = ng.to_ipxact([model], 'mlp') # test controller m = Module('test') params = m.copy_params(targ) ports = m.copy_sim_ports(targ) clk = ports['CLK'] resetn = ports['RESETN'] rst = m.Wire('RST') rst.assign(Not(resetn)) # AXI memory model memory = axi.AxiMemoryModel(m, 'memory', clk, rst) memory.connect(ports, 'maxi') # AXI-Slave controller _saxi = vthread.AXIMLite(m, '_saxi', clk, rst, noio=True) _saxi.connect(ports, 'saxi') # timer time_counter = m.Reg('time_counter', 32, initval=0) seq = Seq(m, 'seq', clk, rst) seq(time_counter.inc()) def ctrl(): for i in range(100): pass start_time = time_counter.value ng.sim.start(_saxi) print('# start') ng.sim.wait(_saxi) end_time = time_counter.value print('# end') print('# execution cycles: %d' % (end_time - start_time)) vthread.finish() th = vthread.Thread(m, 'th_ctrl', clk, rst, ctrl) fsm = th.start() uut = m.Instance(targ, 'uut', params=m.connect_params(targ), ports=m.connect_ports(targ)) # simulation.setup_waveform(m, uut) simulation.setup_clock(m, clk, hperiod=5) init = simulation.setup_reset(m, resetn, m.make_reset(), period=100, polarity='low') init.add( Delay(1000000), Systask('finish'), ) return m