def rtlsim_exec(model, execution_context): """Use PyVerilator to execute given model with stitched IP. The execution context contains the input values.""" if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") # ensure stitched ip project already exists assert os.path.isfile(model.get_metadata_prop("wrapper_filename")), """The file name from metadata property "wrapper_filename" doesn't exist.""" assert os.path.isdir(model.get_metadata_prop("vivado_stitch_proj")), """The directory from metadata property "vivado_stitch_proj" doesn't exist""" trace_file = model.get_metadata_prop("rtlsim_trace") # extract input shape # TODO extend for multiple inputs i_name = model.graph.input[0].name i_tensor = execution_context[i_name] i_dt = model.get_tensor_datatype(i_name) first_node = getCustomOp(model.find_consumer(i_name)) i_stream_w = first_node.get_instream_width() # convert input into time multiplexed shape i_folded_shape = first_node.get_folded_input_shape() # TODO any other layout transformations need to happen here! i_tensor = i_tensor.reshape(i_folded_shape) # extract output shape o_name = model.graph.output[0].name o_shape = model.get_tensor_shape(o_name) o_dt = model.get_tensor_datatype(o_name) last_node = getCustomOp(model.find_producer(o_name)) o_folded_shape = last_node.get_folded_output_shape() o_stream_w = last_node.get_outstream_width() packedBits = o_stream_w targetBits = o_dt.bitwidth() # pack input packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w) num_out_values = last_node.get_number_output_values() # prepare pyverilator model rtlsim_so = model.get_metadata_prop("rtlsim_so") if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)): sim = pyverilate_stitched_ip(model) model.set_metadata_prop("rtlsim_so", sim.lib._name) else: sim = PyVerilator(rtlsim_so) _reset_rtlsim(sim) _toggle_clk(sim) ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file) packed_output = ret[0] model.set_metadata_prop("sim_cycles", str(ret[1])) # unpack output and put into context o_folded_tensor = rtlsim_output_to_npy(packed_output, None, o_dt, o_folded_shape, packedBits, targetBits) execution_context[o_name] = o_folded_tensor.reshape(o_shape)
def test_fpgadataflow_ipstitch_rtlsim(mem_mode): model = load_test_checkpoint_or_skip( ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode) model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") sim = pyverilate_stitched_ip(model) exp_io = [ "ap_clk", "ap_rst_n", "s_axis_0_tdata", "s_axis_0_tready", "s_axis_0_tvalid", "m_axis_0_tdata", "m_axis_0_tkeep", "m_axis_0_tlast", "m_axis_0_tready", "m_axis_0_tvalid", "s_axi_control_araddr", "s_axi_control_arready", "s_axi_control_arvalid", "s_axi_control_awaddr", "s_axi_control_awready", "s_axi_control_awvalid", "s_axi_control_bready", "s_axi_control_bresp", "s_axi_control_bvalid", "s_axi_control_rdata", "s_axi_control_rready", "s_axi_control_rresp", "s_axi_control_rvalid", "s_axi_control_wdata", "s_axi_control_wready", "s_axi_control_wstrb", "s_axi_control_wvalid", ] assert sorted(dir(sim.io)) == sorted(exp_io) model.set_metadata_prop("exec_mode", "rtlsim") idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") x = gen_finn_dt_tensor(idt, ishape) # x = np.zeros(ishape, dtype=np.float32) # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32) rtlsim_res = execute_onnx(model, {"inp": x})["outp"] assert (rtlsim_res == x).all()
def test_fpgadataflow_ipstitch_rtlsim(): model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx") model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") sim = pyverilate_stitched_ip(model) exp_io = [ "ap_clk_0", "ap_rst_n_0", "in0_V_V_0_tdata", "in0_V_V_0_tready", "in0_V_V_0_tvalid", "out_r_0_tdata", "out_r_0_tkeep", "out_r_0_tlast", "out_r_0_tready", "out_r_0_tvalid", "s_axi_control_0_araddr", "s_axi_control_0_arready", "s_axi_control_0_arvalid", "s_axi_control_0_awaddr", "s_axi_control_0_awready", "s_axi_control_0_awvalid", "s_axi_control_0_bready", "s_axi_control_0_bresp", "s_axi_control_0_bvalid", "s_axi_control_0_rdata", "s_axi_control_0_rready", "s_axi_control_0_rresp", "s_axi_control_0_rvalid", "s_axi_control_0_wdata", "s_axi_control_0_wready", "s_axi_control_0_wstrb", "s_axi_control_0_wvalid", ] assert dir(sim.io) == exp_io model.set_metadata_prop("exec_mode", "rtlsim") idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") x = gen_finn_dt_tensor(idt, ishape) # x = np.zeros(ishape, dtype=np.float32) # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32) rtlsim_res = execute_onnx(model, {"inp": x})["outp"] assert (rtlsim_res == x).all()
def test_fpgadataflow_ipstitch_rtlsim(): model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx") sim = pyverilate_stitched_ip(model) exp_io = [ "ap_clk_0", "ap_rst_n_0", "in0_V_V_0_tdata", "in0_V_V_0_tready", "in0_V_V_0_tvalid", "out_r_0_tdata", "out_r_0_tkeep", "out_r_0_tlast", "out_r_0_tready", "out_r_0_tvalid", ] assert dir(sim.io) == exp_io model.set_metadata_prop("exec_mode", "rtlsim") idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") x = gen_finn_dt_tensor(idt, ishape) rtlsim_res = execute_onnx(model, {"inp": x})["outp"] assert (rtlsim_res == x).all()
def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None): """Use PyVerilator to execute given model with stitched IP. The execution context contains the input values. Hook functions can be optionally specified to observe/alter the state of the circuit, receiving the PyVerilator sim object as their first argument: - pre_hook : hook function to be called before sim start (after reset) - post_hook : hook function to be called after sim end """ if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") # ensure stitched ip project already exists assert os.path.isfile( model.get_metadata_prop("wrapper_filename") ), """The file name from metadata property "wrapper_filename" doesn't exist.""" assert os.path.isdir( model.get_metadata_prop("vivado_stitch_proj") ), """The directory from metadata property "vivado_stitch_proj" doesn't exist""" trace_file = model.get_metadata_prop("rtlsim_trace") # extract input shape # TODO extend for multiple inputs i_name = model.graph.input[0].name i_tensor = execution_context[i_name] i_dt = model.get_tensor_datatype(i_name) first_node = getCustomOp(model.find_consumer(i_name)) i_stream_w = first_node.get_instream_width() # convert input into time multiplexed shape i_folded_shape = first_node.get_folded_input_shape() batchsize = i_tensor.shape[0] # override batch size for input i_folded_shape = list(i_folded_shape) i_folded_shape[0] = batchsize i_folded_shape = tuple(i_folded_shape) # TODO any other layout transformations need to happen here! i_tensor = i_tensor.reshape(i_folded_shape) # extract output shape o_name = model.graph.output[0].name o_shape = model.get_tensor_shape(o_name) o_dt = model.get_tensor_datatype(o_name) last_node = getCustomOp(model.find_producer(o_name)) o_folded_shape = last_node.get_folded_output_shape() # override batch size from actual input o_shape = list(o_shape) o_shape[0] = batchsize o_shape = tuple(o_shape) o_folded_shape = list(o_folded_shape) o_folded_shape[0] = batchsize o_folded_shape = tuple(o_folded_shape) o_stream_w = last_node.get_outstream_width() packedBits = o_stream_w targetBits = o_dt.bitwidth() # pack input packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w) num_out_values = last_node.get_number_output_values() num_out_values *= batchsize # prepare pyverilator model rtlsim_so = model.get_metadata_prop("rtlsim_so") if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)): sim = pyverilate_stitched_ip(model) model.set_metadata_prop("rtlsim_so", sim.lib._name) else: sim = PyVerilator(rtlsim_so, auto_eval=False) ret = _run_rtlsim( sim, packed_input, num_out_values, trace_file, pre_hook=pre_hook, post_hook=post_hook, ) packed_output = ret[0] model.set_metadata_prop("cycles_rtlsim", str(ret[1])) # unpack output and put into context o_folded_tensor = rtlsim_output_to_npy( packed_output, None, o_dt, o_folded_shape, packedBits, targetBits ) execution_context[o_name] = o_folded_tensor.reshape(o_shape)
def apply(self, model): # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] for node in model.graph.node: # verify assumptions assert is_fpgadataflow_node( node), "Found non-fpgadataflow node: " + str(node) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) node.set_nodeattr("inFIFODepth", self.max_depth) node.set_nodeattr("outFIFODepth", self.max_depth) if node.onnx_node.op_type == "StreamingFCLayer_Batch": mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) node.set_nodeattr("mem_mode", "decoupled") reset_implementation(node) warnings.warn( "Changed mem_mode from external to decoupled for " + node.onnx_node.name) # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} for node in model.graph.node: if node.op_type == "StreamingFIFO": fifos[node.name] = 0 node = getCustomOp(node) # check depths and fix as necessary if node.get_nodeattr("depth") != self.max_depth: node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] max_cycles = perf["max_cycles"] model = model.transform(PrepareIP(self.fpgapart, self.clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") # calculate input frequency (number of cycles for each input word) first_node = getCustomOp(model.graph.node[0]) ncycles_per_input = max( 1, int( math.ceil(perf["max_cycles"] / (np.prod(first_node.get_folded_input_shape()) / first_node.get_folded_input_shape()[-1]))), ) # set sufficiently large threshold for 1 image to fully execute and exit ncycles = int(latency + max_cycles) # prepare pyverilator model sim = pyverilate_stitched_ip(model) reset_rtlsim(sim) toggle_clk(sim) # set all input valids to 0 and output readies to 1 # set input data to some constant set_signal(sim, "tvalid", 0) set_signal(sim, "tready", 1) set_signal(sim, "tdata", 0) output_detected = False while ncycles > 0: toggle_clk(sim) # set/unset valids if ncycles % ncycles_per_input == 0: set_signal(sim, "tvalid", 1) else: set_signal(sim, "tvalid", 0) # check/update all fifo counts for key in fifos: current_state = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["state"] current_addr = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["addr"] if current_state == 2: current_count = current_addr + 2 else: current_count = current_state if current_count > fifos[key]: fifos[key] = current_count # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: ncycles = max_cycles output_detected = True else: ncycles = ncycles - 1 if not output_detected: warnings.warn( "No output detected, calculated FIFO depths may not be correct" ) # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles if node.op_type == "StreamingFIFO": assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") node_inst.set_nodeattr("ram_style", self.vivado_ram_style) else: node_inst.set_nodeattr("impl_style", "rtl") # reset implementation reset_implementation(node_inst) del fifos[node.name] else: getCustomOp(node).set_nodeattr("inFIFODepth", 0) getCustomOp(node).set_nodeattr("outFIFODepth", 0) # for every FC node we changed from external to decoupled, # change back and reset implementation if node.op_type == "StreamingFCLayer_Batch": if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") reset_implementation(node_inst) modified_fc_nodes.remove(node.name) assert (len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 ), "FIFO/FC nodes left untouched after model reconfiguration" # handle custom sizing for SWG FIFOs if desired if self.swg_exception: model = model.transform( CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)) # remove shallow FIFOs model = model.transform(RemoveShallowFIFOs()) return (model, False)