def fold_cnv_small(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD, in_fifo_depth) for a layer folding = [ (8, 3, 256, "auto"), (16, 16, 256, "auto"), (8, 16, 256, "auto"), (8, 16, 256, "block"), (4, 8, 214, "auto"), (1, 8, 2, "auto"), (1, 2, 126, "distributed"), (2, 2, 62, "block"), (5, 1, 6, "distributed"), ] for fcl, (pe, simd, ififodepth, ramstyle) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("inFIFODepth", ififodepth) fcl_inst.set_nodeattr("ram_style", ramstyle) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") swg_idepth = [2, 51, 9, 106, 2, 2] for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) return model
def fold_cnv_large(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD, in_fifo_depth) for a layer folding = [ (16, 3, 256), (32, 32, 256), (16, 32, 256), (16, 32, 256), (4, 32, 214), (1, 32, 2), (1, 4, 126), (1, 8, 62), (5, 1, 6), ] for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("inFIFODepth", ififodepth) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") swg_idepth = [2, 51, 9, 106, 2, 2] for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) swg_inst.set_nodeattr("inFIFODepth", swg_idepth[i]) return model
def fold_cnv_small(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD) for a layer folding = [ (8, 3, "auto"), (16, 16, "auto"), (8, 16, "auto"), (8, 16, "block"), (4, 8, "auto"), (1, 8, "auto"), (1, 2, "distributed"), (2, 2, "block"), (5, 1, "distributed"), ] for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) return model
def fold_cnv_large(model): fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD) for a layer folding = [ (16, 3), (32, 32), (16, 32), (16, 32), (4, 32), (1, 32), (1, 4), (1, 8), (5, 1), ] for fcl, (pe, simd) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) return model
def attach_child_models_to_parent_model(parent_model, ordered_list_of_child_model_paths): # Assume the child model list is in order (entry 0 is the first child model that is accessed) streaming_dataflow_partition_nodes = parent_model.get_nodes_by_op_type( "StreamingDataflowPartition") # print(streaming_dataflow_partition_nodes) num_sdpn = len(streaming_dataflow_partition_nodes) num_child_models = len(ordered_list_of_child_model_paths) if (num_child_models != num_sdpn): raise ValueError( f"Number of child models supplied ({num_child_models}) does not match number of StreamingDataflowPartition Nodes ({num_sdpn})" ) for i in range(0, num_child_models): sdpn = streaming_dataflow_partition_nodes[i] child_model_path = ordered_list_of_child_model_paths[i] getCustomOp(sdpn).set_nodeattr("model", child_model_path) # modify child model input and output to match streaming dataflow partition node's inputs and outputs new_input_name = sdpn.input[0] new_output_name = sdpn.output[0] child_model = ModelWrapper(child_model_path) child_model.rename_tensor(child_model.graph.input[0].name, new_input_name) child_model.rename_tensor(child_model.graph.output[0].name, new_output_name) child_model.save(child_model_path) return parent_model
def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig): """Create stitched IP for a graph after all HLS IP blocks have been generated. Depends on the DataflowOutputType.STITCHED_IP output product.""" if DataflowOutputType.STITCHED_IP in cfg.generate_outputs: stitched_ip_dir = cfg.output_dir + "/stitched_ip" model = model.transform( CreateStitchedIP(cfg._resolve_fpga_part(), cfg.synth_clk_period_ns)) # TODO copy all ip sources into output dir? as zip? copytree(model.get_metadata_prop("vivado_stitch_proj"), stitched_ip_dir) print("Vivado stitched IP written into " + stitched_ip_dir) if VerificationStepType.STITCHED_IP_RTLSIM in cfg._resolve_verification_steps( ): # prepare ip-stitched rtlsim verify_model = deepcopy(model) # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") # similarly for StreamingDataWidthConverter with impl_style=hls for dwc_layer in verify_model.get_nodes_by_op_type( "StreamingDataWidthConverter_Batch"): getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls") verify_model = verify_model.transform(PrepareRTLSim()) verify_model.set_metadata_prop("exec_mode", "rtlsim") verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True) return model
def test_end2end_cnv_w1a1_fold_and_tlastmarker(): model = ModelWrapper(build_dir + "/end2end_cnv_w1a1_dataflow_model.onnx") fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD, in_fifo_depth) for a layer folding = [ (16, 3, 128), (32, 32, 128), (16, 32, 128), (16, 32, 128), (4, 32, 81), (1, 32, 2), (1, 4, 2), (1, 8, 128), (5, 1, 3), ] for fcl, (pe, simd, ififodepth) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("inFIFODepth", ififodepth) swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] swg_inst.set_nodeattr("SIMD", simd) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(InsertTLastMarker()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateResources("estimate")) model.save(build_dir + "/end2end_cnv_w1a1_folded.onnx")
def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfig): """Measure performance + latency of stitched-IP model in rtlsim (pyverilator). Depends on the DataflowOutputType.STITCHED_IP output product. """ if DataflowOutputType.RTLSIM_PERFORMANCE in cfg.generate_outputs: assert (DataflowOutputType.STITCHED_IP in cfg.generate_outputs), "rtlsim_perf needs stitched IP" # prepare ip-stitched rtlsim rtlsim_model = deepcopy(model) # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that for fifo_layer in rtlsim_model.get_nodes_by_op_type("StreamingFIFO"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") # similarly for StreamingDataWidthConverter with impl_style=hls for dwc_layer in rtlsim_model.get_nodes_by_op_type( "StreamingDataWidthConverter_Batch"): getCustomOp(dwc_layer).set_nodeattr("impl_style", "hls") rtlsim_model = rtlsim_model.transform(PrepareRTLSim()) rtlsim_model.set_metadata_prop("exec_mode", "rtlsim") # run with single input to get latency rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, 1) rtlsim_latency = rtlsim_perf_dict["cycles"] # run with num inputs equal to layers to fill the whole pipeline # to get the steady-state throughput rtlsim_bs = len(rtlsim_model.graph.node) rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) rtlsim_perf_dict["latency_cycles"] = rtlsim_latency report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) with open(report_dir + "/rtlsim_performance.json", "w") as f: json.dump(rtlsim_perf_dict, f, indent=2) return model
def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): idt = wdt = DataType.INT4 ifm_dim = 6 ifm_ch = 4 # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold) model = set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding) input_tensor = gen_finn_dt_tensor(idt, [1, ifm_dim, ifm_dim, ifm_ch]) input_dict = {"inp": input_tensor} new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVVAU()) # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: if n.op_type == "ConvolutionInputGenerator": convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) elif n.op_type == "Vector_Vector_Activate_Batch": vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("rtlsim")) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) new_model = new_model.transform(HLSSynthIP()) new_model = new_model.transform(PrepareRTLSim()) assert oxe.compare_execution(model, new_model, input_dict)
def test_end2end_tfc_w1a2_fold_and_tlastmarker(): model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_model.onnx") fc0 = model.graph.node[0] fc1 = model.graph.node[1] fc2 = model.graph.node[2] fc3 = model.graph.node[3] fc0w = getCustomOp(fc0) fc1w = getCustomOp(fc1) fc2w = getCustomOp(fc2) fc3w = getCustomOp(fc3) fc0w.set_nodeattr("inFIFODepth", 50) fc0w.set_nodeattr("SIMD", 8) fc0w.set_nodeattr("PE", 16) fc0w.set_nodeattr("outFIFODepth", 4) fc1w.set_nodeattr("SIMD", 16) fc1w.set_nodeattr("PE", 16) fc1w.set_nodeattr("outFIFODepth", 4) fc2w.set_nodeattr("SIMD", 16) fc2w.set_nodeattr("PE", 16) fc2w.set_nodeattr("outFIFODepth", 4) fc3w.set_nodeattr("SIMD", 16) fc3w.set_nodeattr("PE", 10) fc3w.set_nodeattr("outFIFODepth", 50) model = model.transform(InsertTLastMarker()) model.save(build_dir + "/end2end_tfc_w1a2_folded.onnx")
def apply(self, model): graph = model.graph if self.mode == "estimate": res_fxn = res_estimation elif self.mode == "hls": res_fxn = hls_synth_res_estimation elif self.mode == "synth": res_fxn = post_synth_res else: raise Exception("Unrecognized mode for AnnotateResources") if self.res_dict is None: self.res_dict = model.analysis(res_fxn) children_dict = {} # annotate node resources for node in graph.node: if _is_fpgadataflow_node( node) and node.name in self.res_dict.keys(): op_inst = registry.getCustomOp(node) op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name])) children_dict[node.name] = self.res_dict[node.name] elif node.op_type == "StreamingDataflowPartition": # recurse into model to manually annotate per-layer resources sdp_model_filename = getCustomOp(node).get_nodeattr("model") sdp_model = ModelWrapper(sdp_model_filename) sdp_model = sdp_model.transform( AnnotateResources(self.mode, self.res_dict)) sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode) sdp_dict = eval(sdp_dict) # save transformed model sdp_model.save(sdp_model_filename) # set res attribute for sdp node getCustomOp(node).set_nodeattr("res_" + self.mode, str(sdp_dict)) children_dict[node.name] = sdp_dict self.res_dict.update(children_dict) total_dict = {} for lname in children_dict.keys(): layer_res_dict = self.res_dict[lname] for r_type in layer_res_dict.keys(): r_amount = layer_res_dict[r_type] r_amount = float(r_amount) if r_type in total_dict.keys(): total_dict[r_type] += r_amount else: total_dict[r_type] = r_amount for k in total_dict.keys(): if "efficiency" in k: total_dict[k] = total_dict[k] / len(graph.node) model.set_metadata_prop("res_total_" + self.mode, str(total_dict)) if "(top)" in self.res_dict.keys(): top_dict = self.res_dict["(top)"] model.set_metadata_prop("res_total_top_" + self.mode, str(top_dict)) return (model, False)
def hw_accelerate_parent_model_setup(parent_onnx_model_dir, remote_exec_model_dir): parent_model = ModelWrapper(parent_onnx_model_dir) sdp_node = parent_model.graph.node[ 1] #Need to look into parent model to customize the value getCustomOp(sdp_node).set_nodeattr("model", REMOTE_EXEC_MODEL_DIR) parent_model.save( BASE_DIR + "/qnn_harnn_model_dataflow_parent_with_remote_bitfile_exec.onnx") return parent_model
def apply(self, model): graph = model.graph node_ind = -1 graph_modified = False for n in graph.node: node_ind += 1 if _suitable_node(n): n_output = n.output[0] consumer = model.find_consumer(n_output) if _suitable_node(consumer) is True: n0 = getCustomOp(n) n1 = getCustomOp(consumer) n0_out_shape = n0.get_folded_output_shape() n1_in_shape = n1.get_folded_input_shape() if n0_out_shape[-1] != n1_in_shape[-1]: graph_modified = True # determine dwc inwidth dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() # determine shape for dwc dwc_shape = n0.get_normal_output_shape() # determine dtype for dwc dtype = n0.get_output_datatype() dwc_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, dwc_shape, ) graph.value_info.append(dwc_output_tensor) dwc_node = oh.make_node( "StreamingDataWidthConverter_Batch", [n_output], [dwc_output_tensor.name], domain="finn", backend="fpgadataflow", shape=dwc_shape, inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) # set dwc output tensor as new input tensor of second node consumer.input[0] = dwc_output_tensor.name return (model, graph_modified)
def rtlsim_exec(model, execution_context): """Use PyVerilator to execute given model with stitched IP. The execution context contains the input values.""" if PyVerilator is None: raise ImportError("Installation of PyVerilator is required.") # ensure stitched ip project already exists assert os.path.isfile(model.get_metadata_prop("wrapper_filename")), """The file name from metadata property "wrapper_filename" doesn't exist.""" assert os.path.isdir(model.get_metadata_prop("vivado_stitch_proj")), """The directory from metadata property "vivado_stitch_proj" doesn't exist""" trace_file = model.get_metadata_prop("rtlsim_trace") # extract input shape # TODO extend for multiple inputs i_name = model.graph.input[0].name i_tensor = execution_context[i_name] i_dt = model.get_tensor_datatype(i_name) first_node = getCustomOp(model.find_consumer(i_name)) i_stream_w = first_node.get_instream_width() # convert input into time multiplexed shape i_folded_shape = first_node.get_folded_input_shape() # TODO any other layout transformations need to happen here! i_tensor = i_tensor.reshape(i_folded_shape) # extract output shape o_name = model.graph.output[0].name o_shape = model.get_tensor_shape(o_name) o_dt = model.get_tensor_datatype(o_name) last_node = getCustomOp(model.find_producer(o_name)) o_folded_shape = last_node.get_folded_output_shape() o_stream_w = last_node.get_outstream_width() packedBits = o_stream_w targetBits = o_dt.bitwidth() # pack input packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w) num_out_values = last_node.get_number_output_values() # prepare pyverilator model rtlsim_so = model.get_metadata_prop("rtlsim_so") if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)): sim = pyverilate_stitched_ip(model) model.set_metadata_prop("rtlsim_so", sim.lib._name) else: sim = PyVerilator(rtlsim_so) _reset_rtlsim(sim) _toggle_clk(sim) ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file) packed_output = ret[0] model.set_metadata_prop("sim_cycles", str(ret[1])) # unpack output and put into context o_folded_tensor = rtlsim_output_to_npy(packed_output, None, o_dt, o_folded_shape, packedBits, targetBits) execution_context[o_name] = o_folded_tensor.reshape(o_shape)
def test_dataflow_partition_tlastmarker(): model = ModelWrapper(build_dir + "/test_dataflow_partition_create.onnx") model_path = getCustomOp(model.graph.node[2]).get_nodeattr("model") model = ModelWrapper(model_path) model = model.transform(InsertTLastMarker()) assert model.graph.node[-1].op_type == "TLastMarker" assert model.graph.node[-1].domain == "finn" tl_node = getCustomOp(model.graph.node[-1]) assert tl_node.get_nodeattr("NumIters") == 1 assert tl_node.get_nodeattr("StreamWidth") == 320 assert tl_node.get_nodeattr("ElemWidth") == 32 model.save(build_dir + "/test_dataflow_partition_tlastmarker.onnx") model = model.transform(InsertTLastMarker()) model.save(build_dir + "/test_dataflow_partition_tlastmarker2.onnx")
def test_end2end_tfc_w1a2_verify_dataflow_part(): model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_ipstitch.onnx") x = np.zeros((1, 784), dtype=np.float32) inp_name = model.graph.input[0].name out_name = model.graph.output[0].name inp_dict = {inp_name: x} # npysim model = model.transform(CodeGen_npysim()) model = model.transform(Compile()) model = model.transform(SetExecMode("npysim")) model.save(build_dir + "/end2end_tfc_w1a2_ipstitch_npysim.onnx") ret_npysim = execute_onnx(model, inp_dict, True) res_npysim = ret_npysim[out_name] # node-by-node rtlsim model = model.transform(SetExecMode("rtlsim")) getCustomOp(model.graph.node[0]).set_nodeattr("rtlsim_trace", "default") getCustomOp(model.graph.node[1]).set_nodeattr("rtlsim_trace", "default") getCustomOp(model.graph.node[2]).set_nodeattr("rtlsim_trace", "default") getCustomOp(model.graph.node[3]).set_nodeattr("rtlsim_trace", "default") model.save(build_dir + "/end2end_tfc_w1a2_ipstitch_nodebynode_rtlsim.onnx") ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True) res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name] # whole-network (ip-stitched) rtlsim model.set_metadata_prop("exec_mode", "rtlsim") model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") model.save(build_dir + "/end2end_tfc_w1a2_ipstitch_whole_rtlsim.onnx") ret_rtlsim_whole = execute_onnx(model, inp_dict, True) res_rtlsim_whole = ret_rtlsim_whole[out_name] assert np.isclose(res_npysim, res_rtlsim_nodebynode).all() assert np.isclose(res_npysim, res_rtlsim_whole).all()
def test_end2end_mobilenet_folding(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hls_layers.onnx") # optional extra folding to use fewer resources # applied while setting the attributes on each node assert extra_fold in [1, 2, 4] # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), (16, 16, "block"), (16, 16, "block"), (32, 16, "block"), (16, 16, "block"), (32, 16, "block"), (16, 16, "block"), (32, 16, "block"), (32, 16, "block"), (32, 16, "block"), (32, 16, "block"), (32, 16, "block"), (16, 16, "block"), (32, 16, "block"), (4, 4, "block"), ] for fcl, (pe, simd, ramstyle) in zip(fc_layers, folding): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe // extra_fold) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) # first layer uses 8-bit weights & activations # control its compute resource type explicitly getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type) # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer vvau_layers = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) vvau_inst.set_nodeattr("PE", pe // extra_fold) # set SIMD in preceeding ConvInputGen to same value convinputgen = model.find_direct_predecessors(vvau)[0] convinputgen_inst = getCustomOp(convinputgen) convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold) # set SIMD in preceeding FMPadding to same value padding = model.find_direct_predecessors(convinputgen)[0] if padding.op_type == "FMPadding_Batch": padding_inst = getCustomOp(padding) padding_inst.set_nodeattr("SIMD", pe // extra_fold) # adjust final pooling layer + its inpgen pool_node = model.get_nodes_by_op_type("Pool_Batch")[0] pool_inst = getCustomOp(pool_node) pool_inst.set_nodeattr("PE", 4 // extra_fold) pool_inpgen = model.find_direct_predecessors(pool_node)[0] pool_inpgen_inst = getCustomOp(pool_inpgen) pool_inpgen_inst.set_nodeattr("SIMD", 4 // extra_fold) model = model.transform(InferDataLayouts()) model.save(build_dir + "/end2end_mobilenet_folded.onnx")
def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False for n in graph.node: node_ind += 1 if n.op_type == "Transpose" and not model.is_fork_node(n): perms = list(get_by_name(n.attribute, "perm").ints) if perms == [0, 3, 1, 2]: mt_cand = model.find_consumer(n.output[0]) if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node( mt_cand ): final_t_cand = model.find_consumer(mt_cand.output[0]) if final_t_cand.op_type == "Transpose": perms = list( get_by_name(final_t_cand.attribute, "perm").ints ) if perms == [0, 2, 3, 1]: mt = getCustomOp(mt_cand) mt.set_nodeattr("data_layout", "NHWC") # get rid of tranpose nodes, wire MT directly mt_cand.input[0] = n.input[0] mt_cand.output[0] = final_t_cand.output[0] graph.node.remove(n) graph.node.remove(final_t_cand) graph_modified = True else: mt = getCustomOp(mt_cand) mt.set_nodeattr("data_layout", "NHWC") # get rid of first tranpose node mt_cand.input[0] = n.input[0] graph.node.remove(n) # fix output shape for MultiThreshold mt_ishape = model.get_tensor_shape(mt_cand.input[0]) model.set_tensor_shape(mt_cand.output[0], mt_ishape) # re-insert Transpose behind MultiThreshold transpose_output = model.make_new_valueinfo_name() new_transpose = oh.make_node( "Transpose", [mt_cand.output[0]], [transpose_output], perm=[0, 3, 1, 2], ) graph.node.insert(node_ind + 1, new_transpose) final_t_cand.input[0] = transpose_output graph_modified = True if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified)
def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False for n in graph.node: node_ind += 1 if n.op_type == "Transpose" and not model.is_fork_node(n): perms = list(get_by_name(n.attribute, "perm").ints) if perms == [0, 3, 1, 2]: mt_cand = model.find_consumer(n.output[0]) if mt_cand.op_type == "MultiThreshold" and not model.is_fork_node( mt_cand): final_t_cand = model.find_consumer(mt_cand.output[0]) if final_t_cand.op_type == "Transpose": perms = list( get_by_name(final_t_cand.attribute, "perm").ints) if perms == [0, 2, 3, 1]: mt = getCustomOp(mt_cand) mt.set_nodeattr("data_layout", "NHWC") # get rid of tranpose nodes, wire MT directly mt_cand.input[0] = n.input[0] mt_cand.output[0] = final_t_cand.output[0] graph.node.remove(n) graph.node.remove(final_t_cand) graph_modified = True elif final_t_cand.op_type == "Reshape": oshape = model.get_tensor_shape( final_t_cand.output[0]) if len(oshape) == 2: # transition to FC part, can still use NHWC mt = getCustomOp(mt_cand) mt.set_nodeattr("data_layout", "NHWC") # get rid of first tranpose node mt_cand.input[0] = n.input[0] # fix output shape for MultiThreshold mt_ishape = model.get_tensor_shape( mt_cand.input[0]) (b, h, w, c) = mt_ishape assert (h == 1 and w == 1), """Untested spatial dim in conv->fc transition, proceed with caution!""" model.set_tensor_shape(mt_cand.output[0], mt_ishape) graph.node.remove(n) graph_modified = True if graph_modified: model = model.transform(InferDataTypes()) return (model, graph_modified)
def test_fpgadataflow_ipstitch_iodma_floorplan(): model = create_one_fc_model() if model.graph.node[0].op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(model.graph.node[0]) assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" assert os.path.isfile(sdp_node.get_nodeattr("model")) model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) model = model.transform(InferDataLayouts()) model = model.transform(InsertIODMA()) model = model.transform(Floorplan()) assert getCustomOp(model.graph.node[0]).get_nodeattr("partition_id") == 0 assert getCustomOp(model.graph.node[1]).get_nodeattr("partition_id") == 2 assert getCustomOp(model.graph.node[2]).get_nodeattr("partition_id") == 1 model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_iodma_floorplan.onnx")
def apply(self, model): if isinstance(self.config, dict): model_config = self.config else: with open(self.config, "r") as f: model_config = json.load(f) used_configurations = ["Defaults"] missing_configurations = [] # Configure network for node_idx, node in enumerate(model.graph.node): try: node_config = model_config[node.name] except KeyError: missing_configurations += [node.name] node_config = {} from finn.custom_op.registry import getCustomOp try: inst = getCustomOp(node) except Exception: continue used_configurations += [node.name] # set specified defaults default_configs = { k: v for k, v in model_config["Defaults"].items() if k not in model_config } default_configs = { k: v[0] for k, v in default_configs.items() if v[1] == "all" or node.op_type in v[1] } for attr, value in default_configs.items(): inst.set_nodeattr(attr, value) # set node attributes from specified configuration for attr, value in node_config.items(): inst.set_nodeattr(attr, value) # Configuration verification if len(missing_configurations) > 0: warnings.warn("\nNo HW configuration for nodes: " + ", ".join(missing_configurations)) unused_configs = [ x for x in model_config if x not in used_configurations ] if len(unused_configs) > 0: warnings.warn("\nUnused HW configurations: " + ", ".join(unused_configs)) # one iteration is enough return (model, False)
def apply(self, model): for node in model.graph.node: if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # find the IP gen dir ipgen_path = inst.get_nodeattr("ipgen_path") if ipgen_path is not None and os.path.isdir(ipgen_path): for dname, dirs, files in os.walk(ipgen_path): for fname in files: if fname.endswith(".v"): fpath = os.path.join(dname, fname) with open(fpath, "r") as f: s = f.read() old = '$readmemh(".' new = '$readmemh("%s' % dname s = s.replace(old, new) old = '"./' new = '"%s/' % dname s = s.replace(old, new) with open(fpath, "w") as f: f.write(s) except KeyError: pass return (model, False)
def applyNodeLocal(self, node): op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # ensure that code is generated assert ( inst.get_nodeattr("code_gen_dir_cppsim") != "" ), """Node attribute "code_gen_dir_cppsim" is not set. Please run Transformation PrepareCppSim first.""" # call the compilation function for this node inst.compile_singlenode_code() # ensure that executable path is now set assert ( inst.get_nodeattr("executable_path") != "" ), """Transformation compile was not successful, there is no path to executables set in node attribute "executable_path".""" except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type ) return (node, False)
def connect_clk_rst(self, node): inst_name = node.name node_inst = getCustomOp(node) clock_intf_name = node_inst.get_verilog_top_module_intf_names( )["clk"][0] reset_intf_name = node_inst.get_verilog_top_module_intf_names( )["rst"][0] # make clock and reset external, if they aren't already if not self.clock_reset_are_external: self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name)) self.connect_cmds.append( "set_property name ap_clk [get_bd_ports ap_clk_0]") self.connect_cmds.append( "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)) self.connect_cmds.append( "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]") self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] self.intf_names["rst"] = ["ap_rst_n"] # otherwise connect clock and reset else: self.connect_cmds.append( "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/%s]" % (inst_name, reset_intf_name)) self.connect_cmds.append( "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/%s]" % (inst_name, clock_intf_name))
def connect_axi(self, node): inst_name = node.name node_inst = getCustomOp(node) axilite_intf_name = node_inst.get_verilog_top_module_intf_names( )["axilite"] aximm_intf_name = node_inst.get_verilog_top_module_intf_names( )["aximm"] if len(axilite_intf_name) != 0: self.connect_cmds.append("make_bd_intf_pins_external " "[get_bd_intf_pins %s/%s]" % (inst_name, axilite_intf_name[0])) ext_if_name = "%s_%d" % ( axilite_intf_name[0], len(self.intf_names["axilite"]), ) self.intf_names["axilite"].append(ext_if_name) if len(aximm_intf_name) != 0: self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, aximm_intf_name[0])) self.connect_cmds.append( "set_property name m_axi_gmem0 [get_bd_intf_ports m_axi_gmem_0]" ) self.intf_names["aximm"] = ["m_axi_gmem0"] assert self.has_aximm is False, "Currently limited to one AXI-MM interface" self.has_aximm = True
def apply(self, model): shallow_fifos = [] for node in model.graph.node: if (node.op_type == "StreamingFIFO" and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold): # bypass shallow fifos shallow_fifos.append(node) consumers = model.find_consumers(node.output[0]) if consumers is None: producer = model.find_producer(node.input[0]) for idx, inp in enumerate(producer.output): if inp == node.input[0]: producer.output[idx] = node.output[0] else: assert len( consumers) == 1, "Fanout detected from FIFO output" consumer = consumers[0] # set fifo input tensor as new input tensor of second node for idx, inp in enumerate(consumer.input): if inp == node.output[0]: consumer.input[idx] = node.input[0] # now filter out for node_to_remove in shallow_fifos: model.graph.node.remove(node_to_remove) return (model, False)
def apply(self, model): # TODO move this to own transformation for node in model.graph.node: # look for following pattern: # ConvolutionInputGenerator -> StreamingFIFO -> StreamingFCLayer if node.op_type == "StreamingFIFO": fifo_prod = model.find_producer(node.input[0]) fifo_cons = model.find_consumer(node.output[0]) if fifo_prod is None: continue if fifo_prod.op_type != "ConvolutionInputGenerator": continue if fifo_cons is None: continue if fifo_cons.op_type != "StreamingFCLayer_Batch": continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") # SWG has an internal buffer of 1 row, so we use this as a # rule of thumb to set FIFO depth to be no larger than 1 row (bs, h, w, ifold, simd) = op_inst.get_folded_input_shape() new_depth = optimize_depth(w * ifold) new_depth = min(new_depth, depth) op_inst.set_nodeattr("depth", new_depth) # Set FIFO implementation/ram styles if new_depth > self.max_qsrl_depth: op_inst.set_nodeattr("impl_style", "vivado") op_inst.set_nodeattr("ram_style", "auto") else: op_inst.set_nodeattr("impl_style", "rtl") return (model, False)
def test_end2end_tfc_w1a2_run_on_pynq(): # use the streamlined model as the "golden" model for right answers golden = ModelWrapper(build_dir + "/end2end_tfc_w1a2_streamlined.onnx") iname = golden.graph.input[0].name oname = golden.graph.output[0].name raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb") input_tensor = onnx.load_tensor_from_string(raw_i) x = nph.to_array(input_tensor) # x = np.zeros(ishape, dtype=np.float32) # run using FINN-based execution ret_golden = execute_onnx(golden, {iname: x}, True) y_golden = ret_golden[oname] # set up parent+child graph to test # we'll use models from the previous step as the child model parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1a2_dataflow_parent.onnx") iname = parent_model.graph.input[0].name oname = parent_model.graph.output[0].name try: ip = os.environ["PYNQ_IP"] # NOQA if ip == "": pytest.skip("PYNQ board IP address not specified") # produce results with cppsim sdp_node = parent_model.get_nodes_by_op_type( "StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) sdp_node.set_nodeattr("model", build_dir + "/end2end_tfc_w1a2_pynq_deploy.onnx") ret = execute_onnx(parent_model, {iname: x}, True) y = ret[oname] assert np.isclose(y, y_golden).all() except KeyError: pytest.skip("PYNQ board IP address not specified")
def applyNodeLocal(self, node): op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # ensure that code is generated assert (inst.get_nodeattr("code_gen_dir_ipgen") != ""), """Node attribute "code_gen_dir_ipgen" is empty. Please run transformation PrepareIP first.""" if not os.path.isdir(inst.get_nodeattr("ipgen_path")): # call the compilation function for this node inst.ipgen_singlenode_code() else: warnings.warn("Using pre-existing IP for %s" % node.name) # ensure that executable path is now set assert (inst.get_nodeattr("ipgen_path") != ""), """Transformation HLSSynthIP was not successful. Node attribute "ipgen_path" is empty.""" except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type) return (node, False)
def apply(self, model): graph = model.graph # annotate node cycles for node in graph.node: if _is_fpgadataflow_node(node): op_inst = registry.getCustomOp(node) cycles = op_inst.get_exp_cycles() op_inst.set_nodeattr("cycles_estimate", cycles) elif node.op_type == "StreamingDataflowPartition": # recurse into model to manually annotate per-layer cycles sdp_model_filename = getCustomOp(node).get_nodeattr("model") sdp_model = ModelWrapper(sdp_model_filename) sdp_model = sdp_model.transform(AnnotateCycles()) # save transformed model sdp_model.save(sdp_model_filename) return (model, False)