def apply(self, model): for node in model.graph.node: if is_fpgadataflow_node(node) is True: inst = getCustomOp(node) if hasattr(inst, "minimize_accumulator_width"): inst.minimize_accumulator_width(model) return (model, False)
def floorplan_params(model): """Gathers SLR and partition IDs from nodes. Returns {node name : {slr, device id, partition id, memory port}}.""" ret_dict = { "Defaults": { "slr": [-1, ["all"]], "partition_id": [0, ["all"]], "device_id": [0, ["all"]], "mem_port": ["", ["all"]], } } for node in model.graph.node: if is_fpgadataflow_node(node) is True: node_inst = getCustomOp(node) node_slr = node_inst.get_nodeattr("slr") node_pid = node_inst.get_nodeattr("partition_id") node_mport = node_inst.get_nodeattr("mem_port") ret_dict[node.name] = { "slr": node_slr, "partition_id": node_pid, "device_id": 0, "mem_port": node_mport, } return ret_dict
def applyNodeLocal(self, node): op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # ensure that code is generated assert ( inst.get_nodeattr("code_gen_dir_cppsim") != "" ), """Node attribute "code_gen_dir_cppsim" is not set. Please run Transformation PrepareCppSim first.""" # call the compilation function for this node inst.compile_singlenode_code() # ensure that executable path is now set assert ( inst.get_nodeattr("executable_path") != "" ), """Transformation compile was not successful, there is no path to executables set in node attribute "executable_path".""" except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type ) return (node, False)
def apply(self, model): for node in model.graph.node: if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # find the IP gen dir ipgen_path = inst.get_nodeattr("ipgen_path") if ipgen_path is not None and os.path.isdir(ipgen_path): for dname, dirs, files in os.walk(ipgen_path): for fname in files: if fname.endswith(".v"): fpath = os.path.join(dname, fname) with open(fpath, "r") as f: s = f.read() old = '$readmemh(".' new = '$readmemh("%s' % dname s = s.replace(old, new) old = '"./' new = '"%s/' % dname s = s.replace(old, new) with open(fpath, "w") as f: f.write(s) except KeyError: pass return (model, False)
def applyNodeLocal(self, node): op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # ensure that code is generated assert (inst.get_nodeattr("code_gen_dir_ipgen") != ""), """Node attribute "code_gen_dir_ipgen" is empty. Please run transformation PrepareIP first.""" if not os.path.isdir(inst.get_nodeattr("ipgen_path")): # call the compilation function for this node inst.ipgen_singlenode_code() else: warnings.warn("Using pre-existing IP for %s" % node.name) # ensure that executable path is now set assert (inst.get_nodeattr("ipgen_path") != ""), """Transformation HLSSynthIP was not successful. Node attribute "ipgen_path" is empty.""" except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type) return (node, False)
def _suitable_node(node): if node is not None: if is_fpgadataflow_node(node) is True: if _is_dwc_node(node) is False: return True else: return False else: return False else: return False
def res_estimation(model): """Estimates the resources needed for the given model. Returns {node name : resource estimation}.""" res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: op_type = node.op_type inst = registry.custom_op[op_type](node) res_dict[node.name] = inst.node_res_estimation() return res_dict
def res_estimation(model): """Estimates the resources needed for the given model. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. Returns {node name : resource estimation}.""" res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: inst = registry.getCustomOp(node) res_dict[node.name] = inst.node_res_estimation() return res_dict
def _suitable_node(node): if node is not None: if is_fpgadataflow_node(node) is True: if _is_dwc_node(node): # no DWC for DWCs return False elif node.op_type == "IODMA": # IODMA data shapes/widths need special handling return False else: return True else: return False else: return False
def exp_cycles_per_layer(model): """Estimates the number of cycles per sample for dataflow layers in the given model. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. Returns {node name : cycle estimation}.""" cycle_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: inst = registry.getCustomOp(node) cycle_dict[node.name] = int(inst.get_exp_cycles()) return cycle_dict
def applyNodeLocal(self, node): op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.custom_op[op_type](node) inst.prepare_rtlsim() # ensure that executable path is now set assert ( inst.get_nodeattr("rtlsim_so") != "" ), "Failed to prepare RTLSim, no rtlsim_so attribute found." except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type) return (node, False)
def dataflow_performance(model): """Extract key performance indicators from given model with dataflow nodes. Note that the latency (critical path) analysis is very pessimistic, it assumes no overlap between executions and simply sums the expected cycles for each node along the critical path. Preconditions: - model consists of fpgadataflow nodes - model has cycle estimates annotated (see AnnotateCycles transformation) - nodes have unique names (see GiveUniqueNodeNames) Returns: - max_cycles : number of cycles for slowest node - max_cycles_node_name : name of slowest node - critical_path_cycles : pessimistic expected latency from input to output """ latency_at_node_output = {} max_cycles = 0 max_node_name = "" for node in model.graph.node: if is_fpgadataflow_node(node) is True: inst = getCustomOp(node) node_cycles = int(inst.get_nodeattr("cycles_estimate")) if node_cycles > max_cycles: max_cycles = node_cycles max_node_name = node.name if node.name not in latency_at_node_output: # calculate based on input(s) predecessors = model.find_direct_predecessors(node) if predecessors is None: # no predecessors, node is first node max_pred_latency = 0 else: # find max of any of predecessors pred_latencies = map( lambda x: latency_at_node_output[x.name], predecessors) max_pred_latency = max(pred_latencies) latency_at_node_output[ node.name] = node_cycles + max_pred_latency critical_path_cycles = max(latency_at_node_output.values()) return { "critical_path_cycles": int(critical_path_cycles), "max_cycles": int(max_cycles), "max_cycles_node_name": max_node_name, }
def hls_synth_res_estimation(model): """Extracts the FPGA resource results from the Vivado HLS synthesis estimates. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. Returns {node name : resources_dict}.""" res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: # init values to zero res_dict[node.name] = dict() res_dict[node.name]["BRAM_18K"] = 0 res_dict[node.name]["FF"] = 0 res_dict[node.name]["LUT"] = 0 res_dict[node.name]["DSP48E"] = 0 res_dict[node.name]["URAM"] = 0 op_type = node.op_type inst = registry.custom_op[op_type](node) code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen") if code_gen_dir == "": warnings.warn( """Could not find report files, values will be set to zero for this node. Please run "PrepareIP" transformation and "HLSSynthIP" first to generate the report files""" ) else: xmlfile = "{}/project_{}/sol1/syn/report/{}_csynth.xml".format( code_gen_dir, node.name, node.name ) if os.path.isfile(xmlfile): tree = ET.parse(xmlfile) root = tree.getroot() for item in root.findall("AreaEstimates/Resources"): for child in item: res_dict[node.name][child.tag] = child.text else: warnings.warn( """Could not find report files, values will be set to zero for this node. Please run "PrepareIP" transformation and "HLSSynthIP" first to generate the report files""" ) return res_dict
def apply(self, model): # delete PYNQ project, if any vivado_pynq_proj_dir = model.get_metadata_prop("vivado_pynq_proj") if vivado_pynq_proj_dir is not None and os.path.isdir( vivado_pynq_proj_dir): shutil.rmtree(vivado_pynq_proj_dir) model.set_metadata_prop("vivado_pynq_proj", "") # delete IP stitching project, if any ipstitch_path = model.get_metadata_prop("vivado_stitch_proj") if ipstitch_path is not None and os.path.isdir(ipstitch_path): shutil.rmtree(ipstitch_path) model.set_metadata_prop("vivado_stitch_proj", "") for node in model.graph.node: op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.custom_op[op_type](node) # delete code_gen_dir from cppsim code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim") if os.path.isdir(code_gen_dir): shutil.rmtree(code_gen_dir) inst.set_nodeattr("code_gen_dir_cppsim", "") inst.set_nodeattr("executable_path", "") # delete code_gen_dir from ipgen and project folder code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen") ipgen_path = inst.get_nodeattr("ipgen_path") if os.path.isdir(code_gen_dir): shutil.rmtree(code_gen_dir) if os.path.isdir(ipgen_path): shutil.rmtree(ipgen_path) inst.set_nodeattr("code_gen_dir_ipgen", "") inst.set_nodeattr("ipgen_path", "") # delete Java HotSpot Performance data log for d_name in os.listdir("/tmp/"): if "hsperfdata" in d_name: shutil.rmtree("/tmp/" + str(d_name)) except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type) return (model, False)
def apply(self, model): for node in model.graph.node: op_type = node.op_type if is_fpgadataflow_node(node) is True: try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) # set sim_mode accordingly to argument mode inst.set_nodeattr("exec_mode", self.mode) # ensure that sim_mode is now set assert (inst.get_nodeattr("exec_mode") != ""), """Transformation was not successful. Node attribute "exec_mode" is not set""" except KeyError: # exception if op_type is not supported raise Exception( "Custom op_type %s is currently not supported." % op_type) return (model, False)
def res_estimation_complete(model): """Estimates the resources needed for the given model and all values for resource-related switches. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. Returns {node name : [resource estimation(s)]}.""" res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: op_type = node.op_type inst = registry.getCustomOp(node) if (op_type == "StreamingFCLayer_Batch" or op_type == "Vector_Vector_Activate_Batch"): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("resType", "lut") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("resType", orig_restype) elif op_type == "ConvolutionInputGenerator": orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] inst.set_nodeattr("ram_style", "block") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("ram_style", "distributed") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("ram_style", "ultra") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("ram_style", orig_ramstyle) else: res_dict[node.name] = [inst.node_res_estimation()] return res_dict
def copy_ip(model, copy_dir, src_dir): # Makes copies of the folders containing the IP of the model from src_dir into copy_dir, updates the node model # If a stitched project is present, copy that to copy_dir as well current_project_dir = model.get_metadata_prop("vivado_stitch_proj") current_wrapper_filename = model.get_metadata_prop("wrapper_filename") if current_project_dir is not None: new_project_dir = change_dir_path(current_project_dir, src_dir, copy_dir) new_wrapper_filename = change_dir_path(current_wrapper_filename, src_dir, copy_dir) shutil.copytree(current_project_dir, new_project_dir) model.set_metadata_prop("vivado_stitch_proj", new_project_dir) model.set_metadata_prop("wrapper_filename", new_wrapper_filename) # Make copies of the IP attached to each node from src_dir to copy_dir for node in model.graph.node: if (is_fpgadataflow_node(node)): node_instance = getCustomOp(node) current_codegen_dir = node_instance.get_nodeattr( "code_gen_dir_ipgen" ) # Directory containing Vivado HLS project and scripts used to generate it current_ipgen_path_dir = node_instance.get_nodeattr( "ipgen_path" ) # Path to the Vivado HLS project containing the generated IP core current_ip_path_dir = node_instance.get_nodeattr( "ip_path" ) # Path to the component.xml file which describes the IP Core new_codegen_dir = change_dir_path(current_codegen_dir, src_dir, copy_dir) new_ipgen_path_dir = change_dir_path(current_ipgen_path_dir, src_dir, copy_dir) new_ip_path_dir = change_dir_path(current_ip_path_dir, src_dir, copy_dir) shutil.copytree(current_codegen_dir, new_codegen_dir) node_instance.set_nodeattr("code_gen_dir_ipgen", new_codegen_dir) node_instance.set_nodeattr("ipgen_path", new_ipgen_path_dir) node_instance.set_nodeattr("ip_path", new_ip_path_dir) return model
def apply(self, model): for node in model.graph.node: if is_fpgadataflow_node(node) is True: _codegen_single_node(node, model, self.fpgapart, self.clk) return (model, False)
def apply(self, model): # ensure non-relative readmemh .dat files model = model.transform(ReplaceVerilogRelPaths()) ip_dirs = ["list"] # add RTL streamer IP ip_dirs.append("/workspace/finn/finn-rtllib/memstream") if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: warnings.warn("""First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware behavior. It is strongly recommended to insert FIFOs prior to calling CreateStitchedIP.""") # ensure that all nodes are fpgadataflow, and that IPs are generated for node in model.graph.node: assert is_fpgadataflow_node( node), "All nodes must be FINN fpgadataflow nodes." node_inst = getCustomOp(node) ip_dir_value = node_inst.get_nodeattr("ip_path") assert os.path.isdir( ip_dir_value), "IP generation directory doesn't exist." ip_dirs += [ip_dir_value] self.create_cmds += node_inst.code_generation_ipi() self.connect_clk_rst(node) self.connect_axi(node) for i in range(len(node.input)): if is_external_input(model, node, i): self.connect_s_axis_external(node, idx=i) else: producer = model.find_producer(node.input[i]) if producer is None: continue j = list(producer.output).index(node.input[i]) src_intf_name = getCustomOp( producer).get_verilog_top_module_intf_names( )["m_axis"][j][0] dst_intf_name = node_inst.get_verilog_top_module_intf_names( )["s_axis"][i][0] self.connect_cmds.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s]" % (producer.name, src_intf_name, node.name, dst_intf_name)) for i in range(len(node.output)): if is_external_output(model, node, i): self.connect_m_axis_external(node, idx=i) # create a temporary folder for the project prjname = "finn_vivado_stitch_proj" vivado_stitch_proj_dir = make_build_dir(prefix="vivado_stitch_proj_") model.set_metadata_prop("vivado_stitch_proj", vivado_stitch_proj_dir) # start building the tcl script tcl = [] # create vivado project tcl.append("create_project %s %s -part %s" % (prjname, vivado_stitch_proj_dir, self.fpgapart)) # add all the generated IP dirs to ip_repo_paths ip_dirs_str = " ".join(ip_dirs) tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str) tcl.append("update_ip_catalog") # create block design and instantiate all layers block_name = self.ip_name tcl.append('create_bd_design "%s"' % block_name) tcl.extend(self.create_cmds) tcl.extend(self.connect_cmds) fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 model.set_metadata_prop("clk_ns", str(self.clk_ns)) tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz) tcl.append("regenerate_bd_layout") tcl.append("validate_bd_design") tcl.append("save_bd_design") # create wrapper hdl (for rtlsim later on) bd_base = "%s/%s.srcs/sources_1/bd/%s" % ( vivado_stitch_proj_dir, prjname, block_name, ) bd_filename = "%s/%s.bd" % (bd_base, block_name) tcl.append("make_wrapper -files [get_files %s] -top" % bd_filename) wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) tcl.append("add_files -norecurse %s" % wrapper_filename) model.set_metadata_prop("wrapper_filename", wrapper_filename) # synthesize to DCP and export stub, DCP and constraints if self.vitis: tcl.append( "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" % bd_filename) tcl.append( "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} " "-value {-mode out_of_context} -objects [get_runs synth_1]") num_workers = get_num_default_workers() assert num_workers >= 0, "Number of workers must be nonnegative." if num_workers == 0: num_workers = mp.cpu_count() tcl.append("launch_runs synth_1 -jobs %s" % str(num_workers)) tcl.append("wait_on_run [get_runs synth_1]") tcl.append("open_run synth_1 -name synth_1") tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name) tcl.append("write_checkpoint %s.dcp" % block_name) tcl.append("write_xdc %s.xdc" % block_name) tcl.append("report_utilization -file %s_partition_util.rpt" % block_name) # export block design itself as an IP core block_vendor = "xilinx_finn" block_library = "finn" block_vlnv = "%s:%s:%s:1.0" % (block_vendor, block_library, block_name) model.set_metadata_prop("vivado_stitch_vlnv", block_vlnv) model.set_metadata_prop("vivado_stitch_ifnames", json.dumps(self.intf_names)) tcl.append( ("ipx::package_project -root_dir %s/ip -vendor %s " "-library %s -taxonomy /UserIP -module %s -import_files") % (vivado_stitch_proj_dir, block_vendor, block_library, block_name)) tcl.append("set_property core_revision 2 [ipx::find_open_core %s]" % block_vlnv) tcl.append("ipx::create_xgui_files [ipx::find_open_core %s]" % block_vlnv) # mark bus interface params as user-resolvable to avoid FREQ_MHZ mismatches tcl.append( "set_property value_resolve_type user [ipx::get_bus_parameters " "-of [ipx::get_bus_interfaces -of [ipx::current_core ]]]") # if targeting Vitis, add some properties to the IP if self.vitis: # replace source code with dcp tcl.append( "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv) tcl.append( "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv) tcl.append( "set_property supported_families { } [ipx::find_open_core %s]" % block_vlnv) tcl.append( "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} " "[ipx::find_open_core %s]" % block_vlnv) tcl.append("set_property auto_family_support_level level_2 " "[ipx::find_open_core %s]" % block_vlnv) # remove all files from synthesis and sim groups # we'll replace with DCP, stub, and xdc tcl.append( "ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]" ) tcl.append("ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagesynthesis]") tcl.append( "ipx::remove_file_group " "xilinx_anylanguagebehavioralsimulation [ipx::current_core]") tcl.append("ipx::remove_file_group " "xilinx_anylanguagesynthesis [ipx::current_core]") # remove sim and src folders tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir) tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir) # copy and add DCP, stub, and xdc tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir) tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir) tcl.append("file copy -force %s.dcp %s/ip/dcp" % (block_name, vivado_stitch_proj_dir)) tcl.append("file copy -force %s.xdc %s/ip/impl" % (block_name, vivado_stitch_proj_dir)) tcl.append( "ipx::add_file_group xilinx_implementation [ipx::current_core]" ) tcl.append( "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]" % block_name) tcl.append( "set_property used_in [list implementation] " "[ipx::get_files impl/%s.xdc " "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name) tcl.append("ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]") tcl.append("ipx::add_file dcp/%s.dcp " "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name) tcl.append( "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]" ) tcl.append("ipx::add_file dcp/%s.dcp " "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name) tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv) tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv) # export list of used Verilog files (for rtlsim later on) tcl.append( "set all_v_files [get_files -filter {FILE_TYPE == Verilog " + "&& USED_IN_SYNTHESIS == 1} ]") v_file_list = "%s/all_verilog_srcs.txt" % vivado_stitch_proj_dir tcl.append("set fp [open %s w]" % v_file_list) # write each verilog filename to all_verilog_srcs.txt tcl.append("foreach vf $all_v_files {puts $fp $vf}") tcl.append("close $fp") # write the project creator tcl script tcl_string = "\n".join(tcl) + "\n" with open(vivado_stitch_proj_dir + "/make_project.tcl", "w") as f: f.write(tcl_string) # create a shell script and call Vivado make_project_sh = vivado_stitch_proj_dir + "/make_project.sh" working_dir = os.environ["PWD"] with open(make_project_sh, "w") as f: f.write("#!/bin/bash \n") f.write("cd {}\n".format(vivado_stitch_proj_dir)) f.write("vivado -mode batch -source make_project.tcl\n") f.write("cd {}\n".format(working_dir)) bash_command = ["bash", make_project_sh] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() return (model, False)
def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels pe_ops = [ "AddStreams_Batch", "ChannelwiseOp_Batch", "DuplicateStreams_Batch", "GlobalAccPool_Batch", "Thresholding_Batch", ] # these ops use SIMD parallelism, up to a max value of NumChannels # ConvolutionInputGenerator has a special case when depthwise=1 simd_ops = [ "DownSampler", "FMPadding_Batch", "ConvolutionInputGenerator" ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring depthwise_op_exceptions = [ "Vector_Vector_Activate_Batch", "Pool_Batch" ] for node in graph.node: if not is_fpgadataflow_node(node): continue op_type = node.op_type node_inst = getCustomOp(node) if op_type == "StreamingFCLayer_Batch": max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) node_inst.set_nodeattr("SIMD", 1) # increase SIMD until either we meet # the target or weight stream becomes # too wide for simd_val in divisors(max_simd): prev_simd_val = node_inst.get_nodeattr("SIMD") node_inst.set_nodeattr("SIMD", simd_val) cyc = node_inst.get_exp_cycles() if cyc < self.target_cycles_per_frame: # finish if target met break if (node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr("SIMD") > self.mvau_wwidth_max): # revert if we've gone above width threshold node_inst.set_nodeattr("SIMD", prev_simd_val) break # increase PE until target met or reached max_pe self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type == "LabelSelect_Batch": max_pe = node_inst.get_nodeattr("Labels") self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") # also set the folding of the upsteam DW SWU # which must be identical to this node swu_node = model.find_producer(node.input[0]) if swu_node.op_type == "ConvolutionInputGenerator": swu_node_inst = getCustomOp(swu_node) pe = node_inst.get_nodeattr("PE") swu_node_inst.set_nodeattr("SIMD", pe) else: raise Exception("Expected SWU on DW op input, found " + swu_node.op_type) elif op_type in simd_ops: if op_type == "ConvolutionInputGenerator": depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") self.optimize_attribute_val(node_inst, max_simd, "SIMD") else: # depthwise SWGs are handled separately continue else: max_simd = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_simd, "SIMD") else: warnings.warn( "SetFolding doesn't know how to handle op_type " + op_type) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) if self.two_pass_relaxation: perf_dict = model.analysis(dataflow_performance) if perf_dict["max_cycles"] > self.target_cycles_per_frame: # run again, but with lower target (that we managed) -- this # may be coming from a single node's constraints, but we want # to balance the entire dataflow pipeline instead # no two_pass_relaxation this time -- no guarantee we'll # converge otherwise warnings.warn( "Node %s is bottleneck with %d cycles, running second pass" % (perf_dict["max_cycles_node_name"], perf_dict["max_cycles"])) model = model.transform( SetFolding( target_cycles_per_frame=perf_dict["max_cycles"], mvau_wwidth_max=self.mvau_wwidth_max, two_pass_relaxation=False, )) return (model, False)
def res_estimation_complete(model, multivariant=True): """Estimates the resources needed for the given model and all values for resource-related switches. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. Returns {node name : [{config: {}, estimate: resource estimation(s)}]}.""" res_dict = {} for node in model.graph.node: if is_fpgadataflow_node(node) is True: op_type = node.op_type inst = getCustomOp(node) if multivariant: if op_type == "StreamingFCLayer_Batch" or op_type == "Vector_Vector_Activate_Batch": orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] for restype in ["dsp", "lut"]: inst.set_nodeattr("resType", restype) config = {"resType": restype} res_dict[node.name].append({ "config": config, "estimate": inst.node_res_estimation() }) inst.set_nodeattr("resType", orig_restype) elif op_type == "ConvolutionInputGenerator": orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] for restype in ["block", "distributed", "ultra"]: inst.set_nodeattr("ram_style", restype) config = {"ram_style": restype} res_dict[node.name].append({ "config": config, "estimate": inst.node_res_estimation() }) inst.set_nodeattr("ram_style", orig_ramstyle) elif op_type == "StreamingFIFO": orig_ramstyle = inst.get_nodeattr("ram_style") orig_impl_style = inst.get_nodeattr("impl_style") res_dict[node.name] = [] inst.set_nodeattr("impl_style", "vivado") for restype in ["block", "distributed", "ultra"]: inst.set_nodeattr("ram_style", restype) config = {"impl_style": "vivado", "ram_style": restype} res_dict[node.name].append({ "config": config, "estimate": inst.node_res_estimation() }) inst.set_nodeattr("ram_style", orig_ramstyle) inst.set_nodeattr("impl_style", orig_impl_style) else: res_dict[node.name] = [{ "config": {}, "estimate": inst.node_res_estimation() }] else: res_dict[node.name] = [{ "config": {}, "estimate": inst.node_res_estimation() }] return res_dict
def apply(self, model): # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] for node in model.graph.node: # verify assumptions assert is_fpgadataflow_node( node), "Found non-fpgadataflow node: " + str(node) assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" node = getCustomOp(node) node.set_nodeattr("inFIFODepth", self.max_depth) node.set_nodeattr("outFIFODepth", self.max_depth) if node.onnx_node.op_type == "StreamingFCLayer_Batch": mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) node.set_nodeattr("mem_mode", "decoupled") reset_implementation(node) warnings.warn( "Changed mem_mode from external to decoupled for " + node.onnx_node.name) # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} for node in model.graph.node: if node.op_type == "StreamingFIFO": fifos[node.name] = 0 node = getCustomOp(node) # check depths and fix as necessary if node.get_nodeattr("depth") != self.max_depth: node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] max_cycles = perf["max_cycles"] model = model.transform(PrepareIP(self.fpgapart, self.clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(self.fpgapart, self.clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") # calculate input frequency (number of cycles for each input word) first_node = getCustomOp(model.graph.node[0]) ncycles_per_input = max( 1, int( math.ceil(perf["max_cycles"] / (np.prod(first_node.get_folded_input_shape()) / first_node.get_folded_input_shape()[-1]))), ) # set sufficiently large threshold for 1 image to fully execute and exit ncycles = int(latency + max_cycles) # prepare pyverilator model sim = pyverilate_stitched_ip(model) reset_rtlsim(sim) toggle_clk(sim) # set all input valids to 0 and output readies to 1 # set input data to some constant set_signal(sim, "tvalid", 0) set_signal(sim, "tready", 1) set_signal(sim, "tdata", 0) output_detected = False while ncycles > 0: toggle_clk(sim) # set/unset valids if ncycles % ncycles_per_input == 0: set_signal(sim, "tvalid", 1) else: set_signal(sim, "tvalid", 0) # check/update all fifo counts for key in fifos: current_state = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["state"] current_addr = sim.internals["finn_design_i"][key]["inst"][ key + "_" + key]["addr"] if current_state == 2: current_count = current_addr + 2 else: current_count = current_state if current_count > fifos[key]: fifos[key] = current_count # since latency estimation is very pessimistic, detect first output # and fast-forward the sim if get_signal(sim, "tvalid") != 0 and not output_detected: ncycles = max_cycles output_detected = True else: ncycles = ncycles - 1 if not output_detected: warnings.warn( "No output detected, calculated FIFO depths may not be correct" ) # Apply depths back into the model; # also set in/outFIFODepth to zero for non-FIFO # nodes, preventing further FIFO insertion for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles if node.op_type == "StreamingFIFO": assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) node_inst = getCustomOp(node) node_inst.set_nodeattr("depth", depth) # Set FIFO implementation/ram styles if depth > self.max_qsrl_depth: node_inst.set_nodeattr("impl_style", "vivado") node_inst.set_nodeattr("ram_style", self.vivado_ram_style) else: node_inst.set_nodeattr("impl_style", "rtl") # reset implementation reset_implementation(node_inst) del fifos[node.name] else: getCustomOp(node).set_nodeattr("inFIFODepth", 0) getCustomOp(node).set_nodeattr("outFIFODepth", 0) # for every FC node we changed from external to decoupled, # change back and reset implementation if node.op_type == "StreamingFCLayer_Batch": if node.name in modified_fc_nodes: node_inst = getCustomOp(node) node_inst.set_nodeattr("mem_mode", "external") reset_implementation(node_inst) modified_fc_nodes.remove(node.name) assert (len(modified_fc_nodes) == 0 and len(fifos.keys()) == 0 ), "FIFO/FC nodes left untouched after model reconfiguration" # handle custom sizing for SWG FIFOs if desired if self.swg_exception: model = model.transform( CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)) # remove shallow FIFOs model = model.transform(RemoveShallowFIFOs()) return (model, False)
def prepareCppSim_node(self, node): if is_fpgadataflow_node(node) is True: _codegen_single_node(node, self.model) return (node, False)