def test_fast_vs_slow_random(idt, ishape): iarr = gen_finn_dt_tensor(idt, ishape) ret_slow = finnpy_to_packed_bytearray( iarr, idt, reverse_endian=True, reverse_inner=True, fast_mode=False ) ret_fast = finnpy_to_packed_bytearray( iarr, idt, reverse_endian=True, reverse_inner=True, fast_mode=True ) assert (ret_fast == ret_slow).all()
def fpga_single_run(self, input): input = input.reshape(self.ishape_normal) input = MT.multithreshold(input, self.mt_node_thresholds) assert input.shape == self.ishape_normal ibuf_folded = input.reshape(self.ishape_folded) # pack the input buffer, reversing both SIMD dim and endianness ibuf_packed = finnpy_to_packed_bytearray(ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True) # copy the packed data into the PYNQ buffer # TODO optimization: pack directly into the PYNQ buffer? np.copyto(self.ibuf_packed_device, ibuf_packed) # set up the DMA and wait until all transfers complete self.dma.sendchannel.transfer(self.ibuf_packed_device) self.dma.recvchannel.transfer(self.obuf_packed) self.dma.sendchannel.wait() self.dma.recvchannel.wait() # unpack the packed output buffer from accelerator obuf_folded = packed_bytearray_to_finnpy(self.obuf_packed, self.odt, self.oshape_folded, reverse_endian=True, reverse_inner=True) obuf_normal = obuf_folded.reshape(self.oshape_normal) obuf_normal = obuf_normal * self.multiply_node_const obuf_normal = obuf_normal + self.add_node_mat return obuf_normal
def pack_input(self, ibuf_folded): """Packs folded input and reverses both SIMD dim and endianness. Gets input data in folded shape and returns packed input data.""" ibuf_packed = finnpy_to_packed_bytearray( ibuf_folded, self.idt, reverse_endian=True, reverse_inner=True ) return ibuf_packed
def test_finnpy_to_packed_bytearray(): A = [[1, 1, 1, 0], [0, 1, 1, 0]] eA = np.asarray([[14], [6]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(A, DataType["BINARY"]) == eA).all() B = [[[3, 3], [3, 3]], [[1, 3], [3, 1]]] eB = np.asarray([[[15], [15]], [[7], [13]]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(B, DataType["UINT2"]) == eB).all() C = [1, 7, 2, 5] eC = np.asarray([23, 37], dtype=np.uint8) assert (finnpy_to_packed_bytearray(C, DataType["UINT4"]) == eC).all() D = [[1, 7, 2, 5], [2, 5, 1, 7]] eD = np.asarray([[23, 37], [37, 23]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(D, DataType["UINT4"]) == eD).all() E = [[-4, 0, -4, -4]] eE = np.asarray( [[ 255, 255, 255, 252, 0, 0, 0, 0, 255, 255, 255, 252, 255, 255, 255, 252 ]], dtype=np.uint8, ) assert (finnpy_to_packed_bytearray(E, DataType["INT32"]) == eE).all() F = [[17.125, -2.0], [-3.5, 11.25]] eF = np.asarray([[1, 19, 240], [3, 200, 90]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(F, DataType["FIXED<9,6>"]) == eF).all() G = F eG = np.asarray( [[65, 137, 0, 0, 192, 0, 0, 0], [192, 96, 0, 0, 65, 52, 0, 0]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(G, DataType["FLOAT32"]) == eG).all()
def test_finnpy_to_packed_bytearray(): A = [[1, 1, 1, 0], [0, 1, 1, 0]] eA = np.asarray([[14], [6]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(A, DataType.BINARY) == eA).all() B = [[[3, 3], [3, 3]], [[1, 3], [3, 1]]] eB = np.asarray([[[15], [15]], [[7], [13]]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(B, DataType.UINT2) == eB).all() C = [1, 7, 2, 5] eC = np.asarray([23, 37], dtype=np.uint8) assert (finnpy_to_packed_bytearray(C, DataType.UINT4) == eC).all() D = [[1, 7, 2, 5], [2, 5, 1, 7]] eD = np.asarray([[23, 37], [37, 23]], dtype=np.uint8) assert (finnpy_to_packed_bytearray(D, DataType.UINT4) == eD).all() E = [[-4, 0, -4, -4]] eE = np.asarray( [[255, 255, 255, 252, 0, 0, 0, 0, 255, 255, 255, 252, 255, 255, 255, 252]], dtype=np.uint8, ) assert (finnpy_to_packed_bytearray(E, DataType.INT32) == eE).all()
def apply(self, model): # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # create the base FINN driver -- same for all accels driver_base_template = pk.resource_filename( "finn.qnn-data", "templates/driver/driver_base.py" ) driver_base_py = pynq_driver_dir + "/driver_base.py" shutil.copy(driver_base_template, driver_base_py) # extract input-output shapes from the graph # TODO convert this to an analysis pass? idt = [] idma_names = [] ishape_normal = [] ishape_folded = [] ishape_packed = [] for idma_ind, graph_in in enumerate(model.graph.input): i_tensor_name = graph_in.name # get inp tensor properties i_tensor_dt = model.get_tensor_datatype(i_tensor_name) i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) # go down into dataflow partition to get folded shape info etc # TODO consider setting these as attributes during dataflow partitioning i_consumer = model.find_consumer(i_tensor_name) assert ( i_consumer.op_type == "StreamingDataflowPartition" ), """ Ensure CreateDataflowPartition called before driver creation.""" first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model")) assert ( first_df_model.graph.node[0].op_type == "IODMA" ), "First partition must hold input IODMA" successors = model.find_direct_successors(i_consumer) successor_input_num = list(successors[0].input).index(i_consumer.output[0]) successor_sdp = getCustomOp(successors[0]) successor_df_model = ModelWrapper(successor_sdp.get_nodeattr("model")) first_node = successor_df_model.find_consumer( successor_df_model.graph.input[successor_input_num].name ) i_tensor_shape_folded = tuple( getCustomOp(first_node).get_folded_input_shape() ) # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor( i_tensor_dt, i_tensor_shape_folded ) i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt ) i_tensor_shape_packed = i_tensor_dummy_packed.shape # append all input tensor info to relevant lists idt.append("DataType['%s']" % i_tensor_dt.name) ishape_normal.append(i_tensor_shape_normal) ishape_folded.append(i_tensor_shape_folded) ishape_packed.append(i_tensor_shape_packed) idma_names.append(getCustomOp(i_consumer).get_nodeattr("instance_name")) odt = [] odma_names = [] oshape_normal = [] oshape_folded = [] oshape_packed = [] for odma_ind, graph_out in enumerate(model.graph.output): o_tensor_name = graph_out.name # get inp tensor properties o_tensor_dt = model.get_tensor_datatype(o_tensor_name) o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) # go down into IODMA partition to get folded shape info etc # TODO consider setting these as attributes during dataflow partitioning o_producer = model.find_producer(o_tensor_name) assert ( o_producer.op_type == "StreamingDataflowPartition" ), """ Ensure CreateDataflowPartition called before driver creation.""" df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model")) assert ( df_model.graph.node[-1].op_type == "IODMA" ), "Partition must hold output IODMA" predecessors = model.find_direct_predecessors(o_producer) predecessor_output_num = list(predecessors[0].output).index( o_producer.input[0] ) predecessor_sdp = getCustomOp(predecessors[0]) predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model")) last_node = predecessor_df_model.find_producer( predecessor_df_model.graph.output[predecessor_output_num].name ) o_tensor_shape_folded = tuple( getCustomOp(last_node).get_folded_output_shape() ) o_tensor_dummy_folded = gen_finn_dt_tensor( o_tensor_dt, o_tensor_shape_folded ) o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt ) o_tensor_shape_packed = o_tensor_dummy_packed.shape # append all output tensor info to relevant lists odt.append("DataType['%s']" % o_tensor_dt.name) oshape_normal.append(o_tensor_shape_normal) oshape_folded.append(o_tensor_shape_folded) oshape_packed.append(o_tensor_shape_packed) odma_names.append(getCustomOp(o_producer).get_nodeattr("instance_name")) # generate external weights npy files weights_dir = pynq_driver_dir + "/runtime_weights" os.makedirs(weights_dir) idma_idx = 0 ext_weight_dma_cnt = 0 for node in model.graph.node: assert ( node.op_type == "StreamingDataflowPartition" ), "CreateDataflowPartition needs to be applied before driver generation" if len(node.input) > 0: producer = model.find_producer(node.input[0]) init_tensor = model.get_initializer(node.input[0]) else: producer = None init_tensor = None if producer is None: # input dma? sdp_inst = getCustomOp(node) idma_name = sdp_inst.get_nodeattr("instance_name") df_model = ModelWrapper(sdp_inst.get_nodeattr("model")) assert df_model.graph.node[0].op_type == "IODMA" iodma_node = getCustomOp(df_model.graph.node[0]) if iodma_node.get_nodeattr("burstMode") == "wrap": # input weights dma? init_tensor = df_model.get_initializer( iodma_node.onnx_node.input[0] ) ext_weight_dma_cnt += 1 w_dtype = df_model.get_tensor_datatype( iodma_node.onnx_node.input[0] ) init_external_tensor = to_external_tensor(init_tensor, w_dtype) np.save( weights_dir + "/" + idma_name + ".npy", init_external_tensor ) idma_idx += 1 # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template driver = driver.replace("$PLATFORM$", self.platform) driver = driver.replace("$INPUT_FINN_DATATYPE$", str(idt).replace('"', "")) driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(ishape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(ishape_folded)) driver = driver.replace("$INPUT_SHAPE_PACKED$", str(ishape_packed)) driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(odt).replace('"', "")) driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(oshape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(oshape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(oshape_packed)) driver = driver.replace("$INPUT_DMA_NAME$", "%s" % str(idma_names)) driver = driver.replace("$OUTPUT_DMA_NAME$", "%s" % str(odma_names)) driver = driver.replace("$NUM_INPUTS$", str(len(idma_names))) driver = driver.replace("$NUM_OUTPUTS$", str(len(odma_names))) driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt)) with open(driver_py, "w") as f: f.write(driver) # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" validate_template = pk.resource_filename( "finn.qnn-data", "templates/driver/validate.py" ) shutil.copy(validate_template, validate_py) # copy all the dependencies into the driver folder # driver imports utils/data_packing and core/datatype # both of which are in finn-base # e.g. /workspace/finn-base/src/finn/util/data_packing.py dpk_root = dpk.__file__ # e.g. /workspace/finn-base/src/finn/util dpk_root = dpk_root.replace("data_packing.py", "") # e.g. /workspace/finn-base/src/finn/core/datatype.py dtp_root = dtp.__file__ # e.g. /workspace/finn-base/src/finn/core dtp_root = dtp_root.replace("datatype.py", "") shutil.copytree(dpk_root, pynq_driver_dir + "/finn/util") shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core") # generate weight files for runtime-writable layers for sdp_ind, sdp_node in enumerate(model.graph.node): assert sdp_node.op_type == "StreamingDataflowPartition" # get dataflow model sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: if node.op_type in ["StreamingFCLayer_Batch", "Thresholding_Batch"]: node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: fcl_w = dataflow_model.get_initializer(node.input[1]) w_filename = weights_dir + "/%d_%d_%s.dat" % ( sdp_ind, rt_layer_ind, node.name, ) node_inst.make_weight_file( fcl_w, "decoupled_runtime", w_filename ) rt_layer_ind += 1 elif node.op_type == "StreamingDataflowPartition": warnings.warn( """Nested StreamingDataflowPartition are not supported """ ) else: continue return (model, False)
def apply(self, model): vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj") if vivado_pynq_proj is None or (not os.path.isdir(vivado_pynq_proj)): raise Exception("No PYNQ project found, apply MakePYNQProject first.") # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # extract input-output shapes from the graph # TODO convert this to an analysis pass i_tensor_name = model.graph.input[0].name o_tensor_name = model.graph.output[0].name i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) i_tensor_dt = model.get_tensor_datatype(i_tensor_name) o_tensor_dt = model.get_tensor_datatype(o_tensor_name) # extract HLSCustomOp instances to get folded i/o shapes first_node = getCustomOp(model.find_consumer(i_tensor_name)) last_node = getCustomOp(model.find_producer(o_tensor_name)) i_tensor_shape_folded = tuple(first_node.get_folded_input_shape()) o_tensor_shape_folded = tuple(last_node.get_folded_output_shape()) # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) i_tensor_dummy_packed = finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt ) o_tensor_dummy_packed = finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt ) i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = templates.pynq_driver_template def mss(x, batch_var_name="N"): # "make shape string" # for a shape like (1, ...) emit a string (N, ...) # where N is the default value for batch_var_name # this lets the driver work with a batch of samples at once ret = str(x) ret = ret.replace("(1,", "(%s," % batch_var_name) ret = ret.replace("[1,", "[%s," % batch_var_name) return ret driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed)) driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt)) driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) with open(driver_py, "w") as f: f.write(driver) # copy all the dependencies into the driver folder shutil.copytree( get_finn_root() + "/src/finn/util", pynq_driver_dir + "/finn/util" ) shutil.copytree( get_finn_root() + "/src/finn/core", pynq_driver_dir + "/finn/core" ) return (model, False)
def apply(self, model): # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # extract input-output shapes from the graph # TODO convert this to an analysis pass i_tensor_name = model.graph.input[0].name o_tensor_name = model.graph.output[0].name i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) i_tensor_dt = model.get_tensor_datatype(i_tensor_name) o_tensor_dt = model.get_tensor_datatype(o_tensor_name) # folded shapes for i/o simply derived from regular tensor shapes # this used to be extracted from first/last node folded shapes, but # can't do this anymore due to IODMAs i_tensor_shape_folded = list(i_tensor_shape_normal) i_tensor_shape_folded.insert(-1, 1) i_tensor_shape_folded = tuple(i_tensor_shape_folded) o_tensor_shape_folded = list(o_tensor_shape_normal) o_tensor_shape_folded.insert(-1, 1) o_tensor_shape_folded = tuple(o_tensor_shape_folded) # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) i_tensor_dummy_packed = finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt) o_tensor_dummy_packed = finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt) i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = templates.pynq_driver_template def mss(x, batch_var_name="N"): # "make shape string" # for a shape like (1, ...) emit a string (N, ...) # where N is the default value for batch_var_name # this lets the driver work with a batch of samples at once ret = str(x) ret = ret.replace("(1,", "(%s," % batch_var_name) ret = ret.replace("[1,", "[%s," % batch_var_name) return ret driver = driver.replace("$PLATFORM$", self.platform) driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed)) driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt)) driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) # clock settings for driver clk_ns = model.get_metadata_prop("clk_ns") # default to 10ns / 100 MHz if property not set if clk_ns is None: clk_ns = 10.0 else: clk_ns = float(clk_ns) fclk_mhz = 1 / (clk_ns * 0.001) # TODO change according to PYNQ board? driver = driver.replace("$CLK_NAME$", "fclk0_mhz") driver = driver.replace("$CLOCK_FREQ_MHZ$", str(fclk_mhz)) with open(driver_py, "w") as f: f.write(driver) # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" validate_src = templates.pynq_validation_template with open(validate_py, "w") as f: f.write(validate_src) # copy all the dependencies into the driver folder shutil.copytree(get_finn_root() + "/src/finn/util", pynq_driver_dir + "/finn/util") shutil.copytree(get_finn_root() + "/src/finn/core", pynq_driver_dir + "/finn/core") return (model, False)
def apply(self, model): # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # create the base FINN driver -- same for all accels driver_base_template = pk.resource_filename( "finn.qnn-data", "templates/driver/driver_base.py") driver_base_py = pynq_driver_dir + "/driver_base.py" shutil.copy(driver_base_template, driver_base_py) # extract input-output shapes from the graph # TODO convert this to an analysis pass? i_tensor_name = model.graph.input[0].name o_tensor_name = model.graph.output[0].name i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) i_tensor_dt = model.get_tensor_datatype(i_tensor_name) o_tensor_dt = model.get_tensor_datatype(o_tensor_name) first_node = model.find_consumer(i_tensor_name) last_node = model.find_producer(o_tensor_name) if first_node.op_type == "StreamingDataflowPartition": # IODMAs and dataflow partitions have already been created # extract folded i/o shapes from IODMA consumer/producer first_df_model = ModelWrapper( getCustomOp(first_node).get_nodeattr("model")) assert (first_df_model.graph.node[0].op_type == "IODMA" ), "First partition must hold input IODMA" successors = model.find_direct_successors(first_node) successor_sdp = getCustomOp(successors[0]) successor_df_model = ModelWrapper( successor_sdp.get_nodeattr("model")) first_node = successor_df_model.find_consumer( successor_df_model.graph.input[0].name) last_df_model = ModelWrapper( getCustomOp(last_node).get_nodeattr("model")) assert (last_df_model.graph.node[0].op_type == "IODMA" ), "Last partition must hold output IODMA" predecessors = model.find_direct_predecessors(last_node) predecessor_sdp = getCustomOp(predecessors[0]) predecessor_df_model = ModelWrapper( predecessor_sdp.get_nodeattr("model")) last_node = predecessor_df_model.find_producer( predecessor_df_model.graph.output[0].name) # else: transformation called before IODMA/SDP creation (legacy flow) # can access folded i/o shapes directly i_tensor_shape_folded = tuple( getCustomOp(first_node).get_folded_input_shape()) o_tensor_shape_folded = tuple( getCustomOp(last_node).get_folded_output_shape()) # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt) o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt) i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape # generate external weights npy files weights_dir = pynq_driver_dir + "/runtime_weights" os.makedirs(weights_dir) idma_idx = 0 ext_weight_dma_cnt = 0 for node in model.graph.node: assert ( node.op_type == "StreamingDataflowPartition" ), "CreateDataflowPartition needs to be applied before driver generation" producer = model.find_producer(node.input[0]) init_tensor = model.get_initializer(node.input[0]) if producer is None: # input dma? idma_name = "idma" + str(idma_idx) if init_tensor is not None: # input weights dma? ext_weight_dma_cnt += 1 w_dtype = model.get_tensor_datatype(node.input[0]) init_external_tensor = to_external_tensor( init_tensor, w_dtype) np.save(weights_dir + "/" + idma_name + ".npy", init_external_tensor) else: net_input_name = idma_name idma_idx += 1 # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template def mss(x, batch_var_name="1"): # "make shape string" # for a shape like (1, ...) emit a string (N, ...) # where N is the default value for batch_var_name # this lets the driver work with a batch of samples at once ret = str(x) ret = ret.replace("(1,", "(%s," % batch_var_name) ret = ret.replace("[1,", "[%s," % batch_var_name) return ret driver = driver.replace("$PLATFORM$", self.platform) driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed)) driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt)) driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) driver = driver.replace("$INPUT_DMA_NAME$", "'%s'" % net_input_name) driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt)) with open(driver_py, "w") as f: f.write(driver) # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" validate_template = pk.resource_filename( "finn.qnn-data", "templates/driver/validate.py") shutil.copy(validate_template, validate_py) # copy all the dependencies into the driver folder # driver imports utils/data_packing and core/datatype # both of which are in finn-base # e.g. /workspace/finn-base/src/finn/util/data_packing.py dpk_root = dpk.__file__ # e.g. /workspace/finn-base/src/finn/util dpk_root = dpk_root.replace("data_packing.py", "") # e.g. /workspace/finn-base/src/finn/core/datatype.py dtp_root = dtp.__file__ # e.g. /workspace/finn-base/src/finn/core dtp_root = dtp_root.replace("datatype.py", "") shutil.copytree(dpk_root, pynq_driver_dir + "/finn/util") shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core") # generate weight files for runtime-writable layers for sdp_ind, sdp_node in enumerate(model.graph.node): assert sdp_node.op_type == "StreamingDataflowPartition" # get dataflow model sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: if node.op_type in [ "StreamingFCLayer_Batch", "Thresholding_Batch" ]: node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr( "runtime_writeable_weights") if is_rt_weights == 1: fcl_w = dataflow_model.get_initializer(node.input[1]) w_filename = weights_dir + "/%d_%d_%s.dat" % ( sdp_ind, rt_layer_ind, node.name, ) node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename) rt_layer_ind += 1 elif node.op_type == "StreamingDataflowPartition": warnings.warn( """Nested StreamingDataflowPartition are not supported """) else: continue return (model, False)
def apply(self, model): # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # create the base FINN driver -- same for all accels driver_base_template = pk.resource_filename( "finn.qnn-data", "templates/driver/driver_base.py") driver_base_py = pynq_driver_dir + "/driver_base.py" shutil.copy(driver_base_template, driver_base_py) # extract input-output shapes from the graph # TODO convert this to an analysis pass? i_tensor_name = model.graph.input[0].name o_tensor_name = model.graph.output[0].name i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name)) o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name)) i_tensor_dt = model.get_tensor_datatype(i_tensor_name) o_tensor_dt = model.get_tensor_datatype(o_tensor_name) # folded shapes for i/o simply derived from regular tensor shapes # this used to be extracted from first/last node folded shapes, but # can't do this anymore due to IODMAs i_tensor_shape_folded = list(i_tensor_shape_normal) i_tensor_shape_folded.insert(-1, 1) i_tensor_shape_folded = tuple(i_tensor_shape_folded) o_tensor_shape_folded = list(o_tensor_shape_normal) o_tensor_shape_folded.insert(-1, 1) o_tensor_shape_folded = tuple(o_tensor_shape_folded) # generate dummy folded i/o tensors and their packed versions i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt) o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt) i_tensor_shape_packed = i_tensor_dummy_packed.shape o_tensor_shape_packed = o_tensor_dummy_packed.shape # fill in the driver template driver_py = pynq_driver_dir + "/driver.py" driver = template_driver.pynq_driver_template def mss(x, batch_var_name="1"): # "make shape string" # for a shape like (1, ...) emit a string (N, ...) # where N is the default value for batch_var_name # this lets the driver work with a batch of samples at once ret = str(x) ret = ret.replace("(1,", "(%s," % batch_var_name) ret = ret.replace("[1,", "[%s," % batch_var_name) return ret driver = driver.replace("$PLATFORM$", self.platform) driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt)) driver = driver.replace("$INPUT_SHAPE_NORMAL$", mss(i_tensor_shape_normal)) driver = driver.replace("$INPUT_SHAPE_FOLDED$", mss(i_tensor_shape_folded)) driver = driver.replace("$INPUT_SHAPE_PACKED$", mss(i_tensor_shape_packed)) driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt)) driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", mss(o_tensor_shape_normal)) driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", mss(o_tensor_shape_folded)) driver = driver.replace("$OUTPUT_SHAPE_PACKED$", mss(o_tensor_shape_packed)) with open(driver_py, "w") as f: f.write(driver) # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" validate_template = pk.resource_filename( "finn.qnn-data", "templates/driver/validate.py") shutil.copy(validate_template, validate_py) # copy all the dependencies into the driver folder # driver imports utils/data_packing and core/datatype # both of which are in finn-base # e.g. /workspace/finn-base/src/finn/util/data_packing.py dpk_root = dpk.__file__ # e.g. /workspace/finn-base/src/finn/util dpk_root = dpk_root.replace("data_packing.py", "") # e.g. /workspace/finn-base/src/finn/core/datatype.py dtp_root = dtp.__file__ # e.g. /workspace/finn-base/src/finn/core dtp_root = dtp_root.replace("datatype.py", "") shutil.copytree(dpk_root, pynq_driver_dir + "/finn/util") shutil.copytree(dtp_root, pynq_driver_dir + "/finn/core") # generate weight files for runtime-writable layers weights_dir = pynq_driver_dir + "/runtime_weights" rt_layer_ind = 0 os.makedirs(weights_dir) for node in model.graph.node: if node.op_type in [ "StreamingFCLayer_Batch", "Thresholding_Batch" ]: node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr( "runtime_writeable_weights") if is_rt_weights == 1: fcl_w = model.get_initializer(node.input[1]) w_filename = weights_dir + "/%d_%s.dat" % (rt_layer_ind, node.name) node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename) rt_layer_ind += 1 elif node.op_type == "StreamingDataflowPartition": warnings.warn("""Please call MakePYNQDriver prior to CreateDataflowPartition. Can only extract runtime-writable weights from HLSCustomOp instances and not StreamingDataflowPartition. """) else: continue return (model, False)