def test_sort_linear_graph(num_of_nodes): model = make_randomly_sorted_linear_model(num_of_nodes, seed=0) new_model = model.transform(SortGraph()) # Test ret = new_model.analysis(ta.nodes_topologically_sorted) assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted."
def apply(self, model): graph = model.graph graph_modified = False for n in graph.node: if n.op_type in self.join_node_op and model.is_join_node(n): in0 = n.input[0] in1 = n.input[1] if in0 is None or in1 is None: continue prod0 = model.find_producer(in0) prod1 = model.find_producer(in1) # Checks if the join node is preceded by # two different, but identical operations if prod0 == prod1: continue identical_op = prod0.op_type == prod1.op_type if identical_op and prod0.op_type in self.ops_to_move: self.move_node(model, n, prod0, prod1) graph_modified = True if graph_modified: model = model.transform(SortGraph(), make_deepcopy=False, cleanup=False) return (model, graph_modified)
def cleanup(self): "Run cleanup transformations on the model." transformed_model = self cleanup_transforms = [ RemoveUnusedTensors(), RemoveStaticGraphInputs(), SortGraph(), ] for trn in cleanup_transforms: transformed_model = transformed_model.transform( trn, cleanup=False, make_deepcopy=False) return transformed_model
def test_sort_nonlinear_graph(): ch = 2 ifmdim = 16 input_shape = (1, ch, ifmdim, ifmdim) top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape) num_of_params = 8 value_info = [] for i in range(num_of_params): value_info += [ helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) ] modelproto = helper.make_model( helper.make_graph( name="test", inputs=[top_in], outputs=[top_out], value_info=value_info, nodes=[ # Not sorted nodes helper.make_node("Mul", ["fork1", "p2"], ["t3"]), helper.make_node("Add", ["t4", "p3"], ["t5"]), helper.make_node("Add", ["t2", "t3"], ["t4"]), helper.make_node("Add", ["t6", "t7"], ["t8"]), helper.make_node("Add", ["fork3", "fork3"], ["top_out"]), helper.make_node("Mul", ["t5", "p4"], ["fork2"]), helper.make_node("Add", ["top_in", "p0"], ["fork1"]), helper.make_node("Mul", ["fork1", "p1"], ["t2"]), helper.make_node("Add", ["fork2", "p5"], ["t6"]), helper.make_node("Add", ["fork2", "p6"], ["t7"]), helper.make_node("Mul", ["t8", "p7"], ["fork3"]), ], )) model = ModelWrapper(modelproto) model = model.transform(InferShapes()) np.random.seed(0) for i in range(num_of_params): model.set_initializer("p" + str(i), np.random.rand(*input_shape).astype(np.float32)) new_model = model.transform(SortGraph()) # Test ret = new_model.analysis(ta.nodes_topologically_sorted) assert ret[ "nodes_topologically_sorted"], "Nodes are not topologically sorted."
def step_resnet50_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): model.set_tensor_datatype(model.graph.input[0].name, DataType["UINT8"]) model = model.transform(InferDataLayouts()) try: from finn.transformation.fpgadataflow.infer_doublepacked_dsp import InferDoublePackedConv model = model.transform(InferDoublePackedConv([1])) except: print( " FINN Experimental not available. Using non-packed convolution ") model = model.transform(DoubleToSingleFloat()) model = model.transform(InferDataTypes()) model = model.transform(SortGraph()) to_hls_transformations = [ to_hls.InferAddStreamsLayer, LowerConvsToMatMul, to_hls.InferChannelwiseLinearLayer, to_hls.InferPool_Batch, AbsorbTransposeIntoMultiThreshold, RoundAndClipThresholds, to_hls.InferQuantizedStreamingFCLayer, to_hls.InferThresholdingLayer, AbsorbConsecutiveTransposes, to_hls.InferConvInpGen, to_hls.InferDuplicateStreamsLayer, to_hls.InferLabelSelectLayer ] for trn in to_hls_transformations: model = model.transform(trn()) model = model.transform(InferDataLayouts()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataTypes()) model = model.transform(RemoveCNVtoFCFlatten()) model = model.transform(GiveReadableTensorNames()) model = model.transform(RemoveUnusedTensors()) model = model.transform(SortGraph()) return model
def step_resnet50_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): for iter_id in range(4): model = step_resnet50_streamline_linear(model, cfg) model = step_resnet50_streamline_nonlinear(model, cfg) # big loop tidy up model = model.transform(RemoveUnusedTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) model = model.transform(SortGraph()) model = model.transform(DoubleToSingleFloat()) return model
def apply(self, model): # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) assert all( get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow" for x in all_nodes ) # parse streamingfclayers looking for external weights with no attached IODMA fc_extw_nodes = list( filter( lambda x: x.op_type == "StreamingFCLayer_Batch" and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, ) ) graph_in_name = model.graph.input[0].name first_node = model.find_consumer(graph_in_name) graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) if ( final_node.op_type == "IODMA" and first_node.op_type == "IODMA" and len(fc_extw_nodes) == 0 ): # TODO maybe check the correctness of properties return (model, False) else: if final_node.op_type != "IODMA": out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) final_node_inst = getCustomOp(final_node) out_folded_shape = final_node_inst.get_folded_output_shape() # take advantage of AXI stream width padding for DMA alignment # (AXI streams are always padded to 8 bits) # this is the width of stream input to DMA padded_outstream_width = final_node_inst.get_outstream_width_padded() padded_outstream_bytes = padded_outstream_width // 8 # determine the feasible interface width transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" # make new buffer final_node_out = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) model.graph.value_info.append(final_node_out) model.set_tensor_datatype(final_node_out.name, out_dtype) # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name # FIXME: currently always using 8-bit dtypes to work around the # padding problems for i/o DMA dma_node = oh.make_node( "IODMA", [final_node_out.name], [graph_out_name], numInputVectors=out_folded_shape[:-1], NumChannels=padded_outstream_bytes, dataType="UINT8", intfWidth=intfwidth, streamWidth=padded_outstream_width, direction="out", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", ) model.graph.node.append(dma_node) if first_node.op_type != "IODMA": in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) first_node_inst = getCustomOp(first_node) in_folded_shape = first_node_inst.get_folded_input_shape() # take advantage of AXI stream width padding for DMA alignment # (AXI streams are always padded to 8 bits) # this is the width of stream output expected from the DMA padded_instream_width = first_node_inst.get_instream_width_padded() padded_instream_bytes = padded_instream_width // 8 # determine the feasible interface width transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" # make new buffer first_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape ) model.graph.value_info.append(first_node_in) model.set_tensor_datatype(first_node_in.name, in_dtype) # reroute first node input # FIXME: currently always using 8-bit dtypes to work around the # padding problems for i/o DMA first_node.input[0] = first_node_in.name dma_node = oh.make_node( "IODMA", [graph_in_name], [first_node_in.name], numInputVectors=in_folded_shape[:-1], NumChannels=padded_instream_bytes, dataType="UINT8", intfWidth=intfwidth, streamWidth=padded_instream_width, direction="in", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) for fc_node in fc_extw_nodes: fc_inst = getCustomOp(fc_node) fc_w_name = fc_node.input[1] w_shape = model.get_tensor_shape(fc_w_name) w_dtype = model.get_tensor_datatype(fc_w_name) # determine the feasible interface width transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0 ), "No feasible interface width for transfer size" # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i streamWidth = fc_inst.get_weightstream_width_padded() # make new buffer W = model.get_initializer(fc_w_name) iodma_mem = self.get_mem_init(W, pe, simd) model.set_initializer(fc_w_name, iodma_mem) fc_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, iodma_mem.shape ) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], numInputVectors=[iodma_mem.shape[0]], NumChannels=pe * simd, dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, direction="in", burstMode="wrap", domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", ) fc_node.input[1] = fc_node_in.name model.graph.node.insert(0, dma_node) model = model.transform(SortGraph()) return (model, True)
sizes = [10, 50, 100, 500, 1000] times = [] reps = 10 print("SortGraph performance test:") print("Test sizes", sizes) print("Repetitions per size:", reps) for sz in sizes: acc_time = 0 print(" Testing size ", sz) for i in range(reps): # it should take the same time even with the sorted one # but better new model each time as it is a more general approach model = make_randomly_sorted_linear_model(sz) # new model as seed is None bef = time.time() new_model = model.transform(SortGraph(), make_deepcopy=False) acc_time += time.time() - bef times += [acc_time / reps] # print csv print("\nnum_of_nodes, seconds") for sz, tm in zip(sizes, times): print("{:12d}, {:6.4e}".format(sz, tm)) # plot # import matplotlib.pyplot as plt # plt.plot(sizes,times,"--o") # plt.grid(True)
def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = make_model(ch, ifmdim) model.save(export_onnx_path) model = ModelWrapper(export_onnx_path) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) # model.save("golden.onnx") # generate test vectors of correct shape if ifmdim == -1: input_tensor_shape = (1, ch) else: input_tensor_shape = (1, ch, ifmdim, ifmdim) x = gen_finn_dt_tensor(idt, input_tensor_shape) # generate expected value from streamlined net input_dict = {model.graph.input[0].name: x} output_dict = oxe.execute_onnx(model, input_dict, True) produced_sum = output_dict[model.graph.output[0].name] chw_mul = model.get_initializer(model.graph.node[-1].input[1]) chw_mul = 1 expected_sum = chw_mul * np.sum(2 * (2 * x + 15.0), axis=(2, 3)) / (ifmdim * ifmdim) assert (produced_sum.flatten() == expected_sum.flatten()).all() model = model.transform(InferDataLayouts()) # convert to hls model.set_tensor_datatype(model.graph.input[0].name, idt) # extra streamlining model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(MoveAddPastMul()) model = model.transform(CollapseRepeatedMul()) model = model.transform(CollapseRepeatedAdd()) # insert top-k node, which should absorb linear ops before it model = model.transform(InferShapes()) model = model.transform(InferDataLayouts()) model = model.transform(InferDataTypes()) model = model.transform(to_hls.InferChannelwiseLinearLayer()) model = model.transform(to_hls.InferAddStreamsLayer()) model = model.transform(to_hls.InferGlobalAccPoolLayer()) model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(InsertTopK()) model = model.transform(AbsorbScalarMulAddIntoTopK()) model = model.transform(InferDataTypes()) model = model.transform(to_hls.InferLabelSelectLayer()) model = model.transform(AbsorbConsecutiveTransposes()) model = model.transform(InferDataTypes()) model = model.transform(to_hls.InferLabelSelectLayer()) model = model.transform(to_hls.InferDuplicateStreamsLayer()) model = model.transform(SortGraph()) # model.save("golden_hls.onnx") # check topology status finn_nodes = model.get_finn_nodes() assert len(finn_nodes) == 9 add_nodes = model.get_nodes_by_op_type("AddStreams_Batch") assert len(add_nodes) == 1 pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch") assert len(pool_nodes) == 1 label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch") assert len(label_nodes) == 1 channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch") assert len(channelwise_nodes) == 5 dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch") assert len(dup_nodes) == 1 model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) output_dict = oxe.execute_onnx(model, input_dict, True) produced_topk_hls = output_dict[model.graph.output[0].name] topk_input = output_dict[model.graph.node[-1].input[0]] assert soft_verify_topk(topk_input, produced_topk_hls, 5) os.remove(export_onnx_path)
def apply(self, model): # only makes sense for a pure fpgadataflow graph -- so we check! all_nodes = list(model.graph.node) assert all( get_by_name(x.attribute, "backend").s.decode("UTF-8") == "fpgadataflow" for x in all_nodes) # parse streamingfclayers looking for external weights with no attached IODMA fc_extw_nodes = list( filter( lambda x: x.op_type == "StreamingFCLayer_Batch" and get_by_name(x.attribute, "mem_mode") is not None and get_by_name(x.attribute, "mem_mode").s.decode("UTF-8") == "external" and model.find_producer(x.input[1]) is None, all_nodes, )) graph_in_name = model.graph.input[0].name first_node = model.find_consumer(graph_in_name) graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) if (final_node.op_type == "IODMA" and first_node.op_type == "IODMA" and len(fc_extw_nodes) == 0): # TODO maybe check the correctness of properties return (model, False) else: if final_node.op_type != "IODMA": # check if tensor is NHWC assert ( model.get_tensor_layout(graph_out_name) == DataLayout.NHWC or model.get_tensor_layout(graph_out_name) == DataLayout.NC ), "Data layout of output tensor must be NHWC or NC" out_shape = model.get_tensor_shape(graph_out_name) out_dtype = model.get_tensor_datatype(graph_out_name) # determine the feasible interface width transfer_bits = np.prod(out_shape) * out_dtype.bitwidth() intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0), "No feasible interface width for transfer size" # get width of stream input to DMA streamWidth = getCustomOp(final_node).get_outstream_width() # make new buffer final_node_out = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape) model.graph.value_info.append(final_node_out) model.set_tensor_datatype(final_node_out.name, out_dtype) # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name dma_node = oh.make_node( "IODMA", [final_node_out.name], [graph_out_name], numInputVectors=out_shape[:-1], NumChannels=out_shape[-1], dataType=str(out_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, direction="out", domain="finn", backend="fpgadataflow", ) model.graph.node.append(dma_node) if first_node.op_type != "IODMA": # check if tensor is NHWC assert ( model.get_tensor_layout(graph_in_name) == DataLayout.NHWC or model.get_tensor_layout(graph_in_name) == DataLayout.NC ), "Data layout of input tensor must be NHWC or NC" in_shape = model.get_tensor_shape(graph_in_name) in_dtype = model.get_tensor_datatype(graph_in_name) # determine the feasible interface width transfer_bits = np.prod(in_shape) * in_dtype.bitwidth() intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0), "No feasible interface width for transfer size" # get width of stream output from DMA streamWidth = getCustomOp(first_node).get_instream_width() # make new buffer first_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape) model.graph.value_info.append(first_node_in) model.set_tensor_datatype(first_node_in.name, in_dtype) # reroute final node output to final_node_out_name first_node.input[0] = first_node_in.name dma_node = oh.make_node( "IODMA", [graph_in_name], [first_node_in.name], numInputVectors=in_shape[:-1], NumChannels=in_shape[-1], dataType=str(in_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, direction="in", domain="finn", backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) for fc_node in fc_extw_nodes: # check if tensor is NHWC assert (model.get_tensor_layout( fc_node.input[1]) == DataLayout.NHWC or model.get_tensor_layout(graph_in_name) == DataLayout.NC ), "Data layout of tensors must be NHWC or NC" fc_w_name = fc_node.input[1] w_shape = model.get_tensor_shape(fc_w_name) w_dtype = model.get_tensor_datatype(fc_w_name) # determine the feasible interface width transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() intfwidth = math.gcd(transfer_bits, self.max_intfwidth) assert ( intfwidth % 8 == 0), "No feasible interface width for transfer size" # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i assert pe * simd == w_shape[0], "Malformed weight matrix" streamWidth = simd * pe * w_dtype.bitwidth() # make new buffer fc_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, w_shape) model.graph.value_info.append(fc_node_in) model.set_tensor_datatype(fc_node_in.name, w_dtype) model.set_initializer(fc_node_in.name, model.get_initializer(fc_w_name)) dma_node = oh.make_node( "IODMA", [fc_w_name], [fc_node_in.name], numInputVectors=[w_shape[1]], NumChannels=w_shape[0], dataType=str(w_dtype.name), intfWidth=intfwidth, streamWidth=streamWidth, direction="in", burstMode="wrap", domain="finn", backend="fpgadataflow", ) fc_node.input[1] = fc_node_in.name model.graph.node.insert(0, dma_node) model = model.transform(SortGraph()) return (model, True)