def throughput_test_rtlsim(model, batchsize=100): """Runs a throughput test for the given IP-stitched model. When combined with tracing, useful to determine bottlenecks and required FIFO sizes.""" assert (model.get_metadata_prop("exec_mode") == "rtlsim" ), """Top-level exec_mode metadata_prop must be set to rtlsim""" # make empty exec context and insert random inputs ctx = model.make_empty_exec_context() i_bytes = 0 for i_vi in model.graph.input: # create random input iname = i_vi.name ishape = model.get_tensor_shape(iname) ishape_batch = ishape ishape_batch[0] = batchsize idt = model.get_tensor_datatype(iname) dummy_input = gen_finn_dt_tensor(idt, ishape_batch) ctx[iname] = dummy_input i_bytes += (np.prod(ishape_batch) * idt.bitwidth()) / 8 # compute total output size as well o_bytes = 0 for o_vi in model.graph.output: oname = o_vi.name oshape = model.get_tensor_shape(oname) oshape_batch = oshape oshape_batch[0] = batchsize odt = model.get_tensor_datatype(oname) o_bytes += (np.prod(oshape_batch) * odt.bitwidth()) / 8 # remove liveness threshold, launch rtlsim os.environ["LIVENESS_THRESHOLD"] = "-1" rtlsim_exec(model, ctx) # extract metrics cycles = int(model.get_metadata_prop("cycles_rtlsim")) clk_ns = float(model.get_metadata_prop("clk_ns")) fclk_mhz = 1 / (clk_ns * 0.001) runtime_s = (cycles * clk_ns) * (10**-9) res = dict() res["cycles"] = cycles res["runtime[ms]"] = runtime_s * 1000 res["throughput[images/s]"] = batchsize / runtime_s res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s res["fclk[mhz]"] = fclk_mhz res["N"] = batchsize return res
def test_runtime_thresholds_single_layer(): mem_mode = "decoupled" act = DataType["INT4"] idt = DataType["INT16"] nf = 8 ich = 16 pe = ich // nf assert ich % pe == 0 # generate input data in_tensor = gen_finn_dt_tensor(idt, (1, ich)) odt = act n_steps = act.get_num_possible_values() - 1 T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode) op_inst = getCustomOp(model.graph.node[0]) op_inst.set_nodeattr("runtime_writeable_weights", 1) op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") with open("old_weights.dat", "r") as f: old_weight_stream = f.read().strip() os.remove("old_weights.dat") old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) in_tensor = np.tile(in_tensor, (2, 1)) exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): extracted_weight_stream.append( axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) assert extracted_weight_stream == old_weight_stream # only use second batch element in output; first will be invalid due to # old weights (see above) y = exec_ctx["outp"][1] expected = multithreshold(in_tensor, T)[1] if act == DataType["BIPOLAR"]: # binary to bipolar expected = 2 * expected - 1 else: # signed offset expected += act.min() assert (y == expected).all() new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) # provide non-decreasing thresholds new_weights = np.sort(T, axis=1) op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") with open("new_weights.dat", "r") as f: new_weight_stream = f.read().strip() os.remove("new_weights.dat") new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) new_weight_stream = list(new_weight_stream) def write_weights(sim): addr = 0 for nw in new_weight_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=write_weights) y = exec_ctx["outp"][1] expected = multithreshold(in_tensor, new_weights)[1] if act == DataType["BIPOLAR"]: # binary to bipolar expected = 2 * expected - 1 else: # signed offset expected += act.min() assert (y == expected).all()
def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=None, end_node=None): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned as indicated by the model.graph.output. If return return_full_exec_context is True, the full set of tensors used by the execution (including inputs, weights, activations and final outputs) will be returned as a dict. When start_node and end_node are set to None, the whole graph is executed. If they are set to particular ONNX nodes, only the subgraph between (and including) those nodes is executed. """ if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") ret = model.analysis(ta.nodes_topologically_sorted) assert (ret["nodes_topologically_sorted"] is True), """Nodes must be topologically sorted.""" graph = model.graph # first, we need to make sure that every variable required by the graph has # some buffer associated with it. this includes graph inputs (which includes # the input data as well as the trained parameters) and the graph ValueInfo # (intermediate tensors between layers) # this is provided by the execution_context, which is a dict of np.ndarray execution_context = model.make_empty_exec_context() # fill in any inputs provided to this function for inp_name in input_dict.keys(): if inp_name in execution_context: if execution_context[inp_name].shape == input_dict[inp_name].shape: execution_context[inp_name] = input_dict[inp_name] else: raise Exception( "Shape mismatch for provided input %s: found %s expected %s " % ( inp_name, str(execution_context[inp_name].shape), str(input_dict[inp_name].shape), )) # else: # raise Exception("Provided input not found in graph context: %s" % inp_name) # check if model has an execution mode set # if None, execute model node by node using execute_node() # if set to "remote_pynq" execute model on PYNQ board # if set to "rtlsim" execute model using pyverilator model_exec_mode = model.get_metadata_prop("exec_mode") if (model_exec_mode is None) or (model_exec_mode == ""): # execute the model node by node # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted subgraph = [] if start_node is None: start_node = model.graph.node[0] if end_node is None: end_node = model.graph.node[-1] # select the nodes between specified start/end nodes start_ind = model.get_node_index(start_node) end_ind = model.get_node_index(end_node) + 1 assert end_ind >= start_ind, "Start/end nodes must define valid subgraph" subgraph = graph.node[start_ind:end_ind] for node in subgraph: if get_sanitize_quant_tensors() != 0: # round input values to match quantization annotation execution_context = sanitize_quant_values( model, node.input, execution_context) execute_node(node, execution_context, graph, return_full_exec_context) if get_sanitize_quant_tensors() != 0: # round output values to quantization annotation execution_context = sanitize_quant_values( model, node.output, execution_context) elif model_exec_mode == "remote_pynq": # use remote exec metadata built into model to execute on a remote PYNQ remote_exec(model, execution_context) elif model_exec_mode == "rtlsim": # use stitched IP for rtlsim rtlsim_exec(model, execution_context) else: raise Exception( """Metadata property "exec_mode" is set to an unknown value. Can be left unset or has to be set to "remote_pynq" for remote execution on PYNQ board or "rtlsim" for execution using pyverilator!""") if return_full_exec_context: return execution_context else: # provide outputs as dict output_dict = dict() for out_tensor in graph.output: out_name = out_tensor.name output_dict[out_name] = execution_context[out_name] return output_dict
def test_runtime_weights_single_layer(): idt = DataType["UINT32"] wdt = DataType["UINT4"] act = None mw = 64 mh = 32 pe = 4 simd = 16 layer_spec = { "idt": idt, "wdt": wdt, "mw": mw, "mh": mh, "act": act, "pe": pe, "simd": simd, } layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) fcl = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] op_inst = getCustomOp(fcl) op_inst.set_nodeattr("mem_mode", "decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) old_weights = model.get_initializer(fcl.input[1]) op_inst.make_weight_file(old_weights, "decoupled_runtime", "old_weights.dat") with open("old_weights.dat", "r") as f: old_weight_stream = f.read().strip() os.remove("old_weights.dat") old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") in_tensor = np.asarray(range(mw), dtype=np.float32) # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) in_tensor = np.tile(in_tensor, (2, 1)) exec_ctx = {"act_0": in_tensor} extracted_weight_stream = [] def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): extracted_weight_stream.append( axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) assert extracted_weight_stream == old_weight_stream y = exec_ctx["act_1"] # only use second batch element in output; first will be invalid due to # old weights (see above) assert (y[1] == np.dot(in_tensor[1], old_weights)).all() new_weights = gen_finn_dt_tensor(wdt, (mw, mh)) op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") with open("new_weights.dat", "r") as f: new_weight_stream = f.read().strip() os.remove("new_weights.dat") new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) new_weight_stream = list(new_weight_stream) def write_weights(sim): addr = 0 for nw in new_weight_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=write_weights) y = exec_ctx["act_1"] # only use second batch element in output; first will be invalid due to # old weights (see above) assert (y[1] == np.dot(in_tensor[1], new_weights)).all()
def execute_onnx(model, input_dict, return_full_exec_context=False): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned as indicated by the model.graph.output. If return return_full_exec_context is True, the full set of tensors used by the execution (including inputs, weights, activations and final outputs) will be returned as a dict.""" if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") graph = model.graph # first, we need to make sure that every variable required by the graph has # some buffer associated with it. this includes graph inputs (which includes # the input data as well as the trained parameters) and the graph ValueInfo # (intermediate tensors between layers) # this is provided by the execution_context, which is a dict of np.ndarray execution_context = model.make_empty_exec_context() # fill in any inputs provided to this function for inp_name in input_dict.keys(): if inp_name in execution_context: if execution_context[inp_name].shape == input_dict[inp_name].shape: execution_context[inp_name] = input_dict[inp_name] else: raise Exception( "Shape mismatch for provided input %s: found %s expected %s " % ( inp_name, str(execution_context[inp_name].shape), str(input_dict[inp_name].shape), )) # else: # raise Exception("Provided input not found in graph context: %s" % inp_name) # check if model has an execution mode set # if None, execute model node by node using execute_node() # if set to "remote_pynq" execute model on PYNQ board # if set to "rtlsim" execute model using pyverilator model_exec_mode = model.get_metadata_prop("exec_mode") if (model_exec_mode is None) or (model_exec_mode == ""): # execute the model node by node # we can simply walk down the list since the ONNX spec guarantees that it is # topologically sorted for node in graph.node: execute_node(node, execution_context, graph) elif model_exec_mode == "remote_pynq": # use remote exec metadata built into model to execute on a remote PYNQ remote_exec(model, execution_context) elif model_exec_mode == "rtlsim": # use stitched IP for rtlsim rtlsim_exec(model, execution_context) else: raise Exception( """Metadata property "exec_mode" is set to an unknown value. Can be left unset or has to be set to "remote_pynq" for remote execution on PYNQ board or "rtlsim" for execution using pyverilator!""") if return_full_exec_context: return execution_context else: # provide outputs as dict output_dict = dict() for out_tensor in graph.output: out_name = out_tensor.name output_dict[out_name] = execution_context[out_name] return output_dict