Beispiel #1
0
def throughput_test_rtlsim(model, batchsize=100):
    """Runs a throughput test for the given IP-stitched model. When combined
    with tracing, useful to determine bottlenecks and required FIFO sizes."""

    assert (model.get_metadata_prop("exec_mode") == "rtlsim"
            ), """Top-level exec_mode
    metadata_prop must be set to rtlsim"""

    # make empty exec context and insert random inputs
    ctx = model.make_empty_exec_context()
    i_bytes = 0
    for i_vi in model.graph.input:
        # create random input
        iname = i_vi.name
        ishape = model.get_tensor_shape(iname)
        ishape_batch = ishape
        ishape_batch[0] = batchsize
        idt = model.get_tensor_datatype(iname)
        dummy_input = gen_finn_dt_tensor(idt, ishape_batch)
        ctx[iname] = dummy_input
        i_bytes += (np.prod(ishape_batch) * idt.bitwidth()) / 8

    # compute total output size as well
    o_bytes = 0
    for o_vi in model.graph.output:
        oname = o_vi.name
        oshape = model.get_tensor_shape(oname)
        oshape_batch = oshape
        oshape_batch[0] = batchsize
        odt = model.get_tensor_datatype(oname)
        o_bytes += (np.prod(oshape_batch) * odt.bitwidth()) / 8

    # remove liveness threshold, launch rtlsim
    os.environ["LIVENESS_THRESHOLD"] = "-1"
    rtlsim_exec(model, ctx)
    # extract metrics
    cycles = int(model.get_metadata_prop("cycles_rtlsim"))
    clk_ns = float(model.get_metadata_prop("clk_ns"))
    fclk_mhz = 1 / (clk_ns * 0.001)
    runtime_s = (cycles * clk_ns) * (10**-9)
    res = dict()
    res["cycles"] = cycles
    res["runtime[ms]"] = runtime_s * 1000
    res["throughput[images/s]"] = batchsize / runtime_s
    res["DRAM_in_bandwidth[Mb/s]"] = i_bytes * 0.000001 / runtime_s
    res["DRAM_out_bandwidth[Mb/s]"] = o_bytes * 0.000001 / runtime_s
    res["fclk[mhz]"] = fclk_mhz
    res["N"] = batchsize

    return res
Beispiel #2
0
def test_runtime_thresholds_single_layer():
    mem_mode = "decoupled"
    act = DataType["INT4"]
    idt = DataType["INT16"]
    nf = 8
    ich = 16
    pe = ich // nf
    assert ich % pe == 0

    # generate input data
    in_tensor = gen_finn_dt_tensor(idt, (1, ich))

    odt = act
    n_steps = act.get_num_possible_values() - 1
    T = np.random.randint(idt.min(),
                          idt.max() + 1, (ich, n_steps)).astype(np.float32)
    # provide non-decreasing thresholds
    T = np.sort(T, axis=1)

    if odt == DataType["BIPOLAR"]:
        actval = 0
    else:
        actval = odt.min()

    model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval,
                                                  mem_mode)
    op_inst = getCustomOp(model.graph.node[0])
    op_inst.set_nodeattr("runtime_writeable_weights", 1)
    op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat")
    with open("old_weights.dat", "r") as f:
        old_weight_stream = f.read().strip()
    os.remove("old_weights.dat")
    old_weight_stream = map(lambda x: int(x, 16),
                            old_weight_stream.split("\n"))
    old_weight_stream = list(old_weight_stream)
    # need to create stitched IP for runtime weight testing
    model = model.transform(InsertFIFO(True))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model = model.transform(PrepareRTLSim())
    model.set_metadata_prop("exec_mode", "rtlsim")
    # add two copies of the input tensor as the first one is just used to
    # "flush out" the pipeline (as mvau already starts receiving old weights while
    # we read/write new ones and reads seem to cause a disturbance too)
    in_tensor = np.tile(in_tensor, (2, 1))
    exec_ctx = {"inp": in_tensor}
    extracted_weight_stream = []

    def read_weights(sim):
        addr = 0
        for i in range(len(old_weight_stream)):
            extracted_weight_stream.append(
                axilite_read(sim, addr, basename="s_axilite_0_"))
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
    assert extracted_weight_stream == old_weight_stream
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, T)[1]
    if act == DataType["BIPOLAR"]:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()

    new_weights = np.random.randint(idt.min(),
                                    idt.max() + 1,
                                    (ich, n_steps)).astype(np.float32)
    # provide non-decreasing thresholds
    new_weights = np.sort(T, axis=1)
    op_inst.make_weight_file(new_weights, "decoupled_runtime",
                             "new_weights.dat")
    with open("new_weights.dat", "r") as f:
        new_weight_stream = f.read().strip()
    os.remove("new_weights.dat")
    new_weight_stream = map(lambda x: int(x, 16),
                            new_weight_stream.split("\n"))
    new_weight_stream = list(new_weight_stream)

    def write_weights(sim):
        addr = 0
        for nw in new_weight_stream:
            axilite_write(sim, addr, nw, basename="s_axilite_0_")
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
    y = exec_ctx["outp"][1]
    expected = multithreshold(in_tensor, new_weights)[1]
    if act == DataType["BIPOLAR"]:
        # binary to bipolar
        expected = 2 * expected - 1
    else:
        # signed offset
        expected += act.min()
    assert (y == expected).all()
Beispiel #3
0
def execute_onnx(model,
                 input_dict,
                 return_full_exec_context=False,
                 start_node=None,
                 end_node=None):
    """Executes given ONNX ModelWrapper with given named inputs.

    If return_full_exec_context is False, a dict of named outputs is returned
    as indicated by the model.graph.output.

    If return return_full_exec_context is True, the full set of tensors used by
    the execution (including inputs, weights, activations and final outputs)
    will be returned as a dict.

    When start_node and end_node are set to None, the whole graph is executed.
    If they are set to particular ONNX nodes, only the subgraph between (and
    including) those nodes is executed.
    """

    if not model.check_all_tensor_shapes_specified():
        raise Exception("Found unspecified tensor shapes, try infer_shapes")
    ret = model.analysis(ta.nodes_topologically_sorted)
    assert (ret["nodes_topologically_sorted"] is True), """Nodes must be
    topologically sorted."""

    graph = model.graph
    # first, we need to make sure that every variable required by the graph has
    # some buffer associated with it. this includes graph inputs (which includes
    # the input data as well as the trained parameters) and the graph ValueInfo
    # (intermediate tensors between layers)
    # this is provided by the execution_context, which is a dict of np.ndarray
    execution_context = model.make_empty_exec_context()
    # fill in any inputs provided to this function
    for inp_name in input_dict.keys():
        if inp_name in execution_context:
            if execution_context[inp_name].shape == input_dict[inp_name].shape:
                execution_context[inp_name] = input_dict[inp_name]
            else:
                raise Exception(
                    "Shape mismatch for provided input %s: found %s expected %s "
                    % (
                        inp_name,
                        str(execution_context[inp_name].shape),
                        str(input_dict[inp_name].shape),
                    ))
        # else:
        # raise Exception("Provided input not found in graph context: %s" % inp_name)

    # check if model has an execution mode set
    # if None, execute model node by node using execute_node()
    # if set to "remote_pynq" execute model on PYNQ board
    # if set to "rtlsim" execute model using pyverilator
    model_exec_mode = model.get_metadata_prop("exec_mode")
    if (model_exec_mode is None) or (model_exec_mode == ""):
        # execute the model node by node
        # we can simply walk down the list since the ONNX spec guarantees that it is
        # topologically sorted
        subgraph = []
        if start_node is None:
            start_node = model.graph.node[0]
        if end_node is None:
            end_node = model.graph.node[-1]
        # select the nodes between specified start/end nodes
        start_ind = model.get_node_index(start_node)
        end_ind = model.get_node_index(end_node) + 1
        assert end_ind >= start_ind, "Start/end nodes must define valid subgraph"
        subgraph = graph.node[start_ind:end_ind]
        for node in subgraph:
            if get_sanitize_quant_tensors() != 0:
                # round input values to match quantization annotation
                execution_context = sanitize_quant_values(
                    model, node.input, execution_context)
            execute_node(node, execution_context, graph,
                         return_full_exec_context)
            if get_sanitize_quant_tensors() != 0:
                # round output values to quantization annotation
                execution_context = sanitize_quant_values(
                    model, node.output, execution_context)
    elif model_exec_mode == "remote_pynq":
        # use remote exec metadata built into model to execute on a remote PYNQ
        remote_exec(model, execution_context)
    elif model_exec_mode == "rtlsim":
        # use stitched IP for rtlsim
        rtlsim_exec(model, execution_context)
    else:
        raise Exception(
            """Metadata property "exec_mode" is set to an unknown value.
        Can be left unset or has to be set to "remote_pynq" for remote execution
        on PYNQ board or "rtlsim" for execution using pyverilator!""")

    if return_full_exec_context:
        return execution_context
    else:
        # provide outputs as dict
        output_dict = dict()
        for out_tensor in graph.output:
            out_name = out_tensor.name
            output_dict[out_name] = execution_context[out_name]
        return output_dict
Beispiel #4
0
def test_runtime_weights_single_layer():
    idt = DataType["UINT32"]
    wdt = DataType["UINT4"]
    act = None
    mw = 64
    mh = 32
    pe = 4
    simd = 16
    layer_spec = {
        "idt": idt,
        "wdt": wdt,
        "mw": mw,
        "mh": mh,
        "act": act,
        "pe": pe,
        "simd": simd,
    }
    layer_spec_list = [layer_spec]
    model = hls_random_mlp_maker(layer_spec_list)
    fcl = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0]
    op_inst = getCustomOp(fcl)
    op_inst.set_nodeattr("mem_mode", "decoupled")
    op_inst.set_nodeattr("runtime_writeable_weights", 1)
    old_weights = model.get_initializer(fcl.input[1])
    op_inst.make_weight_file(old_weights, "decoupled_runtime",
                             "old_weights.dat")
    with open("old_weights.dat", "r") as f:
        old_weight_stream = f.read().strip()
    os.remove("old_weights.dat")
    old_weight_stream = map(lambda x: int(x, 16),
                            old_weight_stream.split("\n"))
    old_weight_stream = list(old_weight_stream)
    model = model.transform(InsertFIFO(True))
    model = model.transform(GiveUniqueNodeNames())
    model = model.transform(PrepareIP(test_fpga_part, target_clk_ns))
    model = model.transform(HLSSynthIP())
    model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
    model.set_metadata_prop("exec_mode", "rtlsim")
    in_tensor = np.asarray(range(mw), dtype=np.float32)
    # add two copies of the input tensor as the first one is just used to
    # "flush out" the pipeline (as mvau already starts receiving old weights while
    # we read/write new ones and reads seem to cause a disturbance too)
    in_tensor = np.tile(in_tensor, (2, 1))
    exec_ctx = {"act_0": in_tensor}
    extracted_weight_stream = []

    def read_weights(sim):
        addr = 0
        for i in range(len(old_weight_stream)):
            extracted_weight_stream.append(
                axilite_read(sim, addr, basename="s_axilite_0_"))
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=read_weights)
    assert extracted_weight_stream == old_weight_stream
    y = exec_ctx["act_1"]
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    assert (y[1] == np.dot(in_tensor[1], old_weights)).all()

    new_weights = gen_finn_dt_tensor(wdt, (mw, mh))
    op_inst.make_weight_file(new_weights, "decoupled_runtime",
                             "new_weights.dat")
    with open("new_weights.dat", "r") as f:
        new_weight_stream = f.read().strip()
    os.remove("new_weights.dat")
    new_weight_stream = map(lambda x: int(x, 16),
                            new_weight_stream.split("\n"))
    new_weight_stream = list(new_weight_stream)

    def write_weights(sim):
        addr = 0
        for nw in new_weight_stream:
            axilite_write(sim, addr, nw, basename="s_axilite_0_")
            addr += 4

    rtlsim_exec(model, exec_ctx, pre_hook=write_weights)
    y = exec_ctx["act_1"]
    # only use second batch element in output; first will be invalid due to
    # old weights (see above)
    assert (y[1] == np.dot(in_tensor[1], new_weights)).all()
Beispiel #5
0
def execute_onnx(model, input_dict, return_full_exec_context=False):
    """Executes given ONNX ModelWrapper with given named inputs.

    If return_full_exec_context is False, a dict of named outputs is returned
    as indicated by the model.graph.output.

    If return return_full_exec_context is True, the full set of tensors used by
    the execution (including inputs, weights, activations and final outputs)
    will be returned as a dict."""

    if not model.check_all_tensor_shapes_specified():
        raise Exception("Found unspecified tensor shapes, try infer_shapes")

    graph = model.graph
    # first, we need to make sure that every variable required by the graph has
    # some buffer associated with it. this includes graph inputs (which includes
    # the input data as well as the trained parameters) and the graph ValueInfo
    # (intermediate tensors between layers)
    # this is provided by the execution_context, which is a dict of np.ndarray
    execution_context = model.make_empty_exec_context()
    # fill in any inputs provided to this function
    for inp_name in input_dict.keys():
        if inp_name in execution_context:
            if execution_context[inp_name].shape == input_dict[inp_name].shape:
                execution_context[inp_name] = input_dict[inp_name]
            else:
                raise Exception(
                    "Shape mismatch for provided input %s: found %s expected %s "
                    % (
                        inp_name,
                        str(execution_context[inp_name].shape),
                        str(input_dict[inp_name].shape),
                    ))
        # else:
        # raise Exception("Provided input not found in graph context: %s" % inp_name)

    # check if model has an execution mode set
    # if None, execute model node by node using execute_node()
    # if set to "remote_pynq" execute model on PYNQ board
    # if set to "rtlsim" execute model using pyverilator
    model_exec_mode = model.get_metadata_prop("exec_mode")
    if (model_exec_mode is None) or (model_exec_mode == ""):
        # execute the model node by node
        # we can simply walk down the list since the ONNX spec guarantees that it is
        # topologically sorted
        for node in graph.node:
            execute_node(node, execution_context, graph)
    elif model_exec_mode == "remote_pynq":
        # use remote exec metadata built into model to execute on a remote PYNQ
        remote_exec(model, execution_context)
    elif model_exec_mode == "rtlsim":
        # use stitched IP for rtlsim
        rtlsim_exec(model, execution_context)
    else:
        raise Exception(
            """Metadata property "exec_mode" is set to an unknown value.
        Can be left unset or has to be set to "remote_pynq" for remote execution
        on PYNQ board or "rtlsim" for execution using pyverilator!""")

    if return_full_exec_context:
        return execution_context
    else:
        # provide outputs as dict
        output_dict = dict()
        for out_tensor in graph.output:
            out_name = out_tensor.name
            output_dict[out_name] = execution_context[out_name]
        return output_dict