def test_runtime_thresholds_single_layer(): mem_mode = "decoupled" act = DataType["INT4"] idt = DataType["INT16"] nf = 8 ich = 16 pe = ich // nf assert ich % pe == 0 # generate input data in_tensor = gen_finn_dt_tensor(idt, (1, ich)) odt = act n_steps = act.get_num_possible_values() - 1 T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode) op_inst = getCustomOp(model.graph.node[0]) op_inst.set_nodeattr("runtime_writeable_weights", 1) op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") with open("old_weights.dat", "r") as f: old_weight_stream = f.read().strip() os.remove("old_weights.dat") old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) in_tensor = np.tile(in_tensor, (2, 1)) exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): extracted_weight_stream.append( axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) assert extracted_weight_stream == old_weight_stream # only use second batch element in output; first will be invalid due to # old weights (see above) y = exec_ctx["outp"][1] expected = multithreshold(in_tensor, T)[1] if act == DataType["BIPOLAR"]: # binary to bipolar expected = 2 * expected - 1 else: # signed offset expected += act.min() assert (y == expected).all() new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) # provide non-decreasing thresholds new_weights = np.sort(T, axis=1) op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") with open("new_weights.dat", "r") as f: new_weight_stream = f.read().strip() os.remove("new_weights.dat") new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) new_weight_stream = list(new_weight_stream) def write_weights(sim): addr = 0 for nw in new_weight_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=write_weights) y = exec_ctx["outp"][1] expected = multithreshold(in_tensor, new_weights)[1] if act == DataType["BIPOLAR"]: # binary to bipolar expected = 2 * expected - 1 else: # signed offset expected += act.min() assert (y == expected).all()
def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): if nf == -1: nf = ich pe = ich // nf assert ich % pe == 0 # generate input data x = gen_finn_dt_tensor(idt, (1, ich)) odt = act n_steps = act.get_num_possible_values() - 1 T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) # make the vivado_hls threshold bug appear (incorrect rtlsim result when first # threshold of first channel is zero, while using BIPOLAR output) if act == DataType["BIPOLAR"]: T[0][0] = 0 # provide non-decreasing thresholds T = np.sort(T, axis=1) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() model = make_single_thresholding_modelwrapper(T, pe, idt, odt, actval, mem_mode) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) elif exec_mode == "rtlsim": model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) else: raise Exception("Unknown exec_mode") # package input data as dictionary input_dict = {"inp": x} y = multithreshold(x, T) if act == DataType["BIPOLAR"]: # binary to bipolar y = 2 * y - 1 else: # signed offset y += act.min() oshape = model.get_tensor_shape("outp") y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) assert (y_produced == y_expected).all(), "cppsim failed" if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "Thresholding_Batch_0" in hls_synt_res_est node = model.get_nodes_by_op_type("Thresholding_Batch")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0
def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: sf = mw pe = mh // nf simd = mw // sf assert mh % pe == 0 assert mw % sf == 0 # generate weights W = gen_finn_dt_tensor(wdt, (mw, mh)) # generate input data x = gen_finn_dt_tensor(idt, (1, mw)) if act is None: # no activation, produce accumulators T = None tdt = None if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: odt = DataType.UINT32 else: odt = DataType.INT32 else: odt = act (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) # generate thresholds for activation if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: tdt = DataType.UINT32 # bias thresholds to be positive T = np.ceil((T + mw) / 2) assert (T >= 0).all() else: tdt = DataType.INT32 model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) for node in model.graph.node: # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) # prepare input data input_dict = prepare_inputs(x, idt, wdt) if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: # convert inputs to binary and use xnorpopcountmatmul y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) else: y = np.matmul(x, W) if T is not None: y = multithreshold(y, T) if act == DataType.BIPOLAR: # binary to bipolar y = 2 * y - 1 else: # signed offset y += act.min() oshape = model.get_tensor_shape("outp") y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] assert (y_produced.reshape( y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) assert "StreamingFCLayer_Batch_0" in hls_synt_res_est node = model.get_nodes_by_op_type("StreamingFCLayer_Batch")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0
def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: sf = mw pe = mh // nf simd = mw // sf assert mh % pe == 0 assert mw % sf == 0 # generate weights W = gen_finn_dt_tensor(wdt, (mw, mh)) # generate input data x = gen_finn_dt_tensor(idt, (1, mw)) if act is None: # no activation, produce accumulators T = None tdt = None if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: odt = DataType.UINT32 else: odt = DataType.INT32 else: odt = act (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) # generate thresholds for activation if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: tdt = DataType.UINT32 # bias thresholds to be positive T = np.ceil((T + mw) / 2) assert (T >= 0).all() else: tdt = DataType.INT32 model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) for node in model.graph.node: # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) # prepare input data input_dict = prepare_inputs(x, idt, wdt) if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR: # convert inputs to binary and use xnorpopcountmatmul y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) else: y = np.matmul(x, W) if T is not None: y = multithreshold(y, T) if act == DataType.BIPOLAR: # binary to bipolar y = 2 * y - 1 else: # signed offset y += act.min() oshape = model.get_tensor_shape("outp") y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) assert (y_produced == y_expected).all(), "cppsim failed"
def test_fpgadataflow_vvau( idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, exec_mode ): if pe == "channels": pe = channels if dim_w == 1 and k_w != 1: pytest.skip("1D image requires 1D kernel, skipping.") if channels % pe != 0: pytest.skip("Requirement Channels divisable by PE is violated.") # Generate weights in expected shape for ONNX and HLS node W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k] W_onnx = _infer_sparse_weight_tensor( W, k_h, k_w, channels ) # shape: [k*k*channels, channels] # Generate inputs in expected format for ONNX and HLS node x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels)) x_vvau = x.reshape(1, dim_h, dim_w, k_h * k_w, channels // pe, pe) x_vvau = x_vvau.transpose(0, 1, 2, 4, 3, 5) x_vvau = x_vvau.reshape(1, dim_h, dim_w, channels * k_h * k_w) if act is None: T = None tdt = None odt = DataType["INT32"] else: odt = act (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32) T = np.sort(T, axis=1) tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt ) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) elif exec_mode == "rtlsim": model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) model = model.transform(PrepareRTLSim()) else: raise Exception("Unknown exec_mode in test_fpgadataflow_vvau") input_dict = prepare_inputs(x_vvau) # Calculate output y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format if T is not None: # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format y_expected = np.transpose(y_expected, (0, 3, 1, 2)) y_expected = multithreshold(y_expected, T) y_expected = np.transpose(y_expected, (0, 2, 3, 1)) # signed offset y_expected += act.min() y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[ "outp" ] assert (y_produced == y_expected).all(), "cppsim failed" if exec_mode == "rtlsim": node = model.get_nodes_by_op_type("Vector_Vector_Activate_Batch")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0
def test_multithreshold(): inputs = np.ndarray( shape=(6, 3, 2, 2), buffer=np.array([ 4.8, 3.2, 1.2, 4.9, 7.8, 2.4, 3.1, 4.7, 6.2, 5.1, 4.9, 2.2, 6.2, 0.0, 0.8, 4.7, 0.2, 5.6, 8.9, 9.2, 9.1, 4.0, 3.3, 4.9, 2.3, 1.7, 1.3, 2.2, 4.6, 3.4, 3.7, 9.8, 4.7, 4.9, 2.8, 2.7, 8.3, 6.7, 4.2, 7.1, 2.8, 3.1, 0.8, 0.6, 4.4, 2.7, 6.3, 6.1, 1.4, 5.3, 2.3, 1.9, 4.7, 8.1, 9.3, 3.7, 2.7, 5.1, 4.2, 1.8, 4.1, 7.3, 7.1, 0.4, 0.2, 1.3, 4.3, 8.9, 1.4, 1.6, 8.3, 9.4, ]), ) thresholds = np.ndarray( shape=(3, 7), buffer=np.array([ 0.8, 1.4, 1.7, 3.5, 5.2, 6.8, 8.2, 0.2, 2.2, 3.5, 4.5, 6.6, 8.6, 9.2, 1.3, 4.1, 4.5, 6.5, 7.8, 8.1, 8.9, ]), ) outputs = np.ndarray( shape=(6, 3, 2, 2), buffer=np.array([ 4.0, 3.0, 1.0, 4.0, 5.0, 2.0, 2.0, 4.0, 3.0, 3.0, 3.0, 1.0, 5.0, 0.0, 1.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 1.0, 1.0, 3.0, 3.0, 3.0, 1.0, 3.0, 4.0, 2.0, 3.0, 7.0, 3.0, 3.0, 1.0, 1.0, 7.0, 5.0, 4.0, 6.0, 2.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 3.0, 2.0, 5.0, 3.0, 3.0, 4.0, 5.0, 7.0, 3.0, 1.0, 3.0, 2.0, 1.0, 4.0, 6.0, 6.0, 0.0, 1.0, 1.0, 3.0, 6.0, 1.0, 1.0, 6.0, 7.0, ]), ) results = multithreshold(inputs, thresholds) assert (results == outputs).all() results_scaled = multithreshold(inputs, thresholds, 2.0, -1.0) outputs_scaled = 2.0 * outputs - 1.0 assert (results_scaled == outputs_scaled).all() # performance and random test np.random.seed(0) inputs = np.random.random((1, 256, 64, 64)) thresholds = (np.array([[1, 2, 3, 4, 5, 6]]) - 0.5) / 6 before = time.time() vec_results = multithreshold(inputs, thresholds) after = time.time() vector_runtime = after - before before = time.time() nonvec_results = multithreshold_elementwise(inputs, thresholds) after = time.time() non_vector_runtime = after - before assert (vec_results == nonvec_results).all() return vector_runtime, non_vector_runtime