def verify(test_vec, mock_enc_w): ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls( test_vec["tir_module"]) buffer_info = tirtocs.extract_buffer_info(test_vec["tir_module"], test_vec["param_dict"]) for ethosu_conv2d_call in ethosu_conv2d_calls: npu_op, w_zero_point = tirtocs.translate_ethosu_conv2d( ethosu_conv2d_call) weights = buffer_info[npu_op.weights[0].address.buffer_var][0] assert mock_enc_w.call_args[1]["accelerator"] == accel assert (mock_enc_w.call_args[1]["weights_volume"].flatten() == weights.astype(np.int64) - w_zero_point).all() assert mock_enc_w.call_args[1]["dilation_xy"] == ( npu_op.kernel.dilation_x, npu_op.kernel.dilation_y, ) assert mock_enc_w.call_args[1]["dilation_xy"] == ( npu_op.kernel.dilation_x, npu_op.kernel.dilation_y, ) assert mock_enc_w.call_args[1][ "ifm_bitdepth"] == npu_op.ifm.data_type.size_in_bits() assert mock_enc_w.call_args[1]["block_traversal"] == test_vec[ "block_traversal"]
def create_mock(test_vec): with patch("ethosu.vela.api.npu_encode_weights") as mock_enc_w: with patch("ethosu.vela.api.npu_find_block_configs") as mock_blk_cfg: mock_blk_cfg.return_value = [vapi.NpuShape3D(8, 8, 8)] ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_vec["tir_module"]) buffer_info = tirtocs.extract_buffer_info( test_vec["tir_module"], test_vec["param_dict"] ) for ethosu_conv2d_call in ethosu_conv2d_calls: npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call) weights = buffer_info[npu_op.weights[0].address.buffer_var][0] vela_api.encode_weights(ethosu_conv2d_call, weights, accel) return mock_enc_w
def test_assign_addresses(): test_cases = [ { # Stimulus "tir_module": WeightStreamOnly, "param_dict": { 2: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"), 3: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 4: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"), 5: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 6: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"), 7: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 8: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [144], "uint8"), 9: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), }, }, { # Stimulus "tir_module": MixedRead, "param_dict": { 1: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [592], "uint8"), 3: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [160], "uint8"), 4: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"), 5: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 6: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"), 7: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 8: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"), 9: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), 10: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [80], "uint8"), 11: np.random.randint(np.iinfo("uint8").min, np.iinfo("uint8").max, [20], "uint8"), }, }, ] def extract_extern_calls(mod): """This function will obtain all ethosu_conv2d calls from a NPU TIR module Parameters ---------- mod : tvm.IRModule This is a NPU TIR Module Returns ------- list of tvm.tir.Call objects that are tir extern calls for ethosu_conv2d """ # There should only be a single function assert len(mod.functions.items()) == 1 primfunc = mod.functions.items()[0][1] extern_calls = list() def populate_extern_calls(stmt): if isinstance(stmt, tvm.tir.Call) and stmt.op.name == "tir.call_extern": extern_calls.append(stmt) stmt_functor.post_order_visit(primfunc.body, populate_extern_calls) return extern_calls def collect_tir_buffer_info(npu_ops): """This is run prior to address assigning to collect tir buffer information for verification later on""" _npu_op_tir_buffers = dict() for npu_op in npu_ops: if isinstance(npu_op, vapi.NpuDmaOperation): _npu_op_tir_buffers[npu_op] = (npu_op.src.address, npu_op.dest.address) elif issubclass(type(npu_op), vapi.NpuBlockOperation): _npu_op_tir_buffers[npu_op] = ( npu_op.ifm.tiles.addresses[0], npu_op.ofm.tiles.addresses[0], npu_op.weights, npu_op.biases, ) return _npu_op_tir_buffers def _check_buffer(address, region, length, buffer_var): """Checks whether the buffer information is valid with original tir buffers. - If its constant, this will check the slice in the constant tensor has the values. - If its scratch, this will check the slice is within scratch and does not have conflicts with other scratch tensors. - If its input/output, this will check the address is zero """ inverse_region_map = { 0: tir_to_cs_translator.BufferType.constant, 1: tir_to_cs_translator.BufferType.scratch, 3: tir_to_cs_translator.BufferType.input, 4: tir_to_cs_translator.BufferType.output, } buffer_type = inverse_region_map[region] if buffer_type == tir_to_cs_translator.BufferType.constant: ref = buffer_info[buffer_var].values assert (constant_tensor[address : address + length] == ref).all() # Every buffer is adjusted to align to 16 bytes length = util.round_up(length, 16) # Mark these constants are read at least once constant_tensor_read_mask[address : address + length] = np.ones(length, dtype="uint8") elif buffer_type == tir_to_cs_translator.BufferType.scratch: shape = list(buffer_info[buffer_var].shape) assert length == np.prod(shape) assert address < scratch_size # Every buffer is adjusted to align to 16 bytes length = util.round_up(length, 16) assert address + length <= scratch_size # The scratch area should not be used by anyother buffer assert not scratch_allocation_mask[address : address + length].any() # The scratch area is marked as used scratch_allocation_mask[address : address + length] = np.ones(length, dtype="uint8") elif buffer_type == tir_to_cs_translator.BufferType.input: assert address == 0 else: assert buffer_type == tir_to_cs_translator.BufferType.output assert address == 0 def verify(npu_ops): """This wrapper verifies the allocated addresses matches with original tir buffers""" checked_buffers = set() def check_buffer(address, region, length, buffer_var): if buffer_var not in checked_buffers: _check_buffer(address, region, length, buffer_var) checked_buffers.add(buffer_var) for npu_op in npu_ops: if isinstance(npu_op, vapi.NpuDmaOperation): src_tir_buffer_var = npu_op_tir_buffers[npu_op][0].buffer_var check_buffer( npu_op.src.address, npu_op.src.region, npu_op.src.length, src_tir_buffer_var ) dest_tir_load = npu_op_tir_buffers[npu_op][1].buffer_var check_buffer( npu_op.dest.address, npu_op.dest.region, npu_op.dest.length, dest_tir_load, ) elif issubclass(type(npu_op), vapi.NpuBlockOperation): ifm_tir_buffer_var = npu_op_tir_buffers[npu_op][0].buffer_var ifm_length = ( npu_op.ifm.shape.height * npu_op.ifm.shape.width * npu_op.ifm.shape.depth ) check_buffer( npu_op.ifm.tiles.addresses[0], npu_op.ifm.region, ifm_length, ifm_tir_buffer_var, ) ofm_tir_buffer_var = npu_op_tir_buffers[npu_op][1].buffer_var ofm_length = ( npu_op.ofm.shape.height * npu_op.ofm.shape.width * npu_op.ofm.shape.depth ) check_buffer( npu_op.ofm.tiles.addresses[0], npu_op.ofm.region, ofm_length, ofm_tir_buffer_var, ) for idx, weight in enumerate(npu_op_tir_buffers[npu_op][2]): assert isinstance(weight, vapi.NpuAddressRange) check_buffer( npu_op.weights[idx].address, npu_op.weights[idx].region, npu_op.weights[idx].length, weight.address.buffer_var, ) for idx, bias in enumerate(npu_op_tir_buffers[npu_op][3]): assert isinstance(bias, vapi.NpuAddressRange) check_buffer( npu_op.biases[idx].address, npu_op.biases[idx].region, npu_op.biases[idx].length, bias.address.buffer_var, ) for test_case in test_cases: buffer_info = tir_to_cs_translator.extract_buffer_info( test_case["tir_module"], test_case["param_dict"] ) extern_calls = extract_extern_calls(test_case["tir_module"]) _npu_ops = list() for extern_call in extern_calls: _npu_ops.append(tir_to_cs_translator.translate_ethosu_tir_extern_call(extern_call)) npu_op_tir_buffers = collect_tir_buffer_info(_npu_ops) _npu_ops, constant_tensor, scratch_size = tir_to_cs_translator.assign_addresses( buffer_info, _npu_ops ) scratch_allocation_mask = np.zeros(scratch_size, dtype="uint8") constant_tensor_read_mask = np.zeros(constant_tensor.size, dtype="uint8") verify(_npu_ops) # This will be only 1 if all allocated scratch is used. assert np.prod(scratch_allocation_mask) == 1 # This will be only 1 if all constant tensors is read at least once. assert np.prod(constant_tensor_read_mask) == 1
def test_buffer_info_extraction(): test_cases = [ { # Stimulus "tir_module": SingleEthosUConv2D, "param_dict": { 1: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 16], "uint8" ), 2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [16], "int32"), }, # Reference Outputs "constants": { "placeholder_4": 1, "placeholder_5": 2, }, "data_buffers": { "placeholder_3": ( [1, 8, 8, 3], "uint8", tir_to_cs_translator.BufferType.input_or_output, ), "ethosu_conv2d_1": ( [1, 8, 8, 16], "uint8", tir_to_cs_translator.BufferType.input_or_output, ), }, }, { "tir_module": MultiEthosUConv2D, "param_dict": { 1: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 32], "uint8" ), 2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [32], "int32"), 3: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 32, 8], "uint8" ), 4: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [8], "int32"), }, # Reference Outputs "constants": { "placeholder_5": 4, "placeholder_7": 1, "placeholder_8": 2, "placeholder_9": 3, }, "data_buffers": { "placeholder_6": ( [1, 8, 8, 3], "uint8", tir_to_cs_translator.BufferType.input_or_output, ), "ethosu_conv2d_1": ( [1, 8, 8, 8], "uint8", tir_to_cs_translator.BufferType.input_or_output, ), "ethosu_conv2d_2": ([1024], "uint8", tir_to_cs_translator.BufferType.scratch), "ethosu_conv2d_3": ([2048], "uint8", tir_to_cs_translator.BufferType.scratch), }, }, ] for test_case in test_cases: buffer_info = tir_to_cs_translator.extract_buffer_info( test_case["tir_module"], test_case["param_dict"] ) for buffer_var, info in buffer_info.items(): buffer_name = buffer_var.name if buffer_name in test_case["constants"].keys(): assert ( info.values == test_case["param_dict"][test_case["constants"][buffer_name]] ).all() assert ( info.dtype == test_case["param_dict"][test_case["constants"][buffer_name]].dtype ) info.btype == tir_to_cs_translator.BufferType.constant else: assert list(info.shape) == test_case["data_buffers"][buffer_name][0] assert info.dtype == test_case["data_buffers"][buffer_name][1] assert info.btype == test_case["data_buffers"][buffer_name][2]