def test_translate_ethosu_depthwise_conv2d(): def extract_ethosu_depthwise_conv2d_extern_call(mod): # There should only be a single function assert len(mod.functions.items()) == 1 primfunc = mod.functions.items()[0][1] ethosu_depthwise_conv2d_calls = list() def populate_ethosu_depthwise_conv2d_calls(stmt): if ( isinstance(stmt, tvm.tir.Call) and stmt.op.name == "tir.call_extern" and stmt.args[0] == "ethosu_depthwise_conv2d" ): ethosu_depthwise_conv2d_calls.append(stmt) stmt_functor.post_order_visit(primfunc.body, populate_ethosu_depthwise_conv2d_calls) return ethosu_depthwise_conv2d_calls[0] depthwise_conv2d_call = extract_ethosu_depthwise_conv2d_extern_call(SingleEthosuDepthwiseConv2D) npu_op, w_zero_point = tir_to_cs_translator.translate_ethosu_depthwise_conv2d( depthwise_conv2d_call ) assert npu_op.ifm.data_type == vapi.NpuDataType.INT8 assert npu_op.ifm.shape == vapi.NpuShape3D(8, 8, 3) assert npu_op.ifm.tiles.height_0 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).height_0 assert npu_op.ifm.tiles.height_1 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).height_1 assert npu_op.ifm.tiles.width_0 == vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]).width_0 assert npu_op.ifm.quantization == pytest.approx(vapi.NpuQuantization(0.6, 11)) assert npu_op.ifm.layout == vapi.NpuLayout.NHWC assert npu_op.ifm.strides == vapi.NpuShape3D(24, 3, 1) # Compare OFM assert npu_op.ofm.data_type == vapi.NpuDataType.INT8 assert npu_op.ofm.shape == vapi.NpuShape3D(6, 7, 3) assert npu_op.ofm.tiles.height_0 == vapi.NpuTileBox(6, 0, 8, [0, 0, 0, 0]).height_0 assert npu_op.ofm.tiles.height_1 == vapi.NpuTileBox(6, 0, 7, [0, 0, 0, 0]).height_1 assert npu_op.ofm.tiles.width_0 == vapi.NpuTileBox(6, 0, 7, [0, 0, 0, 0]).width_0 assert npu_op.ofm.quantization == pytest.approx(vapi.NpuQuantization(0.26, 15)) assert npu_op.ofm.layout == vapi.NpuLayout.NHWC assert npu_op.ofm.strides == vapi.NpuShape3D(21, 3, 1) # Compare kernel and padding assert ( npu_op.kernel.__dict__ == vapi.NpuKernel(w=2, h=3, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1).__dict__ ) assert npu_op.padding == vapi.NpuPadding(top=0, left=0, bottom=0, right=0) # Compare activation assert npu_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU assert npu_op.activation.min == 0 assert npu_op.activation.max == pytest.approx(23.4) # Compare ifm upscaling assert npu_op.ifm_upscale == vapi.NpuResamplingMode.NONE # Compare weight quantization parameters assert w_zero_point == 13
def _create_npu_feature_map( serial_feature_map: spec.SerialFeatureMap) -> vapi.NpuFeatureMap: """This is a helper function to capture a list of arguments to create Vela NpuFeatureMap object. """ layout_map = { "NHWC": vapi.NpuLayout.NHWC, "NHCWB16": vapi.NpuLayout.NHCWB16 } datatype_map = { "uint8": vapi.NpuDataType.UINT8, "int8": vapi.NpuDataType.INT8, "uint16": vapi.NpuDataType.UINT16, "int16": vapi.NpuDataType.INT16, "int32": vapi.NpuDataType.INT32, } layout = str(serial_feature_map.layout.value) data_type = str(serial_feature_map.data_type.value) date_type_bytes = np.iinfo(np.dtype(data_type)).bits // 8 assert layout in layout_map.keys() assert data_type in datatype_map.keys() nfm = vapi.NpuFeatureMap() nfm.data_type = datatype_map[data_type] nfm.shape = vapi.NpuShape3D( int(serial_feature_map.height), int(serial_feature_map.width), int(serial_feature_map.channels), ) nfm.tiles = vapi.NpuTileBox( int(serial_feature_map.tile_height_0), int(serial_feature_map.tile_height_1), int(serial_feature_map.tile_width_0), [ serial_feature_map.tile_address_0, serial_feature_map.tile_address_1, serial_feature_map.tile_address_2, serial_feature_map.tile_address_3, ], ) nfm.quantization = _create_npu_quantization(serial_feature_map.scale, serial_feature_map.zero_point) nfm.layout = layout_map[layout] nfm.strides = vapi.NpuShape3D( int(serial_feature_map.stride_h.value) * date_type_bytes, int(serial_feature_map.stride_w.value) * date_type_bytes, int(serial_feature_map.stride_c.value) * date_type_bytes, ) return nfm
def get_optimal_block_config( npu_op: vapi.NpuOperation, accel_config: vapi.NpuAccelerator) -> vapi.NpuShape3D: """ "The NPU's unit of work is known as a block. It will fetch block(s) from Input Feature Map (IFM) and a compute block for Output Feature Map (OFM). Therefore, we need to pick an optimal block configuration considering bandwidth to bring IFM blocks and the number of OFM block computes need to happen to cover the OFM as indicated by the npu op. Parameters ---------- npu_op : ethosu.vela.api.NpuOperation The NPU operation and its params accel_config : ethosu.vela.api.NpuAccelerator The NPU accelerator config Returns ------- ethosu.vela.api.NpuShape3D : The optimal block config for the operator """ options = tvm.transform.PassContext.current().config.get( "relay.ext.ethos-u.options", None) if options and options.dev_force_block_config: block_config = [ int(v) for v in options.dev_force_block_config.split("x") ] return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2]) all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config) return _get_optimal_block_config(all_valid_block_configs)
def _create_npu_block_config(serial_block_config: spec.SerialBlockConfig) -> vapi.NpuShape3D: """A helper function to convert a SerialBlockConfig into an NpuShape3D""" if serial_block_config.height * serial_block_config.width * serial_block_config.depth == 0: return None block_config = vapi.NpuShape3D( height=int(serial_block_config.height), width=int(serial_block_config.width), depth=int(serial_block_config.depth), ) return block_config
def create_mock(test_vec): with patch("ethosu.vela.api.npu_encode_weights") as mock_enc_w: with patch("ethosu.vela.api.npu_find_block_configs") as mock_blk_cfg: mock_blk_cfg.return_value = [vapi.NpuShape3D(8, 8, 8)] ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_vec["tir_module"]) buffer_info = tirtocs.extract_buffer_info( test_vec["tir_module"], test_vec["param_dict"] ) for ethosu_conv2d_call in ethosu_conv2d_calls: npu_op, _ = tirtocs.translate_ethosu_conv2d(ethosu_conv2d_call) weights = buffer_info[npu_op.weights[0].address.buffer_var][0] vela_api.encode_weights(ethosu_conv2d_call, weights, accel) return mock_enc_w
def test_get_optimal_block_config(): block_configs_cases = [ { "test": [ vapi.NpuShape3D(10, 20, 8), vapi.NpuShape3D(10, 30, 16), vapi.NpuShape3D(10, 40, 32), ], "ref": vapi.NpuShape3D(10, 40, 32), }, { "test": [ vapi.NpuShape3D(10, 20, 8), vapi.NpuShape3D(10, 50, 32), vapi.NpuShape3D(10, 40, 32), ], "ref": vapi.NpuShape3D(10, 50, 32), }, { "test": [ vapi.NpuShape3D(50, 50, 8), vapi.NpuShape3D(10, 30, 32), vapi.NpuShape3D(8, 8, 64), ], "ref": vapi.NpuShape3D(8, 8, 64), }, ] for test_case in block_configs_cases: assert vela_api._get_optimal_block_config(test_case["test"]) == test_case["ref"]
def test_translate_ethosu_conv2d(): test_cases = [ { # Stimulus "tir_module": SingleEthosUConv2D, "param_dict": { 1: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 16], "uint8" ), 2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [16], "int32"), }, # Reference outputs "ref": [ { "ifm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(8, 8, 3), "tiles": vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.5, 10), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(24, 3, 1), }, "ofm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(8, 8, 16), "tiles": vapi.NpuTileBox(8, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.25, 14), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(128, 16, 1), }, "kernel": vapi.NpuKernel( w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1 ), "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0), "activation": { "op": vapi.NpuActivationOp.NONE_OR_RELU, "min": -3.5, "max": 60.25, }, "ifm_upscale": vapi.NpuResamplingMode.NONE, "w_zero_point": 12, } ], }, { "tir_module": MultiEthosUConv2D, "param_dict": { 1: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 3, 32], "uint8" ), 2: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [32], "int32"), 3: np.random.randint( np.iinfo("uint8").min, np.iinfo("uint8").max, [1, 1, 32, 8], "uint8" ), 4: np.random.randint(np.iinfo("int32").min, np.iinfo("int32").max, [8], "int32"), }, # Reference Outputs "ref": [ { "ifm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 3), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.5, 10), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(24, 3, 1), }, "ofm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 32), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.25, 14), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(256, 32, 1), }, "kernel": vapi.NpuKernel( w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1 ), "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0), "activation": {"op": None}, "ifm_upscale": vapi.NpuResamplingMode.NONE, "w_zero_point": 12, }, { "ifm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 32), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.5, 10), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(256, 32, 1), }, "ofm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 8), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.25, 14), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(64, 8, 1), }, "kernel": vapi.NpuKernel( w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1 ), "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0), "activation": { "op": vapi.NpuActivationOp.NONE_OR_RELU, "min": -3.5, "max": 60.25, }, "ifm_upscale": vapi.NpuResamplingMode.NONE, "w_zero_point": 12, }, { "ifm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 3), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.5, 10), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(24, 3, 1), }, "ofm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 32), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.25, 14), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(256, 32, 1), }, "kernel": vapi.NpuKernel( w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1 ), "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0), "activation": { "op": vapi.NpuActivationOp.NONE_OR_RELU, "min": -3.5, "max": 60.25, }, "ifm_upscale": vapi.NpuResamplingMode.NONE, "w_zero_point": 12, }, { "ifm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 32), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.5, 10), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(256, 32, 1), }, "ofm": { "data_type": vapi.NpuDataType.UINT8, "shape": vapi.NpuShape3D(4, 8, 8), "tiles": vapi.NpuTileBox(4, 0, 8, [0, 0, 0, 0]), "quantization": vapi.NpuQuantization(0.25, 14), "layout": vapi.NpuLayout.NHWC, "strides": vapi.NpuShape3D(64, 8, 1), }, "kernel": vapi.NpuKernel( w=1, h=1, stride_x=1, stride_y=1, dilation_x=1, dilation_y=1 ), "padding": vapi.NpuPadding(top=0, left=0, bottom=0, right=0), "activation": { "op": vapi.NpuActivationOp.NONE_OR_RELU, "min": -3.5, "max": 60.25, }, "ifm_upscale": vapi.NpuResamplingMode.NONE, "w_zero_point": 12, }, ], }, ] def extract_ethosu_conv2d_extern_calls(mod): """This function will obtain all ethosu_conv2d calls from a NPU TIR module Parameters ---------- mod : tvm.IRModule This is a NPU TIR Module Returns ------- list of tvm.tir.Call objects that are tir extern calls for ethosu_conv2d """ # There should only be a single function assert len(mod.functions.items()) == 1 primfunc = mod.functions.items()[0][1] ethosu_conv2d_calls = list() def populate_ethosu_conv2d_calls(stmt): if ( isinstance(stmt, tvm.tir.Call) and stmt.op.name == "tir.call_extern" and stmt.args[0] == "ethosu_conv2d" ): ethosu_conv2d_calls.append(stmt) stmt_functor.post_order_visit(primfunc.body, populate_ethosu_conv2d_calls) return ethosu_conv2d_calls for test_case in test_cases: ethosu_conv2d_calls = extract_ethosu_conv2d_extern_calls(test_case["tir_module"]) for idx, ethosu_conv2d_call in enumerate(ethosu_conv2d_calls): ref = test_case["ref"][idx] npu_op, w_zero_point = tir_to_cs_translator.translate_ethosu_conv2d(ethosu_conv2d_call) # Compare IFM assert npu_op.ifm.data_type == ref["ifm"]["data_type"] assert npu_op.ifm.shape == ref["ifm"]["shape"] assert npu_op.ifm.tiles.height_0 == ref["ifm"]["tiles"].height_0 assert npu_op.ifm.tiles.height_1 == ref["ifm"]["tiles"].height_1 assert npu_op.ifm.tiles.width_0 == ref["ifm"]["tiles"].width_0 assert npu_op.ifm.quantization == ref["ifm"]["quantization"] assert npu_op.ifm.layout == ref["ifm"]["layout"] assert npu_op.ifm.strides == ref["ifm"]["strides"] # Compare OFM assert npu_op.ofm.data_type == ref["ofm"]["data_type"] assert npu_op.ofm.shape == ref["ofm"]["shape"] assert npu_op.ofm.tiles.height_0 == ref["ofm"]["tiles"].height_0 assert npu_op.ofm.tiles.height_1 == ref["ofm"]["tiles"].height_1 assert npu_op.ofm.tiles.width_0 == ref["ofm"]["tiles"].width_0 assert npu_op.ofm.quantization == ref["ofm"]["quantization"] assert npu_op.ofm.layout == ref["ofm"]["layout"] assert npu_op.ofm.strides == ref["ofm"]["strides"] # Compare kernel and padding assert npu_op.kernel.__dict__ == ref["kernel"].__dict__ assert npu_op.padding == ref["padding"] # Compare activation if ref["activation"]["op"] is None: assert npu_op.activation is None else: assert npu_op.activation.op_type == ref["activation"]["op"] assert npu_op.activation.min == ref["activation"]["min"] assert npu_op.activation.max == ref["activation"]["max"] # Compare ifm upscaling assert npu_op.ifm_upscale == ref["ifm_upscale"] # Compare weight quantization parameters assert w_zero_point == ref["w_zero_point"]
vapi.NpuShape3D(10, 30, 32), vapi.NpuShape3D(8, 8, 64), ], "ref": vapi.NpuShape3D(8, 8, 64), }, ] for test_case in block_configs_cases: assert vela_api._get_optimal_block_config( test_case["test"]) == test_case["ref"] @pytest.mark.parametrize( "block_config_str, expected_block_config", [("4x4x8", vapi.NpuShape3D(4, 4, 8)), ("3x7x16", vapi.NpuShape3D(3, 7, 16))], ) def test_force_block_config(block_config_str, expected_block_config): config = { "dev_force_block_config": block_config_str, } with tvm.transform.PassContext( config={"relay.ext.ethos-u.options": config}): block_config = vela_api.get_optimal_block_config( None, vapi.NpuAccelerator.Ethos_U55_128) assert block_config == expected_block_config def test_compress_weights(): test_vecs = [