def _ethos_u55_cascader(sram, enable_striping) -> Callable: # TODO(ekalda): Extract the flash info from ConstantPools once it is implemented flash = MemoryRegion(name="FLASH", size=10**7, read_bandwidth=4, write_bandwidth=4) device_config = EthosuDeviceConfig(util.get_accelerator_config()) cascader_options = CascaderOptions( cascade_region=sram, max_proposals=64, stripe_factors=5, max_plan_size=10, always_copy_size=1024, max_open_plans=8, max_closed_plans=32, enable_striping=enable_striping, ) return _create_cascader( options=cascader_options, io_region=sram, constant_region=flash, working_regions=[sram], device_config=device_config, )
def test_compiler_attr_default(): default_config = { "accelerator_config": "ethos-u55-256", } with tvm.transform.PassContext(opt_level=3): with tvm.target.Target("c -device=micro_dev"): assert util.get_accelerator_config( ) == default_config["accelerator_config"]
def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule: """ This is the hook for python-based lowering of a Relay module which lowers NPU external functions to TIR. Parameters ---------- mod : tvm.ir.IRModule This is the Relay module. Returns ------- mod : tvm.ir.IRModule The Relay module with scheduled NPU external functions. """ mod = OutlineCompilerFunctions("ethos-u")(mod) mod = LegalizeEthosU()(mod) mod = LUTsOptimizer()(mod) mod = relay.transform.InferType()(mod) mod = IdentityOptimizer()(mod) mod = LayoutOptimizer()(mod) mod = relay.transform.InferType()(mod) device_contexts = { gv: "ethos-u" for gv, _ in filter(lambda x: util.is_npu_func(x[1]), mod.functions.items()) } mod = mod.with_attr("device_contexts", device_contexts) # Use the cascader if it is enabled for the U55 accelerator, otherwise use copy_constants # scheduler if util.is_cascader_enabled(): if util.get_accelerator_config() == "ethos-u65-256": raise ValueError( "Cascading is not supported for the U65 accelerator") workspace_memory_pools = mod.attrs["workspace_memory_pools"] if not workspace_memory_pools: raise ValueError( "Workspace memory pool needs to be provided for the U55 cascader" ) if len(workspace_memory_pools.pools) != 1: raise ValueError( "Exactly one workspace pool needs to be provided for the U55 cascader" ) memory_pressure = _calculate_memory_pressure(mod) sram = extract_memory_info(workspace_memory_pools.pools[0], memory_pressure) tir_mod = LowerToTIR( _ethos_u55_cascader(sram, util.is_striping_enabled()))(mod) else: tir_mod = LowerToTIR(copy_constants())(mod) return tir_mod
def test_compiler_attr(): config = { "accelerator_config": "ethos-u55-32", } with tvm.transform.PassContext(opt_level=3, config={"relay.ext.ethosu.options": config}): with tvm.target.Target("c -device=micro_dev"): assert util.get_accelerator_config( ) == config["accelerator_config"]
def get_target_accel_type(): """This is a helper function to convert cli accelerator type str argument to NpuAccelerator""" npu_accel_str_map = { "ethos-u55-256": vapi.NpuAccelerator.Ethos_U55_256, "ethos-u55-128": vapi.NpuAccelerator.Ethos_U55_128, "ethos-u55-64": vapi.NpuAccelerator.Ethos_U55_64, "ethos-u55-32": vapi.NpuAccelerator.Ethos_U55_32, } accel_type_str = util.get_accelerator_config() assert accel_type_str in npu_accel_str_map.keys( ), f"{accel_type_str} is not supported" return npu_accel_str_map[accel_type_str]
def assign_addresses(buffer_info, npu_ops, scratch_region_map): """This function will assign addresses to tensors within two buffers : scratch and constants. The scratch is the buffer created to hold all intermediary data The constants is the buffer created via unifying all the constant data (post-encoding). Parameters ---------- buffer_info : dict This is the dictionary obtained via calling extract_buffer_info. The key is the buffer name to BufferInfo npu_ops : list A list of Vela NpuOps with tir.BufferLoads for addresses A list of Vela NpuOps with tir.Loads for addresses scratch_region_map : Dict[tvm.tir.Var, RegionOffset] A buffer_var to region and offset map. Returns ------- npu_ops : list A list of Vela NpuOps with addesses within scratch and constant buffers constant_tensor : NDArray A unified constant data array of uint8 as the constant buffer """ def replace_npu_fm_with_address(npu_fm): assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.BufferLoad) buffer = npu_fm.tiles.addresses[0].buffer.data if buffer in scratch_region_map.keys(): address = scratch_region_map[buffer].offset region = scratch_region_map[buffer].region else: assert buffer in buffer_addresses.keys() address, buffer_type = buffer_addresses[buffer] region = _get_region(buffer_type) assert (len(npu_fm.tiles.addresses[0].indices) == 1 ), "Ethos-U translation expects flattened buffers" index = npu_fm.tiles.addresses[0].indices[0] * ( np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8) npu_fm.tiles.addresses[0] = address + int(index) npu_fm.tiles.addresses[1] = (address if isinstance( npu_fm.tiles.addresses[1], tvm.tir.BufferLoad) else 0) npu_fm.tiles.addresses[2] = (address if isinstance( npu_fm.tiles.addresses[2], tvm.tir.BufferLoad) else 0) npu_fm.tiles.addresses[3] = 0 npu_fm.region = region return npu_fm def replace_npu_address_range_with_address(npu_addr_range): assert isinstance(npu_addr_range.address, tvm.tir.BufferLoad) buffer = npu_addr_range.address.buffer.data index = int(npu_addr_range.address.indices[0] * (np.iinfo(np.dtype(npu_addr_range.address)).bits // 8)) if buffer in scratch_region_map.keys(): return vapi.NpuAddressRange( scratch_region_map[buffer].region, scratch_region_map[buffer].offset + index, npu_addr_range.length, ) assert buffer in buffer_addresses.keys( ), f"searching for buffer : {buffer}, but not found" address, buffer_type = buffer_addresses[buffer] address = address + int(npu_addr_range.address.indices[0].value) return vapi.NpuAddressRange(_get_region(buffer_type), address, npu_addr_range.length) def replace_tir_loads(npu_object): if isinstance(npu_object, vapi.NpuFeatureMap): return replace_npu_fm_with_address(npu_object) if isinstance(npu_object, vapi.NpuAddressRange): return replace_npu_address_range_with_address(npu_object) return npu_object def classify_io(buffer): for _npu_op in npu_ops: if issubclass(type(_npu_op), vapi.NpuBlockOperation): if _npu_op.ifm and _npu_op.ifm.tiles.addresses[ 0].buffer.data == buffer: return BufferType.input if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[ 0].buffer.data == buffer: return BufferType.input if _npu_op.ofm and _npu_op.ofm.tiles.addresses[ 0].buffer.data == buffer: return BufferType.output raise ValueError(f"Unused IO : {buffer} in tir module.") constant_hex_data = [] total_constant_len = 0 buffer_addresses = dict() for _buffer, info in buffer_info.items(): if info.values is not None: assert info.btype == BufferType.constant assert len(info.shape) == 1 buffer_addresses[_buffer] = ((total_constant_len, info.btype) if constant_hex_data else (0, info.btype)) dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 size_in_bytes = dtype_bytes * np.prod(list(info.shape)) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.resize(info.values, size_in_bytes // dtype_bytes) constant_tensor = constant_tensor.tobytes().hex() constant_hex_data.append(constant_tensor) total_constant_len += len(constant_tensor) // 2 else: if info.btype == BufferType.input_or_output or info.btype == BufferType.input: buffer_type = info.btype if info.btype == BufferType.input_or_output: buffer_type = classify_io(_buffer) assert buffer_type in (BufferType.input, BufferType.output) address = 0 buffer_addresses[_buffer] = (address, buffer_type) buffer_info[_buffer] = BufferInfo(values=None, shape=info.dtype, dtype=info.dtype, btype=buffer_type) elif info.btype == BufferType.shram: accl_config = util.get_accelerator_config() arch_config = get_accelerator_arch_config(accl_config) address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: # These buffer_vars are already updated in scratch_region_map assert info.btype == BufferType.scratch for npu_op in npu_ops: for attr_name, attr in npu_op.__dict__.items(): if isinstance(attr, list): new_attr = list() for attr_ in attr: new_attr.append(replace_tir_loads(attr_)) setattr(npu_op, attr_name, new_attr) else: setattr(npu_op, attr_name, replace_tir_loads(attr)) constant_data = "".join(constant_hex_data) return (npu_ops, constant_data)
def assign_addresses(buffer_info, npu_ops): """This function will assign addresses to tensors within two buffers : scratch and constants. The scratch is the buffer created to hold all intermediary data The constants is the buffer created via unifying all the constant data (post-encoding). Parameters ---------- buffer_info : dict This is the dictionary obtained via calling extract_buffer_info. The key is the buffer name to BufferInfo npu_ops : list A list of Vela NpuOps with tir.Loads for addresses Returns ------- npu_ops : list A list of Vela NpuOps with addesses within scratch and constant buffers constant_tensor : NDArray A unified constant data array of uint8 as the constant buffer scratch_size : int The size of the scratch tensor. """ def replace_npu_fm_with_address(npu_fm): assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.Load) # We currently does not support tiles # Change this when tiles are needed # (i.e. when using rolling buffers) assert npu_fm.tiles.addresses[1:] == [0, 0, 0] npu_fm.tiles.addresses[1:] = [0, 0, 0] buffer = npu_fm.tiles.addresses[0].buffer_var assert buffer in buffer_addresses.keys() address, buffer_type = buffer_addresses[buffer] index = npu_fm.tiles.addresses[0].index * ( np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8) npu_fm.tiles.addresses[0] = address + int(index) npu_fm.region = _REGION_MAP[buffer_type] return npu_fm def replace_npu_address_range_with_address(npu_addr_range): assert isinstance(npu_addr_range.address, tvm.tir.Load) buffer = npu_addr_range.address.buffer_var assert buffer in buffer_addresses.keys( ), f"searching for buffer : {buffer}, but not found" address, buffer_type = buffer_addresses[buffer] return vapi.NpuAddressRange(_REGION_MAP[buffer_type], address, npu_addr_range.length) def replace_tir_loads(npu_object): if isinstance(npu_object, vapi.NpuFeatureMap): return replace_npu_fm_with_address(npu_object) if isinstance(npu_object, vapi.NpuAddressRange): return replace_npu_address_range_with_address(npu_object) return npu_object def classify_io(buffer): for _npu_op in npu_ops: if issubclass(type(_npu_op), vapi.NpuBlockOperation): if _npu_op.ifm and _npu_op.ifm.tiles.addresses[ 0].buffer_var == buffer: return BufferType.input if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[ 0].buffer_var == buffer: return BufferType.input if _npu_op.ofm and _npu_op.ofm.tiles.addresses[ 0].buffer_var == buffer: return BufferType.output raise ValueError(f"Unused IO : {buffer} in tir module.") scratch_size = 0 constant_tensor = None buffer_addresses = dict() for _buffer, info in buffer_info.items(): if info.values is not None: assert np.dtype(info.dtype) == np.uint8 assert info.btype == BufferType.constant assert len(info.shape) == 1 if constant_tensor is None: buffer_addresses[_buffer] = (0, info.btype) assert info.values.dtype == np.uint8 size_in_bytes = info.values.size # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.resize(info.values, size_in_bytes) else: buffer_addresses[_buffer] = (constant_tensor.size, info.btype) assert info.values.dtype == np.uint8 size_in_bytes = info.values.size # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.append( constant_tensor, np.resize(info.values, size_in_bytes)) else: if info.btype == BufferType.input_or_output: buffer_type = classify_io(_buffer) assert buffer_type in (BufferType.input, BufferType.output) address = 0 buffer_addresses[_buffer] = (address, buffer_type) elif info.btype == BufferType.shram: accl_config = util.get_accelerator_config() arch_config = get_accelerator_arch_config(accl_config) address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: size_in_bytes = int( (np.iinfo(np.dtype(info.dtype)).bits // 8) * np.prod(list(info.shape))) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) assert info.btype == BufferType.scratch address = scratch_size scratch_size += size_in_bytes buffer_addresses[_buffer] = (address, info.btype) for npu_op in npu_ops: for attr_name, attr in npu_op.__dict__.items(): if isinstance(attr, list): new_attr = list() for attr_ in attr: new_attr.append(replace_tir_loads(attr_)) setattr(npu_op, attr_name, new_attr) else: setattr(npu_op, attr_name, replace_tir_loads(attr)) return npu_ops, constant_tensor, scratch_size