def _check_buffer(address, region, length, buffer_var): """Checks whether the buffer information is valid with original tir buffers. - If its constant, this will check the slice in the constant tensor has the values. - If its scratch, this will check the slice is within scratch and does not have conflicts with other scratch tensors. - If its input/output, this will check the address is zero """ inverse_region_map = { 0: tir_to_cs_translator.BufferType.constant, 1: tir_to_cs_translator.BufferType.scratch, 3: tir_to_cs_translator.BufferType.input, 4: tir_to_cs_translator.BufferType.output, } buffer_type = inverse_region_map[region] if buffer_type == tir_to_cs_translator.BufferType.constant: ref = buffer_info[buffer_var].values assert (constant_tensor[address : address + length] == ref).all() # Every buffer is adjusted to align to 16 bytes length = util.round_up(length, 16) # Mark these constants are read at least once constant_tensor_read_mask[address : address + length] = np.ones(length, dtype="uint8") elif buffer_type == tir_to_cs_translator.BufferType.scratch: shape = list(buffer_info[buffer_var].shape) assert length == np.prod(shape) assert address < scratch_size # Every buffer is adjusted to align to 16 bytes length = util.round_up(length, 16) assert address + length <= scratch_size # The scratch area should not be used by anyother buffer assert not scratch_allocation_mask[address : address + length].any() # The scratch area is marked as used scratch_allocation_mask[address : address + length] = np.ones(length, dtype="uint8") elif buffer_type == tir_to_cs_translator.BufferType.input: assert address == 0 else: assert buffer_type == tir_to_cs_translator.BufferType.output assert address == 0
def calculate_block_traversal_mode( is_depthwise: bool, weights_shape_ohwi: List[int], ifm_bitdepth: int) -> vapi.NpuBlockTraversal: """Calculate a block traversal mode given whether the op is depthwise convolution, shape of weights and bit-depth of the ifm. """ if is_depthwise: return vapi.NpuBlockTraversal.DEPTH_FIRST # Determine which block traversal strategy has better DPU utilization kernel_size = weights_shape_ohwi[1] * weights_shape_ohwi[2] depth_utilization = weights_shape_ohwi[3] / util.round_up( weights_shape_ohwi[3], 32 if ifm_bitdepth == 8 else 16) part_kernel_utilization = (weights_shape_ohwi[3] / util.round_up( weights_shape_ohwi[3], 8)) * (kernel_size / util.round_up( kernel_size, 4 if ifm_bitdepth == 8 else 2)) if part_kernel_utilization >= depth_utilization or weights_shape_ohwi[ 3] <= 8: # Part-kernel first is always better for ifm depths <= 8 return vapi.NpuBlockTraversal.PART_KERNEL_FIRST return vapi.NpuBlockTraversal.DEPTH_FIRST
def analyze_remaining_allocates(stmt): nonlocal dynamic_allocation_size if isinstance(stmt, tvm.tir.stmt.Allocate): allocate = stmt pointer_type = allocate.buffer_var.type_annotation storage_scope = pointer_type.storage_scope if storage_scope == "global": dtype_bytes = np.iinfo(np.dtype(allocate.dtype)).bits // 8 size_in_bytes = int(dtype_bytes * np.prod(list(allocate.extents))) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) address = dynamic_allocation_size dynamic_allocation_size += size_in_bytes scratch_region_map[allocate.buffer_var] = RegionOffset( region=dynamic_allocation_region, offset=address)
def assign_addresses(buffer_info, npu_ops, scratch_region_map): """This function will assign addresses to tensors within two buffers : scratch and constants. The scratch is the buffer created to hold all intermediary data The constants is the buffer created via unifying all the constant data (post-encoding). Parameters ---------- buffer_info : dict This is the dictionary obtained via calling extract_buffer_info. The key is the buffer name to BufferInfo npu_ops : list A list of Vela NpuOps with tir.BufferLoads for addresses A list of Vela NpuOps with tir.Loads for addresses scratch_region_map : Dict[tvm.tir.Var, RegionOffset] A buffer_var to region and offset map. Returns ------- npu_ops : list A list of Vela NpuOps with addesses within scratch and constant buffers constant_tensor : NDArray A unified constant data array of uint8 as the constant buffer """ def replace_npu_fm_with_address(npu_fm): assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.BufferLoad) buffer = npu_fm.tiles.addresses[0].buffer.data if buffer in scratch_region_map.keys(): address = scratch_region_map[buffer].offset region = scratch_region_map[buffer].region else: assert buffer in buffer_addresses.keys() address, buffer_type = buffer_addresses[buffer] region = _get_region(buffer_type) assert (len(npu_fm.tiles.addresses[0].indices) == 1 ), "Ethos-U translation expects flattened buffers" index = npu_fm.tiles.addresses[0].indices[0] * ( np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8) npu_fm.tiles.addresses[0] = address + int(index) npu_fm.tiles.addresses[1] = (address if isinstance( npu_fm.tiles.addresses[1], tvm.tir.BufferLoad) else 0) npu_fm.tiles.addresses[2] = (address if isinstance( npu_fm.tiles.addresses[2], tvm.tir.BufferLoad) else 0) npu_fm.tiles.addresses[3] = 0 npu_fm.region = region return npu_fm def replace_npu_address_range_with_address(npu_addr_range): assert isinstance(npu_addr_range.address, tvm.tir.BufferLoad) buffer = npu_addr_range.address.buffer.data index = int(npu_addr_range.address.indices[0] * (np.iinfo(np.dtype(npu_addr_range.address)).bits // 8)) if buffer in scratch_region_map.keys(): return vapi.NpuAddressRange( scratch_region_map[buffer].region, scratch_region_map[buffer].offset + index, npu_addr_range.length, ) assert buffer in buffer_addresses.keys( ), f"searching for buffer : {buffer}, but not found" address, buffer_type = buffer_addresses[buffer] address = address + int(npu_addr_range.address.indices[0].value) return vapi.NpuAddressRange(_get_region(buffer_type), address, npu_addr_range.length) def replace_tir_loads(npu_object): if isinstance(npu_object, vapi.NpuFeatureMap): return replace_npu_fm_with_address(npu_object) if isinstance(npu_object, vapi.NpuAddressRange): return replace_npu_address_range_with_address(npu_object) return npu_object def classify_io(buffer): for _npu_op in npu_ops: if issubclass(type(_npu_op), vapi.NpuBlockOperation): if _npu_op.ifm and _npu_op.ifm.tiles.addresses[ 0].buffer.data == buffer: return BufferType.input if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[ 0].buffer.data == buffer: return BufferType.input if _npu_op.ofm and _npu_op.ofm.tiles.addresses[ 0].buffer.data == buffer: return BufferType.output raise ValueError(f"Unused IO : {buffer} in tir module.") constant_hex_data = [] total_constant_len = 0 buffer_addresses = dict() for _buffer, info in buffer_info.items(): if info.values is not None: assert info.btype == BufferType.constant assert len(info.shape) == 1 buffer_addresses[_buffer] = ((total_constant_len, info.btype) if constant_hex_data else (0, info.btype)) dtype_bytes = np.iinfo(np.dtype(info.dtype)).bits // 8 size_in_bytes = dtype_bytes * np.prod(list(info.shape)) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.resize(info.values, size_in_bytes // dtype_bytes) constant_tensor = constant_tensor.tobytes().hex() constant_hex_data.append(constant_tensor) total_constant_len += len(constant_tensor) // 2 else: if info.btype == BufferType.input_or_output or info.btype == BufferType.input: buffer_type = info.btype if info.btype == BufferType.input_or_output: buffer_type = classify_io(_buffer) assert buffer_type in (BufferType.input, BufferType.output) address = 0 buffer_addresses[_buffer] = (address, buffer_type) buffer_info[_buffer] = BufferInfo(values=None, shape=info.dtype, dtype=info.dtype, btype=buffer_type) elif info.btype == BufferType.shram: accl_config = util.get_accelerator_config() arch_config = get_accelerator_arch_config(accl_config) address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: # These buffer_vars are already updated in scratch_region_map assert info.btype == BufferType.scratch for npu_op in npu_ops: for attr_name, attr in npu_op.__dict__.items(): if isinstance(attr, list): new_attr = list() for attr_ in attr: new_attr.append(replace_tir_loads(attr_)) setattr(npu_op, attr_name, new_attr) else: setattr(npu_op, attr_name, replace_tir_loads(attr)) constant_data = "".join(constant_hex_data) return (npu_ops, constant_data)
def assign_addresses(buffer_info, npu_ops): """This function will assign addresses to tensors within two buffers : scratch and constants. The scratch is the buffer created to hold all intermediary data The constants is the buffer created via unifying all the constant data (post-encoding). Parameters ---------- buffer_info : dict This is the dictionary obtained via calling extract_buffer_info. The key is the buffer name to BufferInfo npu_ops : list A list of Vela NpuOps with tir.Loads for addresses Returns ------- npu_ops : list A list of Vela NpuOps with addesses within scratch and constant buffers constant_tensor : NDArray A unified constant data array of uint8 as the constant buffer scratch_size : int The size of the scratch tensor. """ def replace_npu_fm_with_address(npu_fm): assert isinstance(npu_fm.tiles.addresses[0], tvm.tir.Load) # We currently does not support tiles # Change this when tiles are needed # (i.e. when using rolling buffers) assert npu_fm.tiles.addresses[1:] == [0, 0, 0] npu_fm.tiles.addresses[1:] = [0, 0, 0] buffer = npu_fm.tiles.addresses[0].buffer_var assert buffer in buffer_addresses.keys() address, buffer_type = buffer_addresses[buffer] index = npu_fm.tiles.addresses[0].index * ( np.iinfo(np.dtype(npu_fm.tiles.addresses[0])).bits // 8) npu_fm.tiles.addresses[0] = address + int(index) npu_fm.region = _REGION_MAP[buffer_type] return npu_fm def replace_npu_address_range_with_address(npu_addr_range): assert isinstance(npu_addr_range.address, tvm.tir.Load) buffer = npu_addr_range.address.buffer_var assert buffer in buffer_addresses.keys( ), f"searching for buffer : {buffer}, but not found" address, buffer_type = buffer_addresses[buffer] return vapi.NpuAddressRange(_REGION_MAP[buffer_type], address, npu_addr_range.length) def replace_tir_loads(npu_object): if isinstance(npu_object, vapi.NpuFeatureMap): return replace_npu_fm_with_address(npu_object) if isinstance(npu_object, vapi.NpuAddressRange): return replace_npu_address_range_with_address(npu_object) return npu_object def classify_io(buffer): for _npu_op in npu_ops: if issubclass(type(_npu_op), vapi.NpuBlockOperation): if _npu_op.ifm and _npu_op.ifm.tiles.addresses[ 0].buffer_var == buffer: return BufferType.input if _npu_op.ifm2 and _npu_op.ifm2.tiles.addresses[ 0].buffer_var == buffer: return BufferType.input if _npu_op.ofm and _npu_op.ofm.tiles.addresses[ 0].buffer_var == buffer: return BufferType.output raise ValueError(f"Unused IO : {buffer} in tir module.") scratch_size = 0 constant_tensor = None buffer_addresses = dict() for _buffer, info in buffer_info.items(): if info.values is not None: assert np.dtype(info.dtype) == np.uint8 assert info.btype == BufferType.constant assert len(info.shape) == 1 if constant_tensor is None: buffer_addresses[_buffer] = (0, info.btype) assert info.values.dtype == np.uint8 size_in_bytes = info.values.size # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.resize(info.values, size_in_bytes) else: buffer_addresses[_buffer] = (constant_tensor.size, info.btype) assert info.values.dtype == np.uint8 size_in_bytes = info.values.size # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) constant_tensor = np.append( constant_tensor, np.resize(info.values, size_in_bytes)) else: if info.btype == BufferType.input_or_output: buffer_type = classify_io(_buffer) assert buffer_type in (BufferType.input, BufferType.output) address = 0 buffer_addresses[_buffer] = (address, buffer_type) elif info.btype == BufferType.shram: accl_config = util.get_accelerator_config() arch_config = get_accelerator_arch_config(accl_config) address = arch_config.lut_start_address buffer_addresses[_buffer] = (address, info.btype) else: size_in_bytes = int( (np.iinfo(np.dtype(info.dtype)).bits // 8) * np.prod(list(info.shape))) # Every memory address the NPU access have to be 16 byte aligned size_in_bytes = util.round_up(size_in_bytes, 16) assert info.btype == BufferType.scratch address = scratch_size scratch_size += size_in_bytes buffer_addresses[_buffer] = (address, info.btype) for npu_op in npu_ops: for attr_name, attr in npu_op.__dict__.items(): if isinstance(attr, list): new_attr = list() for attr_ in attr: new_attr.append(replace_tir_loads(attr_)) setattr(npu_op, attr_name, new_attr) else: setattr(npu_op, attr_name, replace_tir_loads(attr)) return npu_ops, constant_tensor, scratch_size