def test_depthwise(): """Test depthwise operation, preceeded by DMA operation""" weights_src = NpuAddressRange(region=0, address=0x40, length=96) weights_dest = NpuAddressRange(region=1, address=0x10000, length=96) dma_op = NpuDmaOperation(weights_src, weights_dest) op = NpuConvDepthWiseOperation() ifm_quant = NpuQuantization(scale_f32=0.007843138, zero_point=128) op.ifm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x0, quant=ifm_quant) ofm_quant = NpuQuantization(scale_f32=0.062745101749897, zero_point=128) op.ofm = create_feature_map(NpuShape3D(height=64, width=64, depth=8), 1, 0x8000, quant=ofm_quant) op.kernel = NpuKernel(3, 3) op.padding = NpuPadding(top=1, left=1, right=1, bottom=1) op.weights = [weights_dest] op.biases = [NpuAddressRange(region=0, address=0, length=80)] op.block_config = NpuShape3D(height=-1, width=-1, depth=8) cmds = npu_generate_register_command_stream([dma_op, op], NpuAccelerator.Ethos_U55_128) check_cmd0(cmds, cmd0.NPU_SET_DMA0_SRC_REGION, 0) check_cmd1(cmds, cmd1.NPU_SET_DMA0_SRC, 0x40) check_cmd0(cmds, cmd0.NPU_SET_DMA0_DST_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_DMA0_DST, 0x10000) check_cmd1(cmds, cmd1.NPU_SET_DMA0_LEN, 96) check_cmd0(cmds, cmd0.NPU_OP_DMA_START, 0) # A DMA WAIT should have been inserted check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0) check_cmd0(cmds, cmd0.NPU_OP_DEPTHWISE, 0) blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) assert blk_height > 0 assert blk_width > 0
def create_fully_connected_op() -> NpuConv2DOperation: op = NpuConv2DOperation() op.ifm = create_feature_map( NpuShape3D(height=1, width=1, depth=114), 1, 0, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128), layout=NpuLayout.NHCWB16, ) op.ofm = create_feature_map( NpuShape3D(height=1, width=1, depth=96), 1, 0x6A0, quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), layout=NpuLayout.NHCWB16, ) op.kernel = NpuKernel(1, 1) op.weights = [NpuAddressRange(region=0, address=0x16880, length=13120)] op.biases = [NpuAddressRange(region=0, address=0x19BC0, length=960)] op.padding = NpuPadding(top=0, left=0, right=0, bottom=0) op.block_traversal = NpuBlockTraversal.DEPTH_FIRST # In this example we assume that the weights were compressed with ofm depth 96; # let vela choose suitable block width and height by setting these to -1 op.block_config = NpuShape3D(height=-1, width=-1, depth=96) return op
def test_get_fm_strides(): """Tests calculation of feature map strides""" fm = NpuFeatureMap() fm.layout = NpuLayout.NHCWB16 fm.data_type = NpuDataType.INT16 fm.shape = NpuShape3D(height=7, width=10, depth=24) assert get_strides(fm) == NpuShape3D(height=640, width=32, depth=320) fm.layout = NpuLayout.NHWC assert get_strides(fm) == NpuShape3D(height=480, width=48, depth=2) fm.data_type = NpuDataType.UINT8 assert get_strides(fm) == NpuShape3D(height=240, width=24, depth=1)
def create_avg_pool_op() -> NpuPoolingOperation: op = NpuPoolingOperation(NpuPoolingOp.AVERAGE) op.ifm = create_feature_map( NpuShape3D(height=29, width=30, depth=27), 2, 0, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128) ) op.ofm = create_feature_map( NpuShape3D(height=10, width=10, depth=27), 2, 0x5BD0, quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), ) op.kernel = NpuKernel(8, 2, 3, 3) op.padding = NpuPadding(top=0, left=2, right=3, bottom=0) # Do not set a block config, let vela choose one return op
def test_get_address_ranges_vertical_tiles(): """Tests calculation of feature map address ranges, with 2 vertical tiles used""" fm = NpuFeatureMap() fm.region = 6 fm.layout = NpuLayout.NHWC fm.data_type = NpuDataType.INT8 # Set strides explicitly fm.shape = NpuShape3D(height=50, width=10, depth=20) fm.strides = NpuShape3D(height=100, width=20, depth=1) fm.tiles = NpuTileBox(height_0=50, height_1=50, width_0=5, addresses=[16, 32000, 0, 0]) ranges = get_address_ranges(fm) assert ranges == [ NpuAddressRange(region=6, address=16, length=5000), NpuAddressRange(region=6, address=32000, length=5000), None, None, ]
def test_get_address_ranges_one_tile(): """Tests calculation of feature map address ranges, with 1 tile used""" fm = NpuFeatureMap() fm.region = 4 fm.layout = NpuLayout.NHWC fm.data_type = NpuDataType.INT16 fm.shape = NpuShape3D(height=50, width=40, depth=3) fm.tiles = NpuTileBox(height_0=50, height_1=50, width_0=40, addresses=[8000, 0, 0, 0]) ranges = get_address_ranges(fm) assert ranges == [ NpuAddressRange(region=4, address=8000, length=12000), None, None, None ]
def test_get_address_ranges_horizontal_tiles(): """Tests calculation of feature map address ranges, with 2 horizontal tiles used""" fm = NpuFeatureMap() fm.region = 6 fm.layout = NpuLayout.NHWC fm.data_type = NpuDataType.INT16 fm.shape = NpuShape3D(height=50, width=10, depth=20) fm.tiles = NpuTileBox(height_0=20, height_1=30, width_0=10, addresses=[256, 0, 16000, 0]) ranges = get_address_ranges(fm) assert ranges == [ NpuAddressRange(region=6, address=256, length=8000), None, NpuAddressRange(region=6, address=16000, length=12000), None, ]
def test_get_address_ranges_4_tiles(): """Tests calculation of feature map address ranges, with 4 tiles used""" fm = NpuFeatureMap() fm.region = 6 fm.layout = NpuLayout.NHCWB16 fm.data_type = NpuDataType.INT16 fm.shape = NpuShape3D(height=50, width=10, depth=20) fm.tiles = NpuTileBox(height_0=30, height_1=10, width_0=3, addresses=[16, 32000, 8000, 16000]) ranges = get_address_ranges(fm) assert ranges == [ NpuAddressRange(region=6, address=16, length=18952), NpuAddressRange(region=6, address=32000, length=6280), NpuAddressRange(region=6, address=8000, length=12552), NpuAddressRange(region=6, address=28800, length=12680), ]
def test_conv2d(): """Tests command stream generation for a conv2d operation""" op = NpuConv2DOperation() op.ifm = create_feature_map( NpuShape3D(height=30, width=62, depth=46), 1, 512, quant=NpuQuantization(scale_f32=0.007843138, zero_point=128) ) op.ofm = create_feature_map( NpuShape3D(height=30, width=31, depth=46), 1, 0x14E40, quant=NpuQuantization(scale_f32=0.20392157, zero_point=128), ) op.kernel = NpuKernel(3, 2, 2, 1) op.weights = [NpuAddressRange(region=0, address=0, length=7696)] op.biases = [NpuAddressRange(region=0, address=32000, length=464)] op.padding = NpuPadding(top=0, left=0, right=1, bottom=1) op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST # In this example we assume that the weights were compressed with ofm depth 16; # let vela choose suitable block width and height by setting these to -1 op.block_config = NpuShape3D(height=-1, width=-1, depth=16) cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_128) check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 512) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 29) check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 29) check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 61) check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 45) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 2852) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 46) check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 128) check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_TOP, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_LEFT, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_BOTTOM, 1) check_cmd0(cmds, cmd0.NPU_SET_IFM_PAD_RIGHT, 1) check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 85568) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 29) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 29) check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 29) check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 45) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 1426) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 46) check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 128) check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 0) check_cmd0(cmds, cmd0.NPU_SET_KERNEL_HEIGHT_M1, 1) check_cmd0(cmds, cmd0.NPU_SET_KERNEL_WIDTH_M1, 2) check_cmd0(cmds, cmd0.NPU_SET_KERNEL_STRIDE, 5) check_cmd0(cmds, cmd0.NPU_SET_WEIGHT_REGION, 0) check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_BASE, 0) check_cmd1(cmds, cmd1.NPU_SET_WEIGHT_LENGTH, 7696) check_cmd0(cmds, cmd0.NPU_SET_SCALE_REGION, 0) check_cmd1(cmds, cmd1.NPU_SET_SCALE_BASE, 32000) check_cmd1(cmds, cmd1.NPU_SET_SCALE_LENGTH, 464) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15) check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14) check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14) check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0) check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0) check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) # Check that block width/height were generated that fit blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) assert blk_height > 0 assert blk_width > 0 assert (blk_height + 1) * (blk_width + 1) <= 64
def test_mul_with_broadcast_and_relu(): """Test multiplication with broadcasted IFM2""" op = NpuElementWiseOperation(NpuElementWiseOp.MUL) op.ifm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x20) op.ifm2 = create_feature_map(NpuShape3D(height=1, width=22, depth=1), 1, 0) op.ofm = create_feature_map(NpuShape3D(height=31, width=22, depth=31), 1, 0x52C0) op.activation = NpuActivation(NpuActivationOp.NONE_OR_RELU) op.activation.min = 0 # RELU # Do not set a block config, let vela choose one cmds = npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_32) check_cmd1(cmds, cmd1.NPU_SET_OFM_SCALE, 1073741824, 30) check_cmd0(cmds, cmd0.NPU_SET_IFM_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE0, 32) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE1, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE2, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM_BASE3, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT0_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_IFM_HEIGHT1_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_IFM_WIDTH0_M1, 21) check_cmd0(cmds, cmd0.NPU_SET_IFM_DEPTH_M1, 30) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_C, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_Y, 682) check_cmd1(cmds, cmd1.NPU_SET_IFM_STRIDE_X, 31) check_cmd0(cmds, cmd0.NPU_SET_IFM_ZERO_POINT, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_PRECISION, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM_UPSCALE, 0) check_cmd0(cmds, cmd0.NPU_SET_OFM_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE0, 21184) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE1, 0) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE2, 0) check_cmd1(cmds, cmd1.NPU_SET_OFM_BASE3, 0) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT0_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT1_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH0_M1, 21) check_cmd0(cmds, cmd0.NPU_SET_OFM_HEIGHT_M1, 30) check_cmd0(cmds, cmd0.NPU_SET_OFM_WIDTH_M1, 21) check_cmd0(cmds, cmd0.NPU_SET_OFM_DEPTH_M1, 30) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_C, 1) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_Y, 682) check_cmd1(cmds, cmd1.NPU_SET_OFM_STRIDE_X, 31) check_cmd0(cmds, cmd0.NPU_SET_OFM_ZERO_POINT, 0) check_cmd0(cmds, cmd0.NPU_SET_OFM_PRECISION, 256) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION, 0) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MIN, 0) check_cmd0(cmds, cmd0.NPU_SET_ACTIVATION_MAX, 255) check_cmd0(cmds, cmd0.NPU_SET_IFM2_REGION, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE0, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE1, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE2, 0) check_cmd1(cmds, cmd1.NPU_SET_IFM2_BASE3, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT0_M1, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_HEIGHT1_M1, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_WIDTH0_M1, 21) check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_C, 1) check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_Y, 22) check_cmd1(cmds, cmd1.NPU_SET_IFM2_STRIDE_X, 1) check_cmd0(cmds, cmd0.NPU_SET_IFM2_ZERO_POINT, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 23) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 31) check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16) check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16) check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9) check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0) check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0) check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0) # Check that block width/height were generated that fit blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) blk_depth = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1) assert blk_height >= 0 assert blk_width >= 0 assert blk_depth >= 0 assert (blk_height + 1) * (blk_width + 1) + (blk_depth + 1) <= 3072