def test_small_graph(): subgraph = cs.TESubgraph([], None) part_a = cs.InlinePart( subgraph, [ cs.Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), cs.Propagator( [[0, 1, 0], [1, 0, 0], [0, 0, 1]], [-1, -1], ), ], ) part_b = cs.InlinePart( subgraph, [ cs.Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([10, 10], "uint8") tensor_2 = cs.Tensor([9, 9], "uint8") tensor_3 = cs.Tensor([10, 10], "uint8") tensor_4 = cs.Tensor([10, 10], "uint8") part_a.set_input(0, tensor_1) part_a.set_input(1, tensor_2) part_a.set_output(tensor_3) tensor_1.add_consumer(part_a) tensor_2.add_consumer(part_a) tensor_3.add_producer(part_a) part_b.set_input(0, tensor_3) part_b.set_output(tensor_4) tensor_3.add_consumer(part_b) tensor_4.add_producer(part_b) assert part_a.input_tensors == [tensor_1, tensor_2] assert part_a.output_tensor == tensor_3 assert part_b.input_tensors == [tensor_3] assert part_b.output_tensor == tensor_4 assert tensor_1.producers == [] assert tensor_1.consumers == [part_a] assert tensor_2.producers == [] assert tensor_2.consumers == [part_a] assert tensor_3.producers == [part_a] assert tensor_3.consumers == [part_b] assert tensor_4.producers == [part_b] assert tensor_4.consumers == [] graph = cs.CascaderGraph([tensor_1, tensor_2], [tensor_4]) assert graph.input_tensors == [tensor_1, tensor_2] assert graph.output_tensors == [tensor_4] assert graph.part_order == [part_b, part_a] for i, part in enumerate(graph.part_order): assert graph.get_part_id(part) == i
def test_generate_graph_plans(SRAM, DRAM): num_part_groups = 3 stripe_factors = 4 max_plan_size = 10 subgraph = cs.TESubgraph([], None) part_a = cs.InlinePart( subgraph, [ cs.Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), cs.Propagator( [[0, 1, 0], [1, 0, 0], [0, 0, 1]], [-1, -1], ), ], ) part_b = cs.InlinePart( subgraph, [ cs.Propagator( [[1, 0, 0], [0, 1, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([10, 10], "int8") tensor_2 = cs.Tensor([9, 9], "int8") tensor_3 = cs.Tensor([10, 10], "int8") tensor_4 = cs.Tensor([10, 10], "int8") part_a.set_input(0, tensor_1) part_a.set_input(1, tensor_2) part_a.set_output(tensor_3) tensor_1.add_consumer(part_a) tensor_2.add_consumer(part_a) tensor_3.add_producer(part_a) part_b.set_input(0, tensor_3) part_b.set_output(tensor_4) tensor_3.add_consumer(part_b) tensor_4.add_producer(part_b) graph = cs.CascaderGraph([tensor_1, tensor_2], [tensor_4]) home_map = { tensor_1: [SRAM, DRAM], tensor_2: [SRAM], tensor_3: [SRAM], tensor_4: [SRAM, DRAM], } options = make_options( cascade_region=SRAM, stripe_factors=stripe_factors, max_plan_size=max_plan_size, ) closed_plans = _generate_graph_plans(graph, home_map, options) assert len(closed_plans) == num_part_groups
def test_generate_output_stripe_configs_disable_striping(stripe_factors): subgraph = cs.TESubgraph([], None) part_1 = cs.InlinePart( subgraph, [ cs.Propagator( [[2, 0, 0], [0, 2, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([800, 800], "uint8") tensor_2 = cs.Tensor([400, 400], "uint8") part_1.set_input(0, tensor_1) part_1.set_output(tensor_2) tensor_1.add_consumer(part_1) tensor_2.add_producer(part_1) assert ( len( _generate_output_stripe_configs( part_1, stripe_factors, enable_striping=False, multi_dimensional=False ) ) == 1 )
def test_generate_output_stripe_configs_single_dimension(): stripe_factors = 3 subgraph = cs.TESubgraph([], None) part_1 = cs.InlinePart( subgraph, [ cs.Propagator( [[2, 0, 0], [0, 2, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([800, 800], "uint8") tensor_2 = cs.Tensor([400, 400], "uint8") part_1.set_input(0, tensor_1) part_1.set_output(tensor_2) tensor_1.add_consumer(part_1) tensor_2.add_producer(part_1) expected_stripe_configs = { cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]), cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]), cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]), cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]), cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]), } output_stripe_configs = _generate_output_stripe_configs( part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=False ) assert len(output_stripe_configs) == len(expected_stripe_configs) assert set(output_stripe_configs) == expected_stripe_configs
def test_ethosu_part(): te_subgraph = cs.TESubgraph([], None) output_quantum = [1, 2, 2, 8] propagator = cs.Propagator( [[1, 0, 0, 0, 2], [0, 1, 0, 0, 2], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]], [0, 0, 0, 0], ) stripe_config = cs.StripeConfig( [1, 4, 4, 16], [1, 64, 72, 96], [1, 4, 4, 16], [1, 2, 3, 4], [1, 16, 13, 6], [0, 0, 0, 0] ) subkernels = 3 valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], 15000, 7500)] part = EthosuPart( te_subgraph, [propagator], output_quantum, subkernels, valid_block_configs, 1, ) input_tensor = cs.Tensor(shape=[1, 66, 74, 16], dtype="int8") part.set_input(0, input_tensor) assert part.get_stripe_align_hint() == output_quantum # Check that the performance model runs, don't verify output part.get_performance_info(stripe_config, BufferMode.ROLLING) part.get_performance_info(stripe_config, BufferMode.RECOMPUTE)
def test_inline_part(): subgraph = cs.TESubgraph([], None) part = cs.InlinePart( subgraph, [ cs.Propagator( [[0, 1, 0], [1, 0, 0], [0, 0, 1]], [0, 0], ), ], ) output_stripe_config = cs.StripeConfig([2, 4], [8, 8], [2, 4], [1, 2], [4, 2], [0, 0]) input_stripe_config = cs.StripeConfig([4, 2], [8, 8], [4, 2], [2, 1], [2, 4], [0, 0]) assert part.input_tensors == [None] assert part.output_tensor == None assert len(part.propagators) == 1 assert part.in_line == True assert part.get_stripe_align_hint() == [1, 1] performance_info = part.get_performance_info(output_stripe_config, is_rolling=False) assert performance_info.compute_cycles == 0 assert performance_info.read_bytes == [0] assert performance_info.write_bytes == 0 input_stripe_configs = part.calculate_input_stripe_configs( output_stripe_config) assert len(input_stripe_configs) == 1 assert input_stripe_configs[0] == input_stripe_config
def test_generate_single_plans(SRAM, DRAM): subgraph = cs.TESubgraph([], None) part_1 = cs.InlinePart( subgraph, [ cs.Propagator( [[2, 0, 0], [0, 2, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([800, 800], "int8") tensor_2 = cs.Tensor([400, 400], "int8") part_1.set_input(0, tensor_1) part_1.set_output(tensor_2) tensor_1.add_consumer(part_1) tensor_2.add_producer(part_1) home_map = { tensor_1: [SRAM, DRAM], tensor_2: [SRAM], } options = make_options(cascade_region=SRAM, stripe_factors=1) output_stripe_configs = _generate_output_stripe_configs( part_1, options.stripe_factors) plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options) for plan in plans: assert plan.interior_region == SRAM assert plan.part_group == frozenset([part_1]) assert set(plan.tensor_configs.keys()) == set([tensor_1, tensor_2]) for open_config in plan.open_configs: assert open_config.state == cs.TensorConfigState.INTERIOR
def test_force_block_config_kernelwise(ofm_layout, block_config_str, expected_block_shape): op_type = "ethosu_pooling" activation = "NONE" kernel = (2, 2) stride = (2, 2) padding = (0, 0) dilation = (1, 1) ifm_channels = 32 out_shape = (1, 8, 10, 16) ifm_matrix, ifm_offset, _, _, _, _ = make_matrices(op_type, kernel, stride, padding, "NHWC", ofm_layout, dilation, ifm_channels) ofm_channels = out_shape[3] propagator = cs.Propagator(ifm_matrix, ifm_offset) op_attrs = { "op": op_type, "activation": activation, "stride_h": stride[0], "stride_w": stride[1], "dilation_h": dilation[0], "dilation_w": dilation[1], } config = { "enable_cascader": True, "dev_force_block_config": block_config_str, } with tvm.transform.PassContext( config={"relay.ext.ethos-u.options": config}): device_config = cs.EthosuDeviceConfig("ethos-u55-128") block_configs = device_config.get_valid_block_configs( propagator, op_attrs, out_shape, ofm_channels, ifm_channels, ofm_layout, "NHWC", "int8", "int8", kernel[0], kernel[1], ) assert len(block_configs) == 1 assert block_configs[0].output_shape == expected_block_shape
def test_force_block_config_elementwise(ofm_layout, block_config_str, expected_block_shape): op_type = "ethosu_elementwise_unary" op_str = "ABS" activation = "NONE" ofm_shape = (1, 8, 10, 16) ifm_matrix = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1], ] ifm_offset = [0, 0, 0, 0] propagator = cs.Propagator(ifm_matrix, ifm_offset) op_attrs = { "op": op_type, "operator_type": op_str, "activation": activation, "clip_min": 0, "clip_max": 0, "rounding_mode": "TFL", } config = { "enable_cascader": True, "dev_force_block_config": block_config_str, } with tvm.transform.PassContext( config={"relay.ext.ethos-u.options": config}): device_config = cs.EthosuDeviceConfig("ethos-u55-128") block_configs = device_config.get_elementwise_block_config( propagator, None, op_attrs, ofm_shape, ofm_layout, "NWHC", None, "int8", "int8", ) assert len(block_configs) == 1 assert block_configs[0].output_shape == expected_block_shape
def test_ethosu_part(): te_subgraph = pl.TESubgraph([], None) output_quantum = [1, 2, 2, 8] quantum_cycles = 32 propagator = pl.Propagator( [[1, 0, 0, 0, 2], [0, 1, 0, 0, 2], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]], [0, 0, 0, 0], ) stripe_config = pl.StripeConfig([1, 4, 4, 16], [1, 64, 72, 96], [1, 4, 4, 16], [1, 2, 3, 4], [1, 16, 13, 6], [0, 0, 0, 0]) part = EthosuPart(te_subgraph, [propagator], output_quantum, quantum_cycles) assert part.get_stripe_align_hint() == output_quantum # Check that the performance model runs, don't verify output part.get_performance_info(stripe_config, False) part.get_performance_info(stripe_config, True)
def test_generate_output_stripe_configs(): stripe_factors = 3 expected_configs = 13 subgraph = cs.TESubgraph([], None) part_1 = cs.InlinePart( subgraph, [ cs.Propagator( [[2, 0, 0], [0, 2, 0], [0, 0, 1]], [0, 0], ), ], ) tensor_1 = cs.Tensor([800, 800], "uint8") tensor_2 = cs.Tensor([400, 400], "uint8") part_1.set_input(0, tensor_1) part_1.set_output(tensor_2) tensor_1.add_consumer(part_1) tensor_2.add_producer(part_1) assert len(_generate_output_stripe_configs( part_1, stripe_factors)) == expected_configs
def test_best_block_config( test_id, op_type, activation, kernel, stride, dilation, padding, in_shape, out_shape, layouts, acc_config, expected_block_configs, ): nhwc_to_nhcwb16 = [ [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1 / 16, 0], [0, 0, 1, 0, 0], [0, 0, 0, 0, 16], [0, 0, 0, 0, 1], ] nhcwb16_to_nhwc = [ [1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 16, 0, 1, -16], [0, 0, 0, 0, 0, 1], ] ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices( op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]) ofm_channels = out_shape[3] ifm_channels = in_shape[3] if layouts[0] == "NHCWB16": in_shape = [ int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1, )).tolist()[:-1] ] if layouts[1] == "NHCWB16": out_shape = [ int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, out_shape + (1, )).tolist()[:-1] ] propagator = cs.Propagator(ifm_matrix, ifm_offset) weight_propagator = cs.Propagator(weight_matrix, weight_offset) subkernels = ((kernel[0] + 7) // 8) * ((kernel[1] + 7) // 8) op_attrs = { "op": op_type, "activation": activation, "stride_h": stride[0], "stride_w": stride[1], "dilation_h": dilation[0], "dilation_w": dilation[1], } device_config = cs.EthosuDeviceConfig(acc_config) block_configs = device_config.get_valid_block_configs( propagator, op_attrs, out_shape, ofm_channels, ifm_channels, layouts[1], layouts[0], "int8", "int8", kernel[0], kernel[1], ) output_quantum = [1, 1, 2, 8] if layouts[1] == "NHCWB16": output_quantum = [1, 1, 1, 2, 8] # Create EthosUPart te_subgraph = cs.TESubgraph([], None) part = cs.EthosuPart( te_subgraph, [propagator, weight_propagator], output_quantum, subkernels, block_configs, 1, ) order = [1, 2, 3, 4] if layouts[1] == "NHCWB16" else [1, 2, 4, 3, 0] stripes = [1] * len(output_quantum) offset = [0] * len(output_quantum) stripe_config = cs.StripeConfig(out_shape, out_shape, out_shape, order, stripes, offset) block = part.get_block_config(stripe_config) block_shape = tuple(int(a) for a in block.output_shape) assert block_shape in expected_block_configs[test_id]
def test_conv_performance( accelerator, op_type, activation, kernel, stride, dilation, padding, in_shape, out_shape, block_shape, input_block_shape, expected, ): ifm_channels = in_shape[3] ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices( op_type, kernel, stride, padding, "NHWC", "NHWC", dilation, ifm_channels, ) propagator = cs.Propagator(ifm_matrix, ifm_offset) weight_propagator = cs.Propagator(weight_matrix, weight_offset) subkernels = ((kernel[0] + 7) // 8) * ((kernel[1] + 7) // 8) device_config = cs.EthosuDeviceConfig(accelerator) output_cycles = device_config._get_output_cycles(op_type, "", "int8", "int8", activation) output_cycles *= reduce(lambda a, b: a * b, block_shape, 1) is_partkernel = device_config.is_partkernel( op_type, ifm_channels, "int8", kernel[0] * kernel[1] ) compute_cycles = device_config._estimate_compute_cycles_per_block( op_type, _Shape(block_shape), _Shape(input_block_shape), kernel[0], kernel[1], ifm_channels, "int8", is_partkernel, ) block_configs = [ cs.BlockConfig(input_block_shape, block_shape, compute_cycles, int(output_cycles)) ] output_quantum = [1, 1, 2, 8] te_subgraph = cs.TESubgraph([], None) part = cs.EthosuPart( te_subgraph, [propagator, weight_propagator], output_quantum, subkernels, block_configs, 1, ) part.set_input(0, cs.Tensor(in_shape, "int8")) part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8")) part.set_output(cs.Tensor(out_shape, "int8")) stripes = [1] * len(output_quantum) offset = [0] * len(output_quantum) order = [1, 2, 3, 4] stripe_config = cs.StripeConfig(out_shape, out_shape, out_shape, order, stripes, offset) compute_cycles = part.get_performance_info(stripe_config, cs.BufferMode.ROLLING).compute_cycles tolerance = expected * 0.1 assert expected - tolerance <= compute_cycles <= expected + tolerance