Ejemplo n.º 1
0
def _create_npu_op_pooling(serial_pooling: spec.SerialPooling):
    pooling_type = serial_pooling.pooling_type
    if pooling_type == "AVG":
        npu_pooling_op = vapi.NpuPoolingOp.AVERAGE
    elif pooling_type == "MAX":
        npu_pooling_op = vapi.NpuPoolingOp.MAX

    npu_pooling_op = vapi.NpuPoolingOperation(npu_pooling_op)
    npu_pooling_op.ifm = _create_npu_feature_map(serial_pooling.ifm)
    npu_pooling_op.ofm = _create_npu_feature_map(serial_pooling.ofm)
    npu_pooling_op.kernel = _create_npu_kernel(serial_pooling.pool_shape)
    npu_pooling_op.padding = _create_npu_padding(serial_pooling.padding)

    npu_pooling_op.activation = _create_npu_activation(
        serial_pooling.activation)
    if (npu_pooling_op.activation and npu_pooling_op.activation.op_type
            == vapi.NpuActivationOp.NONE_OR_RELU):
        _convert_clip_bounds(npu_pooling_op)

    npu_pooling_op.rounding_mode = _create_npu_rounding_mode(
        serial_pooling.rounding_mode)
    npu_pooling_op.ifm_upscale = _create_npu_resampling_mode(
        serial_pooling.upscale)
    npu_pooling_op.block_config = _create_npu_block_config(
        serial_pooling.block_config)

    if not npu_pooling_op.block_config:
        target_accel_config = vela_api.get_accelerator_config()
        block_config = vela_api.get_optimal_block_config(
            npu_pooling_op, target_accel_config)
        npu_pooling_op.block_config = block_config

    return npu_pooling_op
Ejemplo n.º 2
0
def _create_npu_op_depthwise_conv2d(serial_2d_depthwise):
    npu_depthwise_conv2d_op = vapi.NpuConvDepthWiseOperation()

    npu_depthwise_conv2d_op.ifm = _create_npu_feature_map(serial_2d_depthwise.ifm)
    npu_depthwise_conv2d_op.ofm = _create_npu_feature_map(serial_2d_depthwise.ofm)
    npu_depthwise_conv2d_op.kernel = _create_npu_kernel(serial_2d_depthwise.kernel)
    npu_depthwise_conv2d_op.weights = [_create_npu_address_range(serial_2d_depthwise.weight)]
    weights_zero_point = np.int64(serial_2d_depthwise.weight_zero_point.value)
    npu_depthwise_conv2d_op.biases = [_create_npu_address_range(serial_2d_depthwise.scale_bias)]
    npu_depthwise_conv2d_op.padding = _create_npu_padding(serial_2d_depthwise.padding)

    npu_depthwise_conv2d_op.activation = _create_npu_activation(serial_2d_depthwise.activation)
    if (
        npu_depthwise_conv2d_op.activation
        and npu_depthwise_conv2d_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU
    ):
        _convert_clip_bounds(npu_depthwise_conv2d_op)

    npu_depthwise_conv2d_op.rounding_mode = _create_npu_rounding_mode(
        serial_2d_depthwise.rounding_mode
    )
    npu_depthwise_conv2d_op.ifm_upscale = _create_npu_resampling_mode(serial_2d_depthwise.upscale)
    npu_depthwise_conv2d_op.block_config = _create_npu_block_config(
        serial_2d_depthwise.block_config
    )

    if not npu_depthwise_conv2d_op.block_config:
        target_accel_config = vela_api.get_accelerator_config()
        block_config = vela_api.get_optimal_block_config(
            npu_depthwise_conv2d_op, target_accel_config
        )
        npu_depthwise_conv2d_op.block_config = block_config

    return npu_depthwise_conv2d_op, weights_zero_point
Ejemplo n.º 3
0
def _create_npu_op_conv2d(
    serial_2d_convolution: spec.Serial2DConvolution,
) -> Tuple[vapi.NpuConv2DOperation, int]:
    """This is a helper function to capture a list
    of arguments to create Vela NpuConv2DOperation object.
    """
    has_two_weights = serial_2d_convolution.weight2.address != -1
    has_two_biases = serial_2d_convolution.scale_bias2.address != -1

    npu_conv2d_op = vapi.NpuConv2DOperation()
    npu_conv2d_op.ifm = _create_npu_feature_map(serial_2d_convolution.ifm)
    npu_conv2d_op.ofm = _create_npu_feature_map(serial_2d_convolution.ofm)
    npu_conv2d_op.kernel = _create_npu_kernel(serial_2d_convolution.kernel)
    npu_conv2d_op.weights = ([
        _create_npu_address_range(serial_2d_convolution.weight),
        _create_npu_address_range(serial_2d_convolution.weight2),
    ] if has_two_weights else [
        _create_npu_address_range(serial_2d_convolution.weight)
    ])
    weights_zero_point = np.int64(
        serial_2d_convolution.weight_zero_point.value)
    npu_conv2d_op.biases = ([
        _create_npu_address_range(serial_2d_convolution.scale_bias),
        _create_npu_address_range(serial_2d_convolution.scale_bias2),
    ] if has_two_biases else [
        _create_npu_address_range(serial_2d_convolution.scale_bias)
    ])
    npu_conv2d_op.padding = _create_npu_padding(serial_2d_convolution.padding)

    npu_conv2d_op.activation = _create_npu_activation(
        serial_2d_convolution.activation)
    if (npu_conv2d_op.activation and npu_conv2d_op.activation.op_type
            == vapi.NpuActivationOp.NONE_OR_RELU):
        _convert_clip_bounds(npu_conv2d_op)

    npu_conv2d_op.rounding_mode = _create_npu_rounding_mode(
        serial_2d_convolution.rounding_mode)
    npu_conv2d_op.ifm_upscale = _create_npu_resampling_mode(
        serial_2d_convolution.upscale)
    weights_shape_ohwi = [
        npu_conv2d_op.ofm.shape.depth,
        npu_conv2d_op.kernel.height,
        npu_conv2d_op.kernel.width,
        npu_conv2d_op.ifm.shape.depth,
    ]
    npu_conv2d_op.block_traversal = vela_api.calculate_block_traversal_mode(
        is_depthwise=False,
        weights_shape_ohwi=weights_shape_ohwi,
        ifm_bitdepth=npu_conv2d_op.ifm.data_type.size_in_bits(),
    )
    npu_conv2d_op.block_config = _create_npu_block_config(
        serial_2d_convolution.block_config)

    if not npu_conv2d_op.block_config:
        target_accel_config = vela_api.get_accelerator_config()
        block_config = vela_api.get_optimal_block_config(
            npu_conv2d_op, target_accel_config)
        npu_conv2d_op.block_config = block_config

    return npu_conv2d_op, weights_zero_point
Ejemplo n.º 4
0
def _create_npu_op_unary_elementwise(serial_unary_elementwise):
    operator_type = serial_unary_elementwise.operator_type
    if operator_type == "ABS":
        op = vapi.NpuElementWiseOp.ABS
    if operator_type == "CLZ":
        op = vapi.NpuElementWiseOp.CLZ

    npu_unary_elementwise_op = vapi.NpuElementWiseOperation(op)
    npu_unary_elementwise_op.ifm = _create_npu_feature_map(
        serial_unary_elementwise.ifm)
    npu_unary_elementwise_op.ofm = _create_npu_feature_map(
        serial_unary_elementwise.ofm)

    npu_unary_elementwise_op.activation = _create_npu_activation(
        serial_unary_elementwise.activation)
    if (npu_unary_elementwise_op.activation
            and npu_unary_elementwise_op.activation.op_type
            == vapi.NpuActivationOp.NONE_OR_RELU):
        _convert_clip_bounds(npu_unary_elementwise_op)

    npu_unary_elementwise_op.rounding_mode = _create_npu_rounding_mode(
        serial_unary_elementwise.rounding_mode)
    npu_unary_elementwise_op.block_config = _create_npu_block_config(
        serial_unary_elementwise.block_config)

    if not npu_unary_elementwise_op.block_config:
        target_accel_type = vela_api.get_accelerator_config()
        block_config = vela_api.get_optimal_block_config(
            npu_unary_elementwise_op, target_accel_type)
        npu_unary_elementwise_op.block_config = block_config

    return npu_unary_elementwise_op
Ejemplo n.º 5
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    base_addresses : List[util.BaseAddress]
        base addresses to be used by the driver
    """

    # The NPU has 6 usable regions ranging from 0-6
    # The regions 0, 3, and 4 is already used for input,
    # output and constant, respectively (See _get_regions()).
    # Thus, for scratch we are left with 5, 2 and 1.
    candidate_regions_for_scratch = [5, 2, 1]
    (
        scratch_region_map,
        dynamic_allocation_size,
        dynamic_allocation_region,
    ) = analyze_scratch_memory_acesses(tir_module,
                                       candidate_regions_for_scratch)
    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_data = assign_addresses(buffer_info, _npu_ops,
                                               scratch_region_map)
    base_addresses = extract_param_base_addresses(tir_module, buffer_info,
                                                  scratch_region_map)
    if dynamic_allocation_size:
        base_addresses.append(
            util.BaseAddress(
                name="dynamic_allocation",
                primfunc_param_idx=None,
                region=dynamic_allocation_region,
                size=dynamic_allocation_size,
                is_runtime_allocation=True,
            ))
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops,
                                                     target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    return payload.hex(), constant_data, base_addresses
Ejemplo n.º 6
0
def _create_npu_op_binary_elementwise(serial_binary_elementwise: spec.SerialBinaryElementwise):
    operator_type = serial_binary_elementwise.operator_type
    if operator_type == "ADD":
        op = vapi.NpuElementWiseOp.ADD
    elif operator_type == "SUB":
        op = vapi.NpuElementWiseOp.SUB
    elif operator_type == "MUL":
        op = vapi.NpuElementWiseOp.MUL
    elif operator_type == "MIN":
        op = vapi.NpuElementWiseOp.MIN
    elif operator_type == "MAX":
        op = vapi.NpuElementWiseOp.MAX
    elif operator_type == "SHR":
        op = vapi.NpuElementWiseOp.SHR
    elif operator_type == "SHL":
        op = vapi.NpuElementWiseOp.SHL

    npu_binary_elementwise_op = vapi.NpuElementWiseOperation(op)
    npu_binary_elementwise_op.ifm = _create_npu_feature_map(serial_binary_elementwise.ifm)
    npu_binary_elementwise_op.ifm2 = _create_npu_feature_map(serial_binary_elementwise.ifm2)
    npu_binary_elementwise_op.ofm = _create_npu_feature_map(serial_binary_elementwise.ofm)
    npu_binary_elementwise_op.reversed_operands = serial_binary_elementwise.reversed_operands

    npu_binary_elementwise_op.activation = _create_npu_activation(
        serial_binary_elementwise.activation
    )
    if (
        npu_binary_elementwise_op.activation
        and npu_binary_elementwise_op.activation.op_type == vapi.NpuActivationOp.NONE_OR_RELU
    ):
        _convert_clip_bounds(npu_binary_elementwise_op)

    npu_binary_elementwise_op.rounding_mode = _create_npu_rounding_mode(
        serial_binary_elementwise.rounding_mode
    )
    npu_binary_elementwise_op.block_config = _create_npu_block_config(
        serial_binary_elementwise.block_config
    )

    if not npu_binary_elementwise_op.block_config:
        target_accel_config = vela_api.get_accelerator_config()
        block_config = vela_api.get_optimal_block_config(
            npu_binary_elementwise_op, target_accel_config
        )
        npu_binary_elementwise_op.block_config = block_config

    return npu_binary_elementwise_op
Ejemplo n.º 7
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    base_addresses : List[util.BaseAddress]
        base addresses to be used by the driver
    """

    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops)
    base_addresses = extract_param_base_addresses(tir_module, buffer_info)
    if scratch_size > 0:
        base_addresses.append(
            util.BaseAddress(
                "scratch",
                None,
                _REGION_MAP[BufferType.scratch],
                scratch_size,
                True,
            )
        )
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops, target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    return payload.hex(), constant_data, base_addresses
Ejemplo n.º 8
0
    def _visit(stmt):
        new_args = []
        # We don't want to divide the constant that will be executed on two cores in parallel
        is_u65_conv2d = (vela_api.get_accelerator_config()
                         == vapi.NpuAccelerator.Ethos_U65_512
                         and stmt.args[0] == "ethosu_conv2d")
        for i, arg in enumerate(stmt.args):
            if isinstance(arg, tvm.tir.expr.BufferLoad):
                # If we're trying to load a buffer that maps to a constant
                if arg.buffer.data in buffer_to_const:
                    const = buffer_to_const[arg.buffer.data]
                    flattened_const_shape = np.prod(const.shape)

                    offset = int(arg.indices[0])
                    # Note by convention the arg after a constant read is the length of the read
                    length = int(stmt.args[i + 1])
                    # If it's anything other than a full read, create a new buffer
                    if (offset != 0 or flattened_const_shape != length
                        ) and not is_u65_conv2d:
                        out_channels = const.shape[0]
                        offset_channels = int(
                            (offset * out_channels) / flattened_const_shape)
                        length_channels = int(
                            (length * out_channels) / flattened_const_shape)
                        # split the constant up across channels
                        split_const = np.split(const, out_channels, axis=0)
                        # create a new const out of the channels we want to keep
                        new_const = np.concatenate(
                            split_const[offset_channels:offset_channels +
                                        length_channels],
                            axis=0)
                        new_consts.append(new_const)
                        new_buffer = tvm.tir.decl_buffer(
                            (length, ), arg.dtype, scope=arg.buffer.scope())
                        new_buffers.append(new_buffer)
                        new_args.append(
                            tvm.tir.expr.BufferLoad(new_buffer, [0]))
                        continue
                    keep_buffers.add(arg.buffer.data)

            new_args.append(arg)

        return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span)
Ejemplo n.º 9
0
def translate(tir_module, params):
    """This will take an tir module for the NPU
    and compile to command stream

    Parameters
    ----------
    tir_module : tvm.IRModule
        The TIR module containing ethosu extern calls
    params : dict
        A dictionary containing TIR primfunc argument ordering
        idx to constant NDArray map
    accel_type : ethosu.vela.api.NpuAccelerator
        the accelerator variant the tir module needs to compiled to

    Returns
    -------
    cs : str
        An hex string of the bytes of command stream
    encoded_constants : str
        An hex string of the bytes that includes concat'd
        encoded weights, encoded biases and scales.
    scratch_size : int
        The size of the scratch buffer needed.
    """

    buffer_info = extract_buffer_info(tir_module, params)
    call_extern_list = extract_call_extern_list(tir_module)
    _npu_ops = list()
    for call_extern in call_extern_list:
        _npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
    _npu_ops, constant_tensor, scratch_size = assign_addresses(
        buffer_info, _npu_ops)
    target_accel_config = vela_api.get_accelerator_config()
    cmds = vapi.npu_generate_register_command_stream(_npu_ops,
                                                     target_accel_config)
    payload = vapi.npu_create_driver_payload(cmds, target_accel_config)
    hex_value = "" if constant_tensor is None else constant_tensor.tobytes(
    ).hex()
    return payload.hex(), hex_value, scratch_size
Ejemplo n.º 10
0
def EncodeConstants(const_dict):
    """the NPU requires that weights are compressed and bias/scales are 'encoded', both
    of which are performed by this pass.

    This pass modifies both the constant dict to contain the post-encoding values of the
    constants and the IR to adjust buffer types/sizes/accesses so they align with the
    encoded constants. Calls to the Vela API are made to perform the actual compression/
    encoding.

    """
    new_const_dict = {}
    buffer_to_const = {}
    pointer_to_buffer = {}
    rewrite_buffer = {}
    rewrite_pointer = {}
    accel_config = vela_api.get_accelerator_config()

    def _align_scale_bias(tir_extern_call, bias):
        """Align the scale_bias to 16 bytes."""
        value_bytes = bytearray()
        value_bytes.extend(bias.tobytes())
        # Align to 16
        remainder = (len(value_bytes)) % 16
        if remainder > 0:
            value_bytes.extend(bytearray(16 - remainder))
        value = np.frombuffer(value_bytes, dtype="uint8")
        return value

    def _encode_weights(tir_extern_call, weights):
        """Encode the weights for a TIR extern call."""
        value_bytes = vela_api.encode_weights(tir_extern_call, weights, accel_config)
        value = np.frombuffer(value_bytes, dtype="uint8")
        return value

    def _new_buffer(old_buffer, new_value):
        """Create a new buffer and add the old buffer and its pointer to the
        rewriting maps."""
        new_buffer = tvm.tir.decl_buffer((len(new_value),), str(new_value.dtype))
        pointer_to_buffer[new_buffer.data] = new_buffer
        rewrite_buffer[old_buffer] = new_buffer
        rewrite_pointer[old_buffer.data] = new_buffer.data
        buffer_to_const[new_buffer] = new_value

    def _visit_encode_pre(stmt):
        if isinstance(stmt, tvm.tir.Call):
            # Handle copies as a special-case by propagating the buffer information
            # from the read to the write pointer.
            if stmt.args[0] == "ethosu_copy":
                read_pointer = stmt.args[1].buffer_var
                if read_pointer in pointer_to_buffer:
                    write_pointer = stmt.args[3].buffer_var
                    # Assert writing to the base of the write_var (pre-StorageRewrite)
                    assert stmt.args[3].index == 0
                    assert stmt.args[1].index == 0
                    pointer_to_buffer[write_pointer] = pointer_to_buffer[read_pointer]
            else:
                # Encode the weights
                weights_pointer = get_weights_pointer(stmt)
                if weights_pointer is not None:
                    assert weights_pointer in pointer_to_buffer
                    weights_buffer = pointer_to_buffer[weights_pointer]
                    weights_value = buffer_to_const[weights_buffer]
                    new_weights_value = _encode_weights(stmt, weights_value)
                    _new_buffer(weights_buffer, new_weights_value)
                # Align the scale_bias to 16 bytes
                scale_bias_pointer = get_scale_bias_pointer(stmt)
                if scale_bias_pointer is not None:
                    assert scale_bias_pointer in pointer_to_buffer
                    scale_bias_buffer = pointer_to_buffer[scale_bias_pointer]
                    scale_bias_value = buffer_to_const[scale_bias_buffer]
                    new_scale_bias_value = _align_scale_bias(stmt, scale_bias_value)
                    _new_buffer(scale_bias_buffer, new_scale_bias_value)

    def _visit_encode_post(stmt):
        # Because encoding may change the data type (e.g. bias to uint8) and type information
        # is stored in pointer vars, it's necessary to rewrite all the pointers which point
        # to encoded data.
        if isinstance(stmt, tvm.tir.Allocate):
            allocate_pointer = stmt.buffer_var
            if allocate_pointer in pointer_to_buffer:
                buffer = pointer_to_buffer[allocate_pointer]
                if buffer in rewrite_buffer:  # If the pointer needs rewriting
                    # Create a new pointer var with the type of the new buffer
                    new_buffer = rewrite_buffer[buffer]
                    storage_type = tvm.ir.PrimType(new_buffer.dtype)
                    new_pointer = tvm.tir.Var(
                        allocate_pointer.name,
                        tvm.ir.PointerType(storage_type, buffer.scope()),
                        allocate_pointer.span,
                    )
                    # Set the new pointer to resolve to the new buffer
                    pointer_to_buffer[new_pointer] = new_buffer
                    # Add the old pointer to the pointer rewriting dict
                    rewrite_pointer[allocate_pointer] = new_pointer

    def _visit_rewrite(stmt):
        if isinstance(stmt, tvm.tir.Call):
            # For extern calls, we need to rewrite pairs of arguments corresponding to
            # base address load and the length of the load.
            new_args = [stmt.args[0]]
            new_buffers = rewrite_buffer.values()
            for i in range(1, len(stmt.args)):
                # If the previous argument was a load, the current should be a length
                if isinstance(stmt.args[i - 1], tvm.tir.Load):
                    load = stmt.args[i - 1]
                    pointer = load.buffer_var
                    if pointer in pointer_to_buffer:
                        buffer = pointer_to_buffer[pointer]
                        # Only rewrite the arguments of buffers that have been encoded
                        if buffer in new_buffers:
                            new_arg = np.prod(list(pointer_to_buffer[pointer].shape))
                            new_args.append(new_arg)
                            continue
                new_args.append(stmt.args[i])

            return tvm.tir.Call(stmt.dtype, stmt.op, new_args, stmt.span)
        if isinstance(stmt, tvm.tir.Allocate):
            # Where a pointer needs rewriting, the allocate for it must be rewritten
            allocate_pointer = stmt.buffer_var
            if allocate_pointer in pointer_to_buffer:
                if pointer_to_buffer[allocate_pointer] in rewrite_buffer:
                    new_buffer = rewrite_buffer[pointer_to_buffer[allocate_pointer]]
                    new_pointer = rewrite_pointer[allocate_pointer]
                    return tvm.tir.Allocate(
                        new_pointer,
                        new_buffer.dtype,
                        new_buffer.shape,
                        stmt.condition,
                        stmt.body,
                        stmt.span,
                    )
        # The following rewrites would be better expressed by just rewriting the Vars, however
        # ir_transform doesn't seem to visit Vars. So instead we do the next best thing and rewrite
        # the nodes which contain the Vars.
        if isinstance(stmt, tvm.tir.Load):
            load_pointer = stmt.buffer_var
            if load_pointer in rewrite_pointer:
                new_pointer = rewrite_pointer[load_pointer]
                element_type = new_pointer.type_annotation.element_type.dtype
                return tvm.tir.Load(
                    element_type, new_pointer, stmt.index, stmt.predicate, stmt.span
                )
        if isinstance(stmt, tvm.tir.AttrStmt):
            node_pointer = stmt.node
            if node_pointer in rewrite_pointer:
                return tvm.tir.AttrStmt(
                    rewrite_pointer[node_pointer], stmt.attr_key, stmt.value, stmt.body, stmt.span
                )
        return None

    def _ftransform(f, mod, ctx):
        for i, param in enumerate(f.params):
            if i in const_dict:
                buffer_to_const[f.buffer_map[param]] = const_dict[i].flatten()
                pointer_to_buffer[f.buffer_map[param].data] = f.buffer_map[param]

        # First analyse what needs to be rewritten
        new_body = tvm.tir.stmt_functor.ir_transform(
            f.body, _visit_encode_pre, _visit_encode_post, ["tir.Call", "tir.Allocate"]
        )
        # Then perform the rewrites
        new_body = tvm.tir.stmt_functor.ir_transform(
            f.body, None, _visit_rewrite, ["tir.Call", "tir.Allocate", "tir.Load", "tir.AttrStmt"]
        )
        new_buffer_map = {}
        # Rewrite the buffer map and const dict to instead use the encoded versions
        for i, param in enumerate(f.params):
            buffer = f.buffer_map[param]
            if buffer in rewrite_buffer:
                new_buffer = rewrite_buffer[buffer]
                new_buffer_map[param] = new_buffer
                new_value = buffer_to_const[new_buffer]
                new_const_dict[i] = new_value
            elif buffer in buffer_to_const:
                new_const_dict[i] = buffer_to_const[buffer]
                new_buffer_map[param] = buffer
            else:
                new_buffer_map[param] = buffer

        new_f = tvm.tir.PrimFunc(f.params, new_body, f.ret_type, new_buffer_map, f.attrs, f.span)
        return new_f

    def _encode_constants(mod):
        mod, divided_const_dict = DivideConstants(const_dict)(mod)
        const_dict.clear()
        for key, value in divided_const_dict.items():
            const_dict[key] = value
        transform_func = tvm.tir.transform.prim_func_pass(
            _ftransform, opt_level=0, name="tir.ethosu.encode_constants"
        )
        new_func = transform_func(mod)
        return new_func, new_const_dict

    return _encode_constants
Ejemplo n.º 11
0
Archivo: passes.py Proyecto: wenxcs/tvm
    def collect_encoding_definitions(stmt, old_buffer_to_const):
        # Map from copy destination to copy source.
        copy_map = {}
        # List of buffer copies that occurred
        copied_buffers = []
        # List of encoded buffer information
        constant_buffer_replacements = []

        def _align_scale_bias(tir_extern_call, bias):
            """Align the scale_bias to 16 bytes."""
            value_bytes = bytearray()
            value_bytes.extend(bias.tobytes())
            # Align to 16
            remainder = (len(value_bytes)) % 16
            if remainder > 0:
                value_bytes.extend(bytearray(16 - remainder))
            value = np.frombuffer(value_bytes, dtype="uint8")
            return value

        accel_config = vela_api.get_accelerator_config()

        def _encode_weights(tir_extern_call, weights):
            """Encode the weights for a TIR extern call."""
            value_bytes = vela_api.encode_weights(tir_extern_call, weights,
                                                  accel_config)
            value = np.frombuffer(value_bytes, dtype="uint8")
            return value

        def _declare_constant_buffer(old_buffer, encoded_constants):
            """Create a new buffer and add the old buffer and its pointer to the
            rewriting maps."""
            new_buffer = tvm.tir.decl_buffer(
                shape=[len(encoded_constants)],
                dtype=str(encoded_constants.dtype),
                name=old_buffer.name + "_encoded",
                scope=old_buffer.scope(),
            )

            constant_buffer_replacements.append({
                "old_buffer":
                old_buffer,
                "new_buffer":
                new_buffer,
                "encoded_constants":
                encoded_constants,
            })

        def _visit(stmt):
            if isinstance(stmt, tvm.tir.Call):
                # Handle copies as a special-case by propagating the buffer information
                # from the read to the write pointer.
                if stmt.args[0] == "ethosu_copy":
                    read_buffer = stmt.args[1].buffer
                    write_buffer = stmt.args[3].buffer
                    # Assert writing to the base of the write_var (pre-StorageRewrite)
                    assert list(stmt.args[3].indices) == [0]
                    assert list(stmt.args[1].indices) == [0]
                    copied_buffers.append({
                        "source": read_buffer,
                        "dest": write_buffer
                    })
                    copy_map[write_buffer] = read_buffer

                else:
                    # Encode the weights
                    weights_buffer = get_weights_buffer(stmt)
                    if weights_buffer is not None:
                        if weights_buffer in copy_map:
                            weights_buffer = copy_map[weights_buffer]
                        unencoded_weights_value = old_buffer_to_const[
                            weights_buffer]
                        encoded_weights_value = _encode_weights(
                            stmt, unencoded_weights_value)
                        _declare_constant_buffer(weights_buffer,
                                                 encoded_weights_value)

                    # Align the scale_bias to 16 bytes
                    scale_bias_buffer = get_scale_bias_buffer(stmt)
                    if scale_bias_buffer is not None:
                        if scale_bias_buffer in copy_map:
                            scale_bias_buffer = copy_map[scale_bias_buffer]
                        scale_bias_value = old_buffer_to_const[
                            scale_bias_buffer]
                        aligned_scale_bias_value = _align_scale_bias(
                            stmt, scale_bias_value)
                        _declare_constant_buffer(scale_bias_buffer,
                                                 aligned_scale_bias_value)

        tvm.tir.stmt_functor.post_order_visit(stmt, _visit)

        return {
            "copied_buffers": copied_buffers,
            "constant_buffer_replacements": constant_buffer_replacements,
        }
Ejemplo n.º 12
0
    def collect_encoding_definitions(stmt, old_buffer_to_const):
        # Map from copy destination to copy source.
        copy_map = {}
        # List of buffer copies that occurred
        copied_buffers = []
        # List of encoded buffer information
        constant_buffer_replacements = []

        def _align_scale_bias(tir_extern_call, bias):
            """Align the scale_bias to 16 bytes."""
            value_bytes = bytearray()
            value_bytes.extend(bias.tobytes())
            # Align to 16
            remainder = (len(value_bytes)) % 16
            if remainder > 0:
                value_bytes.extend(bytearray(16 - remainder))
            value = np.frombuffer(value_bytes, dtype="uint8")
            return value

        accel_config = vela_api.get_accelerator_config()

        def _encode_weights(tir_extern_call, weights):
            """Encode the weights for a TIR extern call."""
            value_bytes = vela_api.encode_weights(tir_extern_call, weights,
                                                  accel_config)
            value = np.frombuffer(value_bytes, dtype="uint8")
            return value

        def _declare_constant_buffer(old_buffer, encoded_constants, split_idx):
            """Create a new buffer and add the old buffer and its pointer to the
            rewriting maps."""
            new_buffer = tvm.tir.decl_buffer(
                shape=[len(encoded_constants)],
                dtype=str(encoded_constants.dtype),
                name=old_buffer.name + "_encoded",
                scope=old_buffer.scope(),
            )

            constant_buffer_replacements.append({
                "old_buffer": old_buffer,
                "new_buffer": new_buffer,
                "encoded_constants": encoded_constants,
                "split_idx": split_idx,
            })

        def _encode_weights_or_bias(buffer1, buffer2, stmt, encode_func):
            """Encode the weights or align the bias either for one or two cores,
            depending on the variant."""
            constant = old_buffer_to_const[buffer1]

            # If we have just one core, encode the whole constant
            if buffer2 is None:
                new_const = encode_func(stmt, constant)
                return new_const, None

            # Assume that the constant tensor has not been flattened yet
            assert len(constant.shape) != 1
            channels = constant.shape[0]
            split_const = np.split(constant, channels, axis=0)

            const_list = [
                split_const[i] for i in range(channels) if i % 2 == 0
            ]
            const_to_encode = np.concatenate(const_list, axis=0)

            new_const = encode_func(stmt, const_to_encode)
            split_idx = len(new_const)

            # Encode half of the constant separately for the other core if it exists
            assert buffer1.same_as(buffer2)
            const2_list = [
                split_const[i] for i in range(channels) if i % 2 == 1
            ]
            const2_to_encode = np.concatenate(const2_list, axis=0)

            new_const2 = encode_func(stmt, const2_to_encode)
            new_const = np.append(new_const, new_const2).astype("uint8")

            return new_const, split_idx

        def _visit(stmt):
            if isinstance(stmt, tvm.tir.Call):
                op = str(stmt.args[0].value)
                # Handle copies as a special-case by propagating the buffer information
                # from the read to the write pointer.
                if op == "ethosu_copy":
                    read_buffer = stmt.args[1].buffer
                    write_buffer = stmt.args[3].buffer
                    # Assert writing to the base of the write_var (pre-StorageRewrite)
                    assert list(stmt.args[3].indices) == [0]
                    assert list(stmt.args[1].indices) == [0]
                    copied_buffers.append({
                        "source": read_buffer,
                        "dest": write_buffer
                    })
                    copy_map[write_buffer] = read_buffer

                ops_with_weights = {
                    "ethosu_conv2d":
                    tirtocs.translate_ethosu_conv2d,
                    "ethosu_depthwise_conv2d":
                    tirtocs.translate_ethosu_depthwise_conv2d,
                }
                if op in ops_with_weights:
                    npu_op, _ = ops_with_weights[op](stmt)

                    # Encode the weights
                    weights_buffer = npu_op.weights[0].address.buffer
                    if weights_buffer in copy_map:
                        weights_buffer = copy_map[weights_buffer]

                    # In case of U65 512 mac variant the weights are split across two cores
                    # and need to be encoded separately
                    weights2_buffer = (
                        npu_op.weights[1].address.buffer if accel_config
                        == vapi.NpuAccelerator.Ethos_U65_512 else None)
                    if weights2_buffer in copy_map:
                        weights2_buffer = copy_map[weights2_buffer]

                    new_weights, split_idx = _encode_weights_or_bias(
                        weights_buffer, weights2_buffer, stmt, _encode_weights)
                    _declare_constant_buffer(weights_buffer, new_weights,
                                             split_idx)

                    # Align the scale_bias to 16 bytes
                    scale_bias_buffer = npu_op.biases[0].address.buffer
                    if scale_bias_buffer in copy_map:
                        scale_bias_buffer = copy_map[scale_bias_buffer]
                    scale_bias2_buffer = (
                        npu_op.biases[1].address.buffer if accel_config
                        == vapi.NpuAccelerator.Ethos_U65_512 else None)
                    if scale_bias2_buffer in copy_map:
                        scale_bias2_buffer = copy_map[scale_bias2_buffer]

                    new_scale_bias, split_idx = _encode_weights_or_bias(
                        scale_bias_buffer, scale_bias2_buffer, stmt,
                        _align_scale_bias)

                    _declare_constant_buffer(scale_bias_buffer, new_scale_bias,
                                             split_idx)

        tvm.tir.stmt_functor.post_order_visit(stmt, _visit)

        return {
            "copied_buffers": copied_buffers,
            "constant_buffer_replacements": constant_buffer_replacements,
        }