Ejemplo n.º 1
0
def slice_nibbles(data: bytes, start_nibble: int, size: int = 1) -> int:
    """Slice out integer value of bytes indexed by nibble instead of byte.

    This function is only designed to work with current instruction formats. It
    makes a number of assumptions about byte order and positioning for these
    specific cases.
    """
    if size == 1:
        # Single nibble
        return int((data[start_nibble // 2] >> (((start_nibble + 1) % 2) * 4))
                   & 0xF)
    elif size == 2:
        # Single byte, assuming byte-alignment
        return data[start_nibble // 2]
    elif size == 4:
        # Normal 2-byte value, assuming byte-alignment
        return (data[start_nibble // 2] << 8) + data[start_nibble // 2 + 1]
    elif size == 8 or size == 16:
        # The 2-byte values are ordered from low to high
        res = 0
        for i, nibble in enumerate(range(start_nibble, start_nibble + size,
                                         4)):
            res += (
                (data[nibble // 2] << 8) + data[nibble // 2 + 1]) << (i * 16)
        return res
    else:
        log_error(
            f"slice_nibbles called with unexpected size: {size}. Returning 0")
        return 0
Ejemplo n.º 2
0
 def parse_proto_ids(
     self, data: bytes, size: int, offset_to_section: FileOffset
 ) -> None:
     i_offset = (4 - (offset_to_section)) % 4
     self.proto_ids: List[DexProtoId] = [
         DexProtoId(
             shorty=self.strings[self._parse_uint(data[i : i + 4])],
             return_type=self.type_ids[self._parse_uint(data[i + 4 : i + 8])],
             parameters=self.type_lists[
                 cast(FileOffset, self._parse_uint(data[i + 8 : i + 12]))
             ]
             if self._parse_uint(data[i + 8 : i + 12])
             else list(),
         )
         for i in range(i_offset, size * 12 + i_offset, 12)
     ]
     for proto in self.proto_ids:
         if len(proto.shorty) - 1 != len(proto.parameters):
             log_error("Shorty does not match parameters")
Ejemplo n.º 3
0
 def make_strings(self, data: bytes) -> None:
     self.strings: List[str] = list()
     for string_data_off in self.string_ids:
         utf16_size, off = parse_uleb128(data[string_data_off : string_data_off + 5])
         try:
             string, string_size_off = parse_mutf8(data[string_data_off + off :])
         except UnicodeDecodeError:
             # This should never be reached
             t = data[
                 string_data_off
                 + off : string_data_off
                 + off
                 + data[string_data_off + off :].index(b"\x00")
             ]
             log_error(f"Failed to decode MUTF8: {t!r}")
             raise
         self.strings.append(string)
         plen = len(string.encode("utf-16", "surrogatepass")) // 2 - 1
         if plen != utf16_size:
             # This should never be reached
             log_error(
                 f'String {repr(string)} at string offset "{string_data_off}" Python length {plen} does not match expected length {utf16_size}'
             )
Ejemplo n.º 4
0
def disassemble_pseudoinstructions(data: bytes,
                                   addr: "FileOffset") -> PseudoInstructions:
    # Static variable
    if "insns" not in disassemble.__dict__:
        disassemble.insns = load_insns()  # type: ignore[attr-defined]

    pseudoinstructions: PseudoInstructions = cast(PseudoInstructions, dict())
    code_offset = 0
    while code_offset < len(data):
        if data[code_offset + 1] == 0 and data[code_offset] != 0:
            # Pseudo-instruction
            # TODO performance benchmark swapping here vs. doing it once at
            # beginning of function
            data_swapped = endian_swap_shorts(data[code_offset + 2:])
            if data[code_offset] == 1:
                # packed-switch-payload
                size = unpack("<H", data_swapped[:2])[0]
                pseudoinstructions[cast(
                    "FileOffset",
                    addr + code_offset)] = SmaliPackedSwitchPayload(
                        _total_size=size * 4 + 8,
                        size=size,
                        first_key=unpack("<i", data_swapped[2:6])[0],
                        targets=[
                            unpack("<i", data_swapped[i:i + 4])[0]
                            for i in range(6, 6 + size * 4, 4)
                        ],
                    )
                code_offset += size * 4 + 8
            elif data[code_offset] == 2:
                # sparse-switch-payload
                size = unpack("<H", data_swapped[:2])[0]
                pseudoinstructions[cast(
                    "FileOffset",
                    addr + code_offset)] = SmaliSparseSwitchPayload(
                        _total_size=size * 8 + 4,
                        size=size,
                        keys=[
                            unpack("<i", data_swapped[i:i + 4])[0]
                            for i in range(2, 2 + size * 4, 4)
                        ],
                        targets=[
                            unpack("<i", data_swapped[i:i + 4])[0]
                            for i in range(2 + size * 4, 2 + size * 8, 4)
                        ],
                    )
                code_offset += size * 8 + 4
            elif data[code_offset] == 3:
                # fill-array-data-payload
                element_width = unpack("<H", data_swapped[:2])[0]
                size = unpack("<I", data_swapped[2:6])[0]
                pseudoinstructions[cast(
                    "FileOffset",
                    addr + code_offset)] = SmaliFillArrayDataPayload(
                        _total_size=((size * element_width + 1) // 2) * 2 + 8,
                        element_width=element_width,
                        size=size,
                        data=data_swapped[6:8 + (
                            (element_width * size + 1) // 2) * 2],
                    )
                code_offset += ((size * element_width + 1) // 2) * 2 + 8
            else:
                log_error(
                    f"Unknown pseudoinstruction {data[code_offset:code_offset+2]!r} at {addr + code_offset} in code block at {addr}"
                )
                code_offset += 2
        else:
            # Normal instruction
            insn_info = disassemble.insns[data[
                code_offset + 1]]  # type: ignore[attr-defined]
            code_offset += insn_info.fmt.insn_len * 2
    return pseudoinstructions
Ejemplo n.º 5
0
def disassemble(df: "DexFile", data: bytes,
                addr: "FileOffset") -> Tuple[List[InstructionTextToken], int]:
    # Static variable
    if "insns" not in disassemble.__dict__:
        # https://github.com/python/mypy/issues/708
        disassemble.insns = load_insns()  # type: ignore[attr-defined]

    if len(data) < 2:
        log_warn(
            f"Trying to disassemble data of length {len(data)} at {addr}: {data!r}"
        )
        # Fun fact: if you return -1 here, binja segfaults
        return [], 0

    # Handle pseudo-instructions first
    if data[0] == 0 and data[1] != 0:
        if data[1] == 1:
            # packed-switch
            ps = cast(SmaliPackedSwitchPayload, df.pseudoinstructions[addr])
            text = f".packed-switch {hex(ps.first_key)}\n"
            text += "".join([
                f"        :pswitch_offset_{target:x}\n"
                for target in ps.targets
            ])
            text += "    .end packed-switch"
        elif data[1] == 2:
            # sparse-switch
            # FIXME why do these casts not work?
            ps = cast(SmaliSparseSwitchPayload, df.pseudoinstructions[addr])
            text = ".sparse-switch\n"
            text += "".join([
                f"        {hex(ps.keys[i])} -> :sswitch_offset_{ps.targets[i]:x}\n"
                for i in range(ps.size)
            ])
            text += "    .end sparse-switch"
        elif data[1] == 3:
            ps = cast(SmaliFillArrayDataPayload, df.pseudoinstructions[addr])
            text = f"pseudo-instruction: {ps}"
        else:
            raise ValueError(f"Invalid pseudo-instruction with type {data[1]}")
        return (
            [
                InstructionTextToken(
                    token_type=InstructionTextTokenType.InstructionToken,
                    text=text,
                ),
            ],
            df.pseudoinstructions[addr]._total_size,
        )

    # Now handle normal instructions
    tokens = list()
    insn_info = disassemble.insns[data[0]]  # type: ignore[attr-defined]
    tokens.append(
        InstructionTextToken(InstructionTextTokenType.InstructionToken,
                             insn_info.mnemonic))

    data_to_parse = endian_swap_shorts(data[:2 * insn_info.fmt.insn_len])
    if len(data_to_parse) != insn_info.fmt.insn_len * 2:
        log_error(
            "Disassembly failed. Too few bytes part of instruction available to parse"
        )
        return list(), insn_info.fmt.insn_len * 2
    args = parse_with_format(data_to_parse, insn_info.fmt.format_)
    if "r" in insn_info._formatid:
        # Range instructions
        args["N"] = args["A"] + args["C"] - 1

    # Fix up syntax
    if insn_info._formatid == "35c":
        # 35c is weird for a couple reasons
        # 1. It uses "kind" instead of the actual kind of the name of the
        #    constant pool
        # 2. It forgets about "kind" for A=5 and lists them all out
        m = re.search("\\s([a-z]+)@", insn_info.syntax)
        if m is None:
            log_error(f"Failed to parse 35c at {addr}")
        else:
            kind = m.group(1)
        if args["A"] == 5:
            syntax = f"{{vC, vD, vE, vF, vG}}, {kind}@BBBB"
        elif args["A"] == 4:
            syntax = f"{{vC, vD, vE, vF}}, {kind}@BBBB"
        elif args["A"] == 3:
            syntax = f"{{vC, vD, vE}}, {kind}@BBBB"
        elif args["A"] == 2:
            syntax = f"{{vC, vD}}, {kind}@BBBB"
        elif args["A"] == 1:
            syntax = f"{{vC}}, {kind}@BBBB"
        elif args["A"] == 0:
            syntax = f"{{}}, {kind}@BBBB"
        else:
            log_error(f"Failed to parse syntax for 35c instruction at {addr}")
            syntax = "error (35c)"
    elif "[A=" in insn_info.fmt.syntax:
        for line in insn_info.fmt.syntax.split("[A="):
            line = line.strip()
            if line and line[0] == str(args["A"]):
                syntax = line[6:]
                break
        else:
            log_error(f"Failed to parse syntax for instruction at {addr}")
            syntax = "error"
    else:
        syntax = insn_info.syntax

    for word in syntax.split(" "):
        if not word or word.isspace():
            continue
        tokens += tokenize_syntax(df, word, args)

    return tokens, insn_info.fmt.insn_len * 2
Ejemplo n.º 6
0
def tokenize_syntax(df: "DexFile", word: str,
                    args: Dict[str, int]) -> List[InstructionTextToken]:
    tokens = list()
    tokens.append(InstructionTextToken(InstructionTextTokenType.TextToken,
                                       " "))

    # Check for prefixes and suffixes
    trailing_comma = False
    trailing_curly_brace = False
    if word[-1] == ",":
        trailing_comma = True
        word = word[:-1]
    if word[-1] == "}":  # Needs to be after ',' check
        trailing_curly_brace = True
        word = word[:-1]
    if word[0] == "{":
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.TextToken, "{"))
        word = word[1:]

    # Format operand with numbers where the placeholders are
    word_formatted = format_args_with_syntax(args, word)

    # Add operand token
    if word_formatted == "":
        # {}
        pass
    elif word_formatted[0] == "v":
        # Register e.g. v01
        val = int(word_formatted[1:], 16)
        if val >= 256:
            # TODO add link to issue. See comment in Smali
            log_warn(
                f"Rendering v{val}, but Binary Ninja only knows about registers up to 255 for analysis."
            )
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.RegisterToken,
                                 f"v{val}"))
    elif word_formatted[:2] == "#+":
        # Literal e.g. #+0001
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.IntegerToken,
                                 hex(int(word_formatted[2:], 16))))
    elif "@" in word_formatted:
        # Lookup value e.g. call_site@0001
        # Possible lookup types: call_site, field, method, method_handle, proto, string, type
        lookup_type, lookup_index_str = word_formatted.split("@")
        lookup_index = int(lookup_index_str, 16)
        if lookup_type == "call_site":
            log_warn(lookup_type + " isn't implemented yet")
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     word_formatted))
        elif lookup_type == "field":
            field = df.field_ids[lookup_index]
            # Class name
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     field.class_))
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, "->"))
            # Field name
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     field.name))
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, ":"))
            # Type
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     field.type_))
        elif lookup_type == "meth":
            meth = df.method_ids[lookup_index]
            # Class and method names
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     meth.class_))
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, "->"))

            if meth._insns_off is not None:
                tokens.append(
                    InstructionTextToken(
                        InstructionTextTokenType.PossibleAddressToken,
                        meth.name,
                        value=meth._insns_off,
                    ))
            else:
                tokens.append(
                    InstructionTextToken(InstructionTextTokenType.TextToken,
                                         meth.name))
            # Parameters
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, "("))
            for param in meth.proto.parameters:
                tokens.append(
                    InstructionTextToken(InstructionTextTokenType.TextToken,
                                         param))
            # if meth.proto.parameters:
            #     # Remove trailing semicolon
            #     tokens.pop()
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, ")"))
            # Return type
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     meth.proto.return_type))
        elif lookup_type == "method_handle":
            log_warn(lookup_type + " isn't implemented yet")
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     word_formatted))
        elif lookup_type == "proto":
            log_warn(lookup_type + " isn't implemented yet")
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     word_formatted))
        elif lookup_type == "string":
            string_ = df.strings[lookup_index]
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, '"'))
            tokens.append(
                # Escape e.g \n -> \\n or binja will render literal newline
                InstructionTextToken(
                    InstructionTextTokenType.TextToken,
                    string_.encode("unicode-escape").decode(),
                ))
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, '"'))
        elif lookup_type == "type":
            type_ = df.type_ids[lookup_index]
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     type_))
        else:
            log_error(f"Unknown lookup type: {word_formatted}")
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken,
                                     word_formatted))
    elif word_formatted[0] == "+":
        # Address offset e.g. +0011
        if int(word_formatted[1:], 16) >= 0:
            tokens.append(
                InstructionTextToken(InstructionTextTokenType.TextToken, "+"))
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.PossibleAddressToken,
                                 word_formatted[1:]))
    elif word_formatted == "..":
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.TextToken, ".."))
    else:
        # Other tokens. Investigate these
        log_warn(
            f'Formatting unknown token with syntax: "{word}": {word_formatted}'
        )
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.TextToken,
                                 word_formatted))

    # Add suffixes
    if trailing_curly_brace:
        tokens.append(
            InstructionTextToken(InstructionTextTokenType.TextToken, "}"))
    if trailing_comma:
        tokens.append(
            InstructionTextToken(
                InstructionTextTokenType.OperandSeparatorToken, ","))
    return tokens
Ejemplo n.º 7
0
    def __init__(self, data: bytes) -> None:
        endian_bytes = data[40:44]
        if endian_bytes == b"\x12\x34\x56\x78":
            self.endianness = Endianness.BigEndian
        elif endian_bytes == b"\x78\x56\x34\x12":
            self.endianness = Endianness.LittleEndian
        else:
            raise ValueError(f"Invalid endianness found: {endian_bytes!r}")
        if self.endianness == Endianness.BigEndian:
            # It is likely that these do not exist at all, but who knows
            log_warn(
                "This is a big-endian file. The author was unable to find one of these to test with, so there will probably be errors. Please open an issue with a copy of this file!"
            )

        map_off = self._parse_uint(data[52:56])
        map_size = self._parse_uint(data[map_off : map_off + 4])

        # Parse map list items. First we collect them all, and then we parse
        # them in an order that satisfies dependency relationships. For
        # example, string_ids/strings need to be parsed first, and type_ids
        # need to be parsed before type_lists, which need to be parsed before
        # protos. Strings are the first items in the map list, but protos come
        # before type_lists, so we can't just go in order.
        map_list = dict()
        for i in range(map_off + 4, 4 + map_off + map_size * 12, 12):
            item_type = self._parse_ushort(data[i : i + 2])
            item_size = self._parse_uint(data[i + 4 : i + 8])
            item_offset = cast(FileOffset, self._parse_uint(data[i + 8 : i + 12]))
            # log_debug(f'found type: "{item_type}", "{MapType(item_type).name}"')
            map_list[item_type] = MapListItem(size=item_size, offset=item_offset)

        # Ignore sections we don't need to reparse
        map_list.pop(MapType.TYPE_HEADER_ITEM)
        # The map list is what this part is parsing. No recursion
        map_list.pop(MapType.TYPE_MAP_LIST)

        # string_ids and strings
        mi = map_list.pop(MapType.TYPE_STRING_ID_ITEM)
        self.parse_string_ids(
            data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset
        )
        self.make_strings(data)
        del self.string_ids
        map_list.pop(MapType.TYPE_STRING_DATA_ITEM)  # Already handled

        # Then, type_ids and type_lists
        mi = map_list.pop(MapType.TYPE_TYPE_ID_ITEM)
        self.parse_type_ids(
            data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset
        )
        try:
            mi = map_list.pop(MapType.TYPE_TYPE_LIST)
            self.parse_type_lists(data[mi.offset :], mi.size, mi.offset)
        except KeyError:
            log_warn("No type list section")

        # Need proto ids before method ids and both method ids and field
        # ids before class data before class definitions
        mi = map_list.pop(MapType.TYPE_PROTO_ID_ITEM)
        self.parse_proto_ids(
            data[mi.offset : mi.offset + 12 * mi.size], mi.size, mi.offset
        )
        mi = map_list.pop(MapType.TYPE_METHOD_ID_ITEM)
        self.parse_method_ids(data[mi.offset :], mi.size, mi.offset)
        try:
            mi = map_list.pop(MapType.TYPE_FIELD_ID_ITEM)
            self.parse_field_ids(
                data[mi.offset : mi.offset + 8 * mi.size], mi.size, mi.offset
            )
        except KeyError:
            log_warn("No field id section.")
        mi = map_list.pop(MapType.TYPE_CODE_ITEM)
        self.parse_code_items(data[mi.offset :], mi.size, mi.offset)
        mi = map_list.pop(MapType.TYPE_CLASS_DATA_ITEM)
        self.parse_class_data(data[mi.offset :], mi.size, mi.offset)
        del self.code_items

        # Need encoded_array_items before class_defs
        try:
            mi = map_list.pop(MapType.TYPE_ENCODED_ARRAY_ITEM)
            self.parse_encoded_array_items(data[mi.offset :], mi.size, mi.offset)
        except KeyError:
            log_warn("No encoded array section.")
        del self.proto_ids

        # Rest are in order of MapType constant
        mi = map_list.pop(MapType.TYPE_CLASS_DEF_ITEM)
        self.parse_class_defs(
            data[mi.offset : mi.offset + 32 * mi.size], mi.size, mi.offset
        )
        try:
            del self.type_lists
            del self.class_data_items
        except AttributeError:
            pass

        try:
            mi = map_list.pop(MapType.TYPE_CALL_SITE_ID_ITEM)
            self.parse_call_site_ids(
                data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset
            )
        except KeyError:
            log_warn("No calls")

        try:
            mi = map_list.pop(MapType.TYPE_METHOD_HANDLE_ITEM)
            self.parse_method_handles(
                data[mi.offset : mi.offset + 8 * mi.size], mi.size, mi.offset
            )
        except KeyError:
            log_warn("No methods")

        # TODO annotations
        try:
            mi = map_list.pop(MapType.TYPE_ANNOTATION_ITEM)
            mi = map_list.pop(MapType.TYPE_ANNOTATIONS_DIRECTORY_ITEM)
            # self.parse_annotation_set_refs(data[mi.offset:mi.offset+4+mi.size*4], mi.size)
            mi = map_list.pop(MapType.TYPE_ANNOTATION_SET_ITEM)
        except KeyError:
            log_warn("No annotations")
        try:
            mi = map_list.pop(MapType.TYPE_ANNOTATION_SET_REF_LIST)
            # self.parse_annotation_sets(data[mi.offset:mi.offset+4+mi.size*4], mi.size)
        except KeyError:
            log_warn("No annotations set refs")

        # TODO debug info
        try:
            mi = map_list.pop(MapType.TYPE_DEBUG_INFO_ITEM)
        except KeyError:
            log_warn("No debug info items")

        for item_type in map_list:
            log_error(f"unknown type {hex(item_type)}")