def slice_nibbles(data: bytes, start_nibble: int, size: int = 1) -> int: """Slice out integer value of bytes indexed by nibble instead of byte. This function is only designed to work with current instruction formats. It makes a number of assumptions about byte order and positioning for these specific cases. """ if size == 1: # Single nibble return int((data[start_nibble // 2] >> (((start_nibble + 1) % 2) * 4)) & 0xF) elif size == 2: # Single byte, assuming byte-alignment return data[start_nibble // 2] elif size == 4: # Normal 2-byte value, assuming byte-alignment return (data[start_nibble // 2] << 8) + data[start_nibble // 2 + 1] elif size == 8 or size == 16: # The 2-byte values are ordered from low to high res = 0 for i, nibble in enumerate(range(start_nibble, start_nibble + size, 4)): res += ( (data[nibble // 2] << 8) + data[nibble // 2 + 1]) << (i * 16) return res else: log_error( f"slice_nibbles called with unexpected size: {size}. Returning 0") return 0
def parse_proto_ids( self, data: bytes, size: int, offset_to_section: FileOffset ) -> None: i_offset = (4 - (offset_to_section)) % 4 self.proto_ids: List[DexProtoId] = [ DexProtoId( shorty=self.strings[self._parse_uint(data[i : i + 4])], return_type=self.type_ids[self._parse_uint(data[i + 4 : i + 8])], parameters=self.type_lists[ cast(FileOffset, self._parse_uint(data[i + 8 : i + 12])) ] if self._parse_uint(data[i + 8 : i + 12]) else list(), ) for i in range(i_offset, size * 12 + i_offset, 12) ] for proto in self.proto_ids: if len(proto.shorty) - 1 != len(proto.parameters): log_error("Shorty does not match parameters")
def make_strings(self, data: bytes) -> None: self.strings: List[str] = list() for string_data_off in self.string_ids: utf16_size, off = parse_uleb128(data[string_data_off : string_data_off + 5]) try: string, string_size_off = parse_mutf8(data[string_data_off + off :]) except UnicodeDecodeError: # This should never be reached t = data[ string_data_off + off : string_data_off + off + data[string_data_off + off :].index(b"\x00") ] log_error(f"Failed to decode MUTF8: {t!r}") raise self.strings.append(string) plen = len(string.encode("utf-16", "surrogatepass")) // 2 - 1 if plen != utf16_size: # This should never be reached log_error( f'String {repr(string)} at string offset "{string_data_off}" Python length {plen} does not match expected length {utf16_size}' )
def disassemble_pseudoinstructions(data: bytes, addr: "FileOffset") -> PseudoInstructions: # Static variable if "insns" not in disassemble.__dict__: disassemble.insns = load_insns() # type: ignore[attr-defined] pseudoinstructions: PseudoInstructions = cast(PseudoInstructions, dict()) code_offset = 0 while code_offset < len(data): if data[code_offset + 1] == 0 and data[code_offset] != 0: # Pseudo-instruction # TODO performance benchmark swapping here vs. doing it once at # beginning of function data_swapped = endian_swap_shorts(data[code_offset + 2:]) if data[code_offset] == 1: # packed-switch-payload size = unpack("<H", data_swapped[:2])[0] pseudoinstructions[cast( "FileOffset", addr + code_offset)] = SmaliPackedSwitchPayload( _total_size=size * 4 + 8, size=size, first_key=unpack("<i", data_swapped[2:6])[0], targets=[ unpack("<i", data_swapped[i:i + 4])[0] for i in range(6, 6 + size * 4, 4) ], ) code_offset += size * 4 + 8 elif data[code_offset] == 2: # sparse-switch-payload size = unpack("<H", data_swapped[:2])[0] pseudoinstructions[cast( "FileOffset", addr + code_offset)] = SmaliSparseSwitchPayload( _total_size=size * 8 + 4, size=size, keys=[ unpack("<i", data_swapped[i:i + 4])[0] for i in range(2, 2 + size * 4, 4) ], targets=[ unpack("<i", data_swapped[i:i + 4])[0] for i in range(2 + size * 4, 2 + size * 8, 4) ], ) code_offset += size * 8 + 4 elif data[code_offset] == 3: # fill-array-data-payload element_width = unpack("<H", data_swapped[:2])[0] size = unpack("<I", data_swapped[2:6])[0] pseudoinstructions[cast( "FileOffset", addr + code_offset)] = SmaliFillArrayDataPayload( _total_size=((size * element_width + 1) // 2) * 2 + 8, element_width=element_width, size=size, data=data_swapped[6:8 + ( (element_width * size + 1) // 2) * 2], ) code_offset += ((size * element_width + 1) // 2) * 2 + 8 else: log_error( f"Unknown pseudoinstruction {data[code_offset:code_offset+2]!r} at {addr + code_offset} in code block at {addr}" ) code_offset += 2 else: # Normal instruction insn_info = disassemble.insns[data[ code_offset + 1]] # type: ignore[attr-defined] code_offset += insn_info.fmt.insn_len * 2 return pseudoinstructions
def disassemble(df: "DexFile", data: bytes, addr: "FileOffset") -> Tuple[List[InstructionTextToken], int]: # Static variable if "insns" not in disassemble.__dict__: # https://github.com/python/mypy/issues/708 disassemble.insns = load_insns() # type: ignore[attr-defined] if len(data) < 2: log_warn( f"Trying to disassemble data of length {len(data)} at {addr}: {data!r}" ) # Fun fact: if you return -1 here, binja segfaults return [], 0 # Handle pseudo-instructions first if data[0] == 0 and data[1] != 0: if data[1] == 1: # packed-switch ps = cast(SmaliPackedSwitchPayload, df.pseudoinstructions[addr]) text = f".packed-switch {hex(ps.first_key)}\n" text += "".join([ f" :pswitch_offset_{target:x}\n" for target in ps.targets ]) text += " .end packed-switch" elif data[1] == 2: # sparse-switch # FIXME why do these casts not work? ps = cast(SmaliSparseSwitchPayload, df.pseudoinstructions[addr]) text = ".sparse-switch\n" text += "".join([ f" {hex(ps.keys[i])} -> :sswitch_offset_{ps.targets[i]:x}\n" for i in range(ps.size) ]) text += " .end sparse-switch" elif data[1] == 3: ps = cast(SmaliFillArrayDataPayload, df.pseudoinstructions[addr]) text = f"pseudo-instruction: {ps}" else: raise ValueError(f"Invalid pseudo-instruction with type {data[1]}") return ( [ InstructionTextToken( token_type=InstructionTextTokenType.InstructionToken, text=text, ), ], df.pseudoinstructions[addr]._total_size, ) # Now handle normal instructions tokens = list() insn_info = disassemble.insns[data[0]] # type: ignore[attr-defined] tokens.append( InstructionTextToken(InstructionTextTokenType.InstructionToken, insn_info.mnemonic)) data_to_parse = endian_swap_shorts(data[:2 * insn_info.fmt.insn_len]) if len(data_to_parse) != insn_info.fmt.insn_len * 2: log_error( "Disassembly failed. Too few bytes part of instruction available to parse" ) return list(), insn_info.fmt.insn_len * 2 args = parse_with_format(data_to_parse, insn_info.fmt.format_) if "r" in insn_info._formatid: # Range instructions args["N"] = args["A"] + args["C"] - 1 # Fix up syntax if insn_info._formatid == "35c": # 35c is weird for a couple reasons # 1. It uses "kind" instead of the actual kind of the name of the # constant pool # 2. It forgets about "kind" for A=5 and lists them all out m = re.search("\\s([a-z]+)@", insn_info.syntax) if m is None: log_error(f"Failed to parse 35c at {addr}") else: kind = m.group(1) if args["A"] == 5: syntax = f"{{vC, vD, vE, vF, vG}}, {kind}@BBBB" elif args["A"] == 4: syntax = f"{{vC, vD, vE, vF}}, {kind}@BBBB" elif args["A"] == 3: syntax = f"{{vC, vD, vE}}, {kind}@BBBB" elif args["A"] == 2: syntax = f"{{vC, vD}}, {kind}@BBBB" elif args["A"] == 1: syntax = f"{{vC}}, {kind}@BBBB" elif args["A"] == 0: syntax = f"{{}}, {kind}@BBBB" else: log_error(f"Failed to parse syntax for 35c instruction at {addr}") syntax = "error (35c)" elif "[A=" in insn_info.fmt.syntax: for line in insn_info.fmt.syntax.split("[A="): line = line.strip() if line and line[0] == str(args["A"]): syntax = line[6:] break else: log_error(f"Failed to parse syntax for instruction at {addr}") syntax = "error" else: syntax = insn_info.syntax for word in syntax.split(" "): if not word or word.isspace(): continue tokens += tokenize_syntax(df, word, args) return tokens, insn_info.fmt.insn_len * 2
def tokenize_syntax(df: "DexFile", word: str, args: Dict[str, int]) -> List[InstructionTextToken]: tokens = list() tokens.append(InstructionTextToken(InstructionTextTokenType.TextToken, " ")) # Check for prefixes and suffixes trailing_comma = False trailing_curly_brace = False if word[-1] == ",": trailing_comma = True word = word[:-1] if word[-1] == "}": # Needs to be after ',' check trailing_curly_brace = True word = word[:-1] if word[0] == "{": tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "{")) word = word[1:] # Format operand with numbers where the placeholders are word_formatted = format_args_with_syntax(args, word) # Add operand token if word_formatted == "": # {} pass elif word_formatted[0] == "v": # Register e.g. v01 val = int(word_formatted[1:], 16) if val >= 256: # TODO add link to issue. See comment in Smali log_warn( f"Rendering v{val}, but Binary Ninja only knows about registers up to 255 for analysis." ) tokens.append( InstructionTextToken(InstructionTextTokenType.RegisterToken, f"v{val}")) elif word_formatted[:2] == "#+": # Literal e.g. #+0001 tokens.append( InstructionTextToken(InstructionTextTokenType.IntegerToken, hex(int(word_formatted[2:], 16)))) elif "@" in word_formatted: # Lookup value e.g. call_site@0001 # Possible lookup types: call_site, field, method, method_handle, proto, string, type lookup_type, lookup_index_str = word_formatted.split("@") lookup_index = int(lookup_index_str, 16) if lookup_type == "call_site": log_warn(lookup_type + " isn't implemented yet") tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, word_formatted)) elif lookup_type == "field": field = df.field_ids[lookup_index] # Class name tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, field.class_)) tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "->")) # Field name tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, field.name)) tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, ":")) # Type tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, field.type_)) elif lookup_type == "meth": meth = df.method_ids[lookup_index] # Class and method names tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, meth.class_)) tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "->")) if meth._insns_off is not None: tokens.append( InstructionTextToken( InstructionTextTokenType.PossibleAddressToken, meth.name, value=meth._insns_off, )) else: tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, meth.name)) # Parameters tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "(")) for param in meth.proto.parameters: tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, param)) # if meth.proto.parameters: # # Remove trailing semicolon # tokens.pop() tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, ")")) # Return type tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, meth.proto.return_type)) elif lookup_type == "method_handle": log_warn(lookup_type + " isn't implemented yet") tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, word_formatted)) elif lookup_type == "proto": log_warn(lookup_type + " isn't implemented yet") tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, word_formatted)) elif lookup_type == "string": string_ = df.strings[lookup_index] tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, '"')) tokens.append( # Escape e.g \n -> \\n or binja will render literal newline InstructionTextToken( InstructionTextTokenType.TextToken, string_.encode("unicode-escape").decode(), )) tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, '"')) elif lookup_type == "type": type_ = df.type_ids[lookup_index] tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, type_)) else: log_error(f"Unknown lookup type: {word_formatted}") tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, word_formatted)) elif word_formatted[0] == "+": # Address offset e.g. +0011 if int(word_formatted[1:], 16) >= 0: tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "+")) tokens.append( InstructionTextToken(InstructionTextTokenType.PossibleAddressToken, word_formatted[1:])) elif word_formatted == "..": tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "..")) else: # Other tokens. Investigate these log_warn( f'Formatting unknown token with syntax: "{word}": {word_formatted}' ) tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, word_formatted)) # Add suffixes if trailing_curly_brace: tokens.append( InstructionTextToken(InstructionTextTokenType.TextToken, "}")) if trailing_comma: tokens.append( InstructionTextToken( InstructionTextTokenType.OperandSeparatorToken, ",")) return tokens
def __init__(self, data: bytes) -> None: endian_bytes = data[40:44] if endian_bytes == b"\x12\x34\x56\x78": self.endianness = Endianness.BigEndian elif endian_bytes == b"\x78\x56\x34\x12": self.endianness = Endianness.LittleEndian else: raise ValueError(f"Invalid endianness found: {endian_bytes!r}") if self.endianness == Endianness.BigEndian: # It is likely that these do not exist at all, but who knows log_warn( "This is a big-endian file. The author was unable to find one of these to test with, so there will probably be errors. Please open an issue with a copy of this file!" ) map_off = self._parse_uint(data[52:56]) map_size = self._parse_uint(data[map_off : map_off + 4]) # Parse map list items. First we collect them all, and then we parse # them in an order that satisfies dependency relationships. For # example, string_ids/strings need to be parsed first, and type_ids # need to be parsed before type_lists, which need to be parsed before # protos. Strings are the first items in the map list, but protos come # before type_lists, so we can't just go in order. map_list = dict() for i in range(map_off + 4, 4 + map_off + map_size * 12, 12): item_type = self._parse_ushort(data[i : i + 2]) item_size = self._parse_uint(data[i + 4 : i + 8]) item_offset = cast(FileOffset, self._parse_uint(data[i + 8 : i + 12])) # log_debug(f'found type: "{item_type}", "{MapType(item_type).name}"') map_list[item_type] = MapListItem(size=item_size, offset=item_offset) # Ignore sections we don't need to reparse map_list.pop(MapType.TYPE_HEADER_ITEM) # The map list is what this part is parsing. No recursion map_list.pop(MapType.TYPE_MAP_LIST) # string_ids and strings mi = map_list.pop(MapType.TYPE_STRING_ID_ITEM) self.parse_string_ids( data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset ) self.make_strings(data) del self.string_ids map_list.pop(MapType.TYPE_STRING_DATA_ITEM) # Already handled # Then, type_ids and type_lists mi = map_list.pop(MapType.TYPE_TYPE_ID_ITEM) self.parse_type_ids( data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset ) try: mi = map_list.pop(MapType.TYPE_TYPE_LIST) self.parse_type_lists(data[mi.offset :], mi.size, mi.offset) except KeyError: log_warn("No type list section") # Need proto ids before method ids and both method ids and field # ids before class data before class definitions mi = map_list.pop(MapType.TYPE_PROTO_ID_ITEM) self.parse_proto_ids( data[mi.offset : mi.offset + 12 * mi.size], mi.size, mi.offset ) mi = map_list.pop(MapType.TYPE_METHOD_ID_ITEM) self.parse_method_ids(data[mi.offset :], mi.size, mi.offset) try: mi = map_list.pop(MapType.TYPE_FIELD_ID_ITEM) self.parse_field_ids( data[mi.offset : mi.offset + 8 * mi.size], mi.size, mi.offset ) except KeyError: log_warn("No field id section.") mi = map_list.pop(MapType.TYPE_CODE_ITEM) self.parse_code_items(data[mi.offset :], mi.size, mi.offset) mi = map_list.pop(MapType.TYPE_CLASS_DATA_ITEM) self.parse_class_data(data[mi.offset :], mi.size, mi.offset) del self.code_items # Need encoded_array_items before class_defs try: mi = map_list.pop(MapType.TYPE_ENCODED_ARRAY_ITEM) self.parse_encoded_array_items(data[mi.offset :], mi.size, mi.offset) except KeyError: log_warn("No encoded array section.") del self.proto_ids # Rest are in order of MapType constant mi = map_list.pop(MapType.TYPE_CLASS_DEF_ITEM) self.parse_class_defs( data[mi.offset : mi.offset + 32 * mi.size], mi.size, mi.offset ) try: del self.type_lists del self.class_data_items except AttributeError: pass try: mi = map_list.pop(MapType.TYPE_CALL_SITE_ID_ITEM) self.parse_call_site_ids( data[mi.offset : mi.offset + 4 * mi.size], mi.size, mi.offset ) except KeyError: log_warn("No calls") try: mi = map_list.pop(MapType.TYPE_METHOD_HANDLE_ITEM) self.parse_method_handles( data[mi.offset : mi.offset + 8 * mi.size], mi.size, mi.offset ) except KeyError: log_warn("No methods") # TODO annotations try: mi = map_list.pop(MapType.TYPE_ANNOTATION_ITEM) mi = map_list.pop(MapType.TYPE_ANNOTATIONS_DIRECTORY_ITEM) # self.parse_annotation_set_refs(data[mi.offset:mi.offset+4+mi.size*4], mi.size) mi = map_list.pop(MapType.TYPE_ANNOTATION_SET_ITEM) except KeyError: log_warn("No annotations") try: mi = map_list.pop(MapType.TYPE_ANNOTATION_SET_REF_LIST) # self.parse_annotation_sets(data[mi.offset:mi.offset+4+mi.size*4], mi.size) except KeyError: log_warn("No annotations set refs") # TODO debug info try: mi = map_list.pop(MapType.TYPE_DEBUG_INFO_ITEM) except KeyError: log_warn("No debug info items") for item_type in map_list: log_error(f"unknown type {hex(item_type)}")