def handle_file(self, fname): with open(fname, 'rb') as fd: if fd.read(4) == b'dex\n': new_jar = self.name + '/classes-dex2jar.jar' run(['dex2jar', fname, '-f', '-o', new_jar, '-e', '/dev/null'], stderr=DEVNULL, check=True) fname = new_jar with ZipFile(fname) as jar: jar.extractall(self.name) for cls in jar.namelist(): if cls.endswith('.class'): cls = cls.replace('/', '.')[:-6] self.classes.append(cls) elif cls.endswith('.dex'): self.handle_file(self.name + '/' + cls) elif cls.endswith('.proto'): self.bonus_protos[cls] = jar.read(cls).decode('utf8') elif cls.endswith('.so'): self.bonus_protos.update(walk_binary(self.name + '/' + cls))
def handle_jar(path): # Scan classes for Java Protobuf string signatures if path.endswith('.jar'): yield '_progress', ('Decompressing JAR...', None) else: yield '_progress', ('Converting DEX to JAR...', None) with JarWrapper(path) as jar: enums = {} pkg_to_codedinputstream = {} pkg_to_codedoutputstream = {} map_entry_cls = [] out_additional_cls = [] pkg_to_j2me_protobuftype = {} """ First iteration on classes: look for library classes signatures. """ for i, cls in enumerate(jar.classes): if i % 10 == 0: yield '_progress', ('Scanning Java package contents...', (i / len(jar.classes)) * 0.5) pkg = cls[:cls.rfind('.')] if '.' in cls else '' binr = jar.read(cls) # Search for CodedInputStream/CodedOutputStream raw_cls = cls.replace('.', '/').encode('utf8') """ Handle multiple cases: 1. CodedInputStream, before it was split out in multiple subclasses (cc8ca5b - oct 2016) 2. CodedInputStream, after it was 3. CodedInputByteBufferNano 4. CodedInputStreamMicro The second case doesn't provide intelligible strings to search for, so we'll use method signatures (the different kinds that can be produced by Proguard) instead. """ SIG_NANO = b'([BII)V' # CodedInputByteBufferNano(final byte[] buffer, final int off, final int len) SIG_NANO_2 = b'([BI)V' # CodedInputByteBufferNano(final byte[] buffer, final int bufferSize) SIG_DEF = b'([BIIZ)L%s;' % raw_cls # static CodedInputStream newInstance(final byte[] buf, final int off, final int len, final boolean bufferIsImmutable) SIG_DEF_2 = b'([BII)L%s;' % raw_cls # static CodedInputStream newInstance(final byte[] buf, final int off, final int len) SIG_CALL = b'([BIIZ)V' # private ArrayDecoder(final byte[] buffer, final int offset, final int len, boolean immutable) SIG_CALL_2 = b'([BII)V' # private ArrayDecoder(final byte[] buffer, final int offset, final int len) SIG_CALL_3 = b'([BIIZL' # CodedInputStream$ArrayDecoder(byte abyte0[], int i, int j, boolean flag, com.google.protobuf.CodedInputStream$1 codedinputstream$1) has_constructor = SIG_DEF in binr or SIG_DEF_2 in binr calls_arraydecoder = SIG_CALL in binr or SIG_CALL_2 in binr or SIG_CALL_3 in binr is_legit_class = b'Beginning index' not in binr and b'Number too large' not in binr and b'a byte array' not in binr has_constructor_nano = SIG_NANO in binr or SIG_NANO_2 in binr has_relevant_string = b'message contained an invalid tag' in binr has_relevant_string_nano = b'is beyond current' in binr has_relevant_string_micro = b"when buffer wasn't empty" in binr """ Try to match CodedOutputStream before CodedInputStream, as it may have common points in signatures but always has a recognizable string. """ has_out_constructor = b'([BII' in binr has_out_relevant_string = b'write as much data as' in binr has_out_relevant_string_old = b'UTF-8 not supported.' in binr has_out_relevant_string_nano = b'Unpaired surrogate at index ' in binr and b'wrap' in binr has_out_relevant_string_2 = b'Converting ill-formed UTF-16.' in binr and b'Pos:' not in binr is_legit_out_class = b'byte array' not in binr if has_out_constructor and (\ ((has_out_relevant_string or has_out_relevant_string_old) and is_legit_out_class) or \ has_out_relevant_string_nano or has_out_relevant_string_2): # CodedOutputStream while pkg in pkg_to_codedoutputstream: pkg += '_' pkg_to_codedoutputstream[pkg] = cls elif (has_constructor and is_legit_class and (calls_arraydecoder or has_relevant_string)) or \ (has_constructor_nano and (has_relevant_string_nano or has_relevant_string_micro)): # CodedInputStream while pkg in pkg_to_codedinputstream: pkg += '_' pkg_to_codedinputstream[pkg] = cls # Other classes that may be called for (de)serializing objects elif b'Generated message class' in binr: # GeneratedMessage* out_additional_cls.append(cls) elif b'is not a primitive type' in binr: # InternalNano map_entry_cls.append(cls) elif b'Groups are not allowed in maps' in binr or \ b'a map entry message.' in binr: # MapEntry* map_entry_cls.append(cls) # Search for J2ME implementation's ProtoBuf.java elif b'Unexp.EOF' in binr: code = jar.decomp(cls, True).raw protobuftype_cls = search('public \w+\(([\w.$]+) \w+\)', code).group(1) default_consts = {} for prop, const in findall('(\w+) = new Boolean\((\w+)\)', code): default_consts[cls + '.' + prop] = const while pkg in pkg_to_j2me_protobuftype: pkg += '_' pkg_to_j2me_protobuftype[pkg] = (protobuftype_cls, default_consts) for pkg in list(pkg_to_codedinputstream): if pkg not in pkg_to_codedoutputstream: del pkg_to_codedinputstream[pkg] """ Second iteration on classes: look for generated classes, that contains method call signatures [1] for libraries we found, or other extractible information. [1] https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-4.html#jvms-4.3 """ gen_classes = OrderedDict() gen_classes_j2me = OrderedDict() had_metadata = set() for i, cls in enumerate(jar.classes): if i % 10 == 0: yield '_progress', ('Scanning Java package contents...', (i / len(jar.classes)) * 0.5 + 0.5) binr = jar.read(cls) # Search for metadata descriptors if b'.proto\x12' in binr or b'.protodevel\x12' in binr: code = jar.decomp(cls, True).raw code = sub('",\s+"', '', code, flags=MULTILINE) meta = search(r'"(\\n.+?\.proto.+)"', code) if meta: meta = meta.group(1).encode('latin1') meta = meta.decode('unicode_escape').encode('latin1') yield from walk_binary(meta) had_metadata.add(cls) # Search for signatures common to generated Java classes for impl in pkg_to_codedinputstream: if b'%s' % pkg_to_codedinputstream[impl].replace('.', '/').encode('ascii') in binr and \ b'(L%s;' % pkg_to_codedoutputstream[impl].replace('.', '/').encode('ascii') in binr and \ cls not in (pkg_to_codedinputstream[impl], pkg_to_codedoutputstream[impl]): gen_classes[cls] = (pkg_to_codedinputstream[impl], pkg_to_codedoutputstream[impl]) # Search for generated J2ME classes for impl, (protobuftype_cls, consts) in pkg_to_j2me_protobuftype.items(): if b'(IILjava/lang/Object;)L%s;' % protobuftype_cls.replace('.', '/').encode('ascii') in binr and \ cls != protobuftype_cls: gen_classes_j2me[cls] = (protobuftype_cls, consts) # Search for enums if b'Ljava/lang/Enum<' in binr[:256]: enums[cls] = cls if '$' in cls: enums[cls.replace('$', '.')] = cls enums[cls.rsplit('.', 1)[0] + '.' + cls.rsplit('$', 1)[1]] = cls gen_classes_nodollar = OrderedDict(gen_classes) for cls, pkg in OrderedDict(gen_classes_nodollar).items(): if '$' in cls: gen_classes_nodollar[cls.replace('$', '.')] = pkg gen_classes_nodollar[cls.rsplit('.', 1)[0] + '.' + cls.rsplit('$', 1)[1]] = pkg """ Once we know what classes we should look at, do the actual code scraping and extraction work. """ # These variables will be filled in by extract_* functions: msg_path_to_obj = { } # For the class name of a message/enum, its DescriptorProto object msg_to_referrers = defaultdict( list ) # For a nested message/enum, all message fields that refer to it # Call the extraction routine for most implementations for i, (cls, (codedinputstream, codedoutputstream)) in enumerate(gen_classes.items()): yield '_progress', ('Extracting %s...' % cls, i / len(gen_classes)) if cls.split('$')[0] not in had_metadata: extract_lite(jar, cls, enums, gen_classes_nodollar, codedinputstream, codedoutputstream, map_entry_cls, out_additional_cls, msg_path_to_obj, msg_to_referrers) # Call the extraction routine for J2ME for i, (cls, (protobuftype_cls, consts)) in enumerate(gen_classes_j2me.items()): yield '_progress', ('Extracting %s...' % cls, i / len(gen_classes_j2me)) extract_j2me(jar, cls, enums, gen_classes_j2me, protobuftype_cls, consts, msg_path_to_obj, msg_to_referrers) yield '_progress', ('Dumping information to .protos...', None) # Merge nested Protobuf messages and write them to files yield from nest_and_print_to_files(msg_path_to_obj, msg_to_referrers) # If we got an APK and it contained .so's with embedded metadata or .protos, yield them yield from jar.bonus_protos.items()