Beispiel #1
0
    def handle_file(self, fname):
        with open(fname, 'rb') as fd:
            if fd.read(4) == b'dex\n':
                new_jar = self.name + '/classes-dex2jar.jar'
                run(['dex2jar', fname, '-f', '-o', new_jar, '-e', '/dev/null'],
                    stderr=DEVNULL,
                    check=True)
                fname = new_jar

        with ZipFile(fname) as jar:
            jar.extractall(self.name)

            for cls in jar.namelist():
                if cls.endswith('.class'):
                    cls = cls.replace('/', '.')[:-6]
                    self.classes.append(cls)

                elif cls.endswith('.dex'):
                    self.handle_file(self.name + '/' + cls)

                elif cls.endswith('.proto'):
                    self.bonus_protos[cls] = jar.read(cls).decode('utf8')

                elif cls.endswith('.so'):
                    self.bonus_protos.update(walk_binary(self.name + '/' +
                                                         cls))
Beispiel #2
0
def handle_jar(path):
    # Scan classes for Java Protobuf string signatures

    if path.endswith('.jar'):
        yield '_progress', ('Decompressing JAR...', None)
    else:
        yield '_progress', ('Converting DEX to JAR...', None)

    with JarWrapper(path) as jar:
        enums = {}

        pkg_to_codedinputstream = {}
        pkg_to_codedoutputstream = {}
        map_entry_cls = []
        out_additional_cls = []

        pkg_to_j2me_protobuftype = {}
        """
        First iteration on classes: look for library classes signatures.
        """

        for i, cls in enumerate(jar.classes):
            if i % 10 == 0:
                yield '_progress', ('Scanning Java package contents...',
                                    (i / len(jar.classes)) * 0.5)

            pkg = cls[:cls.rfind('.')] if '.' in cls else ''
            binr = jar.read(cls)

            # Search for CodedInputStream/CodedOutputStream

            raw_cls = cls.replace('.', '/').encode('utf8')
            """
            Handle multiple cases:
            1. CodedInputStream, before it was split out in multiple
               subclasses (cc8ca5b - oct 2016)
            2. CodedInputStream, after it was
            3. CodedInputByteBufferNano
            4. CodedInputStreamMicro
            
            The second case doesn't provide intelligible strings to
            search for, so we'll use method signatures (the different
            kinds that can be produced by Proguard) instead.
            """

            SIG_NANO = b'([BII)V'  # CodedInputByteBufferNano(final byte[] buffer, final int off, final int len)
            SIG_NANO_2 = b'([BI)V'  # CodedInputByteBufferNano(final byte[] buffer, final int bufferSize)
            SIG_DEF = b'([BIIZ)L%s;' % raw_cls  # static CodedInputStream newInstance(final byte[] buf, final int off, final int len, final boolean bufferIsImmutable)
            SIG_DEF_2 = b'([BII)L%s;' % raw_cls  # static CodedInputStream newInstance(final byte[] buf, final int off, final int len)
            SIG_CALL = b'([BIIZ)V'  # private ArrayDecoder(final byte[] buffer, final int offset, final int len, boolean immutable)
            SIG_CALL_2 = b'([BII)V'  # private ArrayDecoder(final byte[] buffer, final int offset, final int len)
            SIG_CALL_3 = b'([BIIZL'  # CodedInputStream$ArrayDecoder(byte abyte0[], int i, int j, boolean flag, com.google.protobuf.CodedInputStream$1 codedinputstream$1)

            has_constructor = SIG_DEF in binr or SIG_DEF_2 in binr
            calls_arraydecoder = SIG_CALL in binr or SIG_CALL_2 in binr or SIG_CALL_3 in binr
            is_legit_class = b'Beginning index' not in binr and b'Number too large' not in binr and b'a byte array' not in binr

            has_constructor_nano = SIG_NANO in binr or SIG_NANO_2 in binr
            has_relevant_string = b'message contained an invalid tag' in binr
            has_relevant_string_nano = b'is beyond current' in binr
            has_relevant_string_micro = b"when buffer wasn't empty" in binr
            """
            Try to match CodedOutputStream before CodedInputStream, as
            it may have common points in signatures but always has a
            recognizable string.
            """

            has_out_constructor = b'([BII' in binr
            has_out_relevant_string = b'write as much data as' in binr
            has_out_relevant_string_old = b'UTF-8 not supported.' in binr
            has_out_relevant_string_nano = b'Unpaired surrogate at index ' in binr and b'wrap' in binr
            has_out_relevant_string_2 = b'Converting ill-formed UTF-16.' in binr and b'Pos:' not in binr
            is_legit_out_class = b'byte array' not in binr

            if has_out_constructor and (\
               ((has_out_relevant_string or has_out_relevant_string_old) and is_legit_out_class) or \
                 has_out_relevant_string_nano or has_out_relevant_string_2): # CodedOutputStream

                while pkg in pkg_to_codedoutputstream:
                    pkg += '_'
                pkg_to_codedoutputstream[pkg] = cls

            elif (has_constructor and is_legit_class and (calls_arraydecoder or has_relevant_string)) or \
               (has_constructor_nano and (has_relevant_string_nano or has_relevant_string_micro)): # CodedInputStream

                while pkg in pkg_to_codedinputstream:
                    pkg += '_'
                pkg_to_codedinputstream[pkg] = cls

            # Other classes that may be called for (de)serializing objects

            elif b'Generated message class' in binr:  # GeneratedMessage*
                out_additional_cls.append(cls)

            elif b'is not a primitive type' in binr:  # InternalNano
                map_entry_cls.append(cls)

            elif b'Groups are not allowed in maps' in binr or \
                 b'a map entry message.' in binr: # MapEntry*
                map_entry_cls.append(cls)

            # Search for J2ME implementation's ProtoBuf.java

            elif b'Unexp.EOF' in binr:
                code = jar.decomp(cls, True).raw
                protobuftype_cls = search('public \w+\(([\w.$]+) \w+\)',
                                          code).group(1)

                default_consts = {}
                for prop, const in findall('(\w+) = new Boolean\((\w+)\)',
                                           code):
                    default_consts[cls + '.' + prop] = const

                while pkg in pkg_to_j2me_protobuftype:
                    pkg += '_'
                pkg_to_j2me_protobuftype[pkg] = (protobuftype_cls,
                                                 default_consts)

        for pkg in list(pkg_to_codedinputstream):
            if pkg not in pkg_to_codedoutputstream:
                del pkg_to_codedinputstream[pkg]
        """
        Second iteration on classes: look for generated classes, that
        contains method call signatures [1] for libraries we found, or
        other extractible information.
        
        [1] https://docs.oracle.com/javase/specs/jvms/se7/html/jvms-4.html#jvms-4.3
        """

        gen_classes = OrderedDict()
        gen_classes_j2me = OrderedDict()
        had_metadata = set()

        for i, cls in enumerate(jar.classes):
            if i % 10 == 0:
                yield '_progress', ('Scanning Java package contents...',
                                    (i / len(jar.classes)) * 0.5 + 0.5)

            binr = jar.read(cls)

            # Search for metadata descriptors
            if b'.proto\x12' in binr or b'.protodevel\x12' in binr:
                code = jar.decomp(cls, True).raw
                code = sub('",\s+"', '', code, flags=MULTILINE)
                meta = search(r'"(\\n.+?\.proto.+)"', code)
                if meta:
                    meta = meta.group(1).encode('latin1')
                    meta = meta.decode('unicode_escape').encode('latin1')

                    yield from walk_binary(meta)
                    had_metadata.add(cls)

            # Search for signatures common to generated Java classes
            for impl in pkg_to_codedinputstream:
                if b'%s' % pkg_to_codedinputstream[impl].replace('.', '/').encode('ascii') in binr and \
                   b'(L%s;' % pkg_to_codedoutputstream[impl].replace('.', '/').encode('ascii') in binr and \
                   cls not in (pkg_to_codedinputstream[impl], pkg_to_codedoutputstream[impl]):
                    gen_classes[cls] = (pkg_to_codedinputstream[impl],
                                        pkg_to_codedoutputstream[impl])

            # Search for generated J2ME classes
            for impl, (protobuftype_cls,
                       consts) in pkg_to_j2me_protobuftype.items():
                if b'(IILjava/lang/Object;)L%s;' % protobuftype_cls.replace('.', '/').encode('ascii') in binr and \
                   cls != protobuftype_cls:
                    gen_classes_j2me[cls] = (protobuftype_cls, consts)

            # Search for enums
            if b'Ljava/lang/Enum<' in binr[:256]:
                enums[cls] = cls

                if '$' in cls:
                    enums[cls.replace('$', '.')] = cls
                    enums[cls.rsplit('.', 1)[0] + '.' +
                          cls.rsplit('$', 1)[1]] = cls

        gen_classes_nodollar = OrderedDict(gen_classes)
        for cls, pkg in OrderedDict(gen_classes_nodollar).items():
            if '$' in cls:
                gen_classes_nodollar[cls.replace('$', '.')] = pkg
                gen_classes_nodollar[cls.rsplit('.', 1)[0] + '.' +
                                     cls.rsplit('$', 1)[1]] = pkg
        """
        Once we know what classes we should look at, do the actual code
        scraping and extraction work.
        """

        # These variables will be filled in by extract_* functions:

        msg_path_to_obj = {
        }  # For the class name of a message/enum, its DescriptorProto object
        msg_to_referrers = defaultdict(
            list
        )  # For a nested message/enum, all message fields that refer to it

        # Call the extraction routine for most implementations
        for i, (cls, (codedinputstream,
                      codedoutputstream)) in enumerate(gen_classes.items()):
            yield '_progress', ('Extracting %s...' % cls, i / len(gen_classes))

            if cls.split('$')[0] not in had_metadata:
                extract_lite(jar, cls, enums, gen_classes_nodollar,
                             codedinputstream, codedoutputstream,
                             map_entry_cls, out_additional_cls,
                             msg_path_to_obj, msg_to_referrers)

        # Call the extraction routine for J2ME
        for i, (cls, (protobuftype_cls,
                      consts)) in enumerate(gen_classes_j2me.items()):
            yield '_progress', ('Extracting %s...' % cls,
                                i / len(gen_classes_j2me))

            extract_j2me(jar, cls, enums, gen_classes_j2me, protobuftype_cls,
                         consts, msg_path_to_obj, msg_to_referrers)

        yield '_progress', ('Dumping information to .protos...', None)

        # Merge nested Protobuf messages and write them to files
        yield from nest_and_print_to_files(msg_path_to_obj, msg_to_referrers)

        # If we got an APK and it contained .so's with embedded metadata or .protos, yield them
        yield from jar.bonus_protos.items()