Beispiel #1
0
 def _get_headparts(self, head):
     mw = mimewords()
     mw = partial(mw.process.__wrapped__.__wrapped__, mw)
     jh = defaultdict(list)
     for key, value in head:
         jh[key].append(mw(''.join(t.lstrip() for t in value.splitlines(False))))
     jh = {k: v[0] if len(v) == 1 else [t for t in v if t] for k, v in jh.items()}
     yield UnpackResult('headers.txt',
         lambda h=head: '\n'.join(F'{k}: {v}' for k, v in h).encode(self.codec))
     yield UnpackResult('headers.json',
         lambda jsn=jh: json.dumps(jsn, indent=4).encode(self.codec))
Beispiel #2
0
 def make_message(name, msg):
     with NoLogging():
         try:
             htm = msg.htmlBody
         except Exception:
             htm = None
         try:
             txt = msg.body
         except Exception:
             txt = None
     if txt:
         yield UnpackResult(F'{name}.txt', ensure_bytes(txt))
     if htm:
         yield UnpackResult(F'{name}.htm', ensure_bytes(htm))
Beispiel #3
0
    def _search(self, pe, directory, level=0, *parts):
        if level >= 3:
            self.log_warn(F'unexpected resource tree level {level + 1:d}')
        for entry in directory.entries:
            if entry.name:
                identifier = str(entry.name)
            elif level == 0 and entry.id in iter(RSRC):
                identifier = RSRC(entry.id).name
            elif entry.id is not None:
                identifier = str(entry.id)
            else:
                self.log_warn(
                    F'resource entry has name {entry.name} and id {entry.id} at level {level + 1:d}'
                )
                continue
            if entry.struct.DataIsDirectory:
                yield from self._search(pe, entry.directory, level + 1, *parts,
                                        identifier)
            else:

                def extracted(p=pe, e=entry):
                    return p.get_data(e.data.struct.OffsetToData,
                                      e.data.struct.Size)

                path = '/'.join((*parts, identifier))
                yield UnpackResult(path, extracted)
Beispiel #4
0
 def unpack(self, data):
     parser = self._oletools.rtfobj.RtfObjParser(data)
     parser.parse()
     width = len(str(len(parser.objects)))
     for k, item in enumerate(parser.objects):
         item: RtfObject
         path = item.filename or F'carve{k:0{width}}.bin'
         data = item.rawdata
         meta = {}
         if item.is_ole:
             if item.format_id == self._oletools.oleobj.OleObject.TYPE_EMBEDDED:
                 meta['ole_type'] = 'EMBEDDED'
             elif item.format_id == self._oletools.oleobj.OleObject.TYPE_LINKED:
                 meta['ole_type'] = 'LINKED'
             if item.is_package:
                 meta['src_path'] = item.src_path
                 meta['tmp_path'] = item.temp_path
             if item.clsid is not None:
                 meta['ole_info'] = item.clsid_desc
                 meta['ole_guid'] = item.clsid
             meta['ole_name'] = item.class_name
         if item.oledata:
             data = item.oledata
             pos = item.rawdata.find(data)
             if pos > 0:
                 meta['raw_header'] = item.rawdata[:pos]
             if item.olepkgdata:
                 data = item.olepkgdata
                 pos = item.oledata.find(data)
                 if pos >= 0:
                     meta['ole_header'] = item.oledata[:pos]
         yield UnpackResult(path, data, **meta)
Beispiel #5
0
        def tree(root: HTMLNode, *path):

            def outer(root: HTMLNode = root):
                return root.recover(inner=False).encode(self.codec)

            def inner(root: HTMLNode = root):
                return root.recover().encode(self.codec)

            tagpath = '/'.join(path)

            if root.root:
                yield UnpackResult(tagpath, inner)
            elif self.args.outer:
                yield UnpackResult(tagpath, outer)
            else:
                yield UnpackResult(tagpath, inner)

            for k, node in enumerate((n for n in root.children if not n.textual)):
                yield from tree(node, *path, F'{k}.{node.tag}')
Beispiel #6
0
 def _walk(self, key, *path):
     here = '/'.join(path)
     if not self._check_reachable(here):
         self.log_debug(F'pruning search at {here}')
         return
     for value in key.values():
         vpath = F'{here}/{value.name()}'
         yield UnpackResult(vpath, lambda v=value: v.raw_data())
     for subkey in key.subkeys():
         yield from self._walk(subkey, *path, subkey.name())
Beispiel #7
0
    def unpack(self, data):
        header = DotNetHeader(data)

        if not header.resources:
            if self.args.list:
                return
            raise ValueError('This file contains no resources.')

        for resource in header.resources:
            yield UnpackResult(resource.Name, resource.Data)
Beispiel #8
0
    def unpack(self, data):
        cursor = 0
        mv = memoryview(data)

        while True:
            offset = data.find(B'MZ', cursor)
            if offset < cursor: break
            cursor = offset + 2
            ntoffset = mv[offset + 0x3C:offset + 0x3E]
            if len(ntoffset) < 2:
                return
            ntoffset, = unpack('H', ntoffset)
            if mv[offset + ntoffset:offset + ntoffset + 2] != B'PE':
                self.log_debug(
                    F'invalid NT header signature for candidate at 0x{offset:08X}'
                )
                continue
            try:
                pe = PE(data=data[offset:], fast_load=True)
            except PEFormatError as err:
                self.log_debug(
                    F'parsing of PE header at 0x{offset:08X} failed:', err)
                continue

            pesize = get_pe_size(pe, memdump=self.args.memdump)
            pedata = mv[offset:offset + pesize]
            info = {}
            if self.args.fileinfo:
                try:
                    info = pemeta().parse_version(pe) or {}
                except Exception as error:
                    self.log_warn(
                        F'Unable to obtain file information: {error!s}')
            try:
                path = info['OriginalFilename']
            except KeyError:
                extension = 'exe' if pe.is_exe() else 'dll' if pe.is_dll(
                ) else 'sys'
                path = F'carve-0x{offset:08X}.{extension}'

            if offset > 0 or self.args.keep_root:
                yield UnpackResult(path, pedata, offset=offset)
                self.log_info(
                    F'extracted PE file of size 0x{pesize:08X} from 0x{offset:08X}'
                )
            else:
                self.log_info(
                    F'ignored root file of size 0x{pesize:08X} from 0x{offset:08X}'
                )
                continue

            if not offset or self.args.recursive:
                cursor += pe.OPTIONAL_HEADER.SizeOfHeaders
            else:
                cursor += pesize
Beispiel #9
0
 def _pack(
     self,
     path: str,
     date: Optional[Union[datetime, str]],
     data: Union[ByteString, Callable[[], ByteString]],
     **meta
 ) -> UnpackResult:
     if isinstance(date, datetime):
         date = date.isoformat(' ', 'seconds')
     if isinstance(date, str):
         meta[self.args.date.decode(self.codec)] = date
     return UnpackResult(path, data, **meta)
Beispiel #10
0
 def _unpack_file(self, data: bytearray):
     for codec in ('utf16', 'utf-16le', 'utf8'):
         try:
             reg = data.decode(codec).splitlines(keepends=True)
         except UnicodeError:
             continue
         if reg[0].startswith('Windows Registry Editor'):
             break
     else:
         raise ParseException
     config = WinRegFileParser()
     config.read_string(''.join(reg[1:]))
     for key in config.sections():
         self.log_debug(key)
         for value in config[key]:
             name = next(iter(shlex.split(value)))
             path = Path(key) / Path(name)
             data = config[key][value]
             decoded = list(self._decode_registry_export(data))
             if len(decoded) == 1:
                 yield UnpackResult(str(path), decoded[0])
                 continue
             for k, d in enumerate(decoded):
                 yield UnpackResult(F'{path!s}.{k}', d)
Beispiel #11
0
 def unpack(self, data):
     sentinel = uuid4()
     try:
         parser = self._olevba.VBA_Parser(sentinel,
                                          data=bytes(data),
                                          relaxed=True)
     except self._olevba.FileOpenError:
         raise ValueError('Input data not recognized by VBA parser')
     for p1, stream_path, p2, code in parser.extract_all_macros():
         if not stream_path:
             if p1 == sentinel:
                 continue
             if p2 == sentinel:
                 continue
         yield UnpackResult(stream_path, code.encode(self.codec))
Beispiel #12
0
 def unpack(self, data):
     jc = JvClassFile(data)
     tt = '  '
     opcw = self._OPC_STRLEN
     for method in jc.methods:
         for attribute in method.attributes:
             if attribute.name == 'Code': break
         else:
             self.log_warn(F'no code found for method: {method.name}')
             continue
         code: JvCode = attribute.parse(JvCode)
         with io.StringIO() as display:
             args, retval = re.match(R'^\((.*?)\)(.*?)$',
                                     method.descriptor).groups()
             print(F'{jc.this!s}::{method!s}{method.descriptor}',
                   file=display)
             for op in code.disassembly:
                 olen = len(op.raw)
                 if op.table is None:
                     args = ', '.join(repr(a) for a in op.arguments)
                 else:
                     ow = 4 if op.code is opc.tableswitch else 8
                     olen = olen - (len(op.table) - 1) * ow
                     args = F'defaultjmp => {op.table[None]:#010x}'
                     jmps = []
                     for k, (key, jmp) in enumerate(op.table.items()):
                         if key is None:
                             continue
                         raw = self._hex(
                             op.raw[olen + k * ow:olen + k * ow + ow], ' ')
                         jmps.append(
                             F'{tt}{raw!s:<{opcw+15}} {key:#010x} => {jmp:#010x}'
                         )
                     args = '\n'.join((args, *jmps))
                 opch = self._hex(op.raw[:olen], ' ')
                 if len(opch) > 14:
                     opch += F'\n{tt}{tt:<15}'
                 print(F'{tt}{opch:<15}{op.code!r:<{opcw}} {args}',
                       file=display)
             name = method.name
             if name.startswith('<'):
                 this = jc.this.value.split('/')
                 this = this[-1]
                 name = F'{this}${name[1:-1]}'
             yield UnpackResult(F'{name}.jd',
                                display.getvalue().encode(self.codec))
Beispiel #13
0
    def _get_parts_outlook(self, data):
        def ensure_bytes(data):
            return data if isinstance(data, bytes) else data.encode(self.codec)

        def make_message(name, msg):
            with NoLogging():
                try:
                    htm = msg.htmlBody
                except Exception:
                    htm = None
                try:
                    txt = msg.body
                except Exception:
                    txt = None
            if txt:
                yield UnpackResult(F'{name}.txt', ensure_bytes(txt))
            if htm:
                yield UnpackResult(F'{name}.htm', ensure_bytes(htm))

        msgcount = 0

        with NoLogging():
            msg = self._extract_msg.Message(bytes(data))

        yield from self._get_headparts(msg.header.items())
        yield from make_message('body', msg)

        def attachments(msg):
            for attachment in getattr(msg, 'attachments', ()):
                yield attachment
                if attachment.type == 'data':
                    continue
                yield from attachments(attachment.data)

        for attachment in attachments(msg):
            self.log_debug(attachment)
            if attachment.type == 'msg':
                msgcount += 1
                yield from make_message(F'attachments/msg_{msgcount:d}', attachment.data)
                continue
            if not isbuffer(attachment.data):
                self.log_warn(F'unknown attachment of type {attachment.type}, please report this!')
                continue
            path = attachment.longFilename or attachment.shortFilename
            yield UnpackResult(F'attachments/{path}', attachment.data)
Beispiel #14
0
        def walk(node: xml.XMLNode, *path: str):
            def extract(node: xml.XMLNode = node):
                if not node.children:
                    return node.content.encode(self.codec)
                with MemoryFile() as stream:
                    node.write(stream)
                    return bytes(stream.getbuffer() | ppxml)

            children_by_tag = defaultdict(list)
            for child in node.children:
                children_by_tag[child.tag].append(child)
            yield UnpackResult('/'.join(path), extract, **node.attributes)
            for tag, children in children_by_tag.items():
                if len(children) == 1:
                    yield from walk(children[0], *path, tag)
                    continue
                width = len(F'{len(children):X}')
                for k, child in enumerate(children):
                    yield from walk(child, *path, F'{tag}[0x{k:0{width}X}]')
Beispiel #15
0
 def unpack(self, data):
     with MemoryFile(data) as stream:
         try:
             oledoc = self._olefile.OleFileIO(stream)
         except OSError as error:
             self.log_info(F'error, {error}, treating input as zip file')
             yield from xtzip().unpack(data)
             return
         for item in oledoc.listdir():
             if not item or not item[-1]:
                 continue
             path = '/'.join(item)
             olestream = oledoc.openstream(path)
             c0 = ord(item[-1][:1])
             if c0 < 20:
                 item[-1] = F'[{c0:d}]{item[-1][1:]}'
                 path = '/'.join(item)
             self.log_debug('exploring:', path)
             yield UnpackResult(path, olestream.read())
Beispiel #16
0
 def unpack(self, data):
     try:
         managed = NetStructuredResources(data)
     except NoManagedResource:
         managed = None
     if not managed:
         raise RefineryPartialResult('no managed resources found',
                                     partial=data)
     for entry in managed:
         if entry.Error:
             self.log_warn(
                 F'entry {entry.Name} carried error message: {entry.Error}')
         data = entry.Data
         if not self.args.raw:
             if isinstance(entry.Value, str):
                 data = entry.Value.encode('utf-16le')
             elif isbuffer(entry.Value):
                 data = entry.Value
         yield UnpackResult(entry.Name, data)
Beispiel #17
0
    def _get_parts_regular(self, data):
        if not re.match(BR'^[\s!-~]+$', data):
            raise ValueError('This is not a plaintext email message.')

        msg = BytesParser().parsebytes(data)

        yield from self._get_headparts(msg.items())

        for k, part in enumerate(msg.walk()):
            path = part.get_filename()
            elog = None
            if path is None:
                extension = file_extension(part.get_content_type(), 'txt')
                path = F'body.{extension}'
            else:
                path = F'attachments/{path}'
            try:
                data = part.get_payload(decode=True)
            except Exception as E:
                try:
                    data = part.get_payload(decode=False)
                except Exception as E:
                    elog = str(E)
                    data = None
                else:
                    from refinery import carve
                    self.log_warn(F'manually decoding part {k}, data might be corrupted: {path}')
                    if isinstance(data, str):
                        data = data.encode('latin1')
                    if isbuffer(data):
                        data = next(data | carve('b64', stripspace=True, single=True, decode=True))
                    else:
                        elog = str(E)
                        data = None
            if not data:
                if elog is not None:
                    self.log_warn(F'could not get content of message part {k}: {elog!s}')
                continue
            yield UnpackResult(path, data)
Beispiel #18
0
    def unpack(self, data):
        def crawl(path, cursor):
            if isinstance(cursor, (dict, list)) and path:
                path = F'{path}/'
            if isinstance(cursor, dict):
                for key, value in cursor.items():
                    yield from crawl(F'{path}{key}', value)
            elif isinstance(cursor, list):
                width = len(F'{len(cursor)-1:d}')
                for key, value in enumerate(cursor):
                    yield from crawl(F'{path}#{key:0{width}d}', value)
            if path:
                yield path, cursor, cursor.__class__.__name__

        for path, item, typename in crawl('', json.loads(data)):

            def extract(item=item):
                if isinstance(item, (list, dict)):
                    dumped = json.dumps(item, indent=4)
                else:
                    dumped = str(item)
                return dumped.encode(self.codec)

            yield UnpackResult(path, extract, type=typename)
Beispiel #19
0
 def unpack(self, data):
     mv = memoryview(data)
     for name, start, size in exeroute(data, self._unpack_elf,
                                       self._unpack_macho, self._unpack_pe):
         end = start + size
         yield UnpackResult(name, mv[start:end], offset=start)
Beispiel #20
0
    def unpack(self, data):
        header = DotNetHeader(data, parse_resources=False)
        tables = header.meta.Streams.Tables
        fields = tables.FieldRVA
        if not fields:
            return
        iwidth = len(str(len(fields)))
        rwidth = max(len(F'{field.RVA:X}') for field in fields)
        rwidth = max(rwidth, 4)
        remaining_field_indices = set(range(len(tables.Field)))

        for k, rv in enumerate(fields):
            _index = rv.Field.Index
            field = tables.Field[_index - 1]
            remaining_field_indices.discard(_index - 1)
            fname = field.Name
            ftype = None
            if len(field.Signature) == 2:
                # Crude signature parser for non-array case. Reference:
                # https://www.codeproject.com/Articles/42649/NET-File-Format-Signatures-Under-the-Hood-Part-1
                # https://www.codeproject.com/Articles/42655/NET-file-format-Signatures-under-the-hood-Part-2
                guess = {
                    0x03: FieldInfo('Char', 1, 1, None),  # noqa
                    0x04: FieldInfo('SByte', 1, 1, None),  # noqa
                    0x05: FieldInfo('Byte', 1, 1, None),  # noqa
                    0x06: FieldInfo('Int16', 1, 2, None),  # noqa
                    0x07: FieldInfo('UInt16', 1, 2, None),  # noqa
                    0x08: FieldInfo('Int32', 1, 4, None),  # noqa
                    0x09: FieldInfo('UInt32', 1, 4, None),  # noqa
                    0x0A: FieldInfo('Int64', 1, 8, None),  # noqa
                    0x0B: FieldInfo('UInt64', 1, 8, None),  # noqa
                    0x0C: FieldInfo('Single', 1, 4, None),  # noqa
                    0x0D: FieldInfo('Double', 1, 8, None),  # noqa
                }.get(field.Signature[1], None)
            else:
                guess = self._guess_field_info(tables, data, _index)
            if guess is None:
                self.log_debug(
                    lambda:
                    F'field {k:0{iwidth}d} name {field.Signature}: unable to guess type information'
                )
                continue
            totalsize = guess.count * guess.size
            if guess.name is not None:
                fname = guess.name
            if not fname.isprintable():
                fname = F'F{rv.RVA:0{rwidth}X}'
            ext = ftype = guess.type.lower()
            if guess.count > 1:
                ftype += F'[{guess.count}]'
            self.log_info(
                lambda:
                F'field {k:0{iwidth}d} at RVA 0x{rv.RVA:04X} of type {guess.type}, count: {guess.count}, name: {fname}'
            )
            offset = header.pe.get_offset_from_rva(rv.RVA)
            yield UnpackResult(
                F'{fname}.{ext}',
                lambda t=offset, s=totalsize: data[t:t + s],
                name=fname,
                type=ftype,
            )

        for _index in remaining_field_indices:
            field = tables.Field[_index]
            index = _index + 1
            name = field.Name
            if field.Flags.HasFieldRVA:
                self.log_warn(
                    F'field {name} has RVA flag set, but no RVA was found')
            token = index.to_bytes(3, 'little')
            values = set()
            for match in re.finditer(
                (
                    BR'\x72(?P<token>...)\x70'  # ldstr
                    BR'(?:\x6F(?P<function>...)\x0A)?'  # call GetBytes
                    BR'\x80%s\x04'  # stsfld
                ) % re.escape(token),
                    data,
                    re.DOTALL):
                md = match.groupdict()
                fn_token = md.get('function')
                fn_index = fn_token and int.from_bytes(fn_token,
                                                       'little') or None
                if fn_index is not None:
                    fn_name = tables.MemberRef[fn_index].Name
                    if fn_name != 'GetBytes':
                        self.log_warn(
                            F'skipping string assignment passing through call to {fn_name}'
                        )
                        continue
                k = int.from_bytes(md['token'], 'little')
                values.add(header.meta.Streams.US[k].encode(self.codec))
            if not values:
                continue
            if len(values) == 1:
                yield UnpackResult(F'{name}.str',
                                   next(iter(values)),
                                   name=name,
                                   type='string')
Beispiel #21
0
    def _walk(self, blob, memo: Optional[Set[int]] = None, *path):
        while isinstance(blob, self._pypdf2.generic.IndirectObject):
            blob = blob.getObject()
        if memo is None:
            memo = {id(blob)}
        elif id(blob) in memo:
            return
        else:
            memo.add(id(blob))
        try:
            name = blob['/F']
            blob = blob['/EF']['/F']
        except Exception:
            pass
        else:
            path = *path[:-1], F'/{name}'
        try:
            if TYPE_CHECKING:
                blob: EncodedStreamObject = cast(EncodedStreamObject, blob)
            extract = blob.getData
        except AttributeError:
            pass
        else:
            yield UnpackResult(''.join(path), extract, kind='object')
            return

        if isinstance(blob, self._pypdf2.generic.ByteStringObject):
            yield UnpackResult(''.join(path), blob, kind='bytes')
            return
        if isinstance(blob, self._pypdf2.generic.TextStringObject):
            yield UnpackResult(''.join(path),
                               blob.encode(self.codec),
                               kind='string')
            return

        if isinstance(blob, (
                self._pypdf2.generic.BooleanObject,
                self._pypdf2.generic.ByteStringObject,
                self._pypdf2.generic.FloatObject,
                self._pypdf2.generic.NameObject,
                self._pypdf2.generic.NullObject,
                self._pypdf2.generic.NumberObject,
                self._pypdf2.generic.RectangleObject,
        )):
            # unhandled PDF objects
            return

        if isinstance(blob, self._pypdf2.generic.TreeObject):
            blob = list(blob)

        pdf = self._pypdf2.generic.PdfObject

        if isinstance(blob, list):
            if (len(blob) % 2 == 0 and all(
                    isinstance(key, str)
                    for key in islice(iter(blob), 0, None, 2)) and all(
                        isinstance(key, pdf)
                        for key in islice(iter(blob), 1, None, 2))):
                blob = dict(zip(*([iter(blob)] * 2)))
            else:
                for key, value in enumerate(blob):
                    yield from self._walk(value, memo, *path, F'/{key}')
                return

        if isinstance(blob, dict):
            for key, value in blob.items():
                if not isinstance(key, str):
                    continue
                if not key.startswith('/'):
                    key = F'/{key}'
                yield from self._walk(value, memo, *path, key)