Beispiel #1
0
    def is_whitelisted(self, task: IngestTask):
        reason, hit = self.get_whitelist_verdict(self.whitelist, task)
        hit = {x: dotdump(safe_str(y)) for x, y in hit.items()}
        sha256 = task.submission.files[0].sha256

        if not reason:
            with self.whitelisted_lock:
                reason = self.whitelisted.get(sha256, None)
                if reason:
                    hit = 'cached'

        if reason:
            if hit != 'cached':
                with self.whitelisted_lock:
                    self.whitelisted[sha256] = reason

            task.failure = "Whitelisting due to reason %s (%s)" % (dotdump(
                safe_str(reason)), hit)
            self._notify_drop(task)

            self.counter.increment('whitelisted')

        return reason
def ident(buf, length: int, path) -> Dict:
    data = {
        'ascii': None,
        'hex': None,
        'magic': None,
        'mime': None,
        'type': 'unknown'
    }

    if length <= 0:
        return data

    header = buf[:min(64, length)]
    data['ascii'] = dotdump(header)
    data['hex'] = safe_str(hexlify(header))

    # noinspection PyBroadException
    try:
        # Loop over the labels returned by libmagic, ...
        labels = []
        if file_type:
            with magic_lock:
                labels = magic.magic_file(file_type, path).split(b'\n')
                labels = [
                    label[2:] if label.startswith(b'- ') else label
                    for label in labels
                ]

        mimes = []
        if mime_type:
            with magic_lock:
                mimes = magic.magic_file(mime_type, path).split(b'\n')
                mimes = [
                    mime[2:] if mime.startswith(b'- ') else mime
                    for mime in mimes
                ]

        # For user feedback set the mime and magic meta data to always be the primary
        # libmagic responses
        if len(labels) > 0:
            data['magic'] = safe_str(labels[0])

        if len(mimes) > 0 and mimes[0] != b'':
            data['mime'] = safe_str(mimes[0])

        # Highest priority is given to mime type matching something
        tagged = False

        for label in labels:
            label = dotdump(label)

            if custom.match(label):
                data['type'] = label.split('custom: ')[1].strip()
                tagged = True
                break

        # Second priority is mime times marked as trusted
        if not tagged:
            for mime in mimes:
                mime = dotdump(mime)

                if mime in trusted_mimes:
                    data['type'] = trusted_mimes[mime]
                    tagged = True
                    break

        # As a third priority try matching the tl_patterns
        if not tagged:
            minimum = len(tl_patterns)
            sl_tag = None

            # Try each label and see how far down the tl_patterns list we go
            # before we hit a match, the closer to the beginning of the list we are the better
            # the tag match is. The final line of tl_patterns matches anything and sets
            # tag to 'unknown', so this loop should never finish with sl_tag as None
            # Unless the tl_patters table has been changed inappropriately
            for label in labels:
                label = dotdump(label)

                # ... match against our patterns and, ...
                index = 0
                for entry in tl_patterns:
                    if index >= minimum:
                        break

                    if entry[1].search(label):  # pylint:disable=E1101
                        break

                    index += 1

                # ... keep highest precedence (lowest index) match.
                if index < minimum:
                    minimum = index
                    sl_tag = subtype(label)

                    # If a label does match, take the best from that label
                    # Further labels from magic are probably terrible
                    break

            assert sl_tag is not None, "tl_patterns seems to be missing a match all => unknown rule at the end"

            # Based on the sub tag we found, figure out the top level tag to use
            tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0])
            data['type'] = '/'.join((tl_tag, sl_tag))

    except Exception as e:
        print(str(e))
        pass

    if not recognized.get(data['type'], False):
        data['type'] = 'unknown'

    if data['type'] == 'document/office/unknown':
        # noinspection PyBroadException
        try:
            root_entry_property_offset = buf.find(
                u"Root Entry".encode("utf-16-le"))
            if -1 != root_entry_property_offset:
                # Get root entry's GUID and try to guess document type
                clsid_offset = root_entry_property_offset + 0x50
                if len(buf) >= clsid_offset + 16:
                    clsid = buf[clsid_offset:clsid_offset + 16]
                    if len(clsid) == 16 and clsid != "\0" * len(clsid):
                        clsid_str = uuid.UUID(bytes_le=clsid)
                        clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper()
                        if clsid_str in OLE_CLSID_GUIDs:
                            data['type'] = OLE_CLSID_GUIDs[clsid_str]
        except Exception:
            pass

    return data
        # The default magic file misidentifies PE files with a munged DOS header
        data['type'] = dos_ident(path)
    elif data['type'] == 'code/html':
        # Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs
        # If both conditions are met, it's fair to say that the file is an .hta
        lang, _ = guess_language(path)
        if lang in ["code/javascript", "code/vbs"]:
            data['type'] = 'code/hta'

    if not recognized.get(data['type'], False) and not cart_metadata_set:
        data['type'] = 'unknown'

    return data


if __name__ == '__main__':
    from pprint import pprint

    # noinspection PyBroadException
    if len(sys.argv) > 1:
        pprint(fileinfo(sys.argv[1]))
    else:
        name = sys.stdin.readline().strip()
        while name:
            a = fileinfo(name)
            print('\t'.join(
                dotdump(str(a[k]))
                for k in ('type', 'ascii', 'entropy', 'hex', 'magic', 'mime',
                          'md5', 'sha1', 'sha256', 'ssdeep', 'size')))
            name = sys.stdin.readline().strip()
Beispiel #4
0
    def ident(self, buf, length: int, path) -> Dict:
        data = {
            "ascii": None,
            "hex": None,
            "magic": None,
            "mime": None,
            "type": "unknown"
        }

        if length <= 0:
            return data

        header = buf[:min(64, length)]
        data["ascii"] = dotdump(header)
        data["hex"] = safe_str(hexlify(header))

        # noinspection PyBroadException
        try:
            # Loop over the labels returned by libmagic, ...
            labels = []
            mimes = []

            with self.lock:
                try:
                    labels = magic.magic_file(self.file_type,
                                              path).split(b"\n")
                except magic.MagicException as me:
                    labels = me.message.split(b"\n")

                try:
                    mimes = magic.magic_file(self.mime_type, path).split(b"\n")
                except magic.MagicException as me:
                    mimes = me.message.split(b"\n")

            mimes = [
                mime[2:].strip() if mime.startswith(b"- ") else mime.strip()
                for mime in mimes
            ]
            labels = [
                label[2:].strip()
                if label.startswith(b"- ") else label.strip()
                for label in labels
            ]

            # For user feedback set the mime and magic meta data to always be the primary
            # libmagic responses
            if len(labels) > 0:

                def find_special_words(word, labels):
                    for index, label in enumerate(labels):
                        if word in label:
                            return index
                    return -1

                # If an expected label is not the first label returned by Magic, then make it so
                # Manipulating the mime accordingly varies between special word cases
                special_word_cases = [
                    (b"OLE 2 Compound Document : Microsoft Word Document",
                     False),
                    (b"Lotus 1-2-3 WorKsheet", True),
                ]
                for word, alter_mime in special_word_cases:
                    index = find_special_words(word, labels)
                    if index >= 0:
                        labels.insert(0, labels.pop(index))
                        if len(labels) == len(mimes) and alter_mime:
                            mimes.insert(0, mimes.pop(index))
                data["magic"] = safe_str(labels[0])

            for mime in mimes:
                if mime != b"":
                    data["mime"] = safe_str(mime)
                    break

            # First lets try to find any custom types
            for label in labels:
                label = dotdump(label)

                if self.custom.match(label):
                    data["type"] = label.split("custom: ")[1].strip()
                    break

            # Second priority is mime times marked as trusted
            if data["type"] == "unknown":
                with self.lock:
                    trusted_mimes = self.trusted_mimes

                for mime in mimes:
                    mime = dotdump(mime)

                    if mime in trusted_mimes:
                        data["type"] = trusted_mimes[mime]
                        break

            # As a third priority try matching the magic_patterns
            if data["type"] == "unknown":
                found = False
                with self.lock:
                    compiled_magic_patterns = self.compiled_magic_patterns

                for label in labels:
                    for entry in compiled_magic_patterns:
                        if entry[1].search(dotdump(label)):  # pylint: disable=E1101
                            data['type'] = entry[0]
                            found = True
                            break
                    if found:
                        break

        except Exception as e:
            self.log.error(
                f"An error occured during file identification: {e.__class__.__name__}({str(e)})"
            )
            pass

        # If mime is text/* and type is unknown, set text/plain to trigger
        # language detection later.
        if data["type"] == "unknown" and data['mime'] is not None and data[
                'mime'].startswith("text/"):
            data["type"] = "text/plain"

        # Lookup office documents by GUID if we're still not sure what they are
        if data["type"] == "document/office/unknown":
            # noinspection PyBroadException
            try:
                root_entry_property_offset = buf.find(
                    u"Root Entry".encode("utf-16-le"))
                if -1 != root_entry_property_offset:
                    # Get root entry's GUID and try to guess document type
                    clsid_offset = root_entry_property_offset + 0x50
                    if len(buf) >= clsid_offset + 16:
                        clsid = buf[clsid_offset:clsid_offset + 16]
                        if len(clsid) == 16 and clsid != b"\0" * len(clsid):
                            clsid_str = uuid.UUID(bytes_le=clsid)
                            clsid_str = clsid_str.urn.rsplit(":",
                                                             1)[-1].upper()
                            if clsid_str in OLE_CLSID_GUIDs:
                                data["type"] = OLE_CLSID_GUIDs[clsid_str]
                        else:
                            bup_details_offset = buf[:
                                                     root_entry_property_offset
                                                     + 0x100].find(
                                                         u"Details".encode(
                                                             "utf-16-le"))
                            if -1 != bup_details_offset:
                                data["type"] = "quarantine/mcafee"
            except Exception:
                pass

        return data
Beispiel #5
0
    use_cache = True
    args = sys.argv[1:]
    if "--no-cache" in args:
        args.remove("--no-cache")
        use_cache = False

    identify = Identify(use_cache=use_cache)

    if len(args) > 0:
        pprint(identify.fileinfo(args[0]))
    else:
        name = sys.stdin.readline().strip()
        while name:
            a = identify.fileinfo(name)
            print("\t".join(
                dotdump(str(a[k])) for k in (
                    "type",
                    "ascii",
                    "entropy",
                    "hex",
                    "magic",
                    "mime",
                    "md5",
                    "sha1",
                    "sha256",
                    "ssdeep",
                    "size",
                )))
            name = sys.stdin.readline().strip()
Beispiel #6
0
def test_dotdump():
    result = str_utils.dotdump([1, 8, 22, 33, 66, 99, 126, 127, 1000])
    assert result == "...!Bc~.."