Example #1
0
    def process(self, task: Task) -> None:  # type: ignore
        sample = task.get_resource("sample")
        headers = task.headers

        if headers["type"] == "sample":
            self.log.info("Analyzing original binary")
            self.analyze_sample(sample)
        elif headers["type"] == "analysis":
            sample_hash = hashlib.sha256(sample.content or b"").hexdigest()
            self.log.info(f"Processing analysis, sample: {sample_hash}")
            dumps = task.get_resource("dumps.zip")
            dumps_metadata = task.get_payload("dumps_metadata")
            with dumps.extract_temporary() as tmpdir:  # type: ignore
                dump_infos = []
                for dump_metadata in dumps_metadata:
                    dump_path = os.path.join(tmpdir, dump_metadata["filename"])
                    if not self._is_safe_path(tmpdir, dump_path):
                        self.log.warning(
                            f"Path traversal attempt: {dump_path}")
                        continue
                    dump_base = int(dump_metadata["base_address"], 16)
                    dump_infos.append(DumpInfo(path=dump_path, base=dump_base))
                self.analyze_dumps(sample, dump_infos)

        self.log.debug("Printing gc stats")
        self.log.debug(gc.get_stats())
    def process(self, task: Task) -> None:  # type: ignore
        config = task.get_payload("config")
        family = task.headers["family"]
        dhash = config_dhash(config)

        # Parse the config using iocextract library
        iocs = parse(family, config)

        if not iocs:
            # Nothing actionable found - skip the config
            return

        # Upload structured data to MISP
        event = MISPEvent()
        event.uuid = str(uuid5(self.CONFIG_NAMESPACE, dhash))
        event.add_tag(f"mwdb:family:{family}")
        event.info = f"Malware configuration ({family})"

        if self.mwdb_url is not None:
            event.add_attribute("link", f"{self.mwdb_url}/config/{dhash}")

        for o in iocs.to_misp():
            event.add_object(o)

        misp = ExpandedPyMISP(self.misp_url, self.misp_key, self.misp_verifycert)
        misp.add_event(event)
    def process(self, task: Task) -> None:  # type: ignore
        mwdb = self.mwdb()
        object_type = task.headers["type"]
        mwdb_object: Optional[MWDBObject]

        if object_type == "sample":
            mwdb_object = self.process_sample(task, mwdb)
        else:
            mwdb_object = self.process_config(task, mwdb)

        if not mwdb_object:
            return

        # Add payload tags
        if task.has_payload("tags"):
            for tag in task.get_payload("tags"):
                if tag not in mwdb_object.tags:
                    self.log.info("[%s %s] Adding tag %s", object_type,
                                  mwdb_object.id, tag)
                    mwdb_object.add_tag(tag)

        # Add payload attributes
        if task.has_payload("attributes"):
            for key, values in task.get_payload("attributes").items():
                for value in values:
                    if value not in mwdb_object.metakeys.get(key, []):
                        self.log.info(
                            "[%s %s] Adding metakey %s: %s",
                            object_type,
                            mwdb_object.id,
                            key,
                            value,
                        )
                        mwdb_object.add_metakey(key, value)

        # Add payload comments
        comments = task.get_payload("comments") or task.get_payload(
            "additional_info")
        if comments:
            for comment in comments:
                self.log.info(
                    "[%s %s] Adding comment: %s",
                    object_type,
                    mwdb_object.id,
                    repr(comment),
                )
                mwdb_object.add_comment(comment)
    def process_config(self, task: Task, mwdb: MWDB) -> MWDBConfig:
        """
        Processing of Config task

        Clarification:
            sample -> parent -> config
            sample is original sample
            parent is parent of the config
            config is config

        :param mwdb: MWDB instance
        :return: MWDBConfig object
        """
        config_data = task.get_payload("config")
        family = (task.headers["family"] or config_data.get("family")
                  or config_data.get("type", "unknown"))

        if task.has_payload("sample"):
            sample = self._upload_file(task, mwdb, task.get_payload("sample"))
            if sample:
                self.log.info("[sample %s] Adding tag ripped:%s", sample.id,
                              family)
                sample.add_tag("ripped:" + family)
            else:
                self.log.warning("Couldn't upload original sample")
        else:
            sample = None

        if task.has_payload("parent"):
            parent = self._upload_file(task,
                                       mwdb,
                                       task.get_payload("parent"),
                                       parent=sample)
            if parent:
                self.log.info("[sample %s] Adding tag %s", parent.id, family)
                parent.add_tag(family)
            else:
                self.log.warning("Couldn't upload parent sample")
        else:
            parent = None

        config = self._upload_config(task,
                                     mwdb,
                                     family,
                                     config_data,
                                     parent=parent)
        return config
    def process_sample(self, task: Task, mwdb: MWDB) -> Optional[MWDBFile]:
        """
        Processing of Sample task

        :param mwdb: MWDB instance
        :return: MWDBFile object or None
        """
        if task.has_payload("parent"):
            parent = self._upload_file(task, mwdb, task.get_payload("parent"))
        else:
            parent = None

        if task.has_payload("sample"):
            sample = self._upload_file(task,
                                       mwdb,
                                       task.get_payload("sample"),
                                       parent=parent)
        else:
            sample = None

        return sample
 def process_cuckoo(self, task: Task) -> List[str]:
     yara_matches: List[str] = []
     analysis = task.get_payload("analysis")
     log.info(f"Processing cuckoo analysis {analysis.name}")
     with analysis.extract_temporary() as analysis_dir:
         dump_dir = f"{analysis_dir}/dumps"
         for rootdir, _dirs, files in os.walk(dump_dir):
             for filename in files:
                 if filename.endswith(".txt") or filename.endswith(
                         ".metadata"):
                     continue
                 log.debug(f"Checking {filename}")
                 with open(f"{rootdir}/{filename}", "rb") as dumpf:
                     content = dumpf.read()
                 yara_matches += self.scan_sample(content)
     return yara_matches
Example #7
0
    def process(self, task: Task):
        dumps = task.get_resource("dumps.zip")
        dumps_metadata = task.get_payload("dumps_metadata")
        sample = task.get_resource("sample")

        with dumps.extract_temporary() as temp:
            family = self.analyze_dumps(sample, temp, dumps_metadata)

            testcase = TestCase.from_json(task.payload["testcase"])
            expected_family = testcase.ripped

            if family is None or expected_family != family:
                self.log.error(
                    f"Failed to rip {sample.sha256}. Expected {expected_family}, ripped {family}"
                )
                result = "FAIL"
            else:
                self.log.info(f"Ripping {sample.sha256} OK: {family}")
                result = "OK"

            out_res = json.dumps({
                "sample": sample.sha256,
                "family": {
                    "expected": expected_family,
                    "ripped": family
                },
                "result": result,
            })

            task = Task({"type": "analysis-test-result", "kind": "drakrun"})
            res = LocalResource(name=self.current_task.root_uid,
                                bucket="draktestd",
                                content=out_res)
            res._uid = res.name
            task.add_payload("result", res)
            self.send_task(task)
Example #8
0
    def process(self, task: Task) -> None:
        sample = task.get_resource("sample")
        task_password = task.get_payload("password", default=None)

        attributes = task.get_payload("attributes", default={})
        if not task_password and attributes.get("password"):
            self.log.info("Accepting password from attributes")
            task_password = attributes.get("password")[0]

        try:
            if sample.name:
                fname = sample.name.encode("utf8")

                classifier_extension = "." + task.headers.get("extension")
                if classifier_extension and not fname.endswith(
                        classifier_extension.encode("utf-8")):
                    fname += classifier_extension.encode("utf-8")

        except Exception as e:
            self.log.warning("Exception during extraction: %r", e)
            fname = None

        extraction_level = task.get_payload("extraction_level", 0)

        if extraction_level > self.max_depth:
            self.log.warning(
                "Maximum extraction depth exceeded. Can't extract this archive."
            )
            return

        with tempfile.TemporaryDirectory() as dir_name:
            filepath = f"{dir_name}/{fname}"
            with open(filepath, "wb") as f:
                f.write(sample.content)

            archive_password = None
            if task_password is not None:
                archive_password = task_password.encode()

            unpacked = unpack(
                filename=fname,
                filepath=filepath.encode("utf-8"),
                password=archive_password,
            )

        try:
            fname = (unpacked.filename
                     and unpacked.filename.decode("utf8")) or unpacked.sha256
        except Exception as e:
            self.log.warning("Exception during extraction: %r", e)
            fname = "(unknown)"

        self.log.info("Got archive {}".format(fname))

        if not unpacked.children:
            self.log.warning("Don't know how to unpack this archive")
            return

        for child in unpacked.children:
            fname = (child.filename
                     and child.filename.decode("utf8")) or child.sha256

            self.log.info("Unpacked child {}".format(fname))

            if not child.contents:
                self.log.warning(
                    "Child has no contents or protected by unknown password")
                continue

            if len(child.contents) > self.max_size:
                self.log.warning("Child is too big for further processing")
                continue

            task = Task(
                headers={
                    "type": "sample",
                    "kind": "raw",
                    "quality": task.headers.get("quality", "high"),
                },
                payload={
                    "sample": Resource(fname, child.contents),
                    "parent": sample,
                    "extraction_level": extraction_level + 1,
                },
            )
            self.send_task(task)
Example #9
0
    def _classify(self, task: Task) -> Optional[Dict[str, Optional[str]]]:
        sample = task.get_resource("sample")
        content = cast(bytes, sample.content)

        magic = task.get_payload("magic") or ""
        magic_mime = task.get_payload("mime") or ""
        try:
            magic = self._magic(content, mime=False)
            magic_mime = self._magic(content, mime=True)
        except Exception as ex:
            self.log.warning(f"unable to get magic: {ex}")

        extension = self._get_extension(sample.name or "sample")
        sample_class = {
            "magic": magic if magic else None,
            "mime": magic_mime if magic_mime else None,
            "kind": None,
            "platform": None,
            "extension": None,
        }

        # Is PE file?
        if magic.startswith("PE32") or magic.startswith(
                "MS-DOS executable PE32"):
            sample_class.update({
                "kind": "runnable",
                "platform": "win32",
                "extension": "exe"
            })
            if magic.startswith("PE32+"):
                sample_class["platform"] = "win64"  # 64-bit only executable
            if "(DLL)" in magic:
                sample_class["extension"] = "dll"
            return sample_class

        # ZIP-contained files?
        def zip_has_file(path: str) -> bool:
            try:
                ZipFile(BytesIO(content)).getinfo(path)
                return True
            except Exception:
                return False

        if magic.startswith("Zip archive data") or magic.startswith(
                "Java archive data (JAR)"):
            if extension == "apk" or zip_has_file("AndroidManifest.xml"):
                sample_class.update({
                    "kind": "runnable",
                    "platform": "android",
                    "extension": "apk"
                })
                return sample_class

            if extension == "jar" or zip_has_file("META-INF/MANIFEST.MF"):
                sample_class.update({
                    "kind": "runnable",
                    "platform": "win32",  # Default platform should be Windows
                    "extension": "jar",
                })
                return sample_class

        # Dalvik Android files?
        if magic.startswith("Dalvik dex file") or extension == "dex":
            sample_class.update({
                "kind": "runnable",
                "platform": "android",
                "extension": "dex"
            })
            return sample_class

        # Shockwave Flash?
        if magic.startswith("Macromedia Flash") or extension == "swf":
            sample_class.update({
                "kind": "runnable",
                "platform": "win32",
                "extension": "swf"
            })
            return sample_class

        # Windows LNK?
        if magic.startswith("MS Windows shortcut") or extension == "lnk":
            sample_class.update({
                "kind": "runnable",
                "platform": "win32",
                "extension": "lnk"
            })
            return sample_class

        # Is ELF file?
        if magic.startswith("ELF"):
            sample_class.update({"kind": "runnable", "platform": "linux"})
            return sample_class

        # Is PKG file?
        if magic.startswith("xar archive") or extension == "pkg":
            sample_class.update({
                "kind": "runnable",
                "platform": "macos",
                "extension": "pkg"
            })
            return sample_class

        # Is DMG file?
        if extension == "dmg" or all([
                len(content) > 512,
                content[-512:][:4] == b"koly",
                content[-512:][8:12] == b"\x00\x00\x02\x00",
        ]):
            sample_class.update({
                "kind": "runnable",
                "platform": "macos",
                "extension": "dmg"
            })
            return sample_class

        # Is mach-o file?
        if magic.startswith("Mach-O"):
            sample_class.update({"kind": "runnable", "platform": "macos"})
            return sample_class

        def zip_has_mac_app() -> bool:
            try:
                zipfile = ZipFile(BytesIO(content))
                return any(
                    x.filename.lower().endswith(".app/contents/info.plist")
                    for x in zipfile.filelist)
            except Exception:
                return False

        # macos app within zip
        if magic.startswith("Zip archive data") and zip_has_mac_app():
            sample_class.update({
                "kind": "runnable",
                "platform": "macos",
                "extension": "app"
            })
            return sample_class

        # Windows scripts (per extension)
        script_extensions = [
            "vbs",
            "vbe",
            "js",
            "jse",
            "wsh",
            "wsf",
            "hta",
            "cmd",
            "bat",
            "ps1",
        ]
        if extension in script_extensions:
            sample_class.update({
                "kind": "script",
                "platform": "win32",
                "extension": extension
            })
            return sample_class

        # Office documents
        office_extensions = {
            "doc": "Microsoft Word",
            "xls": "Microsoft Excel",
            "ppt": "Microsoft PowerPoint",
        }
        # Check RTF by libmagic
        if magic.startswith("Rich Text Format"):
            sample_class.update({
                "kind": "document",
                "platform": "win32",
                "extension": "rtf"
            })
            return sample_class
        # Check Composite Document (doc/xls/ppt) by libmagic and extension
        if magic.startswith("Composite Document File"):
            # MSI installers are also CDFs
            if "MSI Installer" in magic:
                sample_class.update({
                    "kind": "runnable",
                    "platform": "win32",
                    "extension": "msi"
                })
                return sample_class
            # If not MSI, treat it like Office document
            sample_class.update({
                "kind": "document",
                "platform": "win32",
            })

            for ext, typepart in office_extensions.items():
                if f"Name of Creating Application: {typepart}" in magic:
                    sample_class["extension"] = ext
                    return sample_class

            if extension[:3] in office_extensions.keys():
                sample_class["extension"] = extension
            else:
                sample_class["extension"] = "doc"
            return sample_class

        # Check docx/xlsx/pptx by libmagic
        for ext, typepart in office_extensions.items():
            if magic.startswith(typepart):
                sample_class.update({
                    "kind": "document",
                    "platform": "win32",
                    "extension": ext + "x"
                })
                return sample_class

        # Check RTF by extension
        if extension == "rtf":
            sample_class.update({
                "kind": "document",
                "platform": "win32",
                "extension": "rtf"
            })
            return sample_class

        # Finally check document type only by extension
        if extension[:3] in office_extensions.keys():
            sample_class.update({
                "kind": "document",
                "platform": "win32",
                "extension": extension
            })
            return sample_class

        # Unclassified Open XML documents
        if magic.startswith("Microsoft OOXML"):
            try:
                extn = classify_openxml(content)
                if extn:
                    sample_class.update({
                        "kind": "document",
                        "platform": "win32",
                        "extension": extn,
                    })
                    return sample_class
            except Exception:
                self.log.exception("Error while trying to classify OOXML")

        # PDF files
        if magic.startswith("PDF document") or extension == "pdf":
            sample_class.update({
                "kind": "document",
                "platform": "win32",
                "extension": "pdf"
            })
            return sample_class

        # Archives
        archive_assoc = {
            "7z": ["7-zip archive data"],
            "ace": ["ACE archive data"],
            "bz2": ["bzip2 compressed data"],
            "cab": ["Microsoft Cabinet archive data"],
            "gz": ["gzip compressed"],
            "iso": ["ISO 9660 CD-ROM"],
            "lz": ["lzip compressed data"],
            "tar": ["tar archive", "POSIX tar archive"],
            "rar": ["RAR archive data"],
            "udf": ["UDF filesystem data"],
            "xz": ["XZ compressed data"],
            "zip": ["Zip archive data"],
            "zlib": ["zlib compressed data"],
        }
        archive_extensions = [
            "ace",
            "zip",
            "rar",
            "tar",
            "cab",
            "gz",
            "7z",
            "bz2",
            "arj",
            "iso",
            "xz",
            "lz",
            "udf",
            "cab",
            "zlib",
        ]

        def apply_archive_headers(extension):
            headers = {"kind": "archive", "extension": extension}
            if extension == "xz":
                # libmagic >= 5.40 generates correct MIME type for XZ archives
                headers["mime"] = "application/x-xz"
            sample_class.update(headers)
            return sample_class

        for archive_extension, assocs in archive_assoc.items():
            if any(magic.startswith(assoc) for assoc in assocs):
                return apply_archive_headers(archive_extension)

        if extension in archive_extensions:
            return apply_archive_headers(extension)

        # E-mail
        email_assoc = {
            "msg": ["Microsoft Outlook Message"],
            "eml": ["multipart/mixed", "RFC 822 mail", "SMTP mail"],
        }
        for ext, patterns in email_assoc.items():
            if any(pattern in magic for pattern in patterns):
                sample_class.update({"kind": "archive", "extension": ext})
                return sample_class

        if extension in email_assoc.keys():
            sample_class.update({"kind": "archive", "extension": extension})
            return sample_class

        # HTML
        if magic.startswith("HTML document"):
            sample_class.update({"kind": "html"})
            return sample_class

        # Linux scripts
        if ("script" in magic and "executable" in magic) or extension == "sh":
            sample_class.update({
                "kind": "script",
                "platform": "linux",
                "extension": extension
            })
            return sample_class

        # Content heuristics
        partial = content[:2048] + content[-2048:]

        # Dumped PE file heuristics (PE not recognized by libmagic)
        if b".text" in partial and b"This program cannot be run" in partial:
            sample_class.update({
                "kind": "dump",
                "platform": "win32",
                "extension": "exe"
            })
            return sample_class

        if len(partial) > 0x40:
            pe_offs = struct.unpack("<H", partial[0x3C:0x3E])[0]
            if partial[pe_offs:pe_offs + 2] == b"PE":
                sample_class.update({
                    "kind": "dump",
                    "platform": "win32",
                    "extension": "exe"
                })
                return sample_class

        if partial.startswith(b"MZ"):
            sample_class.update({
                "kind": "dump",
                "platform": "win32",
                "extension": "exe"
            })
            return sample_class

        # Heuristics for scripts
        try:
            try:
                partial_str = partial.decode(
                    chardet.detect(partial)["encoding"]).lower()
            except Exception:
                self.log.warning("Heuristics disabled - unknown encoding")
            else:
                vbs_keywords = [
                    "end function",
                    "end if",
                    "array(",
                    "sub ",
                    "on error ",
                    "createobject",
                    "execute",
                ]
                js_keywords = [
                    "function ",
                    "function(",
                    "this.",
                    "this[",
                    "new ",
                    "createobject",
                    "activexobject",
                    "var ",
                    "catch",
                ]
                html_keywords = ["<!doctype", "<html", "<script"]
                ps_keywords = [
                    "powershell",
                    "-nop",
                    "bypass",
                    "new-object",
                    "invoke-expression",
                    "frombase64string(",
                    "| iex",
                    "|iex",
                ]
                if (len([
                        True
                        for keyword in html_keywords if keyword in partial_str
                ]) >= 2):
                    sample_class.update({"kind": "html"})
                    return sample_class

                if (len([
                        True
                        for keyword in vbs_keywords if keyword in partial_str
                ]) >= 2):
                    sample_class.update({
                        "kind": "script",
                        "platform": "win32",
                        "extension": "vbs"
                    })
                    return sample_class
                # Powershell heuristics
                if len([
                        True for keyword in ps_keywords
                        if keyword.lower() in partial_str
                ]):
                    sample_class.update({
                        "kind": "script",
                        "platform": "win32",
                        "extension": "ps1"
                    })
                    return sample_class
                # JS heuristics
                if (len([
                        True
                        for keyword in js_keywords if keyword in partial_str
                ]) >= 2):
                    sample_class.update({
                        "kind": "script",
                        "platform": "win32",
                        "extension": "js"
                    })
                    return sample_class
                # JSE heuristics
                if re.match("#@~\\^[a-zA-Z0-9+/]{6}==", partial_str):
                    sample_class.update({
                        "kind": "script",
                        "platform": "win32",
                        "extension": "jse",  # jse is more possible than vbe
                    })
                    return sample_class
                if magic.startswith("ASCII"):
                    sample_class.update({
                        "kind": "ascii",
                    })
                    return sample_class
                if magic.startswith("ISO-8859"):
                    sample_class.update({
                        "kind": "iso-8859-1",
                    })
                    return sample_class
                if magic.startswith("UTF-8"):
                    sample_class.update({
                        "kind": "utf-8",
                    })
                    return sample_class
                if magic.startswith("PGP"):
                    sample_class.update({
                        "kind": "pgp",
                    })
                    return sample_class
                if magic.startswith("pcap capture file"):
                    sample_class.update({
                        "kind": "pcap",
                    })
                    return sample_class
                if magic.startswith("pcap") and "ng capture file" in magic:
                    sample_class.update({
                        "kind": "pcapng",
                    })
                    return sample_class
        except Exception as e:
            self.log.exception(e)

        # If not recognized then unsupported
        return None
Example #10
0
    def _classify(self, task: Task) -> Optional[Dict[str, str]]:
        sample = task.get_resource("sample")
        content = cast(bytes, sample.content)
        magic = task.get_payload("magic") or pymagic.from_buffer(content)
        extension = self._get_extension(sample.name or "sample")
        sample_type = {
            "type": "sample",
            "stage": "recognized",
            "quality": task.headers.get("quality", "high"),
        }

        # Is PE file?
        if magic.startswith("PE32") or magic.startswith("MS-DOS executable PE32"):
            sample_type.update(
                {"kind": "runnable", "platform": "win32", "extension": "exe"}
            )
            if magic.startswith("PE32+"):
                sample_type["platform"] = "win64"  # 64-bit only executable
            if "(DLL)" in magic:
                sample_type["extension"] = "dll"
            return sample_type

        # ZIP-contained files?
        def zip_has_file(path: str) -> bool:
            try:
                ZipFile(BytesIO(content)).getinfo(path)
                return True
            except Exception:
                return False

        if magic.startswith("Zip archive data") or magic.startswith(
            "Java archive data (JAR)"
        ):
            if extension == "apk" or zip_has_file("AndroidManifest.xml"):
                sample_type.update(
                    {"kind": "runnable", "platform": "android", "extension": "apk"}
                )
                return sample_type

            if extension == "jar" or zip_has_file("META-INF/MANIFEST.MF"):
                sample_type.update(
                    {
                        "kind": "runnable",
                        "platform": "win32",  # Default platform should be Windows
                        "extension": "jar",
                    }
                )
                return sample_type

        # Dalvik Android files?
        if magic.startswith("Dalvik dex file") or extension == "dex":
            sample_type.update(
                {"kind": "runnable", "platform": "android", "extension": "dex"}
            )
            return sample_type

        # Shockwave Flash?
        if magic.startswith("Macromedia Flash") or extension == "swf":
            sample_type.update(
                {"kind": "runnable", "platform": "win32", "extension": "swf"}
            )
            return sample_type

        # Windows LNK?
        if magic.startswith("MS Windows shortcut") or extension == "lnk":
            sample_type.update(
                {"kind": "runnable", "platform": "win32", "extension": "lnk"}
            )
            return sample_type

        # Is ELF file?
        if magic.startswith("ELF"):
            sample_type.update({"kind": "runnable", "platform": "linux"})
            return sample_type

        # Windows scripts (per extension)
        script_extensions = [
            "vbs",
            "vbe",
            "js",
            "jse",
            "wsh",
            "wsf",
            "hta",
            "cmd",
            "bat",
            "ps1",
        ]
        if extension in script_extensions:
            sample_type.update(
                {"kind": "script", "platform": "win32", "extension": extension}
            )
            return sample_type

        # Office documents
        office_extensions = {
            "doc": "Microsoft Word",
            "xls": "Microsoft Excel",
            "ppt": "Microsoft PowerPoint",
        }
        # Check RTF by libmagic
        if magic.startswith("Rich Text Format"):
            sample_type.update(
                {"kind": "document", "platform": "win32", "extension": "rtf"}
            )
            return sample_type
        # Check Composite Document (doc/xls/ppt) by libmagic and extension
        if magic.startswith("Composite Document File"):
            # MSI installers are also CDFs
            if "MSI Installer" in magic:
                sample_type.update(
                    {"kind": "runnable", "platform": "win32", "extension": "msi"}
                )
                return sample_type
            # If not MSI, treat it like Office document
            sample_type.update(
                {
                    "kind": "document",
                    "platform": "win32",
                }
            )
            if extension[:3] in office_extensions.keys():
                sample_type["extension"] = extension
            else:
                sample_type["extension"] = "doc"
            return sample_type

        # Check docx/xlsx/pptx by libmagic
        for ext, typepart in office_extensions.items():
            if magic.startswith(typepart):
                sample_type.update(
                    {"kind": "document", "platform": "win32", "extension": ext + "x"}
                )
                return sample_type

        # Check RTF by extension
        if extension == "rtf":
            sample_type.update(
                {"kind": "document", "platform": "win32", "extension": "rtf"}
            )
            return sample_type

        # Finally check document type only by extension
        if extension[:3] in office_extensions.keys():
            sample_type.update(
                {"kind": "document", "platform": "win32", "extension": extension}
            )
            return sample_type

        # Unclassified Open XML documents
        if magic.startswith("Microsoft OOXML"):
            try:
                extn = classify_openxml(content)
                if extn:
                    sample_type.update(
                        {
                            "kind": "document",
                            "platform": "win32",
                            "extension": extn,
                        }
                    )
                    return sample_type
            except Exception:
                self.log.exception("Error while trying to classify OOXML")

        # PDF files
        if magic.startswith("PDF document") or extension == "pdf":
            sample_type.update(
                {"kind": "document", "platform": "win32", "extension": "pdf"}
            )
            return sample_type

        # Archives
        archive_assoc = {
            "7z": ["7-zip archive data"],
            "ace": ["ACE archive data"],
            "bz2": ["bzip2 compressed data"],
            "cab": ["Microsoft Cabinet archive data"],
            "gz": ["gzip compressed"],
            "iso": ["ISO 9660 CD-ROM"],
            "lz": ["lzip compressed data"],
            "tar": ["tar archive", "POSIX tar archive"],
            "rar": ["RAR archive data"],
            "udf": ["UDF filesystem data"],
            "xz": ["XZ compressed data"],
            "zip": ["Zip archive data"],
            "zlib": ["zlib compressed data"],
        }
        archive_extensions = [
            "ace",
            "zip",
            "rar",
            "tar",
            "cab",
            "gz",
            "7z",
            "bz2",
            "arj",
            "iso",
            "xz",
            "lz",
            "udf",
            "cab",
            "zlib",
        ]
        for ext in archive_extensions:
            if ext in archive_assoc:
                if any(magic.startswith(x) for x in archive_assoc[ext]):
                    sample_type.update({"kind": "archive", "extension": ext})
                    return sample_type
        if extension in archive_extensions:
            sample_type.update({"kind": "archive", "extension": extension})
            return sample_type

        # E-mail
        email_assoc = {"msg": "Microsoft Outlook Message", "eml": "multipart/mixed"}
        for ext in email_assoc.keys():
            if email_assoc[ext] in magic:
                sample_type.update({"kind": "archive", "extension": ext})
                return sample_type

        if extension in email_assoc.keys():
            sample_type.update({"kind": "archive", "extension": extension})
            return sample_type

        # HTML
        if magic.startswith("HTML document"):
            sample_type.update({"kind": "html"})
            return sample_type

        # Linux scripts
        if ("script" in magic and "executable" in magic) or extension == "sh":
            sample_type.update(
                {"kind": "script", "platform": "linux", "extension": extension}
            )
            return sample_type

        # Content heuristics
        partial = content[:2048] + content[-2048:]

        # Dumped PE file heuristics (PE not recognized by libmagic)
        if b".text" in partial and b"This program cannot be run" in partial:
            sample_type.update(
                {"kind": "dump", "platform": "win32", "extension": "exe"}
            )
            return sample_type

        if len(partial) > 0x40:
            pe_offs = struct.unpack("<H", partial[0x3C:0x3E])[0]
            if partial[pe_offs : pe_offs + 2] == b"PE":
                sample_type.update(
                    {"kind": "dump", "platform": "win32", "extension": "exe"}
                )
                return sample_type

        if partial.startswith(b"MZ"):
            sample_type.update(
                {"kind": "dump", "platform": "win32", "extension": "exe"}
            )
            return sample_type

        # Heuristics for scripts
        try:
            try:
                partial_str = partial.decode(
                    chardet.detect(partial)["encoding"]
                ).lower()
            except Exception:
                self.log.warning("Heuristics disabled - unknown encoding")
            else:
                vbs_keywords = [
                    "end function",
                    "end if",
                    "array(",
                    "sub ",
                    "on error ",
                    "createobject",
                    "execute",
                ]
                js_keywords = [
                    "function ",
                    "function(",
                    "this.",
                    "this[",
                    "new ",
                    "createobject",
                    "activexobject",
                    "var ",
                    "catch",
                ]
                html_keywords = ["<!doctype", "<html", "<script"]
                ps_keywords = [
                    "powershell",
                    "-nop",
                    "bypass",
                    "new-object",
                    "invoke-expression",
                    "frombase64string(",
                    "| iex",
                    "|iex",
                ]
                if (
                    len([True for keyword in html_keywords if keyword in partial_str])
                    >= 2
                ):
                    sample_type.update({"kind": "html"})
                    return sample_type

                if (
                    len([True for keyword in vbs_keywords if keyword in partial_str])
                    >= 2
                ):
                    sample_type.update(
                        {"kind": "script", "platform": "win32", "extension": "vbs"}
                    )
                    return sample_type

                if (
                    len([True for keyword in js_keywords if keyword in partial_str])
                    >= 2
                ):
                    sample_type.update(
                        {"kind": "script", "platform": "win32", "extension": "js"}
                    )
                    return sample_type

                # JSE heuristics
                if re.match("#@~\\^[a-zA-Z0-9+/]{6}==", partial_str):
                    sample_type.update(
                        {
                            "kind": "script",
                            "platform": "win32",
                            "extension": "jse",  # jse is more possible than vbe
                        }
                    )
                    return sample_type
                # Powershell heuristics
                if len(
                    [True for keyword in ps_keywords if keyword.lower() in partial_str]
                ):
                    sample_type.update(
                        {"kind": "script", "platform": "win32", "extension": "ps1"}
                    )
                    return sample_type
                if magic.startswith("ASCII"):
                    sample_type.update(
                        {
                            "kind": "ascii",
                        }
                    )
                    return sample_type
                if magic.startswith("ISO-8859"):
                    sample_type.update(
                        {
                            "kind": "iso-8859-1",
                        }
                    )
                    return sample_type
                if magic.startswith("UTF-8"):
                    sample_type.update(
                        {
                            "kind": "utf-8",
                        }
                    )
                    return sample_type
                if magic.startswith("PGP"):
                    sample_type.update(
                        {
                            "kind": "pgp",
                        }
                    )
                    return sample_type
        except Exception as e:
            self.log.exception(e)

        # If not recognized then unsupported
        return None