def process(self, task: Task) -> None: # type: ignore sample = task.get_resource("sample") headers = task.headers if headers["type"] == "sample": self.log.info("Analyzing original binary") self.analyze_sample(sample) elif headers["type"] == "analysis": sample_hash = hashlib.sha256(sample.content or b"").hexdigest() self.log.info(f"Processing analysis, sample: {sample_hash}") dumps = task.get_resource("dumps.zip") dumps_metadata = task.get_payload("dumps_metadata") with dumps.extract_temporary() as tmpdir: # type: ignore dump_infos = [] for dump_metadata in dumps_metadata: dump_path = os.path.join(tmpdir, dump_metadata["filename"]) if not self._is_safe_path(tmpdir, dump_path): self.log.warning( f"Path traversal attempt: {dump_path}") continue dump_base = int(dump_metadata["base_address"], 16) dump_infos.append(DumpInfo(path=dump_path, base=dump_base)) self.analyze_dumps(sample, dump_infos) self.log.debug("Printing gc stats") self.log.debug(gc.get_stats())
def process(self, task: Task): dumps = task.get_resource("dumps.zip") sample = task.get_resource("sample") with dumps.extract_temporary() as temp: family = self.analyze_dumps(sample, temp) testcase = TestCase.from_json(task.payload["testcase"]) expected_family = testcase.ripped if family is None or expected_family != family: self.log.error( f"Failed to rip {sample.sha256}. Expected {expected_family}, ripped {family}" ) result = 'FAIL' else: self.log.info(f"Ripping {sample.sha256} OK: {family}") result = 'OK' out_res = json.dumps({ "sample": sample.sha256, "family": { "expected": expected_family, "ripped": family }, "result": result }) task = Task({"type": "analysis-test-result", "kind": "drakrun"}) res = LocalResource(name=self.current_task.root_uid, bucket='draktestd', content=out_res) res._uid = res.name task.add_payload("result", res) self.send_task(task)
def process_joesandbox(self, task: Task) -> List[str]: log.info("Processing joesandbox analysis") yara_matches: List[str] = [] with tempfile.TemporaryDirectory() as tmpdir: dumpsf = os.path.join(tmpdir, "dumps.zip") task.get_resource("dumps.zip").download_to_file( dumpsf) # type: ignore zipf = zipfile.ZipFile(dumpsf) zipf.extractall(tmpdir, pwd=b"infected") for rootdir, _dirs, files in os.walk(tmpdir): for filename in files: with open(f"{rootdir}/{filename}", "rb") as dumpf: content = dumpf.read() yara_matches += self.scan_sample(content) return yara_matches
def process_drakrun(self, task: Task) -> List[str]: log.info('Processing drakrun analysis') yara_matches: List[str] = [] with tempfile.TemporaryDirectory() as tmpdir: dumpsf = os.path.join(tmpdir, 'dumps.zip') task.get_resource('dumps.zip').download_to_file( dumpsf) # type: ignore zipf = zipfile.ZipFile(dumpsf) zipf.extractall(tmpdir) for rootdir, _dirs, files in os.walk(tmpdir): for filename in files: # skip non-dump files if not re.match(r"^[a-f0-9]{4,16}_[a-f0-9]{16}$", filename): continue with open(f"{rootdir}/{filename}", "rb") as dumpf: content = dumpf.read() yara_matches += self.scan_sample(content) return yara_matches
def process(self, task: Task) -> None: # type: ignore sample = task.get_resource("sample") sample_class = self._classify(task) file_name = sample.name or "sample" if sample_class is None: self.log.info( "Sample {!r} not recognized (unsupported type)".format( file_name.encode("utf8"))) res = task.derive_task({ "type": "sample", "stage": "unrecognized", "kind": "unknown", "quality": task.headers.get("quality", "high"), }) self.send_task(res) return classification_tag = get_tag(sample_class) self.log.info("Classified {!r} as {} and tag {}".format( file_name.encode("utf8"), repr(sample_class), classification_tag)) derived_task = task.derive_task(sample_class) # pass the original tags to the next task tags = [classification_tag] if derived_task.has_payload("tags"): tags += derived_task.get_payload("tags") derived_task.remove_payload("tags") derived_task.add_payload("tags", tags) # add a sha256 digest in the outgoing task if there # isn't one in the incoming task if "sha256" not in derived_task.payload["sample"].metadata: derived_task.payload["sample"].metadata["sha256"] = sha256( cast(bytes, sample.content)).hexdigest() self.send_task(derived_task)
def process(self, task: Task) -> None: # type: ignore headers = task.headers sample = task.get_resource("sample") yara_matches: List[str] = [] if headers["type"] == "sample": self.log.info(f"Processing sample {sample.metadata['sha256']}") if sample.content is not None: yara_matches = self.scan_sample(sample.content) elif headers["type"] == "analysis": if headers["kind"] == "cuckoo1": yara_matches += self.process_cuckoo(task) elif headers["kind"] == "drakrun": yara_matches += self.process_drakrun(task) elif headers["kind"] == "joesandbox": yara_matches += self.process_joesandbox(task) if not yara_matches: self.log.info("Couldn't match any yara rules") return None unique_matches = sorted(list(set(yara_matches))) self.log.info( "Got %d yara hits in total with %s distinct names", len(yara_matches), len(unique_matches), ) tag_task = Task( { "type": "sample", "stage": "analyzed" }, payload={ "sample": sample, "tags": unique_matches }, ) self.send_task(tag_task)
def process(self, task: Task) -> None: # Get the incoming sample sample_resource = task.get_resource("sample") # Log with self.log self.log.info(f"Hi {sample_resource.name}, let me analyse you!") # Download the resource to a temporary file with sample_resource.download_temporary_file() as sample_file: # And run `strings` on it strings = subprocess.check_output(["strings", sample_file.name]) # Send our results for further processing or reporting task = Task( { "type": "sample", "stage": "analyzed" }, payload={ "parent": sample_resource, "sample": Resource("string", strings) }, ) self.send_task(task)
def process(self, task: Task) -> None: # type: ignore sample = task.get_resource("sample") ascii_content = sample.content classifier = AsciiClassifier(ascii_content) classifier.classify() decoder = Decoder(ascii_content, classifier.verdict) try: decoder.decode() except binascii.Error: logging.warning("Error why trying to decode base64.") return if decoder.decoded: self.log.info("Decoded possible executable") if decoder.decoded[:2] == b"MZ": task_params = { "type": "sample", "kind": "runnable", "stage": "recognized", "platform": "win32", "extension": "exe", } else: task_params = {"type": "sample", "kind": "raw"} new_sample = Resource( sample.name, decoder.decoded, ) task = Task(task_params, payload={ "sample": new_sample, "parent": sample }) self.send_task(task)
def process(self, task: Task) -> None: sample = task.get_resource("sample") task_password = task.get_payload("password", default=None) attributes = task.get_payload("attributes", default={}) if not task_password and attributes.get("password"): self.log.info("Accepting password from attributes") task_password = attributes.get("password")[0] try: if sample.name: fname = sample.name.encode("utf8") classifier_extension = "." + task.headers.get("extension") if classifier_extension and not fname.endswith( classifier_extension.encode("utf-8")): fname += classifier_extension.encode("utf-8") except Exception as e: self.log.warning("Exception during extraction: %r", e) fname = None extraction_level = task.get_payload("extraction_level", 0) if extraction_level > self.max_depth: self.log.warning( "Maximum extraction depth exceeded. Can't extract this archive." ) return with tempfile.TemporaryDirectory() as dir_name: filepath = f"{dir_name}/{fname}" with open(filepath, "wb") as f: f.write(sample.content) archive_password = None if task_password is not None: archive_password = task_password.encode() unpacked = unpack( filename=fname, filepath=filepath.encode("utf-8"), password=archive_password, ) try: fname = (unpacked.filename and unpacked.filename.decode("utf8")) or unpacked.sha256 except Exception as e: self.log.warning("Exception during extraction: %r", e) fname = "(unknown)" self.log.info("Got archive {}".format(fname)) if not unpacked.children: self.log.warning("Don't know how to unpack this archive") return for child in unpacked.children: fname = (child.filename and child.filename.decode("utf8")) or child.sha256 self.log.info("Unpacked child {}".format(fname)) if not child.contents: self.log.warning( "Child has no contents or protected by unknown password") continue if len(child.contents) > self.max_size: self.log.warning("Child is too big for further processing") continue task = Task( headers={ "type": "sample", "kind": "raw", "quality": task.headers.get("quality", "high"), }, payload={ "sample": Resource(fname, child.contents), "parent": sample, "extraction_level": extraction_level + 1, }, ) self.send_task(task)
def _classify(self, task: Task) -> Optional[Dict[str, Optional[str]]]: sample = task.get_resource("sample") content = cast(bytes, sample.content) magic = task.get_payload("magic") or "" magic_mime = task.get_payload("mime") or "" try: magic = self._magic(content, mime=False) magic_mime = self._magic(content, mime=True) except Exception as ex: self.log.warning(f"unable to get magic: {ex}") extension = self._get_extension(sample.name or "sample") sample_class = { "magic": magic if magic else None, "mime": magic_mime if magic_mime else None, "kind": None, "platform": None, "extension": None, } # Is PE file? if magic.startswith("PE32") or magic.startswith( "MS-DOS executable PE32"): sample_class.update({ "kind": "runnable", "platform": "win32", "extension": "exe" }) if magic.startswith("PE32+"): sample_class["platform"] = "win64" # 64-bit only executable if "(DLL)" in magic: sample_class["extension"] = "dll" return sample_class # ZIP-contained files? def zip_has_file(path: str) -> bool: try: ZipFile(BytesIO(content)).getinfo(path) return True except Exception: return False if magic.startswith("Zip archive data") or magic.startswith( "Java archive data (JAR)"): if extension == "apk" or zip_has_file("AndroidManifest.xml"): sample_class.update({ "kind": "runnable", "platform": "android", "extension": "apk" }) return sample_class if extension == "jar" or zip_has_file("META-INF/MANIFEST.MF"): sample_class.update({ "kind": "runnable", "platform": "win32", # Default platform should be Windows "extension": "jar", }) return sample_class # Dalvik Android files? if magic.startswith("Dalvik dex file") or extension == "dex": sample_class.update({ "kind": "runnable", "platform": "android", "extension": "dex" }) return sample_class # Shockwave Flash? if magic.startswith("Macromedia Flash") or extension == "swf": sample_class.update({ "kind": "runnable", "platform": "win32", "extension": "swf" }) return sample_class # Windows LNK? if magic.startswith("MS Windows shortcut") or extension == "lnk": sample_class.update({ "kind": "runnable", "platform": "win32", "extension": "lnk" }) return sample_class # Is ELF file? if magic.startswith("ELF"): sample_class.update({"kind": "runnable", "platform": "linux"}) return sample_class # Is PKG file? if magic.startswith("xar archive") or extension == "pkg": sample_class.update({ "kind": "runnable", "platform": "macos", "extension": "pkg" }) return sample_class # Is DMG file? if extension == "dmg" or all([ len(content) > 512, content[-512:][:4] == b"koly", content[-512:][8:12] == b"\x00\x00\x02\x00", ]): sample_class.update({ "kind": "runnable", "platform": "macos", "extension": "dmg" }) return sample_class # Is mach-o file? if magic.startswith("Mach-O"): sample_class.update({"kind": "runnable", "platform": "macos"}) return sample_class def zip_has_mac_app() -> bool: try: zipfile = ZipFile(BytesIO(content)) return any( x.filename.lower().endswith(".app/contents/info.plist") for x in zipfile.filelist) except Exception: return False # macos app within zip if magic.startswith("Zip archive data") and zip_has_mac_app(): sample_class.update({ "kind": "runnable", "platform": "macos", "extension": "app" }) return sample_class # Windows scripts (per extension) script_extensions = [ "vbs", "vbe", "js", "jse", "wsh", "wsf", "hta", "cmd", "bat", "ps1", ] if extension in script_extensions: sample_class.update({ "kind": "script", "platform": "win32", "extension": extension }) return sample_class # Office documents office_extensions = { "doc": "Microsoft Word", "xls": "Microsoft Excel", "ppt": "Microsoft PowerPoint", } # Check RTF by libmagic if magic.startswith("Rich Text Format"): sample_class.update({ "kind": "document", "platform": "win32", "extension": "rtf" }) return sample_class # Check Composite Document (doc/xls/ppt) by libmagic and extension if magic.startswith("Composite Document File"): # MSI installers are also CDFs if "MSI Installer" in magic: sample_class.update({ "kind": "runnable", "platform": "win32", "extension": "msi" }) return sample_class # If not MSI, treat it like Office document sample_class.update({ "kind": "document", "platform": "win32", }) for ext, typepart in office_extensions.items(): if f"Name of Creating Application: {typepart}" in magic: sample_class["extension"] = ext return sample_class if extension[:3] in office_extensions.keys(): sample_class["extension"] = extension else: sample_class["extension"] = "doc" return sample_class # Check docx/xlsx/pptx by libmagic for ext, typepart in office_extensions.items(): if magic.startswith(typepart): sample_class.update({ "kind": "document", "platform": "win32", "extension": ext + "x" }) return sample_class # Check RTF by extension if extension == "rtf": sample_class.update({ "kind": "document", "platform": "win32", "extension": "rtf" }) return sample_class # Finally check document type only by extension if extension[:3] in office_extensions.keys(): sample_class.update({ "kind": "document", "platform": "win32", "extension": extension }) return sample_class # Unclassified Open XML documents if magic.startswith("Microsoft OOXML"): try: extn = classify_openxml(content) if extn: sample_class.update({ "kind": "document", "platform": "win32", "extension": extn, }) return sample_class except Exception: self.log.exception("Error while trying to classify OOXML") # PDF files if magic.startswith("PDF document") or extension == "pdf": sample_class.update({ "kind": "document", "platform": "win32", "extension": "pdf" }) return sample_class # Archives archive_assoc = { "7z": ["7-zip archive data"], "ace": ["ACE archive data"], "bz2": ["bzip2 compressed data"], "cab": ["Microsoft Cabinet archive data"], "gz": ["gzip compressed"], "iso": ["ISO 9660 CD-ROM"], "lz": ["lzip compressed data"], "tar": ["tar archive", "POSIX tar archive"], "rar": ["RAR archive data"], "udf": ["UDF filesystem data"], "xz": ["XZ compressed data"], "zip": ["Zip archive data"], "zlib": ["zlib compressed data"], } archive_extensions = [ "ace", "zip", "rar", "tar", "cab", "gz", "7z", "bz2", "arj", "iso", "xz", "lz", "udf", "cab", "zlib", ] def apply_archive_headers(extension): headers = {"kind": "archive", "extension": extension} if extension == "xz": # libmagic >= 5.40 generates correct MIME type for XZ archives headers["mime"] = "application/x-xz" sample_class.update(headers) return sample_class for archive_extension, assocs in archive_assoc.items(): if any(magic.startswith(assoc) for assoc in assocs): return apply_archive_headers(archive_extension) if extension in archive_extensions: return apply_archive_headers(extension) # E-mail email_assoc = { "msg": ["Microsoft Outlook Message"], "eml": ["multipart/mixed", "RFC 822 mail", "SMTP mail"], } for ext, patterns in email_assoc.items(): if any(pattern in magic for pattern in patterns): sample_class.update({"kind": "archive", "extension": ext}) return sample_class if extension in email_assoc.keys(): sample_class.update({"kind": "archive", "extension": extension}) return sample_class # HTML if magic.startswith("HTML document"): sample_class.update({"kind": "html"}) return sample_class # Linux scripts if ("script" in magic and "executable" in magic) or extension == "sh": sample_class.update({ "kind": "script", "platform": "linux", "extension": extension }) return sample_class # Content heuristics partial = content[:2048] + content[-2048:] # Dumped PE file heuristics (PE not recognized by libmagic) if b".text" in partial and b"This program cannot be run" in partial: sample_class.update({ "kind": "dump", "platform": "win32", "extension": "exe" }) return sample_class if len(partial) > 0x40: pe_offs = struct.unpack("<H", partial[0x3C:0x3E])[0] if partial[pe_offs:pe_offs + 2] == b"PE": sample_class.update({ "kind": "dump", "platform": "win32", "extension": "exe" }) return sample_class if partial.startswith(b"MZ"): sample_class.update({ "kind": "dump", "platform": "win32", "extension": "exe" }) return sample_class # Heuristics for scripts try: try: partial_str = partial.decode( chardet.detect(partial)["encoding"]).lower() except Exception: self.log.warning("Heuristics disabled - unknown encoding") else: vbs_keywords = [ "end function", "end if", "array(", "sub ", "on error ", "createobject", "execute", ] js_keywords = [ "function ", "function(", "this.", "this[", "new ", "createobject", "activexobject", "var ", "catch", ] html_keywords = ["<!doctype", "<html", "<script"] ps_keywords = [ "powershell", "-nop", "bypass", "new-object", "invoke-expression", "frombase64string(", "| iex", "|iex", ] if (len([ True for keyword in html_keywords if keyword in partial_str ]) >= 2): sample_class.update({"kind": "html"}) return sample_class if (len([ True for keyword in vbs_keywords if keyword in partial_str ]) >= 2): sample_class.update({ "kind": "script", "platform": "win32", "extension": "vbs" }) return sample_class # Powershell heuristics if len([ True for keyword in ps_keywords if keyword.lower() in partial_str ]): sample_class.update({ "kind": "script", "platform": "win32", "extension": "ps1" }) return sample_class # JS heuristics if (len([ True for keyword in js_keywords if keyword in partial_str ]) >= 2): sample_class.update({ "kind": "script", "platform": "win32", "extension": "js" }) return sample_class # JSE heuristics if re.match("#@~\\^[a-zA-Z0-9+/]{6}==", partial_str): sample_class.update({ "kind": "script", "platform": "win32", "extension": "jse", # jse is more possible than vbe }) return sample_class if magic.startswith("ASCII"): sample_class.update({ "kind": "ascii", }) return sample_class if magic.startswith("ISO-8859"): sample_class.update({ "kind": "iso-8859-1", }) return sample_class if magic.startswith("UTF-8"): sample_class.update({ "kind": "utf-8", }) return sample_class if magic.startswith("PGP"): sample_class.update({ "kind": "pgp", }) return sample_class if magic.startswith("pcap capture file"): sample_class.update({ "kind": "pcap", }) return sample_class if magic.startswith("pcap") and "ng capture file" in magic: sample_class.update({ "kind": "pcapng", }) return sample_class except Exception as e: self.log.exception(e) # If not recognized then unsupported return None
def process(self, task: Task): # Gather basic facts sample = task.get_resource("sample") magic_output = magic.from_buffer(sample.content) sha256sum = hashlib.sha256(sample.content).hexdigest() self.log.info(f"Running on: {socket.gethostname()}") self.log.info(f"Sample SHA256: {sha256sum}") self.log.info(f"Analysis UID: {self.analysis_uid}") # Timeout sanity check timeout = task.payload.get('timeout') or self.default_timeout hard_time_limit = 60 * 20 if timeout > hard_time_limit: self.log.error( "Tried to run the analysis for more than hard limit of %d seconds", hard_time_limit) return self.update_vnc_info() # Get sample extension. If none set, fall back to exe/dll extension = task.headers.get("extension", "exe").lower() if '(DLL)' in magic_output: extension = 'dll' self.log.info("Running file as %s", extension) # Prepare sample file name file_name = task.payload.get("file_name", "malwar") + f".{extension}" # Alphanumeric, dot, underscore, dash if not re.match(r"^[a-zA-Z0-9\._\-]+$", file_name): self.log.error("Filename contains invalid characters") return self.log.info("Using file name %s", file_name) # workdir - configs, sample, etc. # outdir - analysis artifacts workdir, outdir = self._prepare_workdir() sample_path = os.path.join(workdir, file_name) sample.download_to_file(sample_path) # Try to come up with a start command for this file # or use the one provided by the sender cmd = self._get_start_command(extension, sample, sample_path) start_command = task.payload.get("start_command", cmd) if not start_command: self.log.error( "Unable to run malware sample. Could not generate any suitable" " command to run it.") return self.log.info("Start command: %s", start_command) # If task contains 'custom_hooks' override local defaults with open(os.path.join(workdir, "hooks.txt"), "wb") as hooks: if task.has_payload("custom_hooks"): custom_hooks = task.get_resource("custom_hooks") assert custom_hooks.content is not None hooks.write(custom_hooks.content) else: with open(os.path.join(ETC_DIR, "hooks.txt"), "rb") as default_hooks: hooks.write(default_hooks.read()) metadata = { "sample_sha256": sha256sum, "magic_output": magic_output, "time_started": int(time.time()) } max_attempts = 3 for i in range(max_attempts): try: self.log.info( f"Trying to analyze sample (attempt {i + 1}/{max_attempts})" ) info = self.analyze_sample(sample_path, workdir, outdir, start_command, timeout) metadata.update(info) break except Exception: self.log.exception("Analysis attempt failed. Retrying...") else: self.log.error(f"Giving up after {max_attempts} failures...") return self.log.info("Analysis done. Collecting artifacts...") # Make sure dumps have a reasonable size self.crop_dumps(os.path.join(outdir, 'dumps'), os.path.join(outdir, 'dumps.zip')) # Compress IPT traces, they're quite large however they compress well self.compress_ipt(os.path.join(outdir, 'ipt'), os.path.join(outdir, 'ipt.zip')) metadata['time_finished'] = int(time.time()) with open(os.path.join(outdir, 'metadata.json'), 'w') as f: f.write(json.dumps(metadata)) quality = task.headers.get("quality", "high") self.send_analysis(sample, outdir, metadata, quality)
def process(self, task: Task) -> None: sample = task.get_resource("sample") resources = None m = self.yara.match(data=sample.content) if "autoit_v3_00" in m: self.log.info("Found a possible autoit v3.00 binary") resources = extract(data=sample.content, version=AutoItVersion.EA05) elif "autoit_v3_26" in m: self.log.info("Found a possible autoit v3.26+ binary") resources = extract(data=sample.content, version=AutoItVersion.EA06) if resources: self.log.info("Found embedded data, reporting!") for res_name, res_data in resources: if res_name.endswith(".dll") or res_name.endswith(".exe"): task_params = { "type": "sample", "kind": "raw", } elif res_name == "script.au3": task_params = { "type": "sample", "kind": "script", "stage": "analyzed", "extension": "au3", } else: continue self.log.info("Sending a task with %s", res_name) script = Resource(res_name, res_data) self.send_task( Task(task_params, payload={ "sample": script, "parent": sample })) if res_name == "script.au3": self.log.info( "Looking for a binary embedded in the script") drop = extract_binary(res_data.decode()) if drop: self.log.info("Found an embedded binary") self.send_task( Task( { "type": "sample", "kind": "raw" }, payload={ "sample": Resource(name="autoit_drop.exe", content=drop), "parent": script, }, ))
def process(self, task: Task) -> None: sample = task.get_resource("sample") for module in self.modules: tasks = unpacker_module_worker(sample, self.user_config, module) for task in tasks: self.send_task(task)
def _classify(self, task: Task) -> Optional[Dict[str, str]]: sample = task.get_resource("sample") content = cast(bytes, sample.content) magic = task.get_payload("magic") or pymagic.from_buffer(content) extension = self._get_extension(sample.name or "sample") sample_type = { "type": "sample", "stage": "recognized", "quality": task.headers.get("quality", "high"), } # Is PE file? if magic.startswith("PE32") or magic.startswith("MS-DOS executable PE32"): sample_type.update( {"kind": "runnable", "platform": "win32", "extension": "exe"} ) if magic.startswith("PE32+"): sample_type["platform"] = "win64" # 64-bit only executable if "(DLL)" in magic: sample_type["extension"] = "dll" return sample_type # ZIP-contained files? def zip_has_file(path: str) -> bool: try: ZipFile(BytesIO(content)).getinfo(path) return True except Exception: return False if magic.startswith("Zip archive data") or magic.startswith( "Java archive data (JAR)" ): if extension == "apk" or zip_has_file("AndroidManifest.xml"): sample_type.update( {"kind": "runnable", "platform": "android", "extension": "apk"} ) return sample_type if extension == "jar" or zip_has_file("META-INF/MANIFEST.MF"): sample_type.update( { "kind": "runnable", "platform": "win32", # Default platform should be Windows "extension": "jar", } ) return sample_type # Dalvik Android files? if magic.startswith("Dalvik dex file") or extension == "dex": sample_type.update( {"kind": "runnable", "platform": "android", "extension": "dex"} ) return sample_type # Shockwave Flash? if magic.startswith("Macromedia Flash") or extension == "swf": sample_type.update( {"kind": "runnable", "platform": "win32", "extension": "swf"} ) return sample_type # Windows LNK? if magic.startswith("MS Windows shortcut") or extension == "lnk": sample_type.update( {"kind": "runnable", "platform": "win32", "extension": "lnk"} ) return sample_type # Is ELF file? if magic.startswith("ELF"): sample_type.update({"kind": "runnable", "platform": "linux"}) return sample_type # Windows scripts (per extension) script_extensions = [ "vbs", "vbe", "js", "jse", "wsh", "wsf", "hta", "cmd", "bat", "ps1", ] if extension in script_extensions: sample_type.update( {"kind": "script", "platform": "win32", "extension": extension} ) return sample_type # Office documents office_extensions = { "doc": "Microsoft Word", "xls": "Microsoft Excel", "ppt": "Microsoft PowerPoint", } # Check RTF by libmagic if magic.startswith("Rich Text Format"): sample_type.update( {"kind": "document", "platform": "win32", "extension": "rtf"} ) return sample_type # Check Composite Document (doc/xls/ppt) by libmagic and extension if magic.startswith("Composite Document File"): # MSI installers are also CDFs if "MSI Installer" in magic: sample_type.update( {"kind": "runnable", "platform": "win32", "extension": "msi"} ) return sample_type # If not MSI, treat it like Office document sample_type.update( { "kind": "document", "platform": "win32", } ) if extension[:3] in office_extensions.keys(): sample_type["extension"] = extension else: sample_type["extension"] = "doc" return sample_type # Check docx/xlsx/pptx by libmagic for ext, typepart in office_extensions.items(): if magic.startswith(typepart): sample_type.update( {"kind": "document", "platform": "win32", "extension": ext + "x"} ) return sample_type # Check RTF by extension if extension == "rtf": sample_type.update( {"kind": "document", "platform": "win32", "extension": "rtf"} ) return sample_type # Finally check document type only by extension if extension[:3] in office_extensions.keys(): sample_type.update( {"kind": "document", "platform": "win32", "extension": extension} ) return sample_type # Unclassified Open XML documents if magic.startswith("Microsoft OOXML"): try: extn = classify_openxml(content) if extn: sample_type.update( { "kind": "document", "platform": "win32", "extension": extn, } ) return sample_type except Exception: self.log.exception("Error while trying to classify OOXML") # PDF files if magic.startswith("PDF document") or extension == "pdf": sample_type.update( {"kind": "document", "platform": "win32", "extension": "pdf"} ) return sample_type # Archives archive_assoc = { "7z": ["7-zip archive data"], "ace": ["ACE archive data"], "bz2": ["bzip2 compressed data"], "cab": ["Microsoft Cabinet archive data"], "gz": ["gzip compressed"], "iso": ["ISO 9660 CD-ROM"], "lz": ["lzip compressed data"], "tar": ["tar archive", "POSIX tar archive"], "rar": ["RAR archive data"], "udf": ["UDF filesystem data"], "xz": ["XZ compressed data"], "zip": ["Zip archive data"], "zlib": ["zlib compressed data"], } archive_extensions = [ "ace", "zip", "rar", "tar", "cab", "gz", "7z", "bz2", "arj", "iso", "xz", "lz", "udf", "cab", "zlib", ] for ext in archive_extensions: if ext in archive_assoc: if any(magic.startswith(x) for x in archive_assoc[ext]): sample_type.update({"kind": "archive", "extension": ext}) return sample_type if extension in archive_extensions: sample_type.update({"kind": "archive", "extension": extension}) return sample_type # E-mail email_assoc = {"msg": "Microsoft Outlook Message", "eml": "multipart/mixed"} for ext in email_assoc.keys(): if email_assoc[ext] in magic: sample_type.update({"kind": "archive", "extension": ext}) return sample_type if extension in email_assoc.keys(): sample_type.update({"kind": "archive", "extension": extension}) return sample_type # HTML if magic.startswith("HTML document"): sample_type.update({"kind": "html"}) return sample_type # Linux scripts if ("script" in magic and "executable" in magic) or extension == "sh": sample_type.update( {"kind": "script", "platform": "linux", "extension": extension} ) return sample_type # Content heuristics partial = content[:2048] + content[-2048:] # Dumped PE file heuristics (PE not recognized by libmagic) if b".text" in partial and b"This program cannot be run" in partial: sample_type.update( {"kind": "dump", "platform": "win32", "extension": "exe"} ) return sample_type if len(partial) > 0x40: pe_offs = struct.unpack("<H", partial[0x3C:0x3E])[0] if partial[pe_offs : pe_offs + 2] == b"PE": sample_type.update( {"kind": "dump", "platform": "win32", "extension": "exe"} ) return sample_type if partial.startswith(b"MZ"): sample_type.update( {"kind": "dump", "platform": "win32", "extension": "exe"} ) return sample_type # Heuristics for scripts try: try: partial_str = partial.decode( chardet.detect(partial)["encoding"] ).lower() except Exception: self.log.warning("Heuristics disabled - unknown encoding") else: vbs_keywords = [ "end function", "end if", "array(", "sub ", "on error ", "createobject", "execute", ] js_keywords = [ "function ", "function(", "this.", "this[", "new ", "createobject", "activexobject", "var ", "catch", ] html_keywords = ["<!doctype", "<html", "<script"] ps_keywords = [ "powershell", "-nop", "bypass", "new-object", "invoke-expression", "frombase64string(", "| iex", "|iex", ] if ( len([True for keyword in html_keywords if keyword in partial_str]) >= 2 ): sample_type.update({"kind": "html"}) return sample_type if ( len([True for keyword in vbs_keywords if keyword in partial_str]) >= 2 ): sample_type.update( {"kind": "script", "platform": "win32", "extension": "vbs"} ) return sample_type if ( len([True for keyword in js_keywords if keyword in partial_str]) >= 2 ): sample_type.update( {"kind": "script", "platform": "win32", "extension": "js"} ) return sample_type # JSE heuristics if re.match("#@~\\^[a-zA-Z0-9+/]{6}==", partial_str): sample_type.update( { "kind": "script", "platform": "win32", "extension": "jse", # jse is more possible than vbe } ) return sample_type # Powershell heuristics if len( [True for keyword in ps_keywords if keyword.lower() in partial_str] ): sample_type.update( {"kind": "script", "platform": "win32", "extension": "ps1"} ) return sample_type if magic.startswith("ASCII"): sample_type.update( { "kind": "ascii", } ) return sample_type if magic.startswith("ISO-8859"): sample_type.update( { "kind": "iso-8859-1", } ) return sample_type if magic.startswith("UTF-8"): sample_type.update( { "kind": "utf-8", } ) return sample_type if magic.startswith("PGP"): sample_type.update( { "kind": "pgp", } ) return sample_type except Exception as e: self.log.exception(e) # If not recognized then unsupported return None