def scan(self, file_object, options): self.metadata["total"] = {"attachments": 0, "extracted": 0} tnef = tnefparse.TNEF(file_object.data) self.metadata.setdefault("objectNames", []) tnef_objects = getattr(tnef, "objects", None) if tnef_objects is not None: for object in tnef_objects: descriptive_name = tnefparse.TNEF.codes.get(object.name) if descriptive_name not in self.metadata["objectNames"]: self.metadata["objectNames"].append(descriptive_name) object_data = object.data.strip(b"\0") or None if object_data is not None: if descriptive_name == "Subject": self.metadata["subject"] = object_data elif descriptive_name == "Message ID": self.metadata["messageId"] = object_data elif descriptive_name == "Message Class": self.metadata["messageClass"] = object_data tnef_attachments = getattr(tnef, "attachments", None) if tnef_attachments is not None: self.metadata["total"]["attachments"] = len(tnef_attachments) for attachment in tnef_attachments: child_filename = f"{self.scanner_name}::{attachment.name.decode()}" child_fo = objects.StrelkaFile(data=attachment.data, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 tnef_html = getattr(tnef, "htmlbody", None) if tnef_html is not None: child_fo = objects.StrelkaFile(data=tnef_html.data, filename=f"{self.scanner_name}::htmlbody", depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo)
def scan(self, file_object, options): try: with io.BytesIO(file_object.data) as lzma_object: with lzma.LZMAFile(filename=lzma_object) as lzma_file: try: decompressed_file = lzma_file.read() decompressed_size = len(decompressed_file) self.metadata["decompressedSize"] = decompressed_size child_filename = f"{self.scanner_name}::size_{decompressed_size}" child_fo = objects.StrelkaFile( data=decompressed_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) except EOFError: file_object.flags.append( f"{self.scanner_name}::eof_error") except lzma.LZMAError: file_object.flags.append(f"{self.scanner_name}::lzma_error")
def scan(self, file_object, options): tempfile_directory = options.get("tempfile_directory", "/tmp/") with tempfile.NamedTemporaryFile( dir=tempfile_directory) as strelka_file: strelka_filename = strelka_file.name strelka_file.write(file_object.data) strelka_file.flush() upx_filename = strelka_filename + "_upx" upx_returncode = subprocess.call( ["upx", "-d", strelka_filename, "-o", upx_filename], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if upx_returncode == 0: with open(upx_filename, "rb") as upx_fin: upx_file = upx_fin.read() upx_size = len(upx_file) if upx_size > file_object.size: child_filename = f"{self.scanner_name}::size_{upx_size}" child_fo = objects.StrelkaFile( data=upx_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) os.remove(upx_filename) else: file_object.flags.append( f"{self.scanner_name}::return_code_{upx_returncode}")
def scan(self, file_object, options): file_limit = options.get("limit", 1000) self.metadata["total"] = {"files": 0, "extracted": 0} try: with libarchive.memory_reader(file_object.data) as archive: for entry in archive: self.metadata["total"]["files"] += 1 if entry.isfile: if self.metadata["total"]["extracted"] >= file_limit: continue child_file = b"".join(entry.get_blocks()) if entry.pathname: child_filename = f"{self.scanner_name}::{entry.pathname}" else: child_filename = f"{self.scanner_name}::size_{len(child_file)}" child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 except libarchive.ArchiveError: file_object.flags.append( f"{self.scanner_name}::libarchive_archive_error")
def scan(self, file_object, options): self.metadata["total"] = {"certificates": 0, "extracted": 0} if file_object.data[:1] == b"0": crypto_file_type = crypto.FILETYPE_ASN1 self.metadata["cryptoType"] = "der" else: crypto_file_type = crypto.FILETYPE_PEM self.metadata["cryptoType"] = "pem" try: pkcs7 = crypto.load_pkcs7_data(crypto_file_type, file_object.data) pkcs7_certificates = pkcs7.get_certificates() if pkcs7_certificates is not None: self.metadata["total"]["certificates"] = len(pkcs7_certificates) for certificate in pkcs7_certificates: child_file = crypto.dump_certificate(crypto_file_type, certificate) child_filename = f"{self.scanner_name}::serial_number_{certificate.get_serial_number()}" child_fo = objects.StrelkaFile(data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 except crypto.Error: file_object.flags.append(f"{self.scanner_name}::load_pkcs7_error")
def scan(self, file_object, options): with io.BytesIO(file_object.data) as swf_object: swf_object.seek(4) swf_size = struct.unpack("<i", swf_object.read(4))[0] swf_object.seek(0) magic = swf_object.read(3) child_file = b"FWS" + swf_object.read(5) if magic == b"CWS": self.metadata["type"] = "CWS" try: child_file += zlib.decompress( swf_object.read())[:swf_size - 8] child_filename = f"{self.scanner_name}::size_{len(child_file)}" child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) except zlib.error: file_object.flags.append( f"{self.scanner_name}::zlib_error") elif magic == b"ZWS": self.metadata["type"] = "ZWS" swf_object.seek(12) child_file += pylzma.decompress(swf_object.read())[:swf_size - 8] child_filename = f"{self.scanner_name}::size_{len(child_file)}" child_fo = objects.StrelkaFile(data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) elif magic == b"FWS": self.metadata["type"] = "FWS"
def scan(self, file_object, options): analyze_macros = options.get("analyze_macros", True) self.metadata["total"] = {"files": 0, "extracted": 0} try: vba_parser = olevba3.VBA_Parser(filename=file_object.filename, data=file_object.data) if vba_parser.detect_vba_macros(): extract_macros = list(vba_parser.extract_macros()) self.metadata["total"]["files"] = len(extract_macros) for (filename, stream_path, vba_filename, vba_code) in extract_macros: child_filename = f"{self.scanner_name}::{vba_filename}" child_fo = objects.StrelkaFile( data=vba_code, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 if analyze_macros: self.metadata.setdefault("autoExec", []) self.metadata.setdefault("base64", []) self.metadata.setdefault("dridex", []) self.metadata.setdefault("hex", []) self.metadata.setdefault("ioc", []) self.metadata.setdefault("suspicious", []) macros = vba_parser.analyze_macros() for (type, keyword, description) in macros: if type == "AutoExec": self.metadata["autoExec"].append(keyword) elif type == "Base64 String": self.metadata["base64"].append(keyword) elif type == "Dridex String": self.metadata["dridex"].append(keyword) elif type == "Hex String": self.metadata["hex"].append(keyword) elif type == "IOC": self.metadata["ioc"].append(keyword) elif type == "Suspicious": self.metadata["suspicious"].append(keyword) vba_parser.close() except olevba3.FileOpenError: file_object.flags.append(f"{self.scanner_name}::file_open_error")
def scan(self, file_object, options): extract_text = options.get("extract_text", False) with io.BytesIO(file_object.data) as docx_object: docx_file = docx.Document(docx_object) core_properties = docx_file.core_properties if core_properties.author is not None: self.metadata["author"] = core_properties.author if core_properties.category is not None: self.metadata["category"] = core_properties.category if core_properties.comments is not None: self.metadata["comments"] = core_properties.comments if core_properties.content_status is not None: self.metadata["contentStatus"] = core_properties.content_status if core_properties.created is not None: self.metadata["created"] = core_properties.created.isoformat(timespec="seconds") if core_properties.identifier is not None: self.metadata["identifier"] = core_properties.identifier if core_properties.keywords is not None: self.metadata["keywords"] = core_properties.keywords if core_properties.language is not None: self.metadata["language"] = core_properties.language if core_properties.last_modified_by is not None: self.metadata["lastModifiedBy"] = core_properties.last_modified_by if core_properties.last_printed is not None: self.metadata["lastPrinted"] = core_properties.last_printed.isoformat(timespec="seconds") if core_properties.modified is not None: self.metadata["modified"] = core_properties.modified.isoformat(timespec="seconds") if core_properties.revision is not None: self.metadata["revision"] = core_properties.revision if core_properties.subject is not None: self.metadata["subject"] = core_properties.subject if core_properties.title is not None: self.metadata["title"] = core_properties.title if core_properties.version is not None: self.metadata["version"] = core_properties.version if extract_text: docx_text = [] for paragraph in docx_file.paragraphs: docx_text.append(paragraph.text) child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data="".join(docx_text), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo)
def _recurse_node(self, node, xml_args, file_object): """Recursively parses XML file. The XML file is recursively parsed down every node tree. Args: node: node to be recursively parsed. xml_args: options set by the scanner that affect XMl parsing. file_object: file object being scanned. """ if node is not None: if hasattr(node.tag, "__getitem__"): if node.tag.startswith("{"): namespace, separator, tag = node.tag[1:].partition("}") else: namespace = None tag = node.tag self.metadata["total"]["tags"] += 1 if (namespace is not None and namespace not in self.metadata["namespaces"]): self.metadata["namespaces"].append(namespace) if tag not in self.metadata["tags"]: self.metadata["tags"].append(tag) text = node.attrib.get("name", node.text) if text is not None: if (xml_args["metadata_tags"] and tag in xml_args["metadata_tags"]): tag_data = {"tag": tag, "text": text.strip()} if tag_data not in self.metadata["tagData"]: self.metadata["tagData"].append(tag_data) elif (xml_args["extract_tags"] and tag in xml_args["extract_tags"]): child_filename = f"{self.scanner_name}::{tag}" child_fo = objects.StrelkaFile( data=text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 for child in node.getchildren(): self._recurse_node(self, child, xml_args, file_object) return
def scan(self, file_object, options): with io.BytesIO(file_object.data) as gzip_object: with gzip.GzipFile(fileobj=gzip_object) as gzip_file: decompressed_file = gzip_file.read() decompressed_size = len(decompressed_file) self.metadata["decompressedSize"] = decompressed_size child_filename = f"{self.scanner_name}::size_{decompressed_size}" child_fo = objects.StrelkaFile(data=decompressed_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo)
def scan(self, file_object, options): file_limit = options.get("limit", 1000) self.metadata["total"] = {"files": 0, "extracted": 0} with io.BytesIO(file_object.data) as tar_object: try: with tarfile.open(fileobj=tar_object) as tar_file: tar_members = tar_file.getmembers() self.metadata["total"]["files"] = len(tar_members) for tar_member in tar_members: if tar_member.isfile: if self.metadata["total"][ "extracted"] >= file_limit: break try: extract_file = tar_file.extractfile(tar_member) if extract_file is not None: child_file = extract_file.read() if tar_member.name: child_filename = f"{self.scanner_name}::{tar_member.name}" else: child_filename = f"{self.scanner_name}::size_{len(child_file)}" child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 except KeyError: file_object.flags.append( f"{self.scanner_name}::key_error") except tarfile.ReadError: file_object.flags.append( f"{self.scanner_name}::tarfile_read_error")
def scan(self, file_object, options): if not file_object.data.endswith(b"\x00\x3b"): trailer_index = file_object.data.rfind(b"\x00\x3b") if trailer_index == -1: file_object.flags.append(f"{self.scanner_name}::no_trailer") else: trailer_data = file_object.data[trailer_index + 2:] if trailer_data: self.metadata["trailerIndex"] = trailer_index child_filename = f"{self.scanner_name}::size_{len(trailer_data)}" child_fo = objects.StrelkaFile(data=trailer_data, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo)
def scan(self, file_object, options): with io.BytesIO(file_object.data) as bzip2_object: with bz2.BZ2File(filename=bzip2_object) as bzip2_file: try: decompressed_file = bzip2_file.read() decompressed_size = len(decompressed_file) child_filename = f"{self.scanner_name}::size_{decompressed_size}" self.metadata["decompressedSize"] = decompressed_size child_fo = objects.StrelkaFile( data=decompressed_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) except OSError: file_object.flags.append(f"{self.scanner_name}::os_error")
def scan(self, file_object, options): file_limit = options.get("limit", 1000) self.metadata["total"] = {"files": 0, "extracted": 0} with io.BytesIO(file_object.data) as zip_object: try: with zipfile.ZipFile(zip_object) as zip_file_: name_list = zip_file_.namelist() self.metadata["total"]["files"] = len(name_list) for name in name_list: if not name.endswith("/"): if self.metadata["total"]["extracted"] >= file_limit: break try: child_file = zip_file_.read(name) child_filename = f"{self.scanner_name}::{name}" child_fo = objects.StrelkaFile(data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 except NotImplementedError: file_object.flags.append(f"{self.scanner_name}::unsupported_compression") except RuntimeError: file_object.flags.append(f"{self.scanner_name}::runtime_error") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error") except zlib.error: file_object.flags.append(f"{self.scanner_name}::zlib_error") except zipfile.BadZipFile: file_object.flags.append(f"{self.scanner_name}::bad_zip_file")
def scan(self, file_object, options): extract_text = options.get("extract_text", False) tempfile_directory = options.get("tempfile_directory", "/tmp/") with tempfile.NamedTemporaryFile( dir=tempfile_directory) as strelka_file: strelka_filename = strelka_file.name strelka_file.write(file_object.data) strelka_file.flush() with tempfile.NamedTemporaryFile( dir=tempfile_directory) as tesseract_file: tesseract_filename = tesseract_file.name tesseract_txt_filename = f"{tesseract_filename}.txt" tesseract_returncode = subprocess.call( ["tesseract", strelka_filename, tesseract_filename], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if tesseract_returncode == 0: with open(tesseract_txt_filename, "rb") as tesseract_txt: ocr_file = tesseract_txt.read() if ocr_file: self.metadata["text"] = ocr_file.split() if extract_text: child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile( data=ocr_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) else: file_object.flags.append( f"{self.scanner_name}::return_code_{tesseract_returncode}" ) os.remove(tesseract_txt_filename)
def scan(self, file_object, options): file_limit = options.get("limit", 1000) self.metadata["total"] = {"files": 0, "extracted": 0} with io.BytesIO(file_object.data) as rar_object: with rarfile.RarFile(rar_object) as rf: rf_info_list = rf.infolist() self.metadata["total"]["files"] = len(rf_info_list) for rf_object in rf_info_list: if not rf_object.isdir(): if self.metadata["total"]["extracted"] >= file_limit: break child_file = rf.read(rf_object) child_info = rf.getinfo(rf_object) if not child_info.needs_password(): rar_metadata = { "scanRarHostOs": HOST_OS_MAPPING[child_info.host_os] } child_filename = f"{self.scanner_name}::{child_info.filename}" child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name, external_metadata=rar_metadata) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 else: file_object.flags.append( f"{self.scanner_name}::password_protected")
def scan(self, file_object, options): self.metadata["total"] = {"parts": 0, "extracted": 0} message = email.message_from_string( file_object.data.decode("UTF-8", "replace")) self.metadata.setdefault("headers", []) for (key, value) in message.items(): normalized_value = objects.normalize_whitespace(value.strip()) header_entry = {"header": key, "value": normalized_value} if header_entry not in self.metadata["headers"]: self.metadata["headers"].append(header_entry) self.metadata.setdefault("parts", []) for (index, part) in enumerate(message.walk()): self.metadata["total"]["parts"] += 1 child_file = part.get_payload(decode=True) if child_file is not None: part_filename = part.get_filename() if part_filename is not None: child_filename = f"{self.scanner_name}::{part_filename}" self.metadata["parts"].append(part_filename) else: child_filename = f"{self.scanner_name}::part_{index}" child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name, external_flavors=[part.get_content_type()]) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1
def scan(self, file_object, options): file_limit = options.get("limit", 1000) self.metadata["total"] = {"objects": 0, "extracted": 0} rtf = rtfobj.RtfObjParser(file_object.data) rtf.parse() self.metadata["total"]["objects"] = len(rtf.objects) for object in rtf.objects: if self.metadata["total"]["extracted"] >= file_limit: break index = rtf.objects.index(object) child_file = None child_filename = None if object.is_package: child_file = object.olepkgdata child_filename = f"{self.scanner_name}::{object.filename}" elif object.is_ole: child_file = object.oledata child_filename = f"{self.scanner_name}::object_{index}" else: child_file = object.rawdata child_filename = f"{self.scanner_name}:object_{index}" child_fo = objects.StrelkaFile(data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1
def scan(self, file_object, options): tempfile_directory = options.get("tempfile_directory", "/tmp/") with tempfile.NamedTemporaryFile( dir=tempfile_directory) as strelka_file: strelka_filename = strelka_file.name strelka_file.write(file_object.data) strelka_file.flush() (stdout, stderr) = subprocess.Popen( ["antiword", strelka_filename], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL).communicate() if stdout: child_fo = objects.StrelkaFile( data=stdout, filename=f"{self.scanner_name}::text", depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo)
def scan(self, file_object, options): extract_text = options.get("extract_text", False) file_limit = options.get("limit", 2000) self.metadata["total"] = {"objects": 0, "extracted": 0} extracted_objects = set() try: with io.BytesIO(file_object.data) as pdf_object: parsed_pdf = pdfparser.PDFParser(pdf_object) pdf_document = pdfdocument.PDFDocument(parsed_pdf) self.metadata.setdefault("annotatedUris", []) for xref in pdf_document.xrefs: for object_id in xref.get_objids(): self.metadata["total"]["objects"] += 1 try: object = pdf_document.getobj(object_id) if isinstance(object, dict): for (key, value) in object.items(): if key in ["AA", "OpenAction"]: file_object.flags.append(f"{self.scanner_name}::auto_action") if key in ["JS", "Javascript"]: file_object.flags.append(f"{self.scanner_name}::javascript_embedded") try: if key == "A": uri = value.get("URI") if (uri is not None and uri not in self.metadata["annotatedUris"]): self.metadata["annotatedUris"].append(uri) except AttributeError: pass if self.metadata["total"]["extracted"] >= file_limit: continue if isinstance(object, pdftypes.PDFStream): try: child_filename = f"{self.scanner_name}::object_{object_id}" child_fo = objects.StrelkaFile(data=object.get_data(), filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) if object_id not in extracted_objects: self.children.append(child_fo) extracted_objects.add(object_id) self.metadata["total"]["extracted"] += 1 except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}") except struct.error: file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}") except pdftypes.PDFObjectNotFound: file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}") except pdftypes.PDFNotImplementedError: file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}") except pdftypes.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}") if extract_text: rsrcmgr = pdfinterp.PDFResourceManager(caching=True) retstr = io.StringIO() la_params = layout.LAParams(detect_vertical=True, char_margin=1.0, line_margin=0.3, word_margin=0.3) device = converter.TextConverter(rsrcmgr, retstr, codec="utf-8", laparams=la_params) interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device) for page in pdfpage.PDFPage.get_pages(pdf_object, set()): try: interpreter.process_page(page) except struct.error: file_object.flags.append(f"{self.scanner_name}::text_struct_error") pdf_object_text = retstr.getvalue() child_filename = f"{self.scanner_name}::text" child_fo = objects.StrelkaFile(data=pdf_object_text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) file_object.flags.append(f"{self.scanner_name}::extracted_text") device.close() retstr.close() except IndexError: file_object.flags.append(f"{self.scanner_name}::index_error") except pdfdocument.PDFEncryptionError: file_object.flags.append(f"{self.scanner_name}::encrypted_pdf") except pdfparser.PDFSyntaxError: file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error") except psparser.PSEOF: file_object.flags.append(f"{self.scanner_name}::ps_eof") except psparser.PSSyntaxError: file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
def scan(self, file_object, options): tempfile_directory = options.get("tempfile_directory", "/tmp/") with tempfile.NamedTemporaryFile( dir=tempfile_directory) as strelka_file: strelka_filename = strelka_file.name strelka_file.write(file_object.data) strelka_file.flush() try: with rpmfile.open(strelka_filename) as rpm_file: child_file = file_object.data[rpm_file.data_offset:] child_filename = f"{self.scanner_name}::size_{len(child_file)}" for (key, value) in rpm_file.headers.items(): if key == "arch": self.metadata["architecture"] = value elif key == "archive_compression": self.metadata["archiveCompression"] = value elif key == "archive_format": self.metadata["archiveFormat"] = value elif key == "authors": self.metadata["authors"] = value elif key == "buildhost": self.metadata["buildHost"] = value elif key == "buildtime": self.metadata[ "buildTime"] = datetime.utcfromtimestamp( value).isoformat(timespec="seconds") elif key == "copyright": self.metadata["copyright"] = value elif key == "description": self.metadata["description"] = value.replace( b"\n", b" ") elif key == "filenames": self.metadata["filenames"] = value elif key == "group": self.metadata["group"] = value elif key == "name": self.metadata["name"] = value child_filename = f"{self.scanner_name}::{value.decode()}" elif key == "os": self.metadata["os"] = value elif key == "packager": self.metadata["packager"] = value elif key == "provides": self.metadata["provides"] = value elif key == "release": self.metadata["release"] = value elif key == "requirename": self.metadata["requireName"] = value elif key == "rpmversion": self.metadata["rpmVersion"] = value elif key == "serial": self.metadata["serial"] = value elif key == "sourcerpm": self.metadata["sourceRpm"] = value elif key == "summary": self.metadata["summary"] = value elif key == "vendor": self.metadata["vendor"] = value elif key == "version": self.metadata["version"] = value elif key == "url": self.metadata["url"] = value child_fo = objects.StrelkaFile( data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error")
def scan(self, file_object, options): file_limit = options.get("limit", 1000) password_file = options.get("password_file", "etc/strelka/passwords.txt") self.metadata["total"] = {"files": 0, "extracted": 0} try: if not self.rainbow_table: if os.path.isfile(password_file): with open(password_file, 'r+') as f: for line in f: self.rainbow_table.append(bytes(line.strip(), 'utf-8')) except IOError: file_object.flags.append(f"{self.scanner_name}::file_read_error") with io.BytesIO(file_object.data) as zip_object: try: with zipfile.ZipFile(zip_object) as zip_file_: name_list = zip_file_.namelist() self.metadata["total"]["files"] = len(name_list) for name in name_list: if not name.endswith("/"): if self.metadata["total"]["extracted"] >= file_limit: break try: child_file = None zinfo = zip_file_.getinfo(name) if zinfo.flag_bits & 0x1 and self.rainbow_table: # File is encrypted for pwd in self.rainbow_table: try: child_file = zip_file_.read(name, pwd) if child_file is not None: file_object.flags.append(f"{self.scanner_name}::encrypted_archive_file") break except RuntimeError: pass elif zinfo.flag_bits & 0x1 and not self.rainbow_table: # File is encrypted, no passwords file_object.flags.append(f"{self.scanner_name}::no_archive_passwords") return else: child_file = zip_file_.read(name) if child_file is not None: child_filename = f"{self.scanner_name}::{name}" child_fo = objects.StrelkaFile(data=child_file, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 except NotImplementedError: file_object.flags.append(f"{self.scanner_name}::unsupported_compression") except RuntimeError: file_object.flags.append(f"{self.scanner_name}::runtime_error") except ValueError: file_object.flags.append(f"{self.scanner_name}::value_error") except zlib.error: file_object.flags.append(f"{self.scanner_name}::zlib_error") except zipfile.BadZipFile: file_object.flags.append(f"{self.scanner_name}::bad_zip_file")
def scan(self, file_object, options): parser = options.get("parser", "html.parser") self.metadata["total"] = { "scripts": 0, "forms": 0, "inputs": 0, "frames": 0, "extracted": 0 } try: soup = bs4.BeautifulSoup(file_object.data, parser) if soup.title: normalized_title = objects.normalize_whitespace( soup.title.text) self.metadata["title"] = normalized_title hyperlinks = [] hyperlinks.extend(soup.find_all("a", href=True)) hyperlinks.extend(soup.find_all("img", src=True)) self.metadata.setdefault("hyperlinks", []) for hyperlink in hyperlinks: link = hyperlink.get("href") or hyperlink.get("src") if (link is not None and link not in self.metadata["hyperlinks"]): self.metadata["hyperlinks"].append(link) forms = soup.find_all("form") self.metadata["total"]["forms"] = len(forms) self.metadata.setdefault("forms", []) for form in forms: form_entry = {} form_action = form.get("action") if form_action is not None: form_entry["action"] = form_action form_method = form.get("method") if form_method is not None: form_entry["method"] = form_method if form_entry and form_entry not in self.metadata["forms"]: self.metadata["forms"].append(form_entry) frames = [] frames.extend(soup.find_all("frame")) frames.extend(soup.find_all("iframe")) self.metadata["total"]["frames"] = len(frames) self.metadata.setdefault("frames", []) for frame in frames: frame_entry = {} frame_src = frame.get("src") if frame_src is not None: frame_entry["src"] = frame_src frame_name = frame.get("name") if frame_name is not None: frame_entry["name"] = frame_name frame_height = frame.get("height") if frame_height is not None: frame_entry["height"] = frame_height frame_width = frame.get("width") if frame_width is not None: frame_entry["width"] = frame_width frame_border = frame.get("border") if frame_border is not None: frame_entry["border"] = frame_border frame_id = frame.get("id") if frame_id is not None: frame_entry["id"] = frame_id frame_style = frame.get("style") if frame_style is not None: frame_entry["style"] = frame_style if frame_entry and frame_entry not in self.metadata["frames"]: self.metadata["frames"].append(frame_entry) inputs = soup.find_all("input") self.metadata["total"]["inputs"] = len(inputs) self.metadata.setdefault("inputs", []) for input in inputs: input_entry = {} input_type = input.get("type") if input_type is not None: input_entry["type"] = input_type input_name = input.get("name") if input_name is not None: input_entry["name"] = input_name input_value = input.get("value") if input_value is not None: input_entry["value"] = input_value if input_entry and input_entry not in self.metadata["inputs"]: self.metadata["inputs"].append(input_entry) scripts = soup.find_all("script") self.metadata["total"]["scripts"] = len(scripts) self.metadata.setdefault("scripts", []) for (index, script) in enumerate(scripts): script_flavors = [] script_entry = {} script_src = script.get("src") if script_src is not None: script_entry["src"] = script_src script_language = script.get("language") if script_language is not None: script_entry["language"] = script_language script_flavors.append(script_language.lower()) script_type = script.get("type") if script_type is not None: script_entry["type"] = script_type script_flavors.append(script_type.lower()) if script_entry and script_entry not in self.metadata[ "scripts"]: self.metadata["scripts"].append(script_entry) if script.text: child_filename = f"{self.scanner_name}::script_{index}" child_fo = objects.StrelkaFile( data=script.text, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name, external_flavors=script_flavors) self.children.append(child_fo) self.metadata["total"]["extracted"] += 1 spans = soup.find_all("span") self.metadata["total"]["spans"] = len(spans) self.metadata.setdefault("spans", []) for span in spans: span_entry = {} span_class = span.get("class") if span_class is not None: span_entry["class"] = span_class span_style = span.get("style") if span_style is not None: span_entry["style"] = span_style if span_entry and span_entry not in self.metadata["spans"]: self.metadata["spans"].append(span_entry) except TypeError: file_object.flags.append(f"{self.scanner_name}::type_error")
def scan(self, file_object, options): self.metadata["total"] = {"sections": 0} try: pe = pefile.PE(data=file_object.data) pe_dictionary = pe.dump_dict() self.metadata["total"][ "sections"] = pe.FILE_HEADER.NumberOfSections self.metadata["warnings"] = pe.get_warnings() self.metadata["timestamp"] = datetime.utcfromtimestamp( pe.FILE_HEADER.TimeDateStamp).isoformat(timespec="seconds") machine = pe.FILE_HEADER.Machine self.metadata["machine"] = { "id": machine, "type": pefile.MACHINE_TYPE.get(machine) } # Reference: http://msdn.microsoft.com/en-us/library/windows/desktop/ms680339%28v=vs.85%29.aspx self.metadata["imageMagic"] = IMAGE_MAGIC_LOOKUP.get( pe.OPTIONAL_HEADER.Magic, "Unknown") subsystem = pe.OPTIONAL_HEADER.Subsystem self.metadata["subsystem"] = pefile.SUBSYSTEM_TYPE.get(subsystem) self.metadata[ "stackReserveSize"] = pe.OPTIONAL_HEADER.SizeOfStackReserve self.metadata[ "stackCommitSize"] = pe.OPTIONAL_HEADER.SizeOfStackCommit self.metadata[ "heapReserveSize"] = pe.OPTIONAL_HEADER.SizeOfHeapReserve self.metadata[ "heapCommitSize"] = pe.OPTIONAL_HEADER.SizeOfHeapCommit self.metadata[ "entryPoint"] = pe.OPTIONAL_HEADER.AddressOfEntryPoint self.metadata["imageBase"] = pe.OPTIONAL_HEADER.ImageBase self.metadata[ "entryPoint"] = pe.OPTIONAL_HEADER.AddressOfEntryPoint self.metadata[ "entryPoint"] = pe.OPTIONAL_HEADER.AddressOfEntryPoint image_characteristics = pe_dictionary.get("Flags") if image_characteristics is not None: self.metadata["imageCharacteristics"] = image_characteristics dll_characteristics = pe_dictionary.get("DllCharacteristics") if dll_characteristics is not None: self.metadata["dllCharacteristics"] = dll_characteristics try: self.metadata["imphash"] = pe.get_imphash() except AttributeError: file_object.flags.append(f"{self.scanner_name}::no_imphash") self.metadata.setdefault("exportFunctions", []) export_symbols = pe_dictionary.get("Exported symbols") if export_symbols is not None: for symbols in export_symbols: name = symbols.get("Name") if (name is not None and isinstance(name, bytes) and name not in self.metadata["exportFunctions"]): self.metadata["exportFunctions"].append(name) import_cache = {} self.metadata.setdefault("imports", []) import_symbols = pe_dictionary.get("Imported symbols") if import_symbols is not None: for symbol in import_symbols: for import_ in symbol: dll = import_.get("DLL") if dll is not None: if dll not in self.metadata["imports"]: self.metadata["imports"].append(dll) import_cache.setdefault(dll, []) ordinal = import_.get("Ordinal") if ordinal is not None: ordinal = pefile.ordlookup.ordLookup( dll.lower(), ordinal, make_name=True) import_cache[dll].append(ordinal) name = import_.get("Name") if name is not None: import_cache[dll].append(name) self.metadata.setdefault("importFunctions", []) for (import_, functions) in import_cache.items(): import_entry = {"import": import_, "functions": functions} if import_entry not in self.metadata["importFunctions"]: self.metadata["importFunctions"].append(import_entry) self.metadata.setdefault("resources", []) try: for resource in pe.DIRECTORY_ENTRY_RESOURCE.entries: res_type = pefile.RESOURCE_TYPE.get(resource.id, "Unknown") for entry in resource.directory.entries: for e_entry in entry.directory.entries: sublang = pefile.get_sublang_name_for_lang( e_entry.data.lang, e_entry.data.sublang, ) offset = e_entry.data.struct.OffsetToData size = e_entry.data.struct.Size r_data = pe.get_data(offset, size) language = pefile.LANG.get(e_entry.data.lang, "Unknown") data = { "type": res_type, "id": e_entry.id, "name": e_entry.data.struct.name, "offset": offset, "size": size, "sha256": hashlib.sha256(r_data).hexdigest(), "sha1": hashlib.sha1(r_data).hexdigest(), "md5": hashlib.md5(r_data).hexdigest(), "language": language, "subLanguage": sublang, } if data not in self.metadata["resources"]: self.metadata["resources"].append(data) except AttributeError: file_object.flags.append(f"{self.scanner_name}::no_resources") if hasattr(pe, "DIRECTORY_ENTRY_DEBUG"): debug = dict() for e in pe.DIRECTORY_ENTRY_DEBUG: rawData = pe.get_data(e.struct.AddressOfRawData, e.struct.SizeOfData) if rawData.find(b"RSDS") != -1 and len(rawData) > 24: pdb = rawData[rawData.find(b"RSDS"):] debug["guid"] = b"%s-%s-%s-%s" % (binascii.hexlify( pdb[4:8]), binascii.hexlify( pdb[8:10]), binascii.hexlify( pdb[10:12]), binascii.hexlify(pdb[12:20])) debug["age"] = struct.unpack("<L", pdb[20:24])[0] debug["pdb"] = pdb[24:].rstrip(b"\x00") self.metadata["rsds"] = debug elif rawData.find(b"NB10") != -1 and len(rawData) > 16: pdb = rawData[rawData.find(b"NB10") + 8:] debug["created"] = struct.unpack("<L", pdb[0:4])[0] debug["age"] = struct.unpack("<L", pdb[4:8])[0] debug["pdb"] = pdb[8:].rstrip(b"\x00") self.metadata["nb10"] = debug self.metadata.setdefault("sections", []) sections = pe_dictionary.get("PE Sections") if sections is not None: for section in sections: section_name = section.get("Name", {}).get("Value", "").replace( "\\x00", "") section_flags = section.get("Flags", []) section_structure = section.get("Structure", "") section_entry = { "name": section_name, "flags": section_flags, "structure": section_structure } if section_entry not in self.metadata["sections"]: self.metadata["sections"].append(section_entry) security = pe.OPTIONAL_HEADER.DATA_DIRECTORY[ pefile.DIRECTORY_ENTRY["IMAGE_DIRECTORY_ENTRY_SECURITY"]] digital_signature_virtual_address = security.VirtualAddress if security.Size > 0: signature_data = pe.write()[digital_signature_virtual_address + 8:] if len(signature_data) > 0: file_object.flags.append(f"{self.scanner_name}::signed") child_filename = f"{self.scanner_name}::digital_signature" child_fo = objects.StrelkaFile( data=signature_data, filename=child_filename, depth=file_object.depth + 1, parent_uid=file_object.uid, root_uid=file_object.root_uid, parent_hash=file_object.hash, root_hash=file_object.root_hash, source=self.scanner_name) self.children.append(child_fo) else: file_object.flags.append( f"{self.scanner_name}::empty_signature") except IndexError: file_object.flags.append(f"{self.scanner_name}::pe_index_error") except pefile.PEFormatError: file_object.flags.append(f"{self.scanner_name}::pe_format_error")