def is_whitelisted(self, task: IngestTask): reason, hit = self.get_whitelist_verdict(self.whitelist, task) hit = {x: dotdump(safe_str(y)) for x, y in hit.items()} sha256 = task.submission.files[0].sha256 if not reason: with self.whitelisted_lock: reason = self.whitelisted.get(sha256, None) if reason: hit = 'cached' if reason: if hit != 'cached': with self.whitelisted_lock: self.whitelisted[sha256] = reason task.failure = "Whitelisting due to reason %s (%s)" % (dotdump( safe_str(reason)), hit) self._notify_drop(task) self.counter.increment('whitelisted') return reason
def ident(buf, length: int, path) -> Dict: data = { 'ascii': None, 'hex': None, 'magic': None, 'mime': None, 'type': 'unknown' } if length <= 0: return data header = buf[:min(64, length)] data['ascii'] = dotdump(header) data['hex'] = safe_str(hexlify(header)) # noinspection PyBroadException try: # Loop over the labels returned by libmagic, ... labels = [] if file_type: with magic_lock: labels = magic.magic_file(file_type, path).split(b'\n') labels = [ label[2:] if label.startswith(b'- ') else label for label in labels ] mimes = [] if mime_type: with magic_lock: mimes = magic.magic_file(mime_type, path).split(b'\n') mimes = [ mime[2:] if mime.startswith(b'- ') else mime for mime in mimes ] # For user feedback set the mime and magic meta data to always be the primary # libmagic responses if len(labels) > 0: data['magic'] = safe_str(labels[0]) if len(mimes) > 0 and mimes[0] != b'': data['mime'] = safe_str(mimes[0]) # Highest priority is given to mime type matching something tagged = False for label in labels: label = dotdump(label) if custom.match(label): data['type'] = label.split('custom: ')[1].strip() tagged = True break # Second priority is mime times marked as trusted if not tagged: for mime in mimes: mime = dotdump(mime) if mime in trusted_mimes: data['type'] = trusted_mimes[mime] tagged = True break # As a third priority try matching the tl_patterns if not tagged: minimum = len(tl_patterns) sl_tag = None # Try each label and see how far down the tl_patterns list we go # before we hit a match, the closer to the beginning of the list we are the better # the tag match is. The final line of tl_patterns matches anything and sets # tag to 'unknown', so this loop should never finish with sl_tag as None # Unless the tl_patters table has been changed inappropriately for label in labels: label = dotdump(label) # ... match against our patterns and, ... index = 0 for entry in tl_patterns: if index >= minimum: break if entry[1].search(label): # pylint:disable=E1101 break index += 1 # ... keep highest precedence (lowest index) match. if index < minimum: minimum = index sl_tag = subtype(label) # If a label does match, take the best from that label # Further labels from magic are probably terrible break assert sl_tag is not None, "tl_patterns seems to be missing a match all => unknown rule at the end" # Based on the sub tag we found, figure out the top level tag to use tl_tag = sl_to_tl.get(sl_tag, tl_patterns[minimum][0]) data['type'] = '/'.join((tl_tag, sl_tag)) except Exception as e: print(str(e)) pass if not recognized.get(data['type'], False): data['type'] = 'unknown' if data['type'] == 'document/office/unknown': # noinspection PyBroadException try: root_entry_property_offset = buf.find( u"Root Entry".encode("utf-16-le")) if -1 != root_entry_property_offset: # Get root entry's GUID and try to guess document type clsid_offset = root_entry_property_offset + 0x50 if len(buf) >= clsid_offset + 16: clsid = buf[clsid_offset:clsid_offset + 16] if len(clsid) == 16 and clsid != "\0" * len(clsid): clsid_str = uuid.UUID(bytes_le=clsid) clsid_str = clsid_str.urn.rsplit(':', 1)[-1].upper() if clsid_str in OLE_CLSID_GUIDs: data['type'] = OLE_CLSID_GUIDs[clsid_str] except Exception: pass return data
# The default magic file misidentifies PE files with a munged DOS header data['type'] = dos_ident(path) elif data['type'] == 'code/html': # Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs # If both conditions are met, it's fair to say that the file is an .hta lang, _ = guess_language(path) if lang in ["code/javascript", "code/vbs"]: data['type'] = 'code/hta' if not recognized.get(data['type'], False) and not cart_metadata_set: data['type'] = 'unknown' return data if __name__ == '__main__': from pprint import pprint # noinspection PyBroadException if len(sys.argv) > 1: pprint(fileinfo(sys.argv[1])) else: name = sys.stdin.readline().strip() while name: a = fileinfo(name) print('\t'.join( dotdump(str(a[k])) for k in ('type', 'ascii', 'entropy', 'hex', 'magic', 'mime', 'md5', 'sha1', 'sha256', 'ssdeep', 'size'))) name = sys.stdin.readline().strip()
def ident(self, buf, length: int, path) -> Dict: data = { "ascii": None, "hex": None, "magic": None, "mime": None, "type": "unknown" } if length <= 0: return data header = buf[:min(64, length)] data["ascii"] = dotdump(header) data["hex"] = safe_str(hexlify(header)) # noinspection PyBroadException try: # Loop over the labels returned by libmagic, ... labels = [] mimes = [] with self.lock: try: labels = magic.magic_file(self.file_type, path).split(b"\n") except magic.MagicException as me: labels = me.message.split(b"\n") try: mimes = magic.magic_file(self.mime_type, path).split(b"\n") except magic.MagicException as me: mimes = me.message.split(b"\n") mimes = [ mime[2:].strip() if mime.startswith(b"- ") else mime.strip() for mime in mimes ] labels = [ label[2:].strip() if label.startswith(b"- ") else label.strip() for label in labels ] # For user feedback set the mime and magic meta data to always be the primary # libmagic responses if len(labels) > 0: def find_special_words(word, labels): for index, label in enumerate(labels): if word in label: return index return -1 # If an expected label is not the first label returned by Magic, then make it so # Manipulating the mime accordingly varies between special word cases special_word_cases = [ (b"OLE 2 Compound Document : Microsoft Word Document", False), (b"Lotus 1-2-3 WorKsheet", True), ] for word, alter_mime in special_word_cases: index = find_special_words(word, labels) if index >= 0: labels.insert(0, labels.pop(index)) if len(labels) == len(mimes) and alter_mime: mimes.insert(0, mimes.pop(index)) data["magic"] = safe_str(labels[0]) for mime in mimes: if mime != b"": data["mime"] = safe_str(mime) break # First lets try to find any custom types for label in labels: label = dotdump(label) if self.custom.match(label): data["type"] = label.split("custom: ")[1].strip() break # Second priority is mime times marked as trusted if data["type"] == "unknown": with self.lock: trusted_mimes = self.trusted_mimes for mime in mimes: mime = dotdump(mime) if mime in trusted_mimes: data["type"] = trusted_mimes[mime] break # As a third priority try matching the magic_patterns if data["type"] == "unknown": found = False with self.lock: compiled_magic_patterns = self.compiled_magic_patterns for label in labels: for entry in compiled_magic_patterns: if entry[1].search(dotdump(label)): # pylint: disable=E1101 data['type'] = entry[0] found = True break if found: break except Exception as e: self.log.error( f"An error occured during file identification: {e.__class__.__name__}({str(e)})" ) pass # If mime is text/* and type is unknown, set text/plain to trigger # language detection later. if data["type"] == "unknown" and data['mime'] is not None and data[ 'mime'].startswith("text/"): data["type"] = "text/plain" # Lookup office documents by GUID if we're still not sure what they are if data["type"] == "document/office/unknown": # noinspection PyBroadException try: root_entry_property_offset = buf.find( u"Root Entry".encode("utf-16-le")) if -1 != root_entry_property_offset: # Get root entry's GUID and try to guess document type clsid_offset = root_entry_property_offset + 0x50 if len(buf) >= clsid_offset + 16: clsid = buf[clsid_offset:clsid_offset + 16] if len(clsid) == 16 and clsid != b"\0" * len(clsid): clsid_str = uuid.UUID(bytes_le=clsid) clsid_str = clsid_str.urn.rsplit(":", 1)[-1].upper() if clsid_str in OLE_CLSID_GUIDs: data["type"] = OLE_CLSID_GUIDs[clsid_str] else: bup_details_offset = buf[: root_entry_property_offset + 0x100].find( u"Details".encode( "utf-16-le")) if -1 != bup_details_offset: data["type"] = "quarantine/mcafee" except Exception: pass return data
use_cache = True args = sys.argv[1:] if "--no-cache" in args: args.remove("--no-cache") use_cache = False identify = Identify(use_cache=use_cache) if len(args) > 0: pprint(identify.fileinfo(args[0])) else: name = sys.stdin.readline().strip() while name: a = identify.fileinfo(name) print("\t".join( dotdump(str(a[k])) for k in ( "type", "ascii", "entropy", "hex", "magic", "mime", "md5", "sha1", "sha256", "ssdeep", "size", ))) name = sys.stdin.readline().strip()
def test_dotdump(): result = str_utils.dotdump([1, 8, 22, 33, 66, 99, 126, 127, 1000]) assert result == "...!Bc~.."