def fileinfo(self, path: str) -> Dict: path = safe_str(path) data = get_digests_for_file(path, on_first_block=self.ident) data["ssdeep"] = ssdeep.hash_from_file(path) # Check if file empty if not int(data.get("size", -1)): data["type"] = "empty" # Futher identify zip files based of their content elif data["type"] in [ "archive/zip", "java/jar", "document/office/unknown" ]: data["type"] = zip_ident(path, data["type"]) # Further check CaRT files, they may have an explicit type set elif data["type"] == "archive/cart": data["type"] = cart_ident(path) # Further identify dos executables has this may be a PE that has been misidentified elif data["type"] == "executable/windows/dos": data["type"] = dos_ident(path) # If we're so far failed to identified the file, lets run the yara rules elif "unknown" in data["type"] or data["type"] == "text/plain": data["type"] = self.yara_ident(path, data, fallback=data["type"]) # Extra checks for office documents # - Check for encryption if data["type"] in [ "document/office/word", "document/office/excel", "document/office/powerpoint", "document/office/unknown", ]: try: msoffcrypto_obj = msoffcrypto.OfficeFile(open(path, "rb")) if msoffcrypto_obj and msoffcrypto_obj.is_encrypted(): data["type"] = "document/office/passwordprotected" except Exception: # If msoffcrypto can't handle the file to confirm that it is/isn't password protected, # then it's not meant to be. Moving on! pass # Extra checks for PDF documents # - Check for encryption # - Check for PDF collection (portfolio) if data["type"] == "document/pdf": # Password protected documents typically contain '/Encrypt' pdf_content = open(path, "rb").read() if re.search(b"/Encrypt", pdf_content): data["type"] = "document/pdf/passwordprotected" # Portfolios typically contain '/Type/Catalog/Collection elif re.search(b"/Type/Catalog/Collection", pdf_content): data["type"] = "document/pdf/portfolio" return data
def fileinfo(path: str) -> Dict: path = safe_str(path) data = get_digests_for_file(path, on_first_block=ident) # This is a special case, we know if the mime is set to one of these values # then the input file is almost certainly an office file, but based on only the first # block magic can't figure out any more than that. To handle that case we will read the # entire file, and identify again. if data['mime'] is not None and data['mime'].lower() in [ 'application/cdfv2-corrupt', 'application/cdfv2-unknown' ]: with open(path, 'rb') as fh: buf = fh.read() buflen = len(buf) data.update(ident(buf, buflen)) data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else '' # When data is parsed from a cart file we trust its metatdata and can skip the recognition test later cart_metadata_set = False if not int(data.get('size', -1)): data['type'] = 'empty' elif data['type'] in ['archive/zip', 'java/jar']: # In addition to explicit zip files, we also want to run zip_ident when # a file is a jar as there is a high rate of false positive (magic # matching eclipse and other java related files as jars) data['type'] = zip_ident(path) elif data['type'] == 'document/office/unknown': # For unknown document files try identifying them by unziping, # but don't commit to it being a zip if it can't be extracted data['type'] = zip_ident(path, data['type']) elif data['type'] == 'unknown': data['type'], _ = guess_language(path) elif data['type'] == 'archive/cart': data['type'] = cart_ident(path) cart_metadata_set = True elif data['type'] == 'executable/windows/dos': # The default magic file misidentifies PE files with a munged DOS header data['type'] = dos_ident(path) elif data['type'] == 'code/html': # Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs # If both conditions are met, it's fair to say that the file is an .hta lang, _ = guess_language(path) if lang in ["code/javascript", "code/vbs"]: data['type'] = 'code/hta' if not recognized.get(data['type'], False) and not cart_metadata_set: data['type'] = 'unknown' return data
def presubmit_local_files(self, file_paths, **kw): default_error = {'succeeded': False, 'error': 'Unknown Error'} presubmit_requests = {} presubmit_results = {} ignore_size = kw.get('ignore_size', False) max_size = config.submissions.max.size # Prepare the batch presubmit. rid_map = {} for rid, local_path in enumerate(file_paths): rid = str(rid) rid_map[rid] = local_path try: assert_valid_file(local_path) d = digests.get_digests_for_file(local_path, calculate_entropy=False) if d['size'] > max_size and not ignore_size: presubmit_results[rid] = { 'succeeded': False, 'error': 'file too large (%d > %d). Skipping' % (d['size'], max_size), } continue presubmit_requests[rid] = d # Set a default error. Overwritten on success. presubmit_results[rid] = default_error.copy() except Exception as ex: # pylint: disable=W0703 log.error("Exception processing local file: %s. Skipping", ex) presubmit_results[rid] = { 'succeeded': False, 'error': 'local failure before presubmit: {0}'.format(ex), } continue if self.is_unix: presubmit_results = self._presubmit_local_files_unix(presubmit_requests, presubmit_results) else: presubmit_results = self._presubmit_local_files_windows(presubmit_requests, presubmit_results) if len(presubmit_results) != len(file_paths): log.error('Problem submitting %s: %s', pprint.pformat(file_paths), pprint.pformat(presubmit_results)) # noinspection PyUnresolvedReferences for rid, result in presubmit_results.iteritems(): result['path'] = rid_map[rid] return presubmit_results
def add_extracted( self, path: str, name: str, description: str, classification: Optional[Classification] = None, safelist_interface: Optional[Union[ServiceAPI, PrivilegedServiceAPI]] = None ) -> bool: # Service-based safelisting of files has to be configured at the global configuration # Allows the administrator to be selective about the types of hashes to lookup in the safelist if safelist_interface and self.safelist_config.enabled and not ( self.deep_scan or self.ignore_filtering): # Ignore adding files that are known to the system to be safe digests = get_digests_for_file(path) for hash_type in self.safelist_config.hash_types: qhash = digests[hash_type] resp = safelist_interface.lookup_safelist(qhash) self.log.debug( f'Checking system safelist for {hash_type}: {qhash}') if resp and resp['enabled'] and resp['type'] == 'file': self.log.info( f'Dropping safelisted, extracted file.. ({hash_type}: {qhash})' ) return False if self.max_extracted and len(self.extracted) >= int( self.max_extracted): raise MaxExtractedExceeded if not path: raise ValueError("Path cannot be empty") if not name: raise ValueError("Name cannot be empty") if not description: raise ValueError("Description cannot be empty") file = self._add_file(path, name, description, classification) if not file: return False self.extracted.append(file) return True
def fileinfo(path): path = safe_str(path) data = get_digests_for_file(path, on_first_block=ident) if data['mime'].lower() == 'application/cdfv2-corrupt': with open(path, 'r') as fh: buf = fh.read() buflen = len(buf) data.update(ident(buf, buflen)) data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else '' if not int(data.get('size', -1)): data['tag'] = 'empty' elif data['tag'] == 'archive/zip' or data['tag'] == 'java/jar': data['tag'] = zip_ident(path) elif data['tag'] == 'unknown': data['tag'], _ = guess_language(path) elif data['tag'] == 'archive/cart': data['tag'] = cart_ident(path) elif data['tag'] == 'executable/windows/dos': # The default magic file misidentifies PE files with a munged DOS header data['tag'] = dos_ident(path) return data