def decode_file(original_path, fileinfo, identify): extracted_path = None hdr = {} with open(original_path, 'rb') as original_file: if is_cart(original_file.read(256)): original_file.seek(0) _, hdr, _ = _unpack_header(original_file) al_type = flatten(hdr).get('al.type', None) if not al_type: original_file.seek(0) extracted_fd, extracted_path = tempfile.mkstemp() extracted_file = os.fdopen(extracted_fd, 'wb') cart_extracted = False try: hdr, _ = unpack_stream(original_file, extracted_file) cart_extracted = True except Exception: extracted_path = None hdr = {} fileinfo['type'] = 'corrupted/cart' finally: extracted_file.close() if cart_extracted and extracted_path: fileinfo = identify.fileinfo(extracted_path) return extracted_path, fileinfo, hdr
def parse_link(self, parent_res, path): with open(path, "rb") as fh: metadata = decode_lnk(fh.read()) if metadata is None: return False body_output = { build_key(k): v for k, v in flatten(metadata).items() if v } res = ResultSection("Metadata extracted by parse_lnk", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(body_output), parent=parent_res) bp = metadata.get("BasePath", "").strip() rp = metadata.get("RELATIVE_PATH", "").strip() nn = metadata.get("NetName", "").strip() cla = metadata.get("COMMAND_LINE_ARGUMENTS", "").strip() s = BAD_LINK_RE.search(cla.lower()) if s: res.set_heuristic(1) res.add_tag(tag_type="file.name.extracted", value=(bp or rp or nn).rsplit("\\")[-1]) res.add_tag(tag_type="dynamic.process.command_line", value=f"{(rp or bp or nn)} {cla}".strip()) for k, v in body_output.items(): tag_type = TAG_MAP.get("LNK", {}).get(k, None) or \ TAG_MAP.get(None, {}).get(k, None) if tag_type: res.add_tag(tag_type, v) return True
def _create_random_section(self): # choose a random body format body_format = random.choice(FORMAT_LIST) # create a section with a random title section = ResultSection(get_random_phrase(3, 7), body_format=body_format) # choose random amount of lines in the body for _ in range(1, 5): # generate random line section.add_line(get_random_phrase(5, 10)) # choose random amount of tags tags = flatten(get_random_tags()) for key, val in tags.items(): for v in val: section.add_tag(key, v) # set a heuristic a third of the time if random.choice([False, False, True]): section.set_heuristic(random.randint(1, 4)) # Create random sub-sections if random.choice([False, False, True]): section.add_subsection(self._create_random_section()) return section
def test_compat_tag_map(): flatten_map = flatten(tag_map) for _ in range(10): random_key = random.choice(list(v3_lookup_map.keys())) try: assert random_key in flatten_map[v3_lookup_map[random_key]] except KeyError: assert random_key in UNUSED
def fix_section_data(section): if section['body_format'] in JSON_SECTIONS and isinstance( section['body'], str): # Loading JSON formatted sections try: section['body'] = json.loads(section['body']) except ValueError: pass # Changing tags to a list section['tags'] = tag_dict_to_list(flatten(section['tags']), False) section['tags'] += tag_dict_to_list(section.pop('safelisted_tags', {}), True) return section
def test_dict_flatten(): src = { "a": { "b": { "c": 1 } }, "b": { "d": { 2 } } } flat_src = flatten(src) assert src == unflatten(flat_src) assert list(flat_src.keys()) == ["a.b.c", "b.d"]
def generalize_result(result): # At first we were comparing the full result and removing the random/unpredictable information. # Now we are only keeping the strict minimum to compare with. # supplementary/extracted sha256 + heuristics heur_id + tags trimed_result = {} if "response" in result: trimed_result["response"] = {} if "supplementary" in result["response"]: trimed_result["response"]["supplementary"] = sorted( [x["sha256"] for x in result["response"]["supplementary"]]) if "extracted" in result["response"]: trimed_result["response"]["extracted"] = sorted( [{ "name": x["name"], "sha256": x["sha256"] } for x in result["response"]["extracted"]], key=lambda x: x["sha256"], ) if "result" in result: trimed_result["result"] = {} if "sections" in result["result"]: trimed_result["result"] = {"heuristics": [], "tags": {}} for section in result["result"]["sections"]: if "heuristic" in section: if section["heuristic"] is not None: if "heur_id" in section["heuristic"]: trimed_result["result"]["heuristics"].append( section["heuristic"]["heur_id"]) if "tags" in section: if section["tags"]: for k, v in flatten(section["tags"]).items(): if k in trimed_result["result"]["tags"]: trimed_result["result"]["tags"][k].extend(v) else: trimed_result["result"]["tags"][k] = v # Sort the heur_id and tags lists so they always appear in the same order even if # the result sections where moved around. trimed_result["result"]["heuristics"] = sorted( trimed_result["result"]["heuristics"]) for k, v in trimed_result["result"]["tags"].items(): trimed_result["result"]["tags"][k] = sorted(v) return trimed_result
def get_tag_list_from_keys(self, keys): if len(keys) == 0: return [] keys = [x for x in list(keys) if not x.endswith(".e")] items = self.result.multiget(keys, as_obj=False) out = [] for key, item in items.items(): for section in item.get('result', {}).get('sections', []): for tag_type, tags in flatten(section.get('tags', {})).items(): if tags is not None: for tag in tags: out.append({ 'type': tag_type, 'short_type': tag_type.rsplit(".", 1)[-1], 'value': tag, 'key': key }) return out
def tag_dict_to_list(tag_dict: Dict) -> List[Dict]: return [{ 'type': k, 'value': t, 'short_type': k.rsplit(".", 1)[-1] } for k, v in flatten(tag_dict).items() if v is not None for t in v]
def get_summary_from_keys(self, keys): out = { "tags": [], "attack_matrix": [], "heuristics": { "info": [], "suspicious": [], "malicious": [] } } done_map = {"heuristics": set(), "attack": set(), "tags": set()} if len(keys) == 0: return out keys = [x for x in list(keys) if not x.endswith(".e")] items = self.result.multiget(keys, as_obj=False) for key, item in items.items(): for section in item.get('result', {}).get('sections', []): h_type = "info" if section.get('heuristic', False): # Get the heuristics data if section['heuristic']['score'] < 100: h_type = "info" elif section['heuristic']['score'] < 1000: h_type = "suspicious" else: h_type = "malicious" cache_key = f"{section['heuristic']['heur_id']}_{key}" if cache_key not in done_map['heuristics']: out['heuristics'][h_type].append({ 'heur_id': section['heuristic']['heur_id'], 'name': section['heuristic']['name'], 'key': key }) done_map['heuristics'].add(cache_key) if section['heuristic'].get('attack_id', False): # Get attack matrix data attack_id = section['heuristic']['attack_id'] cache_key = f"{attack_id}_{key}" if cache_key not in done_map['attack']: out['attack_matrix'].append({ "key": key, "attack_id": attack_id, "h_type": h_type, "name": section['heuristic']['attack_pattern'], "categories": section['heuristic']['attack_categories'] }) done_map['attack'].add(cache_key) # Get tagging data for tag_type, tags in flatten(section.get('tags', {})).items(): if tags is not None: for tag in tags: cache_key = f"{tag_type}_{tag}_{key}" if cache_key not in done_map['tags']: out['tags'].append({ 'type': tag_type, 'h_type': h_type, 'short_type': tag_type.rsplit(".", 1)[-1], 'value': tag, 'key': key }) done_map['tags'].add(cache_key) return out
def ingest_single_file(**kwargs): """ Ingest a single file, sha256 or URL in the system Note 1: If you are submitting a sha256 or a URL, you must use the application/json encoding and one of sha256 or url parameters must be included in the data block. Note 2: If you are submitting a file directly, you have to use multipart/form-data encoding this was done to reduce the memory footprint and speedup file transfers ** Read documentation of mime multipart standard if your library does not support it** The multipart/form-data for sending binary has two parts: - The first part contains a JSON dump of the optional params and uses the name 'json' - The last part conatins the file binary, uses the name 'bin' and includes a filename Note 3: The ingest API uses the user's default settings to submit files to the system unless these settings are overridden in the 'params' field. Although, there are exceptions to that rule. Fields deep_scan, ignore_filtering, ignore_cache are resetted to False because the lead to dangerous behavior in the system. Variables: None Arguments: None Data Block (SHA256 or URL): { //REQUIRED VALUES: One of the following "sha256": "1234...CDEF" # SHA256 hash of the file "url": "http://...", # Url to fetch the file from //OPTIONAL VALUES "name": "file.exe", # Name of the file "metadata": { # Submission Metadata "key": val, # Key/Value pair for metadata parameters }, "params": { # Submission parameters "key": val, # Key/Value pair for params that differ from the user's defaults }, # DEFAULT: /api/v3/user/submission_params/<user>/ "generate_alert": False, # Generate an alert in our alerting system or not "notification_queue": None, # Name of the notification queue "notification_threshold": None, # Threshold for notification } Data Block (Binary): --0b34a3c50d3c02dd804a172329a0b2aa <-- Randomly generated boundary for this http request Content-Disposition: form-data; name="json" <-- JSON data blob part (only previous optional values valid) {"params": {"ignore_cache": true}, "generate_alert": true} --0b34a3c50d3c02dd804a172329a0b2aa <-- Switch to next part, file part Content-Disposition: form-data; name="bin"; filename="name_of_the_file_to_scan.bin" <BINARY DATA OF THE FILE TO SCAN... DOES NOT NEED TO BE ENCODDED> --0b34a3c50d3c02dd804a172329a0b2aa-- <-- End of HTTP transmission Result example: { "ingest_id": <ID OF THE INGESTED FILE> } """ user = kwargs['user'] out_dir = os.path.join(TEMP_SUBMIT_DIR, get_random_id()) extracted_path = original_file = None try: # Get data block and binary blob if 'multipart/form-data' in request.content_type: if 'json' in request.values: data = json.loads(request.values['json']) else: data = {} binary = request.files['bin'] name = data.get("name", binary.filename) sha256 = None url = None elif 'application/json' in request.content_type: data = request.json binary = None sha256 = data.get('sha256', None) url = data.get('url', None) name = data.get("name", None) or sha256 or os.path.basename(url) or None else: return make_api_response({}, "Invalid content type", 400) if not data: return make_api_response({}, "Missing data block", 400) # Get notification queue parameters notification_queue = data.get('notification_queue', None) notification_threshold = data.get('notification_threshold', None) if not isinstance(notification_threshold, int) and notification_threshold: return make_api_response( {}, "notification_threshold should be and int", 400) # Get file name if not name: return make_api_response({}, "Filename missing", 400) name = safe_str(os.path.basename(name)) if not name: return make_api_response({}, "Invalid filename", 400) try: os.makedirs(out_dir) except Exception: pass original_file = out_file = os.path.join(out_dir, name) # Prepare variables extra_meta = {} fileinfo = None do_upload = True al_meta = {} # Load default user params s_params = ui_to_submission_params(load_user_settings(user)) # Reset dangerous user settings to safe values s_params.update({ 'deep_scan': False, "priority": 150, "ignore_cache": False, "ignore_dynamic_recursion_prevention": False, "ignore_filtering": False, "type": "INGEST" }) # Apply provided params s_params.update(data.get("params", {})) # Load file if not binary: if sha256: fileinfo = STORAGE.file.get_if_exists( sha256, as_obj=False, archive_access=config.datastore.ilm.update_archive) if FILESTORE.exists(sha256): if fileinfo: if not Classification.is_accessible( user['classification'], fileinfo['classification']): return make_api_response( {}, "SHA256 does not exist in our datastore", 404) else: # File's classification must be applied at a minimum s_params[ 'classification'] = Classification.max_classification( s_params['classification'], fileinfo['classification']) else: # File is in storage and the DB no need to upload anymore do_upload = False # File exists in the filestore and the user has appropriate file access FILESTORE.download(sha256, out_file) else: return make_api_response( {}, "SHA256 does not exist in our datastore", 404) else: if url: if not config.ui.allow_url_submissions: return make_api_response( {}, "URL submissions are disabled in this system", 400) try: safe_download(url, out_file) extra_meta['submitted_url'] = url except FileTooBigException: return make_api_response({}, "File too big to be scanned.", 400) except InvalidUrlException: return make_api_response({}, "Url provided is invalid.", 400) except ForbiddenLocation: return make_api_response( {}, "Hostname in this URL cannot be resolved.", 400) else: return make_api_response( {}, "Missing file to scan. No binary, sha256 or url provided.", 400) else: binary.save(out_file) if do_upload and os.path.getsize(out_file) == 0: return make_api_response({}, err="File empty. Ingestion failed", status_code=400) # Apply group params if not specified if 'groups' not in s_params: s_params['groups'] = user['groups'] # Get generate alert parameter generate_alert = data.get('generate_alert', s_params.get('generate_alert', False)) if not isinstance(generate_alert, bool): return make_api_response({}, "generate_alert should be a boolean", 400) # Override final parameters s_params.update({ 'generate_alert': generate_alert, 'max_extracted': config.core.ingester.default_max_extracted, 'max_supplementary': config.core.ingester.default_max_supplementary, 'priority': min(s_params.get("priority", 150), config.ui.ingest_max_priority), 'submitter': user['uname'] }) # Enforce maximum DTL if config.submission.max_dtl > 0: s_params['ttl'] = min(int( s_params['ttl']), config.submission.max_dtl) if int( s_params['ttl']) else config.submission.max_dtl # No need to re-calculate fileinfo if we have it already if not fileinfo: # Calculate file digest fileinfo = IDENTIFY.fileinfo(out_file) # Validate file size if fileinfo['size'] > MAX_SIZE and not s_params.get( 'ignore_size', False): msg = f"File too large ({fileinfo['size']} > {MAX_SIZE}). Ingestion failed" return make_api_response({}, err=msg, status_code=413) elif fileinfo['size'] == 0: return make_api_response({}, err="File empty. Ingestion failed", status_code=400) # Decode cart if needed extracted_path, fileinfo, al_meta = decode_file( out_file, fileinfo, IDENTIFY) if extracted_path: out_file = extracted_path # Alter filename and classification based on CaRT output meta_classification = al_meta.pop('classification', s_params['classification']) if meta_classification != s_params['classification']: try: s_params['classification'] = Classification.max_classification( meta_classification, s_params['classification']) except InvalidClassification as ic: return make_api_response( {}, "The classification found inside the cart file cannot be merged with " f"the classification the file was submitted as: {str(ic)}", 400) name = al_meta.pop('name', name) # Validate ingest classification if not Classification.is_accessible(user['classification'], s_params['classification']): return make_api_response( {}, "You cannot start a submission with higher " "classification then you're allowed to see", 400) # Freshen file object expiry = now_as_iso(s_params['ttl'] * 24 * 60 * 60) if s_params.get('ttl', None) else None STORAGE.save_or_freshen_file(fileinfo['sha256'], fileinfo, expiry, s_params['classification']) # Save the file to the filestore if needs be # also no need to test if exist before upload because it already does that if do_upload: FILESTORE.upload(out_file, fileinfo['sha256'], location='far') # Setup notification queue if needed if notification_queue: notification_params = { "queue": notification_queue, "threshold": notification_threshold } else: notification_params = {} # Load metadata, setup some default values if they are missing and append the cart metadata ingest_id = get_random_id() metadata = flatten(data.get("metadata", {})) metadata['ingest_id'] = ingest_id metadata['type'] = s_params['type'] metadata.update(al_meta) if 'ts' not in metadata: metadata['ts'] = now_as_iso() metadata.update(extra_meta) # Set description if it does not exists s_params['description'] = s_params[ 'description'] or f"[{s_params['type']}] Inspection of file: {name}" # Create submission object try: submission_obj = Submission({ "sid": ingest_id, "files": [{ 'name': name, 'sha256': fileinfo['sha256'], 'size': fileinfo['size'] }], "notification": notification_params, "metadata": metadata, "params": s_params }) except (ValueError, KeyError) as e: return make_api_response({}, err=str(e), status_code=400) # Send submission object for processing ingest.push(submission_obj.as_primitives()) submission_received(submission_obj) return make_api_response({"ingest_id": ingest_id}) finally: # Cleanup files on disk try: if original_file and os.path.exists(original_file): os.unlink(original_file) except Exception: pass try: if extracted_path and os.path.exists(extracted_path): os.unlink(extracted_path) except Exception: pass try: if os.path.exists(out_dir): shutil.rmtree(out_dir, ignore_errors=True) except Exception: pass
} UNUSED = [ 'BASE64_ALPHABET', 'DISPLAY_SEARCH_STRING', 'DYNAMIC_MALICIOUSNESS', 'DYNAMIC_MALWARE_PATTERN', 'FILE_ATTRIBUTION', 'FILE_EXTENSION', 'FILE_MIMETYPE', 'HEURISTIC', 'REQUEST_SCORE', 'REQUEST_USERNAME', 'SERVICE_DESCRIPTION', 'SERVICE_DISPLAY_NAME', 'SERVICE_NAME', ] def reverse_map(data: Dict) -> Dict: output = {} for k, v in data.items(): for x in v: output[x] = k return output v3_lookup_map = reverse_map(flatten(tag_map)) v3_lookup_map.update({k: None for k in UNUSED})
def service_finished(self, sid: str, result_key: str, result: Result, temporary_data: Optional[Dict[str, Any]] = None): """Notifies the dispatcher of service completion, and possible new files to dispatch.""" # Make sure the dispatcher knows we were working on this task task_key = ServiceTask.make_key(sid=sid, service_name=result.response.service_name, sha=result.sha256) task = self.running_tasks.pop(task_key) if not task: self.log.warning(f"[{sid}/{result.sha256}] {result.response.service_name} could not find the specified " f"task in its set of running tasks while processing successful results.") return task = ServiceTask(task) # Save or freshen the result, the CONTENT of the result shouldn't change, but we need to keep the # most distant expiry time to prevent pulling it out from under another submission too early if result.is_empty(): # Empty Result will not be archived therefore result.archive_ts drives their deletion self.ds.emptyresult.save(result_key, {"expiry_ts": result.archive_ts}) else: while True: old, version = self.ds.result.get_if_exists( result_key, archive_access=self.config.datastore.ilm.update_archive, version=True) if old: if old.expiry_ts and result.expiry_ts: result.expiry_ts = max(result.expiry_ts, old.expiry_ts) else: result.expiry_ts = None try: self.ds.result.save(result_key, result, version=version) break except VersionConflictException as vce: self.log.info(f"Retrying to save results due to version conflict: {str(vce)}") # Send the result key to any watching systems msg = {'status': 'OK', 'cache_key': result_key} for w in self._get_watcher_list(task.sid).members(): NamedQueue(w, host=self.redis).push(msg) # Save the tags tags = [] for section in result.result.sections: tags.extend(tag_dict_to_list(flatten(section.tags.as_primitives()))) # Pull out file names if we have them file_names = {} for extracted_data in result.response.extracted: if extracted_data.name: file_names[extracted_data.sha256] = extracted_data.name # dispatcher = task.metadata['dispatcher__'] result_queue = self._get_queue_from_cache(DISPATCH_RESULT_QUEUE + dispatcher) ex_ts = result.expiry_ts.strftime(DATEFORMAT) if result.expiry_ts else result.archive_ts.strftime(DATEFORMAT) result_queue.push({ # 'service_task': task.as_primitives(), # 'result': result.as_primitives(), 'sid': task.sid, 'sha256': result.sha256, 'service_name': task.service_name, 'service_version': result.response.service_version, 'service_tool_version': result.response.service_tool_version, 'archive_ts': result.archive_ts.strftime(DATEFORMAT), 'expiry_ts': ex_ts, 'result_summary': { 'key': result_key, 'drop': result.drop_file, 'score': result.result.score, 'children': [r.sha256 for r in result.response.extracted], }, 'tags': tags, 'extracted_names': file_names, 'temporary_data': temporary_data })
def _handle_task_result(self, exec_time: int, task: ServiceTask, result: Dict[str, Any], client_id, service_name, freshen: bool, metric_factory): def freshen_file(file_info_list, item): file_info = file_info_list.get(item['sha256'], None) if file_info is None or not self.filestore.exists(item['sha256']): return True else: file_info['archive_ts'] = archive_ts file_info['expiry_ts'] = expiry_ts file_info['classification'] = item['classification'] self.datastore.save_or_freshen_file( item['sha256'], file_info, file_info['expiry_ts'], file_info['classification'], is_section_image=item.get('is_section_image', False)) return False archive_ts = now_as_iso(self.config.datastore.ilm.days_until_archive * 24 * 60 * 60) if task.ttl: expiry_ts = now_as_iso(task.ttl * 24 * 60 * 60) else: expiry_ts = None # Check if all files are in the filestore if freshen: missing_files = [] hashes = list( set([ f['sha256'] for f in result['response']['extracted'] + result['response']['supplementary'] ])) file_infos = self.datastore.file.multiget(hashes, as_obj=False, error_on_missing=False) with elasticapm.capture_span( name="handle_task_result.freshen_files", span_type="tasking_client"): with concurrent.futures.ThreadPoolExecutor( max_workers=5) as executor: res = { f['sha256']: executor.submit(freshen_file, file_infos, f) for f in result['response']['extracted'] + result['response']['supplementary'] } for k, v in res.items(): if v.result(): missing_files.append(k) if missing_files: return missing_files # Add scores to the heuristics, if any section set a heuristic with elasticapm.capture_span( name="handle_task_result.process_heuristics", span_type="tasking_client"): total_score = 0 for section in result['result']['sections']: zeroize_on_sig_safe = section.pop('zeroize_on_sig_safe', True) section['tags'] = flatten(section['tags']) if section.get('heuristic'): heur_id = f"{service_name.upper()}.{str(section['heuristic']['heur_id'])}" section['heuristic']['heur_id'] = heur_id try: section[ 'heuristic'], new_tags = self.heuristic_handler.service_heuristic_to_result_heuristic( section['heuristic'], self.heuristics, zeroize_on_sig_safe) for tag in new_tags: section['tags'].setdefault(tag[0], []) if tag[1] not in section['tags'][tag[0]]: section['tags'][tag[0]].append(tag[1]) total_score += section['heuristic']['score'] except InvalidHeuristicException: section['heuristic'] = None # Update the total score of the result result['result']['score'] = total_score # Add timestamps for creation, archive and expiry result['created'] = now_as_iso() result['archive_ts'] = archive_ts result['expiry_ts'] = expiry_ts # Pop the temporary submission data temp_submission_data = result.pop('temp_submission_data', None) if temp_submission_data: old_submission_data = { row.name: row.value for row in task.temporary_submission_data } temp_submission_data = { k: v for k, v in temp_submission_data.items() if k not in old_submission_data or v != old_submission_data[k] } big_temp_data = { k: len(str(v)) for k, v in temp_submission_data.items() if len(str(v)) > self.config.submission.max_temp_data_length } if big_temp_data: big_data_sizes = [f"{k}={v}" for k, v in big_temp_data.items()] self.log.warning( f"[{task.sid}] The following temporary submission keys where ignored because they are " "bigger then the maximum data size allowed " f"[{self.config.submission.max_temp_data_length}]: {' | '.join(big_data_sizes)}" ) temp_submission_data = { k: v for k, v in temp_submission_data.items() if k not in big_temp_data } # Process the tag values with elasticapm.capture_span(name="handle_task_result.process_tags", span_type="tasking_client"): for section in result['result']['sections']: # Perform tag safelisting tags, safelisted_tags = self.tag_safelister.get_validated_tag_map( section['tags']) section['tags'] = unflatten(tags) section['safelisted_tags'] = safelisted_tags section['tags'], dropped = construct_safe( Tagging, section.get('tags', {})) # Set section score to zero and lower total score if service is set to zeroize score # and all tags were safelisted if section.pop('zeroize_on_tag_safe', False) and \ section.get('heuristic') and \ len(tags) == 0 and \ len(safelisted_tags) != 0: result['result']['score'] -= section['heuristic']['score'] section['heuristic']['score'] = 0 if dropped: self.log.warning( f"[{task.sid}] Invalid tag data from {service_name}: {dropped}" ) result = Result(result) result_key = result.build_key( service_tool_version=result.response.service_tool_version, task=task) self.dispatch_client.service_finished(task.sid, result_key, result, temp_submission_data) # Metrics if result.result.score > 0: metric_factory.increment('scored') else: metric_factory.increment('not_scored') self.log.info( f"[{task.sid}] {client_id} - {service_name} " f"successfully completed task {f' in {exec_time}ms' if exec_time else ''}" ) self.status_table.set( client_id, (service_name, ServiceStatus.Idle, time.time() + 5))
def get_summary_from_keys(self, keys, cl_engine=forge.get_classification(), user_classification=None): out = { "tags": [], "attack_matrix": [], "heuristics": { "info": [], "suspicious": [], "malicious": [] }, "classification": cl_engine.UNRESTRICTED, "filtered": False } done_map = {"heuristics": set(), "attack": set(), "tags": set()} if len(keys) == 0: return out keys = [x for x in list(keys) if not x.endswith(".e")] file_keys = list(set([x[:64] for x in keys])) try: items = self.result.multiget(keys, as_obj=False) except MultiKeyError as e: # Generate partial summaries even if results are missing log.warning( f"Trying to generate summary but we are missing result(s): {str(e.keys)}" ) items = e.partial_output out['missing_results'] = e.keys try: files = self.file.multiget(file_keys, as_obj=False) except MultiKeyError as e: # Generate partial summaries even if results are missing log.warning( f"Trying to generate summary but we are missing file(s): {str(e.keys)}" ) files = e.partial_output out['missing_files'] = e.keys for key, item in items.items(): for section in item.get('result', {}).get('sections', []): file_classification = files.get(key[:64], {}).get( 'classification', section['classification']) if user_classification: if not cl_engine.is_accessible(user_classification, section['classification']): out["filtered"] = True continue if not cl_engine.is_accessible(user_classification, file_classification): out["filtered"] = True continue out["classification"] = cl_engine.max_classification( out["classification"], section['classification']) out["classification"] = cl_engine.max_classification( out["classification"], file_classification) h_type = "info" if section.get('heuristic', False): # Get the heuristics data if section['heuristic']['score'] < 100: h_type = "info" elif section['heuristic']['score'] < 1000: h_type = "suspicious" else: h_type = "malicious" cache_key = f"{section['heuristic']['heur_id']}_{key}" if cache_key not in done_map['heuristics']: out['heuristics'][h_type].append({ 'heur_id': section['heuristic']['heur_id'], 'name': section['heuristic']['name'], 'key': key }) done_map['heuristics'].add(cache_key) for attack in section['heuristic'].get('attack', []): # Get attack matrix data attack_id = attack['attack_id'] cache_key = f"{attack_id}_{key}" if cache_key not in done_map['attack']: out['attack_matrix'].append({ "key": key, "attack_id": attack_id, "h_type": h_type, "name": attack['pattern'], "categories": attack['categories'] }) done_map['attack'].add(cache_key) # Get tagging data for tag_type, tags in flatten(section.get('tags', {})).items(): if tags is not None: for tag in tags: cache_key = f"{tag_type}_{tag}_{key}" if cache_key not in done_map['tags']: out['tags'].append({ 'type': tag_type, 'h_type': h_type, 'short_type': tag_type.rsplit(".", 1)[-1], 'value': tag, 'key': key }) done_map['tags'].add(cache_key) return out
def validate_tags(tag_map): tag_map, _ = construct_safe(Tagging, unflatten(tag_map)) tag_map = flatten(tag_map.as_primitives(strip_null=True)) return tag_map
def submit(**kwargs): """ Submit a single file, sha256 or url for analysis Note 1: If you are submitting a sh256 or a URL, you must use the application/json encoding and one of sha256 or url parameters must be included in the data block. Note 2: If you are submitting a file directly, you have to use multipart/form-data encoding this was done to reduce the memory footprint and speedup file transfers ** Read documentation of mime multipart standard if your library does not support it** The multipart/form-data for sending binary has two parts: - The first part contains a JSON dump of the optional params and uses the name 'json' - The last part conatins the file binary, uses the name 'bin' and includes a filename Variables: None Arguments: None Data Block (SHA256 or URL): { // REQUIRED: One of the two following "sha256": "123...DEF", # SHA256 hash of the file already in the datastore "url": "http://...", # Url to fetch the file from // OPTIONAL VALUES "name": "file.exe", # Name of the file to scan otherwise the sha256 or base file of the url "metadata": { # Submission metadata "key": val, # Key/Value pair metadata values }, "params": { # Submission parameters "key": val, # Key/Value pair for params that different then defaults }, # Default params can be fetch at /api/v3/user/submission_params/<user>/ } Data Block (Binary): --0b34a3c50d3c02dd804a172329a0b2aa <-- Randomly generated boundary for this http request Content-Disposition: form-data; name="json" <-- JSON data blob part (only previous optional values valid) {"metadata": {"hello": "world"}} --0b34a3c50d3c02dd804a172329a0b2aa <-- Switch to next part, file part Content-Disposition: form-data; name="bin"; filename="name_of_the_file_to_scan.bin" <BINARY DATA OF THE FILE TO SCAN... DOES NOT NEED TO BE ENCODDED> --0b34a3c50d3c02dd804a172329a0b2aa-- <-- End of HTTP transmission Result example: <Submission message object as a json dictionary> """ user = kwargs['user'] quota_error = check_submission_quota(user) if quota_error: return make_api_response("", quota_error, 503) out_dir = os.path.join(TEMP_SUBMIT_DIR, get_random_id()) with forge.get_filestore() as f_transport: try: # Get data block and binary blob if 'multipart/form-data' in request.content_type: if 'json' in request.values: data = json.loads(request.values['json']) else: data = {} binary = request.files['bin'] name = data.get("name", binary.filename) sha256 = None url = None elif 'application/json' in request.content_type: data = request.json binary = None sha256 = data.get('sha256', None) url = data.get('url', None) name = data.get( "name", None) or sha256 or os.path.basename(url) or None else: return make_api_response({}, "Invalid content type", 400) if data is None: return make_api_response({}, "Missing data block", 400) if not name: return make_api_response({}, "Filename missing", 400) name = os.path.basename(name) if not name: return make_api_response({}, "Invalid filename", 400) # Create task object if "ui_params" in data: s_params = ui_to_submission_params(data['ui_params']) else: s_params = ui_to_submission_params( STORAGE.user_settings.get(user['uname'], as_obj=False)) if not s_params: s_params = get_default_user_settings(user) s_params.update(data.get("params", {})) if 'groups' not in s_params: s_params['groups'] = user['groups'] s_params['quota_item'] = True s_params['submitter'] = user['uname'] if not s_params['description']: s_params['description'] = "Inspection of file: %s" % name if not Classification.is_accessible(user['classification'], s_params['classification']): return make_api_response( {}, "You cannot start a scan with higher " "classification then you're allowed to see", 400) # Prepare the output directory try: os.makedirs(out_dir) except Exception: pass out_file = os.path.join(out_dir, name) # Get the output file extra_meta = {} if not binary: if sha256: if f_transport.exists(sha256): f_transport.download(sha256, out_file) else: return make_api_response( {}, "SHA256 does not exist in our datastore", 404) else: if url: if not config.ui.allow_url_submissions: return make_api_response( {}, "URL submissions are disabled in this system", 400) try: safe_download(url, out_file) extra_meta['submitted_url'] = url except FileTooBigException: return make_api_response( {}, "File too big to be scanned.", 400) except InvalidUrlException: return make_api_response( {}, "Url provided is invalid.", 400) except ForbiddenLocation: return make_api_response( {}, "Hostname in this URL cannot be resolved.", 400) else: return make_api_response( {}, "Missing file to scan. No binary, sha256 or url provided.", 400) else: with open(out_file, "wb") as my_file: my_file.write(binary.read()) try: metadata = flatten(data.get('metadata', {})) metadata.update(extra_meta) submission_obj = Submission({ "files": [], "metadata": metadata, "params": s_params }) except (ValueError, KeyError) as e: return make_api_response("", err=str(e), status_code=400) # Submit the task to the system try: result = SubmissionClient(datastore=STORAGE, filestore=f_transport, config=config).submit( submission_obj, local_files=[out_file], cleanup=False) except SubmissionException as e: return make_api_response("", err=str(e), status_code=400) return make_api_response(result.as_primitives()) finally: try: # noinspection PyUnboundLocalVariable os.unlink(out_file) except Exception: pass try: shutil.rmtree(out_dir, ignore_errors=True) except Exception: pass
def submit(self, submission_obj: SubmissionObject, local_files: List = None, completed_queue=None): """Submit several files in a single submission. After this method runs, there should be no local copies of the file left. """ if local_files is None: local_files = [] if len(submission_obj.files) == 0 and len(local_files) == 0: raise SubmissionException("No files found to submit...") if submission_obj.params.ttl: expiry = epoch_to_iso(submission_obj.time.timestamp() + submission_obj.params.ttl * 24 * 60 * 60) else: expiry = None max_size = self.config.submission.max_file_size for local_file in local_files: # Upload/download, extract, analyze files original_classification = str(submission_obj.params.classification) file_hash, size, new_metadata = self._ready_file( local_file, expiry, original_classification) new_name = new_metadata.pop('name', safe_str(os.path.basename(local_file))) meta_classification = new_metadata.pop('classification', original_classification) if meta_classification != original_classification: try: submission_obj.params.classification = Classification.max_classification( meta_classification, original_classification) except InvalidClassification as ic: raise SubmissionException( "The classification found inside the cart file cannot be merged with " f"the classification the file was submitted as: {str(ic)}" ) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % ( size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) submission_obj.files.append( File({ 'name': new_name, 'size': size, 'sha256': file_hash, })) # Clearing runtime_excluded on initial submit or resubmit submission_obj.params.services.runtime_excluded = [] # We should now have all the information we need to construct a submission object sub = Submission( dict( archive_ts=now_as_iso( self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), classification=submission_obj.params.classification, error_count=0, errors=[], expiry_ts=expiry, file_count=len(submission_obj.files), files=submission_obj.files, max_score=0, metadata=submission_obj.metadata, params=submission_obj.params, results=[], sid=submission_obj.sid, state='submitted', scan_key=submission_obj.scan_key, )) if self.config.ui.allow_malicious_hinting and submission_obj.params.malicious: sub.verdict = {"malicious": [submission_obj.params.submitter]} self.datastore.submission.save(sub.sid, sub) self.log.debug("Submission complete. Dispatching: %s", sub.sid) self.dispatcher.dispatch_submission(sub, completed_queue=completed_queue) return sub
def submit(self, submission_obj: SubmissionObject, local_files: List = None, cleanup=True, completed_queue=None): """Submit several files in a single submission. After this method runs, there should be no local copies of the file left. """ if local_files is None: local_files = [] try: expiry = now_as_iso(submission_obj.params.ttl * 24 * 60 * 60) if submission_obj.params.ttl else None max_size = self.config.submission.max_file_size if len(submission_obj.files) == 0: if len(local_files) == 0: raise SubmissionException("No files found to submit...") for local_file in local_files: # Upload/download, extract, analyze files file_hash, size, new_metadata = self._ready_file(local_file, expiry, str(submission_obj.params.classification), cleanup, upload=True) new_name = new_metadata.pop('name', safe_str(os.path.basename(local_file))) submission_obj.params.classification = new_metadata.pop('classification', submission_obj.params.classification) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % (size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) submission_obj.files.append(File({ 'name': new_name, 'size': size, 'sha256': file_hash, })) else: for f in submission_obj.files: temporary_path = None try: fd, temporary_path = tempfile.mkstemp(prefix="submission.submit") os.close(fd) # We don't need the file descriptor open self.filestore.download(f.sha256, temporary_path) file_hash, size, new_metadata = self._ready_file(temporary_path, expiry, str(submission_obj.params.classification), cleanup, sha256=f.sha256) new_name = new_metadata.pop('name', f.name) submission_obj.params.classification = new_metadata.pop('classification', submission_obj.params.classification) submission_obj.metadata.update(**flatten(new_metadata)) # Check that after we have resolved exactly what to pass on, that it # remains a valid target for scanning if size > max_size and not submission_obj.params.ignore_size: msg = "File too large (%d > %d). Submission failed" % (size, max_size) raise SubmissionException(msg) elif size == 0: msg = "File empty. Submission failed" raise SubmissionException(msg) if f.size is None: f.size = size f.name = new_name f.sha256 = file_hash finally: if temporary_path: if os.path.exists(temporary_path): os.unlink(temporary_path) # Initialize the temporary data from the submission parameter if submission_obj.params.initial_data: try: temp_hash_name = get_temporary_submission_data_name(submission_obj.sid, submission_obj.files[0].sha256) temporary_submission_data = ExpiringHash(temp_hash_name, host=self.redis) temporary_submission_data.multi_set(json.loads(submission_obj.params.initial_data)) except ValueError as err: self.log.warning(f"[{submission_obj.sid}] could not process initialization data: {err}") # Clearing runtime_excluded on initial submit or resubmit submission_obj.params.services.runtime_excluded = [] # We should now have all the information we need to construct a submission object sub = Submission(dict( archive_ts=now_as_iso(self.config.datastore.ilm.days_until_archive * 24 * 60 * 60), classification=submission_obj.params.classification, error_count=0, errors=[], expiry_ts=expiry, file_count=len(submission_obj.files), files=submission_obj.files, max_score=0, metadata=submission_obj.metadata, params=submission_obj.params, results=[], sid=submission_obj.sid, state='submitted' )) self.datastore.submission.save(sub.sid, sub) self.log.debug("Submission complete. Dispatching: %s", sub.sid) self.dispatcher.dispatch_submission(sub, completed_queue=completed_queue) return sub finally: # Just in case this method fails clean up local files if cleanup: for path in local_files: if path and os.path.exists(path): # noinspection PyBroadException try: os.unlink(path) except Exception: self.log.error("Couldn't delete dangling file %s", path)