def recurse_add_res(self, file_res, res_list, new_files, parent=None): for res_dic in res_list: # Check if condition is OK if self.pass_condition(res_dic.get("condition", None)): res = ResultSection(res_dic['title_text'], classification=res_dic.get('classification', Classification.UNRESTRICTED), parent=parent, body_format=res_dic.get('body_format', BODY_FORMAT.TEXT)) heur_id = self.heuristic_alteration(res_dic.get('score_condition', None), res_dic['heur_id']) res.set_heuristic(heur_id) # Add Tags tags = res_dic.get('tags', []) for res_tag in tags: res.add_tag(res_tag[0], res_tag[1]) # Add body body = res_dic.get('body', None) if body: res.set_body(body) # File for resubmit files = res_dic.get('files', []) for res_file in files: if isinstance(res_file, tuple): res_file = res_file[1] new_files.append(res_file) # Add to file res if root result if parent is None: file_res.add_section(res)
def _create_random_section(self): # choose a random body format body_format = random.choice(FORMAT_LIST) # create a section with a random title section = ResultSection(get_random_phrase(3, 7), body_format=body_format) # choose random amount of lines in the body for _ in range(1, 5): # generate random line section.add_line(get_random_phrase(5, 10)) # choose random amount of tags tags = flatten(get_random_tags()) for key, val in tags.items(): for v in val: section.add_tag(key, v) # set a heuristic a third of the time if random.choice([False, False, True]): section.set_heuristic(random.randint(1, 4)) # Create random sub-sections if random.choice([False, False, True]): section.add_subsection(self._create_random_section()) return section
def execute(self, request): result = Result() request.result = result file_path = request.file_path password = request.get_param('password') start_point = request.get_param('start point') try: data = process_file(file=file_path, password=password, noninteractive=True, no_indent=True, output_level=0, return_deobfuscated=True, extract_only=True) data_deobfuscated = process_file( file=file_path, password=password, start_point=start_point, noninteractive=True, no_indent=True, output_level=0, output_formula_format='[[CELL-ADDR]]: [[INT-FORMULA]]', return_deobfuscated=True) except Exception as e: section = ResultSection('Failed to analyze', parent=request.result) section.add_line(str(e)) if str(e).startswith('Failed to decrypt'): section.set_heuristic(6) return add_results(result, data, data_deobfuscated)
def parse_link(self, parent_res, path): with open(path, "rb") as fh: metadata = decode_lnk(fh.read()) if metadata is None: return False body_output = { build_key(k): v for k, v in flatten(metadata).items() if v } res = ResultSection("Metadata extracted by parse_lnk", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(body_output), parent=parent_res) bp = metadata.get("BasePath", "").strip() rp = metadata.get("RELATIVE_PATH", "").strip() nn = metadata.get("NetName", "").strip() cla = metadata.get("COMMAND_LINE_ARGUMENTS", "").strip() s = BAD_LINK_RE.search(cla.lower()) if s: res.set_heuristic(1) res.add_tag(tag_type="file.name.extracted", value=(bp or rp or nn).rsplit("\\")[-1]) res.add_tag(tag_type="dynamic.process.command_line", value=f"{(rp or bp or nn)} {cla}".strip()) for k, v in body_output.items(): tag_type = TAG_MAP.get("LNK", {}).get(k, None) or \ TAG_MAP.get(None, {}).get(k, None) if tag_type: res.add_tag(tag_type, v) return True
def dump_invalid_properties(self, parent_res): if self.invalid_properties_count: res = ResultSection( f"We've found {self.invalid_properties_count} properties with IDs different than " f"1 (storage), 2 (stream) and 5 (root)", parent=parent_res) res.set_heuristic(50)
def get_result_subsection(result, title, heuristic): result_subsection = None # Set appropriate result subsection if it already exists for subsection in result.subsections: if subsection.title_text == title: result_subsection = subsection # Create appropriate result subsection if it doesn't already exist if not result_subsection: result_subsection = ResultSection(title) result.add_subsection(result_subsection) result_subsection.set_heuristic(heuristic) return result_subsection
def test_parse_results(response, correct_res_secs, metadefender_class_instance): from assemblyline_v4_service.common.result import Result, ResultSection, BODY_FORMAT, Heuristic metadefender_class_instance.blocklist = ["a"] metadefender_class_instance.sig_score_revision_map = {} metadefender_class_instance.kw_score_revision_map = {} metadefender_class_instance.current_node = "http://blah" metadefender_class_instance.nodes[ metadefender_class_instance.current_node] = { "engine_map": { "z": { "version": "blah", "def_time": "blah" }, "y": { "version": "blah", "def_time": "blah" } }, "queue_times": [], "file_count": 0 } correct_result = Result() for correct_res_sec in correct_res_secs: section = ResultSection( correct_res_sec["title_text"], body_format=BODY_FORMAT.TEXT if not correct_res_sec.get("body_format") else BODY_FORMAT.JSON, body=correct_res_sec.get("body")) for subsec in correct_res_sec.get("subsections", []): subsection = ResultSection( subsec["title_text"], body=subsec["body"], body_format=BODY_FORMAT.KEY_VALUE, tags=subsec.get("tags"), ) if subsec.get("heuristic"): subsection.set_heuristic(subsec["heuristic"]["heur_id"]) print(subsec["heuristic"]["signatures"]) for key in subsec["heuristic"]["signatures"].keys(): subsection.heuristic.add_signature_id(key) section.add_subsection(subsection) correct_result.add_section(section) actual_result = metadefender_class_instance.parse_results(response) for index, section in enumerate(actual_result.sections): assert check_section_equality(section, correct_result.sections[index])
def dump_properties(self, parent_res): # 1. start with id 0 and naviguate the tree from there. self.dump_dir('0', '\\', parent_res, False) # 2. any missing properties, look for dir first? while len(self.parent) > 0: cur_dir = list(self.parent.items())[0][0] if self.property_dict[cur_dir][1]: del self.parent[cur_dir] else: while cur_dir in self.parent and self.property_dict[ self.parent[cur_dir]][1] is False: cur_dir = self.parent[cur_dir] self.dump_dir(cur_dir, '\\-ORPHAN-\\', parent_res, True) for (p_id, field_struct) in self.property_dict.items(): if field_struct[1] is False and field_struct[0][ 'type'].display == 'storage': self.dump_dir(p_id, '\\-ORPHAN-\\', parent_res, True) if len(self.invalid_streams) > 0: res_error = ResultSection( "Trying to access stream content from the short block, but root[0] doesn't " "even exist. This file is either corrupted, patched or exploiting a " "vulnerability.", parent=parent_res) res_error.add_line( f"Unable to access the following stream(s): {'', ''.join(self.invalid_streams)}" ) res_error.set_heuristic(40) # 3. any missing properties, with no parent? orphans = {} for (p_id, field_struct) in self.property_dict.items(): if field_struct[1] is False and field_struct[0]['name'].value != '': orphans[p_id] = field_struct if len(orphans) > 0: res = ResultSection("OLE2 STORAGE: \\-ORPHAN-") for (p_id, field_struct) in orphans.items(): self.dump_property(field_struct[0], '\\-ORPHAN-', p_id, res, parent_res, True) if len(res.subsections) > 0: parent_res.add_subsection(res)
def _set_heuristic_by_verdict(self, result_section: ResultSection, verdict: Optional[str]) -> None: """ This method sets the heuristic of the result section based on the verdict :param result_section: The result section that will have its heuristic set :param verdict: The verdict of the file :return: None """ if not verdict: return if (verdict not in Verdicts.INTERESTING_VERDICTS.value and verdict not in Verdicts.UNINTERESTING_VERDICTS.value): self.log.debug(f"{verdict} was spotted. Is this useful?") elif verdict in Verdicts.MALICIOUS_VERDICTS.value: result_section.set_heuristic(1) elif verdict in Verdicts.SUSPICIOUS_VERDICTS.value: result_section.set_heuristic(2) elif verdict in Verdicts.TRUSTED_VERDICTS.value: self.log.debug( f"The verdict was {verdict}. Can we do something with this?")
def execute(self, request): result = Result() file = request.file_path with open(file, "rb") as f: file_content = f.read() content_list = autoit_ripper.extract(data=file_content) if content_list: content = content_list[0][1].decode("utf-8") text_section = ResultSection('[DUMP RESULT]') text_section.add_line(content) text_section.set_heuristic(1) result.add_section(text_section) with open(self.working_directory + "script.au3", "w") as f: f.write(content) request.add_extracted(self.working_directory + 'script.au3', 'script.au3', 'This is the unpacked script') request.result = result
def execute(self, request): result = Result() file_path = request.file_path p1 = subprocess.Popen("clamscan -a -z --detect-pua --alert-macros " + file_path, shell=True, stdout=subprocess.PIPE) p1.wait() stdout = p1.communicate()[0].decode("utf-8") report = stdout.split("\n") report = list(filter(None, report)) text_section = ResultSection("Successfully scanned the file") if "FOUND" in report[0]: text_section.set_heuristic(1) for l in report: text_section.add_line(l) result.add_section(text_section) request.result = result
def test_init(mocker): from json import dumps from assemblyline_v4_service.common.result import BODY_FORMAT, ResultSection mocker.patch("assemblyline_v4_service.common.api.ServiceAPIError") from metadefender import AvHitSection av_name = "blah" virus_name = "blah" engine = {} heur_id = 1 sig_score_rev_map = {} kw_score_rev_map = {} safelist_match = [] actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.set_heuristic(1) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "infected", "engine_version": "unknown", "engine_definition_time": "unknown" }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) engine = {"version": "blah", "def_time": 1} heur_id = 2 safelist_match = ["blah"] actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 0) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) kw_score_rev_map = {"bla": 1} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 1) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) kw_score_rev_map = {"bla": 1, "h": 2} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 2) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section) sig_score_rev_map = {f"{av_name}.{virus_name}": 10} actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id, sig_score_rev_map, kw_score_rev_map, safelist_match) correct_result_section = ResultSection( f"{av_name} identified the file as {virus_name}") correct_result_section.add_tag("av.virus_name", virus_name) correct_result_section.set_heuristic(2) correct_result_section.heuristic.add_signature_id( f"{av_name}.{virus_name}", 10) correct_result_section.set_body( dumps({ "av_name": av_name, "virus_name": virus_name, "scan_result": "suspicious", "engine_version": "blah", "engine_definition_time": 1 }), BODY_FORMAT.KEY_VALUE) assert check_section_equality(actual_res_sec, correct_result_section)
def execute(self, request): file_path = request.file_path result = Result() # Report the version of suricata as the service context request.set_service_context( f"Suricata version: {self.get_suricata_version()}") # restart Suricata if we need to self.start_suricata_if_necessary() # Strip frame headers from the PCAP, since Suricata sometimes has trouble parsing strange PCAPs stripped_filepath = self.strip_frame_headers(file_path) # Check to make sure the size of the stripped file isn't 0 - this happens on pcapng files # TODO: there's probably a better way to do this - don't event strip it if it's pcapng if os.stat(stripped_filepath).st_size == 0: stripped_filepath = file_path # Switch stdout and stderr so we don't get our logs polluted mystdout = StringIO() old_stdout = sys.stdout sys.stdout = mystdout mystderr = StringIO() old_stderr = sys.stderr sys.stderr = mystderr # Pass the pcap file to Suricata via the socket ret = self.suricata_sc.send_command( "pcap-file", { "filename": stripped_filepath, "output-dir": self.working_directory }) if not ret or ret["return"] != "OK": self.log.exception( f"Failed to submit PCAP for processing: {ret['message']}") # Wait for the socket finish processing our PCAP while True: time.sleep(1) try: ret = self.suricata_sc.send_command("pcap-current") if ret and ret["message"] == "None": break except ConnectionResetError as e: raise RecoverableError(e) # Bring back stdout and stderr sys.stdout = old_stdout sys.stderr = old_stderr # NOTE: for now we will ignore content of mystdout and mystderr but we have them just in case... alerts, signatures, domains, ips, urls, email_addresses, tls_dict, extracted_files, reverse_lookup = self.parse_suricata_output( ).values() file_extracted_section = ResultSection("File(s) extracted by Suricata") # Parse the json results of the service if request.get_param("extract_files"): for file in extracted_files: sha256, filename, extracted_file_path = file.values() self.log.info(f"extracted file {filename}") try: if request.add_extracted( extracted_file_path, filename, "Extracted by Suricata", safelist_interface=self.api_interface): file_extracted_section.add_line(filename) if filename != sha256: file_extracted_section.add_tag( 'file.name.extracted', filename) except FileNotFoundError as e: # An intermittent issue, just try again raise RecoverableError(e) except MaxExtractedExceeded: # We've hit our limit pass # Report a null score to indicate that files were extracted. If no sigs hit, it's not clear # where the extracted files came from if file_extracted_section.body: result.add_section(file_extracted_section) # Add tags for the domains, urls, and IPs we've discovered root_section = ResultSection("Discovered IOCs", parent=result) if domains: domain_section = ResultSection("Domains", parent=root_section) for domain in domains: domain_section.add_line(domain) domain_section.add_tag('network.dynamic.domain', domain) if ips: ip_section = ResultSection("IP Addresses", parent=root_section) for ip in ips: # Make sure it's not a local IP if not (ip.startswith("127.") or ip.startswith("192.168.") or ip.startswith("10.") or (ip.startswith("172.") and 16 <= int(ip.split(".")[1]) <= 31)): ip_section.add_line(ip) ip_section.add_tag('network.dynamic.ip', ip) if urls: url_section = ResultSection("URLs", parent=root_section) for url in urls: url_section.add_line(url) url_section.add_tag('network.dynamic.uri', url) if email_addresses: email_section = ResultSection("Email Addresses", parent=root_section) for eml in email_addresses: email_section.add_line(eml) email_section.add_tag('network.email.address', eml) # Map between suricata key names and AL tag types tls_mappings = { "subject": 'cert.subject', "issuerdn": 'cert.issuer', "version": 'cert.version', "notbefore": 'cert.valid.start', "notafter": 'cert.valid.end', "fingerprint": 'cert.thumbprint', "sni": 'network.tls.sni' } if tls_dict: tls_section = ResultSection("TLS Information", parent=root_section, body_format=BODY_FORMAT.JSON) kv_body = {} for tls_type, tls_values in tls_dict.items(): if tls_type == "fingerprint": # make sure the cert fingerprint/thumbprint matches other values, # like from PEFile tls_values = [ v.replace(":", "").lower() for v in tls_values ] if tls_type in tls_mappings: kv_body[tls_type] = tls_values tag_type = tls_mappings[tls_type] if tag_type is not None: for tls_value in tls_values: tls_section.add_tag(tag_type, tls_value) elif tls_type == "ja3": kv_body.setdefault('ja3_hash', []) kv_body.setdefault('ja3_string', []) for ja3_entry in tls_values: ja3_hash = ja3_entry.get("hash") ja3_string = ja3_entry.get("string") if ja3_hash: kv_body['ja3_hash'].append(ja3_hash) tls_section.add_tag('network.tls.ja3_hash', ja3_hash) if ja3_string: kv_body['ja3_string'].append(ja3_string) tls_section.add_tag('network.tls.ja3_string', ja3_string) else: kv_body[tls_type] = tls_values # stick a message in the logs about a new TLS type found in suricata logs self.log.info( f"Found new TLS type {tls_type} with values {tls_values}" ) tls_section.set_body(json.dumps(kv_body)) # Create the result sections if there are any hits if len(alerts) > 0: for signature_id, signature_details in signatures.items(): signature = signature_details['signature'] attributes = signature_details['attributes'] section = ResultSection(f'{signature_id}: {signature}') heur_id = 3 if any(x in signature for x in self.config.get("sure_score")): heur_id = 1 elif any(x in signature for x in self.config.get("vhigh_score")): heur_id = 2 section.set_heuristic(heur_id) if signature_details['al_signature']: section.add_tag("file.rule.suricata", signature_details['al_signature']) for timestamp, src_ip, src_port, dest_ip, dest_port in alerts[ signature_id][:10]: section.add_line( f"{timestamp} {src_ip}:{src_port} -> {dest_ip}:{dest_port}" ) if len(alerts[signature_id]) > 10: section.add_line( f'And {len(alerts[signature_id]) - 10} more flows') # Tag IPs/Domains/URIs associated to signature for flow in alerts[signature_id]: dest_ip = flow[3] section.add_tag('network.dynamic.ip', dest_ip) if dest_ip in reverse_lookup.keys(): section.add_tag('network.dynamic.domain', reverse_lookup[dest_ip]) [ section.add_tag('network.dynamic.uri', uri) for uri in urls if dest_ip in uri or (reverse_lookup.get(dest_ip) and reverse_lookup[dest_ip] in uri) ] # Add a tag for the signature id and the message section.add_tag('network.signature.signature_id', str(signature_id)) section.add_tag('network.signature.message', signature) [ section.add_tag('network.static.uri', attr['uri']) for attr in attributes if attr.get('uri') ] # Tag malware_family for malware_family in signature_details['malware_family']: section.add_tag('attribution.family', malware_family) result.add_section(section) self.ontology.add_result_part( Signature, data=dict( name=signature_details['al_signature'], type="SURICATA", malware_families=signature_details['malware_family'] or None, attributes=attributes)) # Add the original Suricata output as a supplementary file in the result request.add_supplementary( os.path.join(self.working_directory, 'eve.json'), 'SuricataEventLog.json', 'json') # Add the stats.log to the result, which can be used to determine service success if os.path.exists(os.path.join(self.working_directory, 'stats.log')): request.add_supplementary( os.path.join(self.working_directory, 'stats.log'), 'stats.log', 'log') request.result = result
def _report_embedded_xdp(self, file_res, chunk_number, binary, leftover): res_section = ResultSection([f"Found {chunk_number}", "Embedded PDF (in XDP)"]) res_section.set_heuristic(1) res_section.add_tag('file.behavior', "Embedded PDF (in XDP)") file_res.add_section(res_section)
def tag_data(data, data_deobfuscated, result_ioc, result_formula): pattern = PatternMatch() # Get all IoCs without deobfuscation ioc_dict = {} formulas = collections.OrderedDict() for line in data: if line[:4] == 'CELL': split_value = line.split(',', 1) cell = split_value[0].split(':')[1].strip() formula = split_value[1].rsplit(',', 1)[0].strip() # Add formula to list of formulas if it contains IoC(s) if pattern.ioc_match(formula, cell, ioc_dict): formulas[cell] = formula # Get all IoCs after deobfuscation ioc_deobfuscated_dict = {} formulas_deobfuscated = collections.OrderedDict() for line in data_deobfuscated: split_value = line.split(':', 1) cell = split_value[0].strip() formula = split_value[1].strip() # Add formula to list of deobfuscated formulas if it contains IoC(s) if pattern.ioc_match(formula, cell, ioc_deobfuscated_dict): formulas_deobfuscated[cell] = formula # Remove duplicate IoCs (found both before AND after deobfuscation) for ioc_tag, values in ioc_deobfuscated_dict.copy().items(): for ioc_details in values.copy(): if ioc_tag in ioc_dict and ioc_details in ioc_dict[ioc_tag]: ioc_deobfuscated_dict[ioc_tag].remove(ioc_details) # Remove ioc_tag if no IoCs are associated with it if len(ioc_deobfuscated_dict[ioc_tag]) == 0: del ioc_deobfuscated_dict[ioc_tag] # Remove duplicate formulas from the same cell (found both before AND after deobfuscation) for cell, formula in formulas_deobfuscated.copy().items(): if cell in formulas and formula in formulas[cell]: del formulas_deobfuscated[cell] # Create the appropriate result subsections for formulas formulas_subsection = ResultSection('Formulas') formulas_deobfuscated_subsection = ResultSection('Deobfuscated Formulas') formulas_deobfuscated_subsection.set_heuristic(5) if formulas: result_formula.add_subsection(formulas_subsection) if formulas_deobfuscated: result_formula.add_subsection(formulas_deobfuscated_subsection) # Generate result subsections for IoCs found without deobfuscation heuristics = [1, 2] for ioc_tag, values in ioc_dict.items(): for ioc_details in values: ioc = ioc_details[0] title = ioc_details[1] heuristic = heuristics[ioc_details[2]] ioc_subsection = get_result_subsection(result_ioc, title, heuristic) ioc_subsection.add_tag(ioc_tag, ioc) pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)') if ioc_subsection.body is not None and not pattern.search( ioc_subsection.body): ioc_subsection.add_line(ioc) elif ioc_subsection.body is None: ioc_subsection.add_line(ioc) formulas_subsection.add_tag(ioc_tag, ioc) # Generate result subsections for deobfuscated IoCs heuristics = [3, 4] for ioc_tag, values in ioc_deobfuscated_dict.items(): for ioc_details in values: ioc = ioc_details[0] title = 'Deobfuscated ' + ioc_details[1] heuristic = heuristics[ioc_details[2]] ioc_subsection = get_result_subsection(result_ioc, title, heuristic) ioc_subsection.add_tag(ioc_tag, ioc) pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)') if ioc_subsection.body is not None and not pattern.search( ioc_subsection.body): ioc_subsection.add_line(ioc) elif ioc_subsection.body is None: ioc_subsection.add_line(ioc) formulas_deobfuscated_subsection.add_tag(ioc_tag, ioc) # Populate 'Formulas' result subsection with all suspicious formulas found without deobfuscation for cell, formula in formulas.items(): # Only add complete formulas if "FORMULA(" in formula: cell_referenced = formula.rsplit(',', 1)[1][:-1] if cell_referenced not in formulas.keys(): formulas_subsection.add_line(cell + ": " + formula) else: formulas_subsection.add_line(cell + ": " + formula) # Populate 'Deobfuscated Formulas' result subsection with all deobfuscated suspicious formulas for cell, formula in formulas_deobfuscated.items(): # Only add complete formulas if "FORMULA(" in formula: cell_referenced = formula.rsplit(',', 1)[1][:-1] if cell_referenced not in formulas_deobfuscated.keys(): formulas_deobfuscated_subsection.add_line(cell + ": " + formula) else: formulas_deobfuscated_subsection.add_line(cell + ": " + formula)
def execute(self, request): """Main Module. See README for details.""" max_size = self.config.get('MAX_PDF_SIZE', 3000000) request.result = result = Result() if (os.path.getsize(request.file_path) or 0) < max_size or request.deep_scan: path = request.file_path working_dir = self.working_directory # CALL PDFID and identify all suspicious keyword streams additional_keywords = self.config.get('ADDITIONAL_KEYS', []) heur = deepcopy(self.config.get('HEURISTICS', [])) all_errors = set() res_txt = "Main Document Results" res, contains_objstms, errors = self.analyze_pdf( request, res_txt, path, working_dir, heur, additional_keywords) result.add_section(res) for e in errors: all_errors.add(e) # ObjStms: Treat all ObjStms like a standalone PDF document if contains_objstms: objstm_files = self.analyze_objstm(path, working_dir, request.deep_scan) obj_cnt = 1 for osf in objstm_files: parent_obj = os.path.basename(osf).split("_")[1] res_txt = "ObjStream Object {0} from Parent Object {1}".format( obj_cnt, parent_obj) # It is going to look suspicious as the service created the PDF heur = [ x for x in heur if 'plugin_suspicious_properties' not in x and 'plugin_embeddedfile' not in x and 'plugin_nameobfuscation' not in x ] res, contains_objstms, errors = self.analyze_pdf( request, res_txt, osf, working_dir, heur, additional_keywords, get_malform=False) obj_cnt += 1 result.add_section(res) if len(all_errors) > 0: erres = ResultSection(title_text="Errors Analyzing PDF") for e in all_errors: erres.add_line(e) result.add_section(erres) else: section = ResultSection( "PDF Analysis of the file was skipped because the file is too big (limit is 3 MB)." ) section.set_heuristic(10) result.add_section(section)
def cache_fields(self, field, parent_res): num_of_attempt = 15 keep_trying = True previous_parser_error = None failed_again = False while keep_trying: # noinspection PyBroadException try: if field.is_field_set and field._getCurrentLength() > 0: for _ in field: pass except MissingField as e: res = ResultSection( f"Hachoir lib COULD NOT get field '{e.key}' from " f"'{e.field.path}'. This file is either corrupted, " f"patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(42) except ParserError as e: if previous_parser_error is None and previous_parser_error != str( e): previous_parser_error = str(e) if str(e).startswith( "OLE2: Unable to parse property of type "): res = ResultSection( f"Hachoir lib DID NOT successfully " f"parse one of the property [{str(e)}]. This " f"file is either corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(43) elif str(e).startswith('Unable to add ') and str( e).endswith(" is too large"): res = ResultSection( f"Hachoir lib determined that a field " f"is overflowing the file [{str(e)}]. This " f"file is either corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(44) elif str(e).endswith(" is too large!"): res = ResultSection( f"Hachoir lib COULD NOT access a field " f"[{str(e)}]. This file is either corrupted," f" patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(45) elif str(e).startswith("Seek above field set end"): res = ResultSection( f"Hachoir lib determined that a field is " f"overflowing the file [{str(e)}]. This " f"file is either corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(44) elif "FAT chain: Found a loop" in str(e): if str(e).startswith('B'): fat = 'BFAT' else: fat = 'SFAT' res = ResultSection( f"Hachoir lib found a loop when navigating " f"through the {fat} [{str(e)}]. This file " f"is either corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(46) elif "FAT chain: Invalid block index" in str(e): if str(e).startswith('B'): fat = 'BFAT' else: fat = 'SFAT' res = ResultSection( f"Hachoir lib found an invalid block index " f"in the {fat} [{str(e)}]. This file is " f"either corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(47) elif str(e).startswith("OLE2: Invalid endian value"): res = ResultSection( f"The stream endian field is not valid " f"[{str(e)}]. This file is either " f"corrupted, patched or exploiting a vulnerability.", parent=parent_res) res.set_heuristic(48) else: res = ResultSection( f"Hachoir lib DID NOT successfully parse the entire file ... " f"odd [{str(e)}].", parent=parent_res) res.set_heuristic(49) backtrace = getBacktrace(None) self.log.info( f"{self.task.sid}/{self.task.sha256}\n{backtrace}") except Exception: if num_of_attempt == 15: res = ResultSection( "Hachoir lib DID NOT successfully parse the entire file ... odd.", parent=parent_res) res.set_heuristic(49) backtrace = getBacktrace(None) self.log.info( f"{self.task.sid}/{self.task.sha256}\n{backtrace}") elif failed_again is False: failed_again = True ResultSection( "Hachoir failed to parse the entire file after retrying.", parent=parent_res) backtrace = getBacktrace(None) self.log.info( f"{self.task.sid}/{self.task.sha256}\n{backtrace}") num_of_attempt -= 1 keep_trying = num_of_attempt > 0
def dump_property(self, field, path, index, res, parent_res, is_orphan): if field['name'].value != '': name = field['name'].display[1:-1] p_type = field['type'].value if path[-1:] == '\\': abs_name = f"{path}{name}" else: abs_name = f"{path}\\{name}" prop_res = ResultSection(f"Property: {abs_name}", body_format=BODY_FORMAT.KEY_VALUE, body={}) # if type is not: 1- storage, 2- stream an not 5- root, that is weird. if p_type != 1 and p_type != 2 and p_type != 5: self.invalid_properties_count += 1 # for properties not storage (which should be seen like a folder) if p_type != 1: size = field['size'].value else: size = 0 address = 0 if size > 0: if field['size'].value < self.ole2parser[ 'header/threshold'].value and index != '0': # we first get the offset from the short block but then we need # to map it back to the file, which is from root[X]. offset = field['start'].value * self.ole2parser.ss_size keep_looping = True root_index = 0 while keep_looping: try: current_root = self.ole2parser[ f"root[{root_index}]"] if offset == 0 or current_root.size > offset: address = current_root.address + offset keep_looping = False else: offset -= current_root.size root_index += 1 except MissingField: keep_looping = False address = None if not is_orphan: self.invalid_streams.append( field['name'].display) else: address = HEADER_SIZE + field[ 'start'].value * self.ole2parser.sector_size else: address = 0 if address >= 0: prop_res.body['property_meta'] = \ f"offset: {hex(address // 8)} size: {hex(size)} / {field['type'].display} / " \ f"{field['decorator'].display} / id={index} left={field['left'].display} " \ f"right={field['right'].display} child={field['child'].display}" else: prop_res.body['property_meta'] = \ f"offset: could not map.. size: {hex(size)} / {field['type'].display} / " \ f"{field['decorator'].display} / id={index} left={field['left'].display} " \ f"right={field['right'].display} child={field['child'].display}" # for root or storage if p_type == 5 or p_type == 1: if field[ 'clsid'].display != "Null GUID: 00000000-0000-0000-0000-000000000000": clsid_desc = self.GUID_DESC.get(field['clsid'].display, "unknown clsid") prop_res.body[ "clsid"] = f"{field['clsid'].display} ({clsid_desc})" prop_res.add_tag('file.ole.clsid', field['clsid'].display) if field['creation'].display != "1601-01-01 00:00:00": prop_res.body["creation_date"] = field['creation'].display prop_res.add_tag('file.date.creation', field['creation'].display) if field['lastmod'].display != "1601-01-01 00:00:00": prop_res.body["last_modified_date"] = field[ 'lastmod'].display prop_res.add_tag('file.date.last_modified', field['lastmod'].display) # fixes up a bug: if name == '\\1CompObj': if p_type != 2: res_error = ResultSection( f"\\1CompObj type is '{p_type}' and it should be 2 (stream) " f"... really suspicious.") res_error.set_heuristic(41) prop_res.add_subsection(res_error) size = field['size'].value # Apparently, we can get to this point and have office_root_entry_parser set to None. # Not sure what we should do about that but trying to use that member variable seems # like a bad idea... if self.office_root_entry_parser is not None: temp_field = None for f in self.office_root_entry_parser.createFields(): if f.name.startswith('compobj'): temp_field = f # cache all the sub-fields.... for _ in temp_field: pass self.parse_field(temp_field, prop_res, self.PARSING_MODE_DISPLAY, parent_res) if size > 0 and index != '0': field_with_other_parser = self.additional_parsing_fields.get( address, None) if field_with_other_parser: # noinspection PyTypeChecker self.parse_field(field_with_other_parser, prop_res, self.PARSING_MODE_DISPLAY, parent_res) if len(prop_res.body) > 1: prop_res.body = json.dumps(prop_res.body) res.add_subsection(prop_res)
def execute(self, request): self.result = Result() request.result = self.result self.request = request self.ip_list = [] self.url_list = [] self.found_powershell = False self.file_hashes = [] vmonkey_err = False actions = [] external_functions = [] tmp_iocs = [] output_results = {} # Running ViperMonkey try: cmd = " ".join([ PYTHON2_INTERPRETER, os.path.join(os.path.dirname(__file__), 'vipermonkey_compat.py2'), request.file_path ]) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) stdout, _ = p.communicate() # Read output if stdout: for l in stdout.splitlines(): if l.startswith(b"{") and l.endswith(b"}"): try: output_results = json.loads(l) except UnicodeDecodeError: output_results = json.loads( l.decode("utf-8", "replace")) break # Checking for tuple in case vmonkey return is None # If no macros found, return is [][], if error, return is None if type(output_results.get('vmonkey_values')) == dict: ''' Structure of variable "actions" is as follows: [action, description, parameter] action: 'Found Entry Point', 'Execute Command', etc... parameter: Parameters for function description: 'Shell Function', etc... external_functions is a list of built-in VBA functions that were called ''' actions = output_results['vmonkey_values']['actions'] external_functions = output_results['vmonkey_values'][ 'external_funcs'] tmp_iocs = output_results['vmonkey_values']['tmp_iocs'] else: vmonkey_err = True else: vmonkey_err = True except Exception: raise # Add vmonkey log as a supplemental file if 'stdout' in output_results: temp_log_copy = os.path.join( tempfile.gettempdir(), f'{request.sid}_vipermonkey_output.log') with open(temp_log_copy, "w") as temp_log_file: temp_log_file.write(output_results['stdout']) self.request.add_supplementary(temp_log_copy, 'vipermonkey_output.log', 'ViperMonkey log output') if vmonkey_err is True: ResultSection( 'ViperMonkey has encountered an error, please check "vipermonkey_output.log"', parent=self.result, heuristic=Heuristic(1)) if len(actions) > 0: # Creating action section action_section = ResultSection('Recorded Actions:', parent=self.result) action_section.add_tag('technique.macro', 'Contains VBA Macro(s)') for action in actions: # Creating action sub-sections for each action cur_action = action[0] cur_description = action[2] if action[2] else cur_action # Entry point actions have an empty description field, re-organize result section for this case if cur_action == 'Found Entry Point': sub_action_section = ResultSection('Found Entry Point', parent=action_section) sub_action_section.add_line(action[1]) else: # Action's description will be the sub-section name sub_action_section = ResultSection(cur_description, parent=action_section) if cur_description == 'Shell function': sub_action_section.set_heuristic(2) # Parameters are sometimes stored as a list, account for this if isinstance(action[1], list): for item in action[1]: # Parameters includes more than strings (booleans for example) if isinstance(item, str): # Check for PowerShell self.extract_powershell( item, sub_action_section) # Join list items into single string param = ', '.join(str(a) for a in action[1]) else: param = action[1] # Parameters includes more than strings (booleans for example) if isinstance(param, str): self.extract_powershell(param, sub_action_section) sub_action_section.add_line(f'Action: {cur_action}') sub_action_section.add_line(f'Parameters: {param}') # If decoded is true, possible base64 string has been found self.check_for_b64(param, sub_action_section) # Add urls/ips found in parameter to respective lists self.find_ip(param) # Check tmp_iocs res_temp_iocs = ResultSection('Runtime temporary IOCs') for ioc in tmp_iocs: self.extract_powershell(ioc, res_temp_iocs) self.check_for_b64(ioc, res_temp_iocs) self.find_ip(ioc) if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body: self.result.add_section(res_temp_iocs) # Add PowerShell score/tag if found if self.found_powershell: ResultSection('Discovered PowerShell code in file', parent=self.result, heuristic=Heuristic(3)) # Add url/ip tags self.add_ip_tags() # Create section for built-in VBA functions called if len(external_functions) > 0: vba_builtin_dict = {} dict_path = os.path.join(os.path.dirname(__file__), 'VBA_built_ins.txt') with open(dict_path, 'r') as f: for line in f: line = line.strip() if re.search(r'^#', line): continue if line: line = line.split(';') vba_builtin_dict[line[0].strip()] = line[1].strip() external_func_section = ResultSection( 'VBA functions called', body_format=BODY_FORMAT.MEMORY_DUMP, parent=self.result) for func in external_functions: if func in vba_builtin_dict: external_func_section.add_line(func + ': ' + vba_builtin_dict[func]) else: external_func_section.add_line(func)
def section_builder(self, parser, field_dict, result, parsertype="MWCP"): json_body = {} malware_name = '' malware_types = [] mitre_group = '' mitre_att = '' category = 'malware' # get malware names from parser objects if parsertype == "RATDecoder": malware_name = parser if parsertype == "MWCP": for name, obj in self.file_parsers.items(): if parser in obj.parser_list: malware_name = obj.malware malware_types = obj.malware_types mitre_att = obj.mitre_att mitre_group = obj.mitre_group category = obj.category for item in [ 'classification', 'mitre_group', 'mitre_att', 'malware', 'malware_types', 'category' ]: val = getattr(obj, item, None) if val: json_body[item] = val break parser_section = ResultSection(f"{parsertype} : {parser}") parser_section = classification_checker(parser_section, parser, self.file_parsers) if len(field_dict) > 0: # if any decoder output exists raise heuristic parser_section.set_body(json.dumps(json_body), body_format=BODY_FORMAT.KEY_VALUE) parser_section.set_heuristic(HEURISTICS_MAP.get(category, 1), attack_id=mitre_att) parser_section.add_tag("source", parsertype) if malware_name: parser_section.add_tag('attribution.implant', malware_name.upper()) if mitre_group: parser_section.add_tag('attribution.actor', mitre_group.upper()) for malware_type in malware_types: parser_section.add_tag('attribution.family', malware_type.upper()) # Create subsections and attach them to the main parser_section subsection_builder(parser_section, field_dict) other_key = "other" if other_key in field_dict: other_content = field_dict[other_key] other_section = ResultSection(f"Other metadata found", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(other_content)) parser_section.add_subsection(other_section) for field in field_dict: if field != other_key and field not in FIELD_TAG_MAP: self.log.debug(f"{field} does not exist in FIELD_TAG_MAP") result.add_section(parser_section)
def execute(self, request: ServiceRequest) -> None: self.result = Result() request.result = self.result self.ip_list = [] self.url_list = [] self.found_powershell = False self.file_hashes = [] vmonkey_err = False actions: List[str] = [] external_functions: List[str] = [] tmp_iocs: List[str] = [] output_results: Dict[str, Any] = {} potential_base64: Set[str] = set() # Running ViperMonkey try: file_contents = request.file_contents input_file: str = request.file_path input_file_obj: Optional[IO] = None # Typical start to XML files if not file_contents.startswith( b"<?") and request.file_type == "code/xml": # Default encoding/decoding if BOM not found encoding: Optional[str] = None decoding: Optional[str] = None # Remove potential BOMs from contents if file_contents.startswith(BOM_UTF8): encoding = "utf-8" decoding = "utf-8-sig" elif file_contents.startswith(BOM_UTF16): encoding = "utf-16" decoding = "utf-16" if encoding and decoding: input_file_obj = tempfile.NamedTemporaryFile( "w+", encoding=encoding) input_file_obj.write( file_contents.decode(decoding, errors="ignore")) input_file = input_file_obj.name else: # If the file_type was detected as XML, it's probably buried within but not actually an XML file # Give no response as ViperMonkey can't process this kind of file return cmd = " ".join([ PYTHON2_INTERPRETER, os.path.join(os.path.dirname(__file__), "vipermonkey_compat.py2"), input_file, self.working_directory, ]) p = subprocess.run(cmd, capture_output=True, shell=True) stdout = p.stdout # Close file if input_file_obj and os.path.exists(input_file_obj.name): input_file_obj.close() # Add artifacts artifact_dir = os.path.join( self.working_directory, os.path.basename(input_file) + "_artifacts") if os.path.exists(artifact_dir): for file in os.listdir(artifact_dir): try: file_path = os.path.join(artifact_dir, file) if os.path.isfile(file_path) and os.path.getsize( file_path): request.add_extracted( file_path, file, "File extracted by ViperMonkey during analysis" ) except os.error as e: self.log.warning(e) # Read output if stdout: for line in stdout.splitlines(): if line.startswith(b"{") and line.endswith(b"}"): try: output_results = json.loads(line) except UnicodeDecodeError: output_results = json.loads( line.decode("utf-8", "replace")) break # Checking for tuple in case vmonkey return is None # If no macros found, return is [][][], if error, return is None # vmonkey_err can still happen if return is [][][], log as warning instead of error if isinstance(output_results.get("vmonkey_values"), dict): """ Structure of variable "actions" is as follows: [action, parameters, description] action: 'Found Entry Point', 'Execute Command', etc... parameters: Parameters for function description: 'Shell Function', etc... external_functions is a list of built-in VBA functions that were called """ actions = output_results["vmonkey_values"]["actions"] external_functions = output_results["vmonkey_values"][ "external_funcs"] tmp_iocs = output_results["vmonkey_values"]["tmp_iocs"] if output_results["vmonkey_err"]: vmonkey_err = True self.log.warning(output_results["vmonkey_err"]) else: vmonkey_err = True else: vmonkey_err = True except Exception: self.log.exception( f"Vipermonkey failed to analyze file {request.sha256}") if actions: # Creating action section action_section = ResultSection("Recorded Actions:", parent=self.result) action_section.add_tag("technique.macro", "Contains VBA Macro(s)") sub_action_sections: Dict[str, ResultSection] = {} for action, parameters, description in actions: # Creating action sub-sections for each action if not description: # For actions with no description, just use the type of action description = action if description not in sub_action_sections: # Action's description will be the sub-section name sub_action_section = ResultSection(description, parent=action_section) sub_action_sections[description] = sub_action_section if description == "Shell function": sub_action_section.set_heuristic(2) else: # Reuse existing section sub_action_section = sub_action_sections[description] if sub_action_section.heuristic: sub_action_section.heuristic.increment_frequency() # Parameters are sometimes stored as a list, account for this if isinstance(parameters, list): for item in parameters: # Parameters includes more than strings (booleans for example) if isinstance(item, str): # Check for PowerShell self.extract_powershell(item, sub_action_section, request) # Join list items into single string param = ", ".join(str(p) for p in parameters) else: param = parameters # Parameters includes more than strings (booleans for example) if isinstance(param, str): self.extract_powershell(param, sub_action_section, request) # If the description field was empty, re-organize result section for this case if description == action: sub_action_section.add_line(param) else: sub_action_section.add_line( f"Action: {action}, Parameters: {param}") # Check later for base64 potential_base64.add(param) # Add urls/ips found in parameter to respective lists self.find_ip(param) # Check tmp_iocs res_temp_iocs = ResultSection("Runtime temporary IOCs") for ioc in tmp_iocs: self.extract_powershell(ioc, res_temp_iocs, request) potential_base64.add(ioc) self.find_ip(ioc) if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body: self.result.add_section(res_temp_iocs) # Add PowerShell score/tag if found if self.found_powershell: ResultSection("Discovered PowerShell code in file", parent=self.result, heuristic=Heuristic(3)) # Check parameters and temp_iocs for base64 base64_section = ResultSection("Possible Base64 found", heuristic=Heuristic(5, frequency=0)) for param in potential_base64: self.check_for_b64(param, base64_section, request, request.file_contents) if base64_section.body: self.result.add_section(base64_section) # Add url/ip tags self.add_ip_tags() # Create section for built-in VBA functions called if len(external_functions) > 0: external_func_section = ResultSection( "VBA functions called", body_format=BODY_FORMAT.MEMORY_DUMP, parent=self.result) for func in external_functions: if func in vba_builtins: external_func_section.add_line(func + ": " + vba_builtins[func]) else: external_func_section.add_line(func) # Add vmonkey log as a supplemental file if we have results if "stdout" in output_results and (vmonkey_err or request.result.sections): temp_log_copy = os.path.join( tempfile.gettempdir(), f"{request.sid}_vipermonkey_output.log") with open(temp_log_copy, "w") as temp_log_file: temp_log_file.write(output_results["stdout"]) request.add_supplementary(temp_log_copy, "vipermonkey_output.log", "ViperMonkey log output") if vmonkey_err is True: ResultSection( 'ViperMonkey has encountered an error, please check "vipermonkey_output.log"', parent=self.result, heuristic=Heuristic(1), )
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop 3 embedded file which two generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in ['d729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06']: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines([get_random_phrase() for _ in range(random.randint(1, 5))]) # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(3, signature="sig_one") # You can attach attack ids to heuristics after they where defined text_section.heuristic.add_attack_id("T1066") # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how # many time the signature fired by setting its frequency. If you call add_signature_id twice with the # same signature, this will effectively increase the frequency of the signature. text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2) text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3) text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_four", score=0) # The heuristic for text_section should have the following properties # 1. 1 attack ID: T1066 # 2. 4 signatures: sig_one, sig_two, sig_three and sig_four # 3. Signature frequencies are cumulative therefor they will be as follow: # - sig_one = 1 # - sig_two = 5 # - sig_three = 2 # - sig_four = 1 # 4. The score used by each heuristic is driven by the following rules: signature_score_map is higher # priority, then score value for the add_signature_id is in second place and finally the default # heuristic score is use. Therefor the score used to calculate the total score for the text_section is # as follow: # - sig_one: 10 -> heuristic default score # - sig_two: 20 -> score provided by the function add_signature_id # - sig_three: 30 -> score provided by the heuristic map # - sig_four: 40 -> score provided by the heuristic map because it's higher priority than the # function score # 5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210 # Make sure you add your section to the result result.add_section(text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 color_map_data = { 'type': 'colormap', 'data': { 'domain': [cmap_min, cmap_max], 'values': [random.random() * cmap_max for _ in range(50)] } } # The classification of a section can be set to any valid classification for your system section_color_map = ResultSection("Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(color_map_data), classification=cl_engine.RESTRICTED) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL, body=json.dumps({"name": "Random url!", "url": f"https://{random_host}/"})) # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed host1 = get_random_host() host2 = get_random_host() ip1 = get_random_ip() ip2 = get_random_ip() ip3 = get_random_ip() urls = [ {"url": f"https://{host1}/"}, {"url": f"https://{host2}/"}, {"url": f"https://{ip1}/"}, {"url": f"https://{ip2}/"}, {"url": f"https://{ip3}/"}] # A heuristic can fire more then once without being associated to a signature url_heuristic = Heuristic(4, frequency=len(urls)) url_sub_section = ResultSection('Example of a url section with multiple links', body=json.dumps(urls), body_format=BODY_FORMAT.URL, heuristic=url_heuristic) url_sub_section.add_tag("network.static.ip", ip1) url_sub_section.add_tag("network.static.ip", ip2) url_sub_section.add_tag("network.static.ip", ip3) url_sub_section.add_tag("network.static.domain", host1) url_sub_section.add_tag("network.dynamic.domain", host2) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump(b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!") memdump_section = ResultSection('Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP, body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed) kv_body = { "a_str": "Some string", "a_bool": False, "an_int": 102, } kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(kv_body)) result.add_section(kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a json dump of a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [ {"d1_key": "val", "d1_key2": "val2"}, {"d2_key": "val", "d2_key2": "val2"} ], "bool": True } } json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON, body=json.dumps(json_body)) result.add_section(json_section) # ================================================================== # PROCESS_TREE section: # This section allows the service writer to list a bunch of dictionary objects that have nested lists # of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore # each dictionary must have be of the following format: # { # "process_pid": int, # "process_name": str, # "command_line": str, # "children": [] NB: This list either is empty or contains more dictionaries that have the same # structure # } nc_body = [ { "process_pid": 123, "process_name": "evil.exe", "command_line": "C:\\evil.exe", "signatures": {}, "children": [ { "process_pid": 321, "process_name": "takeovercomputer.exe", "command_line": "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff", "signatures": {"one":250}, "children": [ { "process_pid": 456, "process_name": "evenworsethanbefore.exe", "command_line": "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad", "signatures": {"one":10, "two":10, "three":10}, "children": [] }, { "process_pid": 234, "process_name": "badfile.exe", "command_line": "C:\\badfile.exe -k nothing_to_see_here", "signatures": {"one":1000, "two":10, "three":10, "four":10, "five":10}, "children": [] } ] }, { "process_pid": 345, "process_name": "benignexe.exe", "command_line": "C:\\benignexe.exe -f \"just kidding, i'm evil\"", "signatures": {"one": 2000}, "children": [] } ] }, { "process_pid": 987, "process_name": "runzeroday.exe", "command_line": "C:\\runzeroday.exe -f insert_bad_spelling", "signatures": {}, "children": [] } ] nc_section = ResultSection('Example of a PROCESS_TREE section', body_format=BODY_FORMAT.PROCESS_TREE, body=json.dumps(nc_body)) result.add_section(nc_section) # ================================================================== # TABLE section: # This section allows the service writer to have their content displayed in a table format in the UI # The body argument must be a list [] of dict {} objects. A dict object can have a key value pair # where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested # table within a cell. table_body = [ { "a_str": "Some string1", "extra_column_here": "confirmed", "a_bool": False, "an_int": 101, }, { "a_str": "Some string2", "a_bool": True, "an_int": 102, }, { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, { "a_str": "Some string4", "a_bool": None, "an_int": -1000000000000000000, "extra_column_there": "confirmed", "nested_table": { "a_str": "Some string3", "a_bool": False, "nested_table_thats_too_deep": { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, }, }, ] table_section = ResultSection('Example of a TABLE section', body_format=BODY_FORMAT.TABLE, body=json.dumps(table_body)) result.add_section(table_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # Embedded files can also have their own classification! fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"CLASSIFIED!!!__"+data.encode()) request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look", classification=cl_engine.RESTRICTED) # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(urls)) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def execute(self, request: ServiceRequest) -> None: # --- Setup ---------------------------------------------------------------------------------------------- request.result = Result() patterns = PatternMatch() if request.deep_scan: max_attempts = 100 else: max_attempts = 10 self.files_extracted = set() self.hashes = set() # --- Pre-Processing -------------------------------------------------------------------------------------- # Get all IOCs prior to de-obfuscation pat_values = patterns.ioc_match(request.file_contents, bogon_ip=True, just_network=False) if pat_values and request.get_param('extract_original_iocs'): ioc_res = ResultSection( "The following IOCs were found in the original file", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) for k, val in pat_values.items(): for v in val: if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_res.add_tag(k, v) # --- Prepare Techniques ---------------------------------------------------------------------------------- techniques = [ ('MSOffice Embedded script', self.msoffice_embedded_script_string), ('CHR and CHRB decode', self.chr_decode), ('String replace', self.string_replace), ('Powershell carets', self.powershell_carets), ('Array of strings', self.array_of_strings), ('Fake array vars', self.vars_of_fake_arrays), ('Reverse strings', self.str_reverse), ('B64 Decode', self.b64decode_str), ('Simple XOR function', self.simple_xor_function), ] second_pass = [('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), ('Charcode hex', self.charcode_hex)] final_pass = [ ('Charcode', self.charcode), ] code_extracts = [('.*html.*', "HTML scripts extraction", self.extract_htmlscript)] layers_list: List[Tuple[str, bytes]] = [] layer = request.file_contents # --- Stage 1: Script Extraction -------------------------------------------------------------------------- for pattern, name, func in code_extracts: if regex.match(regex.compile(pattern), request.task.file_type): extracted_parts = func(request.file_contents) layer = b"\n".join(extracted_parts).strip() layers_list.append((name, layer)) break # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------ idx = 0 first_pass_len = len(techniques) layers_count = len(layers_list) while True: if idx > max_attempts: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break with ThreadPoolExecutor() as executor: threads = [ executor.submit(technique, layer) for name, technique in techniques ] results = [thread.result() for thread in threads] for i in range(len(results)): result = results[i] if result: layers_list.append((techniques[i][0], result)) # Looks like it worked, restart with new layer layer = result # If the layers haven't changed in a passing, break if layers_count == len(layers_list): if len(techniques) != first_pass_len: final_pass.extend(techniques) with ThreadPoolExecutor() as executor: threads = [ executor.submit(technique, layer) for name, technique in final_pass ] results = [thread.result() for thread in threads] for i in range(len(results)): result = results[i] if result: layers_list.append((techniques[i][0], result)) break for x in second_pass: techniques.insert(0, x) layers_count = len(layers_list) idx += 1 # --- Compiling results ---------------------------------------------------------------------------------- if len(layers_list) > 0: extract_file = False num_layers = len(layers_list) # Compute heuristic if num_layers < 5: heur_id = 1 elif num_layers < 10: heur_id = 2 elif num_layers < 50: heur_id = 3 elif num_layers < 100: heur_id = 4 else: # num_layers >= 100 heur_id = 5 # Cleanup final layer clean = self.clean_up_final_layer(layers_list[-1][1]) if clean != request.file_contents: # Check for new IOCs pat_values = patterns.ioc_match(clean, bogon_ip=True, just_network=False) diff_tags: Dict[str, List[bytes]] = {} for uri in pat_values.get('network.static.uri', []): # Compare URIs without query string uri = uri.split(b'?', 1)[0] if uri not in request.file_contents: diff_tags.setdefault('network.static.uri', []) diff_tags['network.static.uri'].append(uri) if request.deep_scan or (len(clean) > 1000 and heur_id >= 4) or diff_tags: extract_file = True # Display obfuscation steps mres = ResultSection( "De-obfuscation steps taken by DeobsfuScripter", parent=request.result) if heur_id: mres.set_heuristic(heur_id) lcount = Counter([x[0] for x in layers_list]) for l, c in lcount.items(): mres.add_line(f"{l}, {c} time(s).") # Display final layer byte_count = 5000 if extract_file: # Save extracted file byte_count = 500 file_name = f"{os.path.basename(request.file_name)}_decoded_final" file_path = os.path.join(self.working_directory, file_name) # Ensure directory exists before write os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'wb+') as f: f.write(clean) self.log.debug( f"Submitted dropped file for analysis: {file_path}" ) request.add_extracted(file_path, file_name, "Final deobfuscation layer") ResultSection(f"First {byte_count} bytes of the final layer:", body=safe_str(clean[:byte_count]), body_format=BODY_FORMAT.MEMORY_DUMP, parent=request.result) # Display new IOCs from final layer if len(diff_tags) > 0: ioc_new = ResultSection( "New IOCs found after de-obfustcation", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) has_network_heur = False for ty, val in diff_tags.items(): for v in val: if "network" in ty: has_network_heur = True ioc_new.add_line( f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_new.add_tag(ty, v) if has_network_heur: ioc_new.set_heuristic(7) else: ioc_new.set_heuristic(6) if len(self.files_extracted) > 0: ext_file_res = ResultSection( "The following files were extracted during the deobfuscation", heuristic=Heuristic(8), parent=request.result) for extracted in self.files_extracted: file_name = os.path.basename(extracted) ext_file_res.add_line(file_name) request.add_extracted( extracted, file_name, "File of interest deobfuscated from sample")
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop two embedded file which one generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in [ 'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06' ]: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines( [get_random_phrase() for _ in range(random.randint(1, 5))]) # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(random.randint(1, 4), signature=get_random_phrase( 1, 4).lower().replace(" ", "_")) # Make sure you add your section to the result result.add_section(text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 color_map_data = { 'type': 'colormap', 'data': { 'domain': [cmap_min, cmap_max], 'values': [random.random() * cmap_max for _ in range(50)] } } section_color_map = ResultSection( "Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(color_map_data)) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL, body=json.dumps({ "name": "Random url!", "url": f"https://{random_host}/" })) # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed host1 = get_random_host() host2 = get_random_host() ip1 = get_random_ip() urls = [{ "url": f"https://{host1}/" }, { "url": f"https://{host2}/" }, { "url": f"https://{ip1}/" }] url_sub_section = ResultSection( 'Example of a url section with multiple links', body_format=BODY_FORMAT.URL, body=json.dumps(urls)) url_sub_section.set_heuristic(random.randint(1, 4)) url_sub_section.add_tag("network.static.ip", ip1) url_sub_section.add_tag("network.static.domain", host1) url_sub_section.add_tag("network.dynamic.domain", host2) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump( b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!" ) memdump_section = ResultSection( 'Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP, body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed) kv_body = { "a_str": "Some string", "a_bool": False, "an_int": 102, } kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(kv_body)) result.add_section(kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a json dump of a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [{ "d1_key": "val", "d1_key2": "val2" }, { "d2_key": "val", "d2_key2": "val2" }], "bool": True } } json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON, body=json.dumps(json_body)) result.add_section(json_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(urls)) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def peepdf_analysis(self, temp_filename, file_content, request): file_res = Result() try: res_list = [] # js_stream = [] f_list = [] js_dump = [] pdf_parser = PDFParser() ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content) if ret == 0: stats_dict = pdf_file.getStats() if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \ "indirect objects found in the body": # Not a PDF return json_body = dict( version=stats_dict['Version'], binary=stats_dict['Binary'], linearized=stats_dict['Linearized'], encrypted=stats_dict['Encrypted'], ) if stats_dict['Encryption Algorithms']: temp = [] for algorithmInfo in stats_dict['Encryption Algorithms']: temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits") json_body["encryption_algorithms"] = temp json_body.update(dict( updates=stats_dict['Updates'], objects=stats_dict['Objects'], streams=stats_dict['Streams'], comments=stats_dict['Comments'], errors={True: ", ".join(stats_dict['Errors']), False: "None"}[len(stats_dict['Errors']) != 0] )) res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(json_body)) for version in range(len(stats_dict['Versions'])): stats_version = stats_dict['Versions'][version] v_json_body = dict( catalog=stats_version['Catalog'] or "no", info=stats_version['Info'] or "no", objects=self.list_first_x(stats_version['Objects'][1]), ) if stats_version['Compressed Objects'] is not None: v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1]) if stats_version['Errors'] is not None: v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1]) v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1]) if stats_version['Xref Streams'] is not None: v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1]) if stats_version['Object Streams'] is not None: v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1]) if int(stats_version['Streams'][0]) > 0: v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1]) if stats_version['Decoding Errors'] is not None: v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1]) if stats_version['Objects with JS code'] is not None: v_json_body['objects_with_js_code'] = \ self.list_first_x(stats_version['Objects with JS code'][1]) # js_stream.extend(stats_version['Objects with JS code'][1]) res_version = ResultSection(f"Version {str(version)}", parent=res, body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body)) actions = stats_version['Actions'] events = stats_version['Events'] vulns = stats_version['Vulns'] elements = stats_version['Elements'] is_suspicious = False if events is not None or actions is not None or vulns is not None or elements is not None: res_suspicious = ResultSection('Suspicious elements', parent=res_version) if events is not None: for event in events: res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}") is_suspicious = True if actions is not None: for action in actions: res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}") is_suspicious = True if vulns is not None: for vuln in vulns: if vuln in vulnsDict: temp = [vuln, ' ('] for vulnCVE in vulnsDict[vuln]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(vulns[vuln])) res_suspicious.add_line(temp) else: res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}") is_suspicious = True if elements is not None: for element in elements: if element in vulnsDict: temp = [element, ' ('] for vulnCVE in vulnsDict[element]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(elements[element])) res_suspicious.add_line(temp) is_suspicious = True else: res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}") is_suspicious = True res_suspicious.set_heuristic(8) if is_suspicious else None urls = stats_version['URLs'] if urls is not None: res.add_line("") res_url = ResultSection('Found URLs', parent=res) for url in urls: res_url.add_line(f"\t\t{url}") res_url.set_heuristic(9) for obj in stats_version['Objects'][1]: cur_obj = pdf_file.getObject(obj, version) if cur_obj.containsJScode: cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} " f"block of JavaScript") score_modifier = 0 js_idx = 0 for js in cur_obj.JSCode: sub_res = ResultSection('Block of JavaScript', parent=cur_res) js_idx += 1 js_score = 0 js_code, unescaped_bytes, _, _, _ = analyseJS(js) js_dump += [x for x in js_code] # Malicious characteristics big_buffs = self.get_big_buffs("".join(js_code)) if len(big_buffs) == 1: js_score += 500 * len(big_buffs) if len(big_buffs) > 0: js_score += 500 * len(big_buffs) has_eval, has_unescape = self.check_dangerous_func("".join(js_code)) if has_unescape: js_score += 100 if has_eval: js_score += 100 js_cmt = "" if has_eval or has_unescape or len(big_buffs) > 0: score_modifier += js_score js_cmt = "Suspiciously malicious " cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF") sub_res.set_heuristic(7) js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res) if js_score > 0: temp_js_outname = f"object{obj}-{version}_{js_idx}.js" temp_js_path = os.path.join(self.working_directory, temp_js_outname) temp_js_bin = "".join(js_code).encode("utf-8") f = open(temp_js_path, "wb") f.write(temp_js_bin) f.close() f_list.append(temp_js_path) js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}") if has_eval or has_unescape: analysis_res = ResultSection("[Suspicious Functions]", parent=js_res) if has_eval: analysis_res.add_line("eval: This JavaScript block uses eval() function " "which is often used to launch deobfuscated " "JavaScript code.") analysis_res.set_heuristic(3) if has_unescape: analysis_res.add_line("unescape: This JavaScript block uses unescape() " "function. It may be legitimate but it is definitely " "suspicious since malware often use this to " "deobfuscate code blocks.") analysis_res.set_heuristic(3) buff_idx = 0 for buff in big_buffs: buff_idx += 1 error, new_buff = unescape(buff) if error == 0: buff = new_buff if buff not in unescaped_bytes: temp_path_name = None if ";base64," in buff[:100] and "data:" in buff[:100]: temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff" try: buff = b64decode(buff.split(";base64,")[1].strip()) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(buff) f.close() f_list.append(temp_path) except Exception: self.log.error("Found 'data:;base64, ' buffer " "but failed to base64 decode.") temp_path_name = None if temp_path_name is not None: buff_cond = f" and was resubmitted as {temp_path_name}" else: buff_cond = "" buff_res = ResultSection( f"A {len(buff)} bytes buffer was found in the JavaScript " f"block{buff_cond}. Here are the first 256 bytes.", parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")), body_format=BODY_FORMAT.MEMORY_DUMP) buff_res.set_heuristic(2) processed_sc = [] sc_idx = 0 for sc in unescaped_bytes: if sc not in processed_sc: sc_idx += 1 processed_sc.append(sc) try: sc = sc.decode("hex") except Exception: pass shell_score = 500 temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff" shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript " f"buffer (id: {sc_idx}) was resubmitted as " f"{temp_path_name}. Here are the first 256 bytes.", parent=js_res) shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(sc) f.close() f_list.append(temp_path) cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer") shell_res.set_heuristic(6) score_modifier += shell_score if score_modifier > 0: res_list.append(cur_res) elif cur_obj.type == "stream": if cur_obj.isEncodedStream and cur_obj.filter is not None: data = cur_obj.decodedStream encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/", "").strip() val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) else: data = cur_obj.rawStream encoding = None val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) if otype: otype = otype.value.replace("/", "").lower() if sub_type: sub_type = sub_type.value.replace("/", "").lower() if length: length = length.value if otype == "embeddedfile": if len(data) > 4096: if encoding is not None: temp_encoding_str = f"_{encoding}" else: temp_encoding_str = "" cur_res = ResultSection( f'Embedded file found ({length} bytes) [obj: {obj} {version}] ' f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj" temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(data) f.close() f_list.append(temp_path) cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}") res_list.append(cur_res) elif otype not in BANNED_TYPES: cur_res = ResultSection( f'Unknown stream found [obj: {obj} {version}] ' f'{f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) for line in val.splitlines(): cur_res.add_line(line) emb_res = ResultSection('First 256 bytes', parent=cur_res) first_256 = data[:256] if isinstance(first_256, str): first_256 = first_256.encode() emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP) res_list.append(cur_res) else: pass file_res.add_section(res) for results in res_list: file_res.add_section(results) if js_dump: js_dump_res = ResultSection('Full JavaScript dump') temp_js_dump = "javascript_dump.js" temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump) try: temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8") except UnicodeDecodeError: temp_js_dump_bin = "\n\n----\n\n".join(js_dump) temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest() f = open(temp_js_dump_path, "wb") f.write(temp_js_dump_bin) f.flush() f.close() f_list.append(temp_js_dump_path) js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}") js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}") js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1) file_res.add_section(js_dump_res) for filename in f_list: request.add_extracted(filename, os.path.basename(filename), f"Dumped from {os.path.basename(temp_filename)}") else: res = ResultSection("ERROR: Could not parse file with PeePDF.") file_res.add_section(res) finally: request.result = file_res try: del pdf_file except Exception: pass try: del pdf_parser except Exception: pass gc.collect()
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop 3 embedded file which two generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in [ 'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06' ]: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultTextSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines( [get_random_phrase() for _ in range(random.randint(1, 5))]) # You can tag data to a section, tagging is used to to quickly find defining information about a file text_section.add_tag("attribution.implant", "ResultSample") # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(3, signature="sig_one") # You can attach attack ids to heuristics after they where defined text_section.heuristic.add_attack_id( random.choice(list(software_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(attack_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(group_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(revoke_map.keys()))) # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how # many time the signature fired by setting its frequency. If you call add_signature_id twice with the # same signature, this will effectively increase the frequency of the signature. text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2) text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3) text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_four", score=0) # The heuristic for text_section should have the following properties # 1. 1 attack ID: T1066 # 2. 4 signatures: sig_one, sig_two, sig_three and sig_four # 3. Signature frequencies are cumulative therefor they will be as follow: # - sig_one = 1 # - sig_two = 5 # - sig_three = 2 # - sig_four = 1 # 4. The score used by each heuristic is driven by the following rules: signature_score_map is higher # priority, then score value for the add_signature_id is in second place and finally the default # heuristic score is use. Therefor the score used to calculate the total score for the text_section is # as follow: # - sig_one: 10 -> heuristic default score # - sig_two: 20 -> score provided by the function add_signature_id # - sig_three: 30 -> score provided by the heuristic map # - sig_four: 40 -> score provided by the heuristic map because it's higher priority than the # function score # 5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210 # Make sure you add your section to the result result.add_section(text_section) # Even if the section was added to the results you can still modify it by adding a subsection for example ResultSection( "Example of sub-section without a body added later in processing", parent=text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 cmap_values = [random.random() * cmap_max for _ in range(50)] # The classification of a section can be set to any valid classification for your system section_color_map = ResultGraphSection( "Example of colormap result section", classification=cl_engine.RESTRICTED) section_color_map.set_colormap(cmap_min, cmap_max, cmap_values) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultURLSection('Example of a simple url section') url_section.add_url(f"https://{random_host}/", name="Random url!") # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed hosts = [get_random_host() for _ in range(2)] # A heuristic can fire more then once without being associated to a signature url_heuristic = Heuristic(4, frequency=len(hosts)) url_sub_section = ResultURLSection( 'Example of a url sub-section with multiple links', heuristic=url_heuristic, classification=cl_engine.RESTRICTED) for host in hosts: url_sub_section.add_url(f"https://{host}/") url_sub_section.add_tag("network.static.domain", host) # You can keep nesting sections if you really need to ips = [get_random_ip() for _ in range(3)] url_sub_sub_section = ResultURLSection( 'Exemple of a two level deep sub-section') for ip in ips: url_sub_sub_section.add_url(f"https://{ip}/") url_sub_sub_section.add_tag("network.static.ip", ip) # Since url_sub_sub_section is a sub-section of url_sub_section # we will add it as a sub-section of url_sub_section not to the main result itself url_sub_section.add_subsection(url_sub_sub_section) # Invalid sections will be ignored, and an error will apear in the logs # Sub-sections of invalid sections will be ignored too invalid_section = ResultSection("") ResultSection( "I won't make it to the report because my parent is invalid :(", parent=invalid_section) url_sub_section.add_subsection(invalid_section) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump( b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!" ) memdump_section = ResultMemoryDumpSection( 'Example of a memory dump section', body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a dictionary (only str, int, and booleans are allowed) kv_section = ResultKeyValueSection( 'Example of a KEY_VALUE section') # You can add items individually kv_section.set_item('key', "value") # Or simply add them in bulk kv_section.update_items({ "a_str": "Some string", "a_bool": False, "an_int": 102, }) result.add_section(kv_section) # ================================================================== # ORDERED_KEY_VALUE section: # This section provides the same functionality as the KEY_VALUE section except the order of the fields # are garanteed to be preserved in the order in which the fields are added to the section. Also with # this section, you can repeat the same key name multiple times oredered_kv_section = ResultOrderedKeyValueSection( 'Example of an ORDERED_KEY_VALUE section') # You can add items individually for x in range(random.randint(3, 6)): oredered_kv_section.add_item(f'key{x}', f"value{x}") result.add_section(oredered_kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [{ "d1_key": "val", "d1_key2": "val2" }, { "d2_key": "val", "d2_key2": "val2" }], "bool": True } } json_section = ResultJSONSection('Example of a JSON section') # You can set the json result to a specific value json_section.set_json(json_body) # You can also update specific parts after the fact json_section.update_json({ 'an_int': 1000, 'updated_key': 'updated_value' }) result.add_section(json_section) # ================================================================== # PROCESS_TREE section: # This section allows the service writer to list a bunch of dictionary objects that have nested lists # of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore # each dictionary must have be of the following format: # { # "process_pid": int, # "process_name": str, # "command_line": str, # "signatures": {} This dict has the signature name as a key and the score as it's value # "children": [] NB: This list either is empty or contains more dictionaries that have the same # structure # } process_tree_section = ResultProcessTreeSection( 'Example of a PROCESS_TREE section') # You can use the ProcessItem class to create the processes to add to the result section evil_process = ProcessItem(123, "evil.exe", "c:\\evil.exe") evil_process_child_1 = ProcessItem( 321, "takeovercomputer.exe", "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff") # You can add child processes to the ProcessItem objects evil_process_child_1.add_child_process( ProcessItem( 456, "evenworsethanbefore.exe", "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad", signatures={ "one": 10, "two": 10, "three": 10 })) evil_process_child_1.add_child_process( ProcessItem(234, "badfile.exe", "C:\\badfile.exe -k nothing_to_see_here", signatures={ "one": 1000, "two": 10, "three": 10, "four": 10, "five": 10 })) # You can add signatures that hit on a ProcessItem Object evil_process_child_1.add_signature('one', 250) # Or even directly create the ProcessItem object with the signature in it evil_process_child_2 = ProcessItem( 345, "benignexe.exe", "C:\\benignexe.exe -f \"just kidding, i'm evil\"", signatures={"one": 2000}) # You can also add counts for network, file and registry events to a ProcessItem object evil_process_child_2.add_network_events(4) evil_process_child_2.add_file_events(7000) evil_process_child_2.add_registry_events(10) # You can also indicate if the process tree item has been safelisted benign_process = ProcessItem(678, "trustme.exe", "C:\\trustme.exe") benign_process.safelist() evil_process.add_child_process(evil_process_child_1) evil_process.add_child_process(evil_process_child_2) # Add your processes to the result section via the add_process function process_tree_section.add_process(evil_process) process_tree_section.add_process( ProcessItem(987, "runzeroday.exe", "C:\\runzeroday.exe -f insert_bad_spelling")) process_tree_section.add_process(benign_process) result.add_section(process_tree_section) # ================================================================== # TABLE section: # This section allows the service writer to have their content displayed in a table format in the UI # The body argument must be a list [] of dict {} objects. A dict object can have a key value pair # where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested # table within a cell. table_section = ResultTableSection('Example of a TABLE section') # Use the TableRow class to help adding row to the Table section table_section.add_row( TableRow(a_str="Some string1", extra_column_here="confirmed", a_bool=False, an_int=101)) table_section.add_row( TableRow( { "a_str": "Some string2", "a_bool": True, "an_int": "to_be_overriden_by_kwargs" }, an_int=102)) table_section.add_row( TableRow(a_str="Some string3", a_bool=False, an_int=103)) # Valid values for the items in the TableRow are: str, int, bool, None, or dict of those values table_section.add_row( TableRow( { "a_str": "Some string4", "a_bool": None, "an_int": -1000000000000000000 }, { "extra_column_there": "confirmed", "nested_key_value_pair": { "a_str": "Some string3", "a_bool": False, "nested_kv_thats_too_deep": { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, } })) result.add_section(table_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # Embedded files can also have their own classification! fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"CLASSIFIED!!!__" + data.encode()) request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look", classification=cl_engine.RESTRICTED) # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(url_sub_section.body) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Zeroize on safe tags # When this feature is turned on, the section will get its score set to zero if all its tags # were safelisted by the safelisting engine zero_section = ResultSection('Example of zeroize-able section', zeroize_on_tag_safe=True) zero_section.set_heuristic(2) zero_section.add_line( "This section will have a zero score if all tags are safelisted." ) zero_section.add_tag('network.static.ip', '127.0.0.1') result.add_section(zero_section) # ================================================================== # Auto-collapse # When this feature is turned on, the section will be collapsed when first displayed collapse_section = ResultSection( 'Example of auto-collapse section', auto_collapse=True) collapse_section.set_heuristic(2) collapse_section.add_line( "This section was collapsed when first loaded in the UI") result.add_section(collapse_section) # ================================================================== # Image Section # This type of section allows the service writer to display images to the user image_section = ResultImageSection(request, 'Example of Image section') for x in range(6): image_section.add_image(f'data/000{x+1}.jpg', f'000{x+1}.jpg', f'ResultSample screenshot 000{x+1}', ocr_heuristic_id=6) result.add_section(image_section) # ================================================================== # Multi Section # This type of section allows the service writer to display multiple section types # in the same result section. Here's a concrete exemple of this: multi_section = ResultMultiSection( 'Example of Multi-typed section') multi_section.add_section_part( TextSectionBody( body="We have detected very high entropy multiple sections " "of your file, this section is most-likely packed or " "encrypted.\n\nHere are affected sections:")) section_count = random.randint(1, 4) for x in range(section_count): multi_section.add_section_part( KVSectionBody(section_name=f".UPX{x}", offset=f'0x00{8+x}000', size='4196 bytes')) graph_part = GraphSectionBody() graph_part.set_colormap( 0, 8, [7 + random.random() for _ in range(20)]) multi_section.add_section_part(graph_part) if x != section_count - 1: multi_section.add_section_part(DividerSectionBody()) multi_section.add_tag("file.pe.sections.name", f".UPX{x}") multi_section.set_heuristic(5) result.add_section(multi_section) # ================================================================== # Propagate temporary submission data to other services # Sometimes two service can work in tandem were one extra some piece of information the other # one uses to do it's work. This is how a service can set temporary data that other # services that subscribe to can use. request.temp_submission_data['kv_section'] = kv_section.body request.temp_submission_data[ 'process_tree_section'] = process_tree_section.body request.temp_submission_data['url_section'] = url_sub_section.body # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def analyze_pdf(self, request, res_txt, path, working_dir, heur, additional_keywords, get_malform=True): """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins, and PDF Parser. Args: request: AL request object. res_txt: Header string for AL result section title. path: Original PDF sample path. working_dir: AL working directory. heur: List of plugins to run on PDFId results (provided in service configuration). additional_keywords: List of additional keywords to be searched (provided in service configuration). get_malform: Extract malformed objects from PDF. Returns: AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list. """ triage_keywords = set() all_errors = set() embed_present = False objstms = False res = ResultSection(title_text=res_txt) carved_extracted_shas = set() if request.deep_scan: run_pdfparse = True else: run_pdfparse = False # Run PDFId try: pdfid_result, errors = self.get_pdfid(path, additional_keywords, heur, request.deep_scan) except Exception as e: raise NonRecoverableError(e) # Parse PDFId results pdfidres = ResultSection(title_text="PDFID Results", parent=res) if len(pdfid_result) == 0: pdfidres.add_line( "No results generated for file. Please see errors.") else: # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: version = pdfid_result.get("PDFID", None) if version: pdfidres.add_line(version[0]) properties = pdfid_result.get("Properties", None) if properties: pres = ResultSection(title_text="PDF Properties", parent=pdfidres) for plist in properties: pres.add_line("{0}: {1}".format(plist[0], plist[1])) if plist[0] == "/ModDate": pres.add_tag('file.pdf.date.modified', plist[1]) elif plist[0] == "/CreationDate": pres.add_tag('file.date.creation', plist[1]) elif plist[0] == "/LastModified": pres.add_tag('file.date.last_modified', plist[1]) elif plist[0] == "/SourceModified": pres.add_tag('file.pdf.date.source_modified', plist[1]) elif plist[0] == "/pdfx": pres.add_tag('file.pdf.date.pdfx', plist[1]) entropy = pdfid_result.get("Entropy", None) if entropy: enres = ResultSection(title_text="Entropy", parent=pdfidres) for enlist in entropy: enres.add_line("{0}: {1}, ({2})".format( enlist[0], enlist[1], enlist[2])) flags = pdfid_result.get("Flags", None) if flags: fres = ResultSection(title_text="PDF Keyword Flags", parent=pdfidres) for flist in flags: if flist[0] == "/ObjStm": objstms = True if len(flist) == 3: fres.add_line( "{0}:Count: {1}, Hex-Encoded Count: {2}".format( flist[0], flist[1], flist[2])) else: fres.add_line("{0}:Count: {1}".format( flist[0], flist[1])) fres.add_tag('file.string.extracted', flist[0].replace("/", "", 1)) if flist[0] in additional_keywords: triage_keywords.add(flist[0].replace("/", "", 1)) plugin = pdfid_result.get("Plugin", []) # If any plugin results, or flagged keywords found, run PDF Parser if plugin or len(triage_keywords) > 0: run_pdfparse = True for pllist in plugin: pl_name, pl_heur, pl_text = pllist pl_heur = int(pl_heur) pl_text = pl_text[14:] if not pl_text or pl_text == "None": continue if pl_name in ['EmbeddedFile', 'Name Obfuscation']: modres = ResultSection(title_text=pl_text, parent=pdfidres) if pl_heur > 0: modres.set_heuristic(pl_heur) if pl_name == 'EmbeddedFile': embed_present = True elif pl_name in ['Triage', 'Suspicious Properties']: javascript_found = False for line in pl_text.splitlines(): lineres = ResultSection(title_text=line) # Triage results if '/JavaScript' in line: triage_keywords.add('JavaScript') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JS' in line: triage_keywords.add('JS') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JBIG2Decode' in line: triage_keywords.add('JBIG2Decode') lineres.set_heuristic(3) elif '/Colors > 2^24' in line: triage_keywords.add('Colors > 2^24') lineres.set_heuristic(20) elif '/AA' in line: triage_keywords.add('AA') lineres.set_heuristic(1) elif '/Launch' in line: triage_keywords.add('Launch') lineres.set_heuristic(1) elif '/OpenAction' in line: triage_keywords.add('OpenAction') lineres.set_heuristic(1) elif '/GoToE' in line: triage_keywords.add('GoToE') lineres.set_heuristic(21) elif '/GoToR' in line: triage_keywords.add('GoToR') lineres.set_heuristic(22) elif '/Encrypt' in line: triage_keywords.add('Encrypt') lineres.set_heuristic(11) elif '/AcroForm' in line: triage_keywords.add('AcroForm') lineres.set_heuristic(4) elif '/RichMedia' in line: triage_keywords.add('RichMedia') lineres.set_heuristic(5) elif '/XFA' in line: triage_keywords.add('XFA') lineres.set_heuristic(23) elif '/Annot' in line: triage_keywords.add('Annot') lineres.set_heuristic(25) elif '/ObjStm' in line: triage_keywords.add('ObjStm') lineres.set_heuristic(7) elif '/URI' in line: triage_keywords.add('URI') lineres.set_heuristic(24) # Suspicious properties results elif "eof2" in line: lineres.set_heuristic(2) elif "eof5" in line: lineres.set_heuristic(17) elif "page" in line: lineres.set_heuristic(26) elif "entropy" in line: lineres.set_heuristic(12) elif "obj/endobj" in line: lineres.set_heuristic(13) elif "stream/endstream" in line: lineres.set_heuristic(14) if lineres.heuristic is not None: pdfidres.add_subsection(lineres) for e in errors: all_errors.add(e) if e.startswith('Error running plugin'): self.log.warn(e) if run_pdfparse: # CALL PDF parser and extract further information pdf_parserres = ResultSection(title_text="PDF Parser Results") # STATISTICS # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: options = { "stats": True, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No statistical results generated for file. Please see errors." ) else: version = pdf_parser_result.get("version", None) if version and version[0] != '0': pdf_parserres.add_line(version[0]) stats = pdf_parser_result.get("stats", None) if stats: sres = ResultSection( title_text="PDF Statistcs", parent=pdf_parserres, body_format=BODY_FORMAT.MEMORY_DUMP) for p in stats: sres.add_line(p) for e in errors: all_errors.add(e) # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream) carved_content = {} # Format { "objnum": [{keyword: content list}} obj_extract_triage = set() jbig_objs = set() for keyword in triage_keywords: # ObjStms handled differently if keyword == 'ObjStm': continue options = { "search": keyword, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: for p in pdf_parser_result['parts']: content = "" references = [] # Trailer will be extracted anyways, try and grab all references anyways -- will be messy if p.startswith("trailer:"): # Grab the content after the keyword # Check that keyword actually in content if "/{}".format(keyword) in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').split("/", 1)[0].strip() references = re.findall( "[0-9]* [0-9]* R", content) except Exception: continue # If not trailer, should be object elif 'Referencing:' in p: # Grab the content after the keyword if '>>++>>' in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').strip() except Exception: try: content = p.split("\n", 3)[3] except Exception: content = p else: try: content = p.split("\n", 3)[3] except Exception: content = p # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R" if content.startswith("/{}".format(keyword)): try: content = re.sub("/{}[ ]*".format(keyword), "", content, 1) except Exception: pass try: references = p.split("\n", 3)[2].replace( 'Referencing:', '').strip().split(", ") except Exception: pass # Only extract JBIG2Decode objects with deep scan, but always report on their presence if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p: try: objnum = p.split("\n", 1)[0].split(" ")[1] if request.deep_scan: obj_extract_triage.add(objnum) jbig_objs.add(objnum) continue except Exception as e: self.log.debug(e) continue # If no content, then keyword likely points to reference objects, so grab those if content == '': if len(references) > 0: content = references else: # Something is wrong, drop it. continue else: while True: # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R] islist = re.match( r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]", content) if islist: content = re.sub( r"[\[\]]", "", islist.group(0).replace( "s ", '').replace("R ", "R,")).split(",") break # References might be with instructions, i.e. [# # R /FitH null] withinst = re.match( r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}" r"[/a-zA-Z0-9 ]*[ ]?\]", content) if withinst: content = [withinst.group(1)] break content = [content] break for c in content: # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[ 0:5]: continue if c in references or re.match( "[0-9]* [0-9]* R", c): try: ref_obj = c.split(" ", 1)[0] options = { "object": ref_obj, "get_object_detail": True } pdf_parser_subresult, err = self.get_pdf_parser( path, working_dir, options) if pdf_parser_subresult: for sub_p in pdf_parser_subresult[ 'parts']: sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\ .strip().split(", ") ptyp = sub_p.split( "\n", 2)[1].replace( 'Type:', '').strip().replace( "/", "") # If the object contains a stream, extract the object. if "Contains stream" in sub_p: try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] obj_extract_triage.add( objnum) except Exception: pass # Or if the object Type is the keyword, grab all referenced objects. elif sub_references[0] != '' and len(sub_references) >= 1 \ and ptyp == keyword: for sr in sub_references: try: objnum = sr.split( " ", 1)[0] obj_extract_triage.add( objnum) except Exception: pass # If not, extract object detail in to carved output elif pdf_parser_subresult[ 'obj_details'] != "": try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] if objnum in carved_content: carved_content[objnum]\ .append({keyword: pdf_parser_subresult['obj_details']}) else: carved_content[objnum] = \ [{keyword: pdf_parser_subresult['obj_details']}] except Exception: continue for e in err: errors.add(e) except Exception: # If none of that work, just extract the original object for examination. try: objnum = p.split("\n", 1)[0].split(" ")[1] obj_extract_triage.add(objnum) except Exception: pass # If content does not look like a reference: else: if p.startswith("trailer:"): continue objnum = p.split("\n", 1)[0].split(" ")[1] # If the object contains a stream extract the object if p.split("\n", 4)[3] == "Contains stream": obj_extract_triage.add(objnum) else: # Or just carve the content if objnum in carved_content: carved_content[objnum].append( {keyword: c}) else: carved_content[objnum] = [{keyword: c}] for e in errors: all_errors.add(e) # Add carved content to result output show_content_of_interest = False if len(carved_content) > 0 or len(jbig_objs) > 0: carres = ResultSection(title_text="Content of Interest") else: carres = None if len(jbig_objs) > 0: jbigres = ResultSection( title_text= "The following Object IDs are JBIG2DECODE streams:", body_format=BODY_FORMAT.MEMORY_DUMP, parent=carres) jbigres.add_line(', '.join(map(str, jbig_objs))) show_content_of_interest = True if len(carved_content) > 0: for k, l in sorted(carved_content.items()): for d in l: for keyw, con in d.items(): subres = ResultSection( title_text="Object {0}: Hits for Keyword '{1}':" .format(k, keyw)) subres.set_heuristic(8) con_bytes = con.encode() if len(con) < 500: subres.body_format = BODY_FORMAT.MEMORY_DUMP subres.add_line(con) # Check for IOC content patterns = PatternMatch() st_value = patterns.ioc_match(con_bytes, bogon_ip=True) if len(st_value) > 0: carres.add_subsection(subres) show_content_of_interest = True for ty, val in st_value.items(): if val == "": asc_asc = unicodedata.normalize( 'NFKC', val).encode('ascii', 'ignore') subres.add_tag(ty, asc_asc) else: ulis = list(set(val)) for v in ulis: subres.add_tag(ty, v) else: crv_sha = hashlib.sha256(con_bytes).hexdigest() if crv_sha not in carved_extracted_shas: f_name = "carved_content_obj_{}_{}".format( k, crv_sha[0:7]) subres.add_lines([ "Content over 500 bytes it will be extracted for analysis", "Name: {} - SHA256: {}".format( f_name, crv_sha) ]) carres.add_subsection(subres) show_content_of_interest = True crvf = os.path.join( self.working_directory, f_name) with open(crvf, 'wb') as f: f.write(con_bytes) request.add_extracted( crvf, os.path.basename(crvf), "Extracted content from object {}". format(k)) carved_extracted_shas.add(crv_sha) if show_content_of_interest: pdf_parserres.add_subsection(carres) # ELEMENTS # Do not show for objstms if get_malform: if request.deep_scan: options = { "verbose": True, "nocanonicalizedoutput": True, "get_malform": get_malform } elif embed_present: options = { "verbose": True, "elements": "ctsi", "type": "/EmbeddedFile", "get_malform": get_malform } else: options = { "verbose": True, "elements": "cst", "get_malform": get_malform } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) embed_extracted = set() if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No structure information generated for file. Please see errors." ) else: # PDF Parser will write any malformed content over 100 bytes to a file files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'malformed': if len(l) > 0: pdf_parserres.set_heuristic(6) for i in l: request.add_extracted( i, os.path.basename(i), "Extracted malformed content in PDF Parser Analysis." ) parts = pdf_parser_result.get("parts", None) # Extract service will extract the sample's embedded files. # However we want to make note of them so that they are not extracted again below if parts: for p in sorted(parts): if "Type: /EmbeddedFile" in p: getobj = p.split("\n", 1)[0].split(" ")[1] embed_extracted.add(getobj) # Extract objects collected from above analysis obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs if len(obj_to_extract) > 0: options = { "filter": True, "object": obj_to_extract, "dump": "extracted_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: files = pdf_parser_result.get("files", None) extracted_files = [] if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_obj_", "") extracted_files.append( "Extracted object {} as {}".format( obj_id, f_name)) request.add_extracted( i, f_name, "Object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_files: extract_res = ResultSection( title_text="Extracted embedded objects", parent=pdf_parserres) extract_res.set_heuristic(9) extract_res.add_lines(extracted_files) # Extract jbig2decode objects in deep scan mode if request.deep_scan and len(jbig_objs) > 0: options = { "object": jbig_objs, "dump": "extracted_jb_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: extracted_jb = [] files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_jb_obj_", "") extracted_jb.append( "JBIG2DECODE object {} extracted as {}" .format(obj_id, f_name)) request.add_extracted( i, f_name, "JBIG2DECODE object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_jb: jbig_extract_res = ResultSection( title_text="Extracted JBIG2Decode objects", parent=pdf_parserres) jbig_extract_res.set_heuristic(9) jbig_extract_res.add_lines(extracted_jb) if len(pdf_parserres.subsections) > 0: res.add_subsection(pdf_parserres) return res, objstms, all_errors
def test_process_ttps(intezer_static_class_instance, dummy_api_interface_class, mocker): from intezer_static import ALIntezerApi from intezer_sdk.api import IntezerApi from intezer_sdk.errors import UnsupportedOnPremiseVersion from assemblyline_v4_service.common.result import ResultSection, ResultTableSection, TableRow from requests import HTTPError mocker.patch.object(intezer_static_class_instance, "get_api_interface", return_value=dummy_api_interface_class) intezer_static_class_instance.start() parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(IntezerApi, "get_dynamic_ttps", side_effect=HTTPError("FORBIDDEN")) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(IntezerApi, "get_dynamic_ttps", side_effect=UnsupportedOnPremiseVersion()) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "blah", "description": "blah", "data": [], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: blah", "blah") correct_res_sec.set_heuristic(4) correct_res_sec.heuristic.add_signature_id("blah", 10) assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "InjectionInterProcess", "description": "blah", "data": [], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: InjectionInterProcess", "blah") correct_res_sec.set_heuristic(7) correct_res_sec.heuristic.add_signature_id("InjectionInterProcess", 10) correct_res_sec.heuristic.add_attack_id("T1055") assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "enumerates_running_processes", "description": "blah", "data": [{ "wow": "print me!" }], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection( "Signature: enumerates_running_processes", "blah") correct_res_sec.set_heuristic(8) correct_res_sec.heuristic.add_signature_id( "enumerates_running_processes", 10) correct_res_sec.heuristic.add_attack_id("T1057") assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "blah", "description": "blah", "data": [ { "IP": "blah 2.2.2.2 blah" }, ], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: blah", "blah") correct_res_sec.add_line("\tIP: blah 2.2.2.2 blah") correct_res_sec.set_heuristic(4) correct_res_sec.heuristic.add_signature_id("blah", 10) correct_ioc_res_sec = ResultTableSection( "IOCs found in signature marks") correct_ioc_res_sec.add_row(TableRow(ioc_type="ip", ioc="2.2.2.2")) correct_ioc_res_sec.add_tag("network.dynamic.ip", "2.2.2.2") correct_res_sec.add_subsection(correct_ioc_res_sec) assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec)
def execute(self, request): # --- Setup ---------------------------------------------------------------------------------------------- request.result = Result() patterns = PatternMatch() if request.deep_scan: max_attempts = 100 else: max_attempts = 10 self.files_extracted = set() self.hashes = set() before = set() # --- Pre-Processing -------------------------------------------------------------------------------------- # Get all IOCs prior to de-obfuscation pat_values = patterns.ioc_match(request.file_contents, bogon_ip=True, just_network=False) if pat_values: if request.get_param('extract_original_iocs'): ioc_res = ResultSection( "The following IOCs were found in the original file", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) else: ioc_res = None for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}" ) ioc_res.add_tag(k, asc_asc) before.add((k, asc_asc)) else: for v in val: if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_res.add_tag(k, v) before.add((k, v)) # --- Prepare Techniques ---------------------------------------------------------------------------------- techniques = [ ('MSOffice Embedded script', self.msoffice_embedded_script_string), ('CHR and CHRB decode', self.chr_decode), ('String replace', self.string_replace), ('Powershell carets', self.powershell_carets), ('Array of strings', self.array_of_strings), ('Fake array vars', self.vars_of_fake_arrays), ('Reverse strings', self.str_reverse), ('B64 Decode', self.b64decode_str), ('Simple XOR function', self.simple_xor_function), ] second_pass = [('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), ('Charcode hex', self.charcode_hex)] final_pass = [ ('Charcode', self.charcode), ] code_extracts = [('.*html.*', "HTML scripts extraction", self.extract_htmlscript)] layers_list = [] layer = request.file_contents # --- Stage 1: Script Extraction -------------------------------------------------------------------------- for pattern, name, func in code_extracts: if re.match(re.compile(pattern), request.task.file_type): extracted_parts = func(request.file_contents) layer = b"\n".join(extracted_parts).strip() layers_list.append((name, layer)) break # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------ idx = 0 first_pass_len = len(techniques) layers_count = len(layers_list) while True: if idx > max_attempts: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break for name, technique in techniques: res = technique(layer) if res: layers_list.append((name, res)) # Looks like it worked, restart with new layer layer = res # If the layers haven't changed in a passing, break if layers_count == len(layers_list): if len(techniques) != first_pass_len: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break else: for x in second_pass: techniques.insert(0, x) layers_count = len(layers_list) idx += 1 # --- Compiling results ---------------------------------------------------------------------------------- if len(layers_list) > 0: extract_file = False num_layers = len(layers_list) heur_id = None # Compute heuristic if num_layers < 5: heur_id = 1 elif num_layers < 10: heur_id = 2 elif num_layers < 50: heur_id = 3 elif num_layers < 100: heur_id = 4 elif num_layers >= 100: heur_id = 5 # Cleanup final layer clean = self.clean_up_final_layer(layers_list[-1][1]) if clean != request.file_contents: # Check for new IOCs pat_values = patterns.ioc_match(clean, bogon_ip=True, just_network=False) diff_tags = {} for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if (k, asc_asc) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(asc_asc) else: for v in val: if (k, v) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(v) if request.deep_scan or \ (len(clean) > 1000 and heur_id >= 4) or diff_tags: extract_file = True # Display obfuscation steps mres = ResultSection( "De-obfuscation steps taken by DeobsfuScripter", parent=request.result) if heur_id: mres.set_heuristic(heur_id) lcount = Counter([x[0] for x in layers_list]) for l, c in lcount.items(): mres.add_line(f"{l}, {c} time(s).") # Display final layer byte_count = 5000 if extract_file: # Save extracted file byte_count = 500 fn = f"{request.file_name}_decoded_final" fp = os.path.join(self.working_directory, fn) with open(fp, 'wb') as dcf: dcf.write(clean) self.log.debug( f"Submitted dropped file for analysis: {fp}") request.add_extracted(fp, fn, "Final deobfuscation layer") ResultSection(f"First {byte_count} bytes of the final layer:", body=safe_str(clean[:byte_count]), body_format=BODY_FORMAT.MEMORY_DUMP, parent=request.result) # Display new IOCs from final layer if len(diff_tags) > 0: ioc_new = ResultSection( "New IOCs found after de-obfustcation", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) has_network_heur = False for ty, val in diff_tags.items(): for v in val: if "network" in ty: has_network_heur = True ioc_new.add_line( f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_new.add_tag(ty, v) if has_network_heur: ioc_new.set_heuristic(7) else: ioc_new.set_heuristic(6) if len(self.files_extracted) > 0: ext_file_res = ResultSection( "The following files were extracted during the deobfuscation", heuristic=Heuristic(8), parent=request.result) for f in self.files_extracted: ext_file_res.add_line(os.path.basename(f)) request.add_extracted( f, os.path.basename(f), "File of interest deobfuscated from sample")
def _add_resultinfo_for_match(self, result: Result, match): """ Parse from Yara signature match and add information to the overall AL service result. This module determines result score and identifies any AL tags that should be added (i.e. IMPLANT_NAME, THREAT_ACTOR, etc.). Args: result: AL ResultSection object. match: Yara rules Match object item. Returns: None. """ almeta = YaraMetadata(match) self._normalize_metadata(almeta) section = ResultSection('', classification=almeta.classification) if self.deep_scan or almeta.al_status != "NOISY": section.set_heuristic(self.YARA_HEURISTICS_MAP.get( almeta.category, 1), signature=f'{match.namespace}.{match.rule}', attack_id=almeta.mitre_att) section.add_tag(f'file.rule.{self.name.lower()}', f'{match.namespace}.{match.rule}') title_elements = [ f"[{match.namespace}] {match.rule}", ] if almeta.actor_type: section.add_tag('attribution.actor', almeta.actor_type) for tag in almeta.tags: section.add_tag(tag['type'], tag['value']) # Malware Tags implant_title_elements = [] for (implant_name, implant_family) in almeta.malwares: if implant_name: implant_title_elements.append(implant_name) section.add_tag('attribution.implant', implant_name) if implant_family: implant_title_elements.append(implant_family) section.add_tag('attribution.family', implant_family) if implant_title_elements: title_elements.append( f"- Implant(s): {', '.join(implant_title_elements)}") # Threat Actor metadata for actor in almeta.actors: title_elements.append(actor) section.add_tag('attribution.actor', actor) # Exploit / CVE metadata if almeta.exploits: title_elements.append( f"- Exploit(s): {', '.join(almeta.exploits)}") for exploit in almeta.exploits: section.add_tag('attribution.exploit', exploit) # Include technique descriptions in the section behavior for (category, name) in almeta.techniques: descriptor = self.TECHNIQUE_DESCRIPTORS.get(category, None) if descriptor: technique_type, technique_description = descriptor section.add_tag(technique_type, name) almeta.behavior.add(technique_description) for (category, name) in almeta.infos: descriptor = self.INFO_DESCRIPTORS.get(category, None) if descriptor: info_type, info_description = descriptor section.add_tag(info_type, name) almeta.behavior.add(info_description) # Summaries if almeta.behavior: title_elements.append(f"- Behavior: {', '.join(almeta.behavior)}") for element in almeta.behavior: section.add_tag('file.behavior', element) title = " ".join(title_elements) section.title_text = title json_body = dict(name=match.rule, ) for item in [ 'id', 'version', 'author', 'description', 'source', 'malware', 'info', 'technique', 'tool', 'exploit', 'actor', 'category', 'mitre_att' ]: val = almeta.__dict__.get(item, None) if val: json_body[item] = val string_match_data = self._add_string_match_data(match) if string_match_data: json_body['string_hits'] = string_match_data section.set_body(json.dumps(json_body), body_format=BODY_FORMAT.KEY_VALUE) result.add_section(section)