def test_handle_artefact(artefact, expected_result_section_title): from assemblyline_v4_service.common.dynamic_service_helper import SandboxOntology, Artefact from assemblyline_v4_service.common.result import ResultSection if artefact is None: with pytest.raises(Exception): SandboxOntology._handle_artefact(artefact, None) return expected_result_section = None if expected_result_section_title is not None: expected_result_section = ResultSection( expected_result_section_title) expected_result_section.add_tag("dynamic.process.file_name", artefact["path"]) parent_result_section = ResultSection("blah") a = Artefact(name=artefact["name"], path=artefact["path"], description=artefact["description"], to_be_extracted=artefact["to_be_extracted"]) SandboxOntology._handle_artefact(a, parent_result_section) if len(parent_result_section.subsections) > 0: actual_result_section = parent_result_section.subsections[0] else: actual_result_section = None if expected_result_section is None and actual_result_section is None: assert True else: assert check_section_equality(actual_result_section, expected_result_section)
def decoded_result(text: bytes) -> Optional[ResultSection]: """ Generates a ResultSection from floss decoded strings output section """ lines = text.splitlines() lines[0] = b'Most likely decoding functions:' body = b'\n'.join(lines[:-1]) strings = re.findall(rb'^\[[A-Z]+\]\s+0x[0-9A-F]+\s+(.+)', body, flags=re.M) if not strings: return None result = ResultSection('FLARE FLOSS Decoded Strings', body_format=BODY_FORMAT.MEMORY_DUMP, heuristic=Heuristic(1)) assert result.heuristic ioc = False for string in strings: ioc = ioc_tag(string, result, just_network=len(strings) > 1000) or ioc result.add_tag('file.string.decoded', string[:75]) if ioc: result.heuristic.add_signature_id('decoded_ioc') result.add_line(body.decode()) return result
def _get_category_section(self, category: str, tags: Iterator[AVClassTag]) -> ResultSection: """ Gets a `ResultSection` for a list of tags from a single category. Result contains table with AVclass tag information in descending order by rank. :param category: Category of tags :param tags: Tags belonging to category :return: `ResultSection` """ tags = sorted(tags, key=lambda t: t.rank, reverse=True) category_name, heur_id, tag_type = AVCLASS_CATEGORY[category] tag_table = [{ 'name': tag.name, 'category': category_name, 'path': tag.path, 'rank': tag.rank } for tag in tags] section = ResultSection( f'AVclass extracted {len(tags)} {category_name} tags', body=json.dumps(tag_table), body_format=BODY_FORMAT.TABLE, heuristic=Heuristic(heur_id) if heur_id is not None else None) if tag_type is not None: for tag in tags: section.add_tag(tag_type, tag.name) return section
def additional_parsing(self, file_path: str) -> Optional[ResultSection]: urls = set() try: with pikepdf.open(file_path) as pdf: num_pages = len(pdf.pages) for page in pdf.pages: if '/Annots' not in page: continue for annot in page['/Annots'].as_list(): if annot.get('/Subtype') == '/Link': if '/A' not in annot: continue _url = annot['/A'].get('/URI') if not hasattr(_url, '__str__'): continue url = str(_url) if re.match(FULL_URI, url): urls.add(url) if not urls: return None patterns = PatternMatch() body = '\n'.join(urls) tags: dict[str, set[bytes]] = patterns.ioc_match(body.encode()) result = ResultSection( 'URL in Annotations', heuristic=Heuristic( 27, signature='one_page' if num_pages == 1 else None), body=body) for ty, vals in tags.items(): for val in vals: result.add_tag(ty, val) return result except Exception as e: self.log.warning(f'pikepdf failed to parse sample: {e}') return None
def add_ip_tags(self): """ Adds tags for urls and ip addresses from given lists """ if self.url_list or self.ip_list: sec_iocs = ResultSection( "ViperMonkey has found the following IOCs:", parent=self.result, heuristic=Heuristic(4)) # Add Urls for url in set(self.url_list): sec_iocs.add_line(url) sec_iocs.add_tag('network.static.uri', url) try: parsed = urlparse(url) if not re.match(IP_ONLY_REGEX, parsed.hostname): sec_iocs.add_tag('network.static.domain', parsed.hostname) except Exception: pass # Add IPs for ip in set(self.ip_list): sec_iocs.add_line(ip) # Checking if IP ports also found and adding the corresponding tags if re.findall(":", ip): net_ip, net_port = ip.split(':') sec_iocs.add_tag('network.static.ip', net_ip) sec_iocs.add_tag('network.port', net_port) else: sec_iocs.add_tag('network.static.ip', ip)
def run_strings_analysis(self, apk_file, result: Result): string_args = ['d', 'strings', apk_file] strings, _ = self.run_appt(string_args) if not strings or strings == "String pool is unitialized.\n": ResultSection("No strings found in APK", body="This is highly unlikely and most-likely malicious.", parent=result, heuristic=Heuristic(6)) else: res_strings = ResultSection("Strings Analysis", parent=result) config_args = ['d', 'configurations', apk_file] configs, _ = self.run_appt(config_args) languages = [] for line in configs.splitlines(): config = line.upper() if config in ISO_LOCALES: languages.append(config) res_strings.add_tag('file.apk.locale', config) data_line = strings.split("\n", 1)[0] count = int(data_line.split(" entries")[0].rsplit(" ", 1)[1]) styles = int(data_line.split(" styles")[0].rsplit(" ", 1)[1]) if count < 50: ResultSection("Low volume of strings, this is suspicious.", parent=res_strings, body_format=BODY_FORMAT.MEMORY_DUMP, body=safe_str(strings), heuristic=Heuristic(7)) if len(languages) < 2: ResultSection("This app is not built for multiple languages. This is unlikely.", parent=res_strings, heuristic=Heuristic(8)) res_strings.add_line(f"Total string count: {count}") res_strings.add_line(f"Total styles: {styles}") if languages: res_strings.add_line(f"Languages: {', '.join(languages)}")
def recurse_add_res(self, file_res, res_list, new_files, parent=None): for res_dic in res_list: # Check if condition is OK if self.pass_condition(res_dic.get("condition", None)): res = ResultSection(res_dic['title_text'], classification=res_dic.get('classification', Classification.UNRESTRICTED), parent=parent, body_format=res_dic.get('body_format', BODY_FORMAT.TEXT)) heur_id = self.heuristic_alteration(res_dic.get('score_condition', None), res_dic['heur_id']) res.set_heuristic(heur_id) # Add Tags tags = res_dic.get('tags', []) for res_tag in tags: res.add_tag(res_tag[0], res_tag[1]) # Add body body = res_dic.get('body', None) if body: res.set_body(body) # File for resubmit files = res_dic.get('files', []) for res_file in files: if isinstance(res_file, tuple): res_file = res_file[1] new_files.append(res_file) # Add to file res if root result if parent is None: file_res.add_section(res)
def _create_random_section(self): # choose a random body format body_format = random.choice(FORMAT_LIST) # create a section with a random title section = ResultSection(get_random_phrase(3, 7), body_format=body_format) # choose random amount of lines in the body for _ in range(1, 5): # generate random line section.add_line(get_random_phrase(5, 10)) # choose random amount of tags tags = flatten(get_random_tags()) for key, val in tags.items(): for v in val: section.add_tag(key, v) # set a heuristic a third of the time if random.choice([False, False, True]): section.set_heuristic(random.randint(1, 4)) # Create random sub-sections if random.choice([False, False, True]): section.add_subsection(self._create_random_section()) return section
def _validate_tag( result_section: ResultSection, tag: str, value: Any, safelist: Dict[str, Dict[str, List[str]]] = None ) -> bool: """ This method validates the value relative to the tag type before adding the value as a tag to the ResultSection. :param result_section: The ResultSection that the tag will be added to :param tag: The tag type that the value will be tagged under :param value: The item that will be tagged under the tag type :param safelist: The safelist containing matches and regexs. The product of a service using self.get_api_interface().get_safelist(). :return: Tag was successfully added """ if safelist is None: safelist = {} regex = _get_regex_for_tag(tag) if regex and not match(regex, value): return False if "ip" in tag and not is_valid_ip(value): return False if "domain" in tag: if not is_valid_domain(value): return False elif value in FALSE_POSITIVE_DOMAINS_FOUND_IN_PATHS: return False elif isinstance(value, str) and value.split(".")[-1] in COMMON_FILE_EXTENSIONS: return False if is_tag_safelisted(value, [tag], safelist): return False # if "uri" is in the tag, let's try to extract its domain/ip and tag it. if "uri_path" not in tag and "uri" in tag: # First try to get the domain valid_domain = False domain = search(DOMAIN_REGEX, value) if domain: domain = domain.group() valid_domain = _validate_tag(result_section, "network.dynamic.domain", domain, safelist) # Then try to get the IP valid_ip = False ip = search(IP_REGEX, value) if ip: ip = ip.group() valid_ip = _validate_tag(result_section, "network.dynamic.ip", ip, safelist) if value not in [domain, ip] and (valid_domain or valid_ip): result_section.add_tag(tag, safe_str(value)) else: return False else: result_section.add_tag(tag, safe_str(value)) return True
def execute(self, request): result = Result() url = request.task.metadata.get('submitted_url') api_key = request.get_param("api_key") public = request.get_param("public") u = UrlScan(apikey=api_key, url=url, public=public) u.submit() # We need to wait for the API to process our request response = self.wait_processing(u) # We get the response parts that we want and merge them all together report = { **response.json()["verdicts"]["overall"], **response.json()["lists"], **response.json()["page"] } # We convert the "certicates" section from a list of dictionnaries to a dictionnary of lists certificates = report.pop("certificates") certificates = { k: [dic[k] for dic in certificates] for k in certificates[0] } # We add the converted section to the report report = {**report, **certificates} # We create the KEY_VALUE section to add the report to the result page kv_section = ResultSection("Urlscan.io report", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(report)) for domain in report["domains"]: kv_section.add_tag("network.static.domain", domain.strip()) result.add_section(kv_section) # We get the preview of the website screenshot = u.getScreenshot() with open(self.working_directory + "/preview.png", "wb") as ofile: ofile.write(screenshot) # Adding the preview on the result page url_section = ResultSection( 'Urlscan.io website screenshot', body_format=BODY_FORMAT.URL, body=json.dumps({ "name": "The preview is also available here !", "url": response.json()["task"]["screenshotURL"] })) result.add_section(url_section) request.add_extracted(self.working_directory + "/preview.png", "preview.png", "Here\'s the preview of the site") request.result = result
def test_process_iocs(intezer_static_class_instance, dummy_api_interface_class, mocker): from intezer_static import ALIntezerApi from intezer_sdk.api import IntezerApi from assemblyline_v4_service.common.result import ResultSection from requests import HTTPError mocker.patch.object(intezer_static_class_instance, "get_api_interface", return_value=dummy_api_interface_class) intezer_static_class_instance.start() parent_res_sec = ResultSection("blah") file_verdict_map = {} mocker.patch.object(ALIntezerApi, "get_iocs", return_value={ "files": [], "network": [] }) intezer_static_class_instance._process_iocs("blah", file_verdict_map, parent_res_sec) assert parent_res_sec.subsections == [] assert file_verdict_map == {} mocker.patch.object(IntezerApi, "get_iocs", side_effect=HTTPError("FORBIDDEN")) intezer_static_class_instance._process_iocs("blah", file_verdict_map, parent_res_sec) assert parent_res_sec.subsections == [] assert file_verdict_map == {} mocker.patch.object(ALIntezerApi, "get_iocs", return_value={ "files": [{ "sha256": "blah", "verdict": "malicious" }], "network": [{ "ioc": "1.1.1.1", "type": "ip" }, { "ioc": "blah.com", "type": "domain" }] }) intezer_static_class_instance._process_iocs("blah", file_verdict_map, parent_res_sec) correct_res_sec = ResultSection("Network Communication Observed") correct_res_sec.add_tag("network.dynamic.ip", "1.1.1.1") correct_res_sec.add_tag("network.dynamic.domain", "blah.com") correct_res_sec.add_line("IOC: 1.1.1.1") correct_res_sec.add_line("IOC: blah.com") assert check_section_equality(parent_res_sec.subsections[0], correct_res_sec) assert file_verdict_map == {"blah": "malicious"}
def test_section_traverser(tags, correct_tags): from assemblyline_v4_service.common.section_reducer import _section_traverser from assemblyline_v4_service.common.result import ResultSection section = ResultSection("blah") subsection = ResultSection("subblah") for t_type, t_values in tags.items(): for t_value in t_values: subsection.add_tag(t_type, t_value) section.add_subsection(subsection) assert _section_traverser(section).subsections[0].tags == correct_tags
def parse_plist(self, pdict): """Attempts to extract and identify all known and unknown keys of a plist file. Args: pdict: Plist dictionary item. Returns: A list of known keys and a list of unknown keys. """ idenkey_sec = ResultSection("Identified Keys") unkkey_sec = ResultSection("Unidentified Keys:") # Sometimes plist is a list of dictionaries, or it is just a list. Will merge dict /convert to dict for now if isinstance(pdict, list): pdict = self.transform_dicts(pdict) for k, i in list(pdict.items()): # Prepare Keys k = str(safe_str(k)) k_noipad = k.replace("~ipad", "") # Prepare values if i is None: i = [""] elif not isinstance(i, list): i = [i] # Many plist files are duplicates of info.plist, do not report on keys already identified if k_noipad in self.reported_keys: if i in self.reported_keys[k_noipad]: continue self.reported_keys[k_noipad].append(i) else: self.reported_keys[k_noipad] = [i] # Process known keys if k_noipad in self.known_keys: desc, create_tag = self.known_keys[k_noipad] idenkey_sec.add_line(f"{k} ({desc}): {', '.join([safe_str(x, force_str=True) for x in i])}") if create_tag: for val in i: idenkey_sec.add_tag(TAG_MAP[k_noipad.upper()], safe_str(val, force_str=True)) else: unkkey_sec.add_line(f"{k}: {', '.join([safe_str(x, force_str=True) for x in i])}") if idenkey_sec.body is None: idenkey_sec = None if unkkey_sec.body is None: unkkey_sec = None return idenkey_sec, unkkey_sec
def ioc_tag(text: bytes, result: ResultSection, just_network: bool = False) -> bool: """ Tags iocs found in text to result text: text to search for iocs result: ResultSection to tag with iocs just_network: whether non-network iocs should be skipped returns: whether iocs are found """ pattern = PatternMatch() ioc = pattern.ioc_match(text, bogon_ip=True, just_network=just_network) for kind, values in ioc.items(): for val in values: result.add_tag(kind, val[:MAX_TAG_LEN]) # Return whether any IOCs were found return bool(ioc)
def parse_link(self, parent_res, path): with open(path, "rb") as fh: metadata = decode_lnk(fh.read()) if metadata is None: return False body_output = { build_key(k): v for k, v in flatten(metadata).items() if v } res = ResultSection("Metadata extracted by parse_lnk", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(body_output), parent=parent_res) bp = metadata.get("BasePath", "").strip() rp = metadata.get("RELATIVE_PATH", "").strip() nn = metadata.get("NetName", "").strip() cla = metadata.get("COMMAND_LINE_ARGUMENTS", "").strip() s = BAD_LINK_RE.search(cla.lower()) if s: res.set_heuristic(1) res.add_tag(tag_type="file.name.extracted", value=(bp or rp or nn).rsplit("\\")[-1]) res.add_tag(tag_type="dynamic.process.command_line", value=f"{(rp or bp or nn)} {cla}".strip()) for k, v in body_output.items(): tag_type = TAG_MAP.get("LNK", {}).get(k, None) or \ TAG_MAP.get(None, {}).get(k, None) if tag_type: res.add_tag(tag_type, v) return True
def execute(self, request): qr = xqrcode.decode_from_file(request.file_path) if len(qr) > 0: result_url = qr[0]['data'] result = Result() text_section = ResultSection('QR Code') text_section.add_line(result_url) result.add_section(text_section) url_section = ResultSection('url extracted', body_format=BODY_FORMAT.URL, body=json.dumps({ "name": "QR Code Url", "url": f"{result_url}" })) url_section.add_tag("network.static.domain", result_url) result.add_section(url_section) request.result = result else: request.result = Result()
def _handle_artefact(artefact: Artefact = None, artefacts_result_section: ResultSection = None): if artefact is None: raise Exception("Artefact cannot be None") # This is a dict who's key-value pairs follow the format {regex: result_section_title} artefact_map = { HOLLOWSHUNTER_EXE_REGEX: "HollowsHunter Injected Portable Executable", HOLLOWSHUNTER_SHC_REGEX: "HollowsHunter Shellcode", HOLLOWSHUNTER_DLL_REGEX: "HollowsHunter DLL", } artefact_result_section = None for regex, title in artefact_map.items(): pattern = compile(regex) if pattern.match(artefact.name): artefact_result_section = ResultSection(title) artefact_result_section.add_tag("dynamic.process.file_name", artefact.path) if artefact_result_section is not None: artefacts_result_section.add_subsection(artefact_result_section)
def subsection_builder(parent_section: ResultSection = None, fields: dict = {}): for mwcp_field, mwcp_field_data in fields.items(): if mwcp_field in FIELD_TAG_MAP: tag = FIELD_TAG_MAP[mwcp_field] table_body = [] table_section = ResultSection( f"Extracted {mwcp_field.capitalize()}") if tag: for x in mwcp_field_data: table_section.add_tag(tag, x) # Tag everything that we can # Add data to section body for line in mwcp_field_data: if type(line) is str: table_body.append({mwcp_field: line}) elif type(line) is list: for item in line: table_body.append({mwcp_field: item}) table_section.set_body(body_format=BODY_FORMAT.TABLE, body=json.dumps(table_body)) parent_section.add_subsection(table_section)
def validate_certs(apktool_out_dir: str, result: Result): has_cert = False for root, _, files in os.walk(os.path.join(apktool_out_dir, "original", "META-INF")): for f in files: cur_file = os.path.join(root, f) stdout, stderr = Popen(["keytool", "-printcert", "-file", cur_file], stderr=PIPE, stdout=PIPE).communicate() stdout = safe_str(stdout) if stdout: if "keytool error" not in stdout: has_cert = True issuer = "" owner = "" country = "" valid_from = "" valid_to = "" valid_year_end = 0 valid_year_start = 0 valid_until_date = time.time() play_store_min = 'Sat Oct 22 00:00:00 2033' play_store_min_valid_date = time.mktime(time.strptime(play_store_min, "%a %b %d %H:%M:%S %Y")) for line in stdout.splitlines(): if "Owner:" in line: owner = line.split(": ", 1)[1] country = owner.split("C=") if len(country) != 1: country = country[1] else: country = "" if "Issuer:" in line: issuer = line.split(": ", 1)[1] if "Valid from:" in line: valid_from = line.split(": ", 1)[1].split(" until:")[0] valid_to = line.rsplit(": ", 1)[1] valid_from_splitted = valid_from.split(" ") valid_to_splitted = valid_to.split(" ") valid_year_start = int(valid_from_splitted[-1]) valid_year_end = int(valid_to_splitted[-1]) valid_until = " ".join(valid_to_splitted[:-2] + valid_to_splitted[-1:]) valid_until_date = time.mktime(time.strptime(valid_until, "%a %b %d %H:%M:%S %Y")) res_cert = ResultSection("Certificate Analysis", body=safe_str(stdout), parent=result, body_format=BODY_FORMAT.MEMORY_DUMP) res_cert.add_tag('cert.valid.start', valid_from) res_cert.add_tag('cert.valid.end', valid_to) res_cert.add_tag('cert.issues', issuer) res_cert.add_tag('cert.owner', owner) if owner == issuer: ResultSection("Certificate is self-signed", parent=res_cert, heuristic=Heuristic(10)) if not country: ResultSection("Certificate owner has no country", parent=res_cert, heuristic=Heuristic(11)) if valid_year_start < 2008: ResultSection("Certificate valid before first android release", parent=res_cert, heuristic=Heuristic(12)) if valid_year_start > valid_year_end: ResultSection("Certificate expires before validity date starts", parent=res_cert, heuristic=Heuristic(16)) if (valid_year_end - valid_year_start) > 30: ResultSection("Certificate valid more then 30 years", parent=res_cert, heuristic=Heuristic(13)) if valid_until_date < play_store_min_valid_date: ResultSection("Certificate not valid until minimum valid playstore date", parent=res_cert, heuristic=Heuristic(20)) if country: try: int(country) is_int_country = True except Exception: is_int_country = False if len(country) != 2 or is_int_country: ResultSection("Invalid country code in certificate owner", parent=res_cert, heuristic=Heuristic(14)) if f != "CERT.RSA": ResultSection(f"Certificate name not using conventional name: {f}", parent=res_cert, heuristic=Heuristic(15)) if not has_cert: ResultSection("This APK is not signed", parent=result, heuristic=Heuristic(9))
def run_badging_analysis(self, apk_file: str, result: Result): badging_args = ['d', 'badging', apk_file] badging, errors = self.run_appt(badging_args) if not badging: return res_badging = ResultSection("Android application details") libs = [] permissions = [] components = [] features = [] pkg_version = None for line in badging.splitlines(): if line.startswith("package:"): pkg_name = line.split("name='")[1].split("'")[0] pkg_version = line.split("versionCode='")[1].split("'")[0] res_badging.add_line(f"Package: {pkg_name} v.{pkg_version}") res_badging.add_tag('file.apk.pkg_name', pkg_name) res_badging.add_tag('file.apk.app.version', pkg_version) if line.startswith("sdkVersion:"): min_sdk = line.split(":'")[1][:-1] res_badging.add_line(f"Min SDK: {min_sdk}") res_badging.add_tag('file.apk.sdk.min', min_sdk) if line.startswith("targetSdkVersion:"): target_sdk = line.split(":'")[1][:-1] res_badging.add_line(f"Target SDK: {target_sdk}") res_badging.add_tag('file.apk.sdk.target', target_sdk) if line.startswith("application-label:"): label = line.split(":'")[1][:-1] res_badging.add_line(f"Default Label: {label}") res_badging.add_tag('file.apk.app.label', label) if line.startswith("launchable-activity:"): launch = line.split("name='")[1].split("'")[0] res_badging.add_line(f"Launchable activity: {launch}") res_badging.add_tag('file.apk.activity', launch) if line.startswith("uses-library-not-required:"): lib = line.split(":'")[1][:-1] if lib not in libs: libs.append(lib) if line.startswith("uses-permission:") or line.startswith("uses-implied-permission:"): perm = line.split("name='")[1].split("'")[0] if perm not in permissions: permissions.append(perm) if line.startswith("provides-component:"): component = line.split(":'")[1][:-1] if component not in components: components.append(component) if "uses-feature:" in line or "uses-implied-feature:" in line: feature = line.split("name='")[1].split("'")[0] if feature not in features: features.append(feature) if pkg_version is not None: pkg_version = int(pkg_version) if pkg_version < 15: ResultSection("Package version is suspiciously low", parent=res_badging, heuristic=Heuristic(17)) elif pkg_version > 999999999: ResultSection("Package version is suspiciously high", parent=res_badging, heuristic=Heuristic(17)) if libs: res_lib = ResultSection("Libraries used", parent=res_badging) for lib in libs: res_lib.add_line(lib) res_lib.add_tag('file.apk.used_library', lib) if permissions: res_permissions = ResultSection("Permissions used", parent=res_badging) dangerous_permissions = [] unknown_permissions = [] for perm in permissions: if perm in ALL_ANDROID_PERMISSIONS: if 'dangerous' in ALL_ANDROID_PERMISSIONS[perm]: dangerous_permissions.append(perm) else: res_permissions.add_line(perm) res_permissions.add_tag('file.apk.permission', perm) else: unknown_permissions.append(perm) if len(set(permissions)) < len(permissions): ResultSection("Some permissions are defined more then once", parent=res_badging, heuristic=Heuristic(18)) if dangerous_permissions: res_dangerous_perm = ResultSection("Dangerous permissions used", parent=res_badging, heuristic=Heuristic(4)) for perm in dangerous_permissions: res_dangerous_perm.add_line(perm) res_dangerous_perm.add_tag('file.apk.permission', perm) if unknown_permissions: res_unknown_perm = ResultSection("Unknown permissions used", parent=res_badging, heuristic=Heuristic(5)) for perm in unknown_permissions: res_unknown_perm.add_line(perm) res_unknown_perm.add_tag('file.apk.permission', perm) if features: res_features = ResultSection("Features used", parent=res_badging) for feature in features: res_features.add_line(feature) res_features.add_tag('file.apk.feature', feature) if components: res_components = ResultSection("Components provided", parent=res_badging) for component in components: res_components.add_line(component) res_components.add_tag('file.apk.provides_component', component) result.add_section(res_badging)
def find_network_indicators(apktool_out_dir: str, result: Result): # Whitelist skip_list = [ "android.intent", "com.google", "com.android", ] indicator_whitelist = [ 'google.to', 'google.ttl', 'google.delay', 'google_tagmanager.db', 'gtm_urls.db', 'gtm.url', 'google_tagmanager.db', 'google_analytics_v4.db', 'Theme.Dialog.Alert', 'popupLocationInfo.gravity', 'popupLocationInfo.displayId', 'popupLocationInfo.left', 'popupLocationInfo.top', 'popupLocationInfo.right', 'popupLocationInfo.bottom', 'googleads.g.doubleclick.net', 'ad.doubleclick.net', '.doubleclick.net', '.googleadservices.com', '.googlesyndication.com', 'android.hardware.type.watch', 'mraid.js', 'google_inapp_purchase.db', 'mobileads.google.com', 'mobileads.google.com', 'share_history.xml', 'share_history.xml', 'activity_choser_model_history.xml', 'FragmentPager.SavedState{', 'android.remoteinput.results', 'android.people', 'android.picture', 'android.icon', 'android.text', 'android.title', 'android.title.big', 'FragmentTabHost.SavedState{', 'android.remoteinput.results', 'android.remoteinput.results', 'android.remoteinput.results', 'libcore.icu.ICU', ] file_list = [] # Indicators url_list = [] domain_list = [] ip_list = [] email_list = [] # Build dynamic whitelist smali_dir = os.path.join(apktool_out_dir, "smali") for root, dirs, files in os.walk(smali_dir): if not files: continue else: skip_list.append(root.replace(smali_dir + "/", "").replace("/", ".")) for cdir in dirs: skip_list.append(os.path.join(root, cdir).replace(smali_dir + "/", "").replace("/", ".")) asset_dir = os.path.join(apktool_out_dir, "assets") if os.path.exists(asset_dir): for root, dirs, files in os.walk(asset_dir): if not files: continue else: for asset_file in files: file_list.append(asset_file) skip_list = list(set(skip_list)) # Find indicators proc = Popen(['grep', '-ER', r'(([[:alpha:]](-?[[:alnum:]])*)\.)*[[:alpha:]](-?[[:alnum:]])+\.[[:alpha:]]{2,}', smali_dir], stdout=PIPE, stderr=PIPE) grep, _ = proc.communicate() for line in safe_str(grep).splitlines(): file_path, line = line.split(":", 1) if "const-string" in line or "Ljava/lang/String;" in line: data = line.split("\"", 1)[1].split("\"")[0] data_low = data.lower() data_split = data.split(".") if data in file_list: continue elif data in indicator_whitelist: continue elif data.startswith("/"): continue elif data_low.startswith("http://") or data_low.startswith('ftp://') or data_low.startswith('https://'): url_list.append(data) elif len(data_split[0]) < len(data_split[-1]) and len(data_split[-1]) > 3: continue elif data.startswith('android.') and data_low != data: continue elif "/" in data and "." in data and data.index("/") < data.index("."): continue elif " " in data: continue elif data_split[0] in ['com', 'org', 'net', 'java']: continue elif data_split[-1].lower() in ['so', 'properties', 'zip', 'read', 'id', 'store', 'name', 'author', 'sh', 'soccer', 'fitness', 'news', 'video']: continue elif data.endswith("."): continue else: do_skip = False for skip in skip_list: if data.startswith(skip): do_skip = True break if do_skip: continue data = data.strip(".") if is_valid_domain(data): domain_list.append(data) elif is_valid_ip(data): ip_list.append(data) elif is_valid_email(data): email_list.append(data) url_list = list(set(url_list)) for url in url_list: dom_ip = url.split("//")[1].split("/")[0] if ":" in dom_ip: dom_ip = dom_ip.split(":")[0] if is_valid_ip(dom_ip): ip_list.append(dom_ip) elif is_valid_domain(dom_ip): domain_list.append(dom_ip) ip_list = list(set(ip_list)) domain_list = list(set(domain_list)) email_list = list(set(email_list)) if url_list or ip_list or domain_list or email_list: res_net = ResultSection("Network indicator(s) found", parent=result, heuristic=Heuristic(3)) if url_list: res_url = ResultSection("Found urls in the decompiled code", parent=res_net) count = 0 for url in url_list: count += 1 if count <= 20: res_url.add_line(url) res_url.add_tag('network.static.uri', url) if count > 20: res_url.add_line(f"and {count - 20} more...") if ip_list: res_ip = ResultSection("Found IPs in the decompiled code", parent=res_net) count = 0 for ip in ip_list: count += 1 if count <= 20: res_ip.add_line(ip) res_ip.add_tag('network.static.ip', ip) if count > 20: res_ip.add_line(f"and {count - 20} more...") if domain_list: res_domain = ResultSection("Found domains in the decompiled code", parent=res_net) count = 0 for domain in domain_list: count += 1 if count <= 20: res_domain.add_line(domain) res_domain.add_tag('network.static.domain', domain) if count > 20: res_domain.add_line(f"and {count - 20} more...") if email_list: res_email = ResultSection("Found email addresses in the decompiled code", parent=res_net) count = 0 for email in email_list: count += 1 if count <= 20: res_email.add_line(email) res_email.add_tag('network.email.address', email) if count > 20: res_email.add_line(f"and {count - 20} more...")
def _report_embedded_xdp(self, file_res, chunk_number, binary, leftover): res_section = ResultSection([f"Found {chunk_number}", "Embedded PDF (in XDP)"]) res_section.set_heuristic(1) res_section.add_tag('file.behavior', "Embedded PDF (in XDP)") file_res.add_section(res_section)
def peepdf_analysis(self, temp_filename, file_content, request): file_res = Result() try: res_list = [] # js_stream = [] f_list = [] js_dump = [] pdf_parser = PDFParser() ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content) if ret == 0: stats_dict = pdf_file.getStats() if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \ "indirect objects found in the body": # Not a PDF return json_body = dict( version=stats_dict['Version'], binary=stats_dict['Binary'], linearized=stats_dict['Linearized'], encrypted=stats_dict['Encrypted'], ) if stats_dict['Encryption Algorithms']: temp = [] for algorithmInfo in stats_dict['Encryption Algorithms']: temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits") json_body["encryption_algorithms"] = temp json_body.update(dict( updates=stats_dict['Updates'], objects=stats_dict['Objects'], streams=stats_dict['Streams'], comments=stats_dict['Comments'], errors={True: ", ".join(stats_dict['Errors']), False: "None"}[len(stats_dict['Errors']) != 0] )) res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(json_body)) for version in range(len(stats_dict['Versions'])): stats_version = stats_dict['Versions'][version] v_json_body = dict( catalog=stats_version['Catalog'] or "no", info=stats_version['Info'] or "no", objects=self.list_first_x(stats_version['Objects'][1]), ) if stats_version['Compressed Objects'] is not None: v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1]) if stats_version['Errors'] is not None: v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1]) v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1]) if stats_version['Xref Streams'] is not None: v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1]) if stats_version['Object Streams'] is not None: v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1]) if int(stats_version['Streams'][0]) > 0: v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1]) if stats_version['Decoding Errors'] is not None: v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1]) if stats_version['Objects with JS code'] is not None: v_json_body['objects_with_js_code'] = \ self.list_first_x(stats_version['Objects with JS code'][1]) # js_stream.extend(stats_version['Objects with JS code'][1]) res_version = ResultSection(f"Version {str(version)}", parent=res, body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body)) actions = stats_version['Actions'] events = stats_version['Events'] vulns = stats_version['Vulns'] elements = stats_version['Elements'] is_suspicious = False if events is not None or actions is not None or vulns is not None or elements is not None: res_suspicious = ResultSection('Suspicious elements', parent=res_version) if events is not None: for event in events: res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}") is_suspicious = True if actions is not None: for action in actions: res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}") is_suspicious = True if vulns is not None: for vuln in vulns: if vuln in vulnsDict: temp = [vuln, ' ('] for vulnCVE in vulnsDict[vuln]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(vulns[vuln])) res_suspicious.add_line(temp) else: res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}") is_suspicious = True if elements is not None: for element in elements: if element in vulnsDict: temp = [element, ' ('] for vulnCVE in vulnsDict[element]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(elements[element])) res_suspicious.add_line(temp) is_suspicious = True else: res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}") is_suspicious = True res_suspicious.set_heuristic(8) if is_suspicious else None urls = stats_version['URLs'] if urls is not None: res.add_line("") res_url = ResultSection('Found URLs', parent=res) for url in urls: res_url.add_line(f"\t\t{url}") res_url.set_heuristic(9) for obj in stats_version['Objects'][1]: cur_obj = pdf_file.getObject(obj, version) if cur_obj.containsJScode: cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} " f"block of JavaScript") score_modifier = 0 js_idx = 0 for js in cur_obj.JSCode: sub_res = ResultSection('Block of JavaScript', parent=cur_res) js_idx += 1 js_score = 0 js_code, unescaped_bytes, _, _, _ = analyseJS(js) js_dump += [x for x in js_code] # Malicious characteristics big_buffs = self.get_big_buffs("".join(js_code)) if len(big_buffs) == 1: js_score += 500 * len(big_buffs) if len(big_buffs) > 0: js_score += 500 * len(big_buffs) has_eval, has_unescape = self.check_dangerous_func("".join(js_code)) if has_unescape: js_score += 100 if has_eval: js_score += 100 js_cmt = "" if has_eval or has_unescape or len(big_buffs) > 0: score_modifier += js_score js_cmt = "Suspiciously malicious " cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF") sub_res.set_heuristic(7) js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res) if js_score > 0: temp_js_outname = f"object{obj}-{version}_{js_idx}.js" temp_js_path = os.path.join(self.working_directory, temp_js_outname) temp_js_bin = "".join(js_code).encode("utf-8") f = open(temp_js_path, "wb") f.write(temp_js_bin) f.close() f_list.append(temp_js_path) js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}") if has_eval or has_unescape: analysis_res = ResultSection("[Suspicious Functions]", parent=js_res) if has_eval: analysis_res.add_line("eval: This JavaScript block uses eval() function " "which is often used to launch deobfuscated " "JavaScript code.") analysis_res.set_heuristic(3) if has_unescape: analysis_res.add_line("unescape: This JavaScript block uses unescape() " "function. It may be legitimate but it is definitely " "suspicious since malware often use this to " "deobfuscate code blocks.") analysis_res.set_heuristic(3) buff_idx = 0 for buff in big_buffs: buff_idx += 1 error, new_buff = unescape(buff) if error == 0: buff = new_buff if buff not in unescaped_bytes: temp_path_name = None if ";base64," in buff[:100] and "data:" in buff[:100]: temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff" try: buff = b64decode(buff.split(";base64,")[1].strip()) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(buff) f.close() f_list.append(temp_path) except Exception: self.log.error("Found 'data:;base64, ' buffer " "but failed to base64 decode.") temp_path_name = None if temp_path_name is not None: buff_cond = f" and was resubmitted as {temp_path_name}" else: buff_cond = "" buff_res = ResultSection( f"A {len(buff)} bytes buffer was found in the JavaScript " f"block{buff_cond}. Here are the first 256 bytes.", parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")), body_format=BODY_FORMAT.MEMORY_DUMP) buff_res.set_heuristic(2) processed_sc = [] sc_idx = 0 for sc in unescaped_bytes: if sc not in processed_sc: sc_idx += 1 processed_sc.append(sc) try: sc = sc.decode("hex") except Exception: pass shell_score = 500 temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff" shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript " f"buffer (id: {sc_idx}) was resubmitted as " f"{temp_path_name}. Here are the first 256 bytes.", parent=js_res) shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(sc) f.close() f_list.append(temp_path) cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer") shell_res.set_heuristic(6) score_modifier += shell_score if score_modifier > 0: res_list.append(cur_res) elif cur_obj.type == "stream": if cur_obj.isEncodedStream and cur_obj.filter is not None: data = cur_obj.decodedStream encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/", "").strip() val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) else: data = cur_obj.rawStream encoding = None val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) if otype: otype = otype.value.replace("/", "").lower() if sub_type: sub_type = sub_type.value.replace("/", "").lower() if length: length = length.value if otype == "embeddedfile": if len(data) > 4096: if encoding is not None: temp_encoding_str = f"_{encoding}" else: temp_encoding_str = "" cur_res = ResultSection( f'Embedded file found ({length} bytes) [obj: {obj} {version}] ' f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj" temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(data) f.close() f_list.append(temp_path) cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}") res_list.append(cur_res) elif otype not in BANNED_TYPES: cur_res = ResultSection( f'Unknown stream found [obj: {obj} {version}] ' f'{f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) for line in val.splitlines(): cur_res.add_line(line) emb_res = ResultSection('First 256 bytes', parent=cur_res) first_256 = data[:256] if isinstance(first_256, str): first_256 = first_256.encode() emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP) res_list.append(cur_res) else: pass file_res.add_section(res) for results in res_list: file_res.add_section(results) if js_dump: js_dump_res = ResultSection('Full JavaScript dump') temp_js_dump = "javascript_dump.js" temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump) try: temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8") except UnicodeDecodeError: temp_js_dump_bin = "\n\n----\n\n".join(js_dump) temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest() f = open(temp_js_dump_path, "wb") f.write(temp_js_dump_bin) f.flush() f.close() f_list.append(temp_js_dump_path) js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}") js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}") js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1) file_res.add_section(js_dump_res) for filename in f_list: request.add_extracted(filename, os.path.basename(filename), f"Dumped from {os.path.basename(temp_filename)}") else: res = ResultSection("ERROR: Could not parse file with PeePDF.") file_res.add_section(res) finally: request.result = file_res try: del pdf_file except Exception: pass try: del pdf_parser except Exception: pass gc.collect()
def execute(self, request): request.result = Result() # 1. Calculate entropy map with open(request.file_path, 'rb') as fin: (entropy, part_entropies) = calculate_partition_entropy(fin) entropy_graph_data = { 'type': 'colormap', 'data': { 'domain': [0, 8], 'values': part_entropies } } ResultSection(f"File entropy: {round(entropy, 3)}", parent=request.result, body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(entropy_graph_data)) if request.file_type == "meta/shortcut/windows": # 2. Parse windows shortcuts self.parse_link(request.result, request.file_path) else: # 3. Get hachoir metadata parser = createParser(request.file_path) if parser is not None: with parser: tags = parser.getParserTags() parser_id = tags.get('id', 'unknown') # Do basic metadata extraction metadata = extractMetadata(parser, 1) if metadata: kv_body = {} tags = [] for m in metadata: if m.key == "comment": for v in m.values: key, val = get_type_val(v.text, "comment") if not val: continue kv_body[key] = val tag_type = TAG_MAP.get(parser_id, {}).get(key, None) or \ TAG_MAP.get(None, {}).get(key, None) if tag_type is not None: tags.append((tag_type, val)) elif m.key in ["mime_type"]: pass else: values = [v.text for v in m.values] if len(values) == 1 and values[0]: kv_body[m.key] = values[0] elif values: kv_body[m.key] = values for v in values: tag_type = TAG_MAP.get(parser_id, {}).get(m.key, None) or \ TAG_MAP.get(None, {}).get(m.key, None) if tag_type is not None: tags.append((tag_type, v)) if kv_body: res = ResultSection( f"Metadata extracted by hachoir-metadata [Parser: {parser_id}]", body=json.dumps(kv_body), body_format=BODY_FORMAT.KEY_VALUE, parent=request.result) for t_type, t_val in tags: res.add_tag(t_type, t_val) # 4. Get Exiftool Metadata exif = subprocess.run(["exiftool", "-j", request.file_path], capture_output=True, check=False) if exif.stdout: exif_data = json.loads(exif.stdout.decode('utf-8', errors="ignore")) res_data = exif_data[0] if "Error" not in res_data: exif_body = { build_key(k): v for k, v in res_data.items() if v and k not in [ "SourceFile", "ExifToolVersion", "FileName", "Directory", "FileSize", "FileModifyDate", "FileAccessDate", "FileInodeChangeDate", "FilePermissions", "FileType", "FileTypeExtension", "MIMEType" ] } if exif_body: e_res = ResultSection("Metadata extracted by ExifTool", body=json.dumps(exif_body), body_format=BODY_FORMAT.KEY_VALUE, parent=request.result) for k, v in exif_body.items(): tag_type = TAG_MAP.get(res_data.get("FileTypeExtension", "UNK").upper(), {}).get(k, None) or \ TAG_MAP.get(None, {}).get(k, None) if tag_type: e_res.add_tag(tag_type, v)
def analyze_pdf(self, request, res_txt, path, working_dir, heur, additional_keywords, get_malform=True): """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins, and PDF Parser. Args: request: AL request object. res_txt: Header string for AL result section title. path: Original PDF sample path. working_dir: AL working directory. heur: List of plugins to run on PDFId results (provided in service configuration). additional_keywords: List of additional keywords to be searched (provided in service configuration). get_malform: Extract malformed objects from PDF. Returns: AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list. """ triage_keywords = set() all_errors = set() embed_present = False objstms = False res = ResultSection(title_text=res_txt) carved_extracted_shas = set() if request.deep_scan: run_pdfparse = True else: run_pdfparse = False # Run PDFId try: pdfid_result, errors = self.get_pdfid(path, additional_keywords, heur, request.deep_scan) except Exception as e: raise NonRecoverableError(e) # Parse PDFId results pdfidres = ResultSection(title_text="PDFID Results", parent=res) if len(pdfid_result) == 0: pdfidres.add_line( "No results generated for file. Please see errors.") else: # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: version = pdfid_result.get("PDFID", None) if version: pdfidres.add_line(version[0]) properties = pdfid_result.get("Properties", None) if properties: pres = ResultSection(title_text="PDF Properties", parent=pdfidres) for plist in properties: pres.add_line("{0}: {1}".format(plist[0], plist[1])) if plist[0] == "/ModDate": pres.add_tag('file.pdf.date.modified', plist[1]) elif plist[0] == "/CreationDate": pres.add_tag('file.date.creation', plist[1]) elif plist[0] == "/LastModified": pres.add_tag('file.date.last_modified', plist[1]) elif plist[0] == "/SourceModified": pres.add_tag('file.pdf.date.source_modified', plist[1]) elif plist[0] == "/pdfx": pres.add_tag('file.pdf.date.pdfx', plist[1]) entropy = pdfid_result.get("Entropy", None) if entropy: enres = ResultSection(title_text="Entropy", parent=pdfidres) for enlist in entropy: enres.add_line("{0}: {1}, ({2})".format( enlist[0], enlist[1], enlist[2])) flags = pdfid_result.get("Flags", None) if flags: fres = ResultSection(title_text="PDF Keyword Flags", parent=pdfidres) for flist in flags: if flist[0] == "/ObjStm": objstms = True if len(flist) == 3: fres.add_line( "{0}:Count: {1}, Hex-Encoded Count: {2}".format( flist[0], flist[1], flist[2])) else: fres.add_line("{0}:Count: {1}".format( flist[0], flist[1])) fres.add_tag('file.string.extracted', flist[0].replace("/", "", 1)) if flist[0] in additional_keywords: triage_keywords.add(flist[0].replace("/", "", 1)) plugin = pdfid_result.get("Plugin", []) # If any plugin results, or flagged keywords found, run PDF Parser if plugin or len(triage_keywords) > 0: run_pdfparse = True for pllist in plugin: pl_name, pl_heur, pl_text = pllist pl_heur = int(pl_heur) pl_text = pl_text[14:] if not pl_text or pl_text == "None": continue if pl_name in ['EmbeddedFile', 'Name Obfuscation']: modres = ResultSection(title_text=pl_text, parent=pdfidres) if pl_heur > 0: modres.set_heuristic(pl_heur) if pl_name == 'EmbeddedFile': embed_present = True elif pl_name in ['Triage', 'Suspicious Properties']: javascript_found = False for line in pl_text.splitlines(): lineres = ResultSection(title_text=line) # Triage results if '/JavaScript' in line: triage_keywords.add('JavaScript') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JS' in line: triage_keywords.add('JS') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JBIG2Decode' in line: triage_keywords.add('JBIG2Decode') lineres.set_heuristic(3) elif '/Colors > 2^24' in line: triage_keywords.add('Colors > 2^24') lineres.set_heuristic(20) elif '/AA' in line: triage_keywords.add('AA') lineres.set_heuristic(1) elif '/Launch' in line: triage_keywords.add('Launch') lineres.set_heuristic(1) elif '/OpenAction' in line: triage_keywords.add('OpenAction') lineres.set_heuristic(1) elif '/GoToE' in line: triage_keywords.add('GoToE') lineres.set_heuristic(21) elif '/GoToR' in line: triage_keywords.add('GoToR') lineres.set_heuristic(22) elif '/Encrypt' in line: triage_keywords.add('Encrypt') lineres.set_heuristic(11) elif '/AcroForm' in line: triage_keywords.add('AcroForm') lineres.set_heuristic(4) elif '/RichMedia' in line: triage_keywords.add('RichMedia') lineres.set_heuristic(5) elif '/XFA' in line: triage_keywords.add('XFA') lineres.set_heuristic(23) elif '/Annot' in line: triage_keywords.add('Annot') lineres.set_heuristic(25) elif '/ObjStm' in line: triage_keywords.add('ObjStm') lineres.set_heuristic(7) elif '/URI' in line: triage_keywords.add('URI') lineres.set_heuristic(24) # Suspicious properties results elif "eof2" in line: lineres.set_heuristic(2) elif "eof5" in line: lineres.set_heuristic(17) elif "page" in line: lineres.set_heuristic(26) elif "entropy" in line: lineres.set_heuristic(12) elif "obj/endobj" in line: lineres.set_heuristic(13) elif "stream/endstream" in line: lineres.set_heuristic(14) if lineres.heuristic is not None: pdfidres.add_subsection(lineres) for e in errors: all_errors.add(e) if e.startswith('Error running plugin'): self.log.warn(e) if run_pdfparse: # CALL PDF parser and extract further information pdf_parserres = ResultSection(title_text="PDF Parser Results") # STATISTICS # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: options = { "stats": True, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No statistical results generated for file. Please see errors." ) else: version = pdf_parser_result.get("version", None) if version and version[0] != '0': pdf_parserres.add_line(version[0]) stats = pdf_parser_result.get("stats", None) if stats: sres = ResultSection( title_text="PDF Statistcs", parent=pdf_parserres, body_format=BODY_FORMAT.MEMORY_DUMP) for p in stats: sres.add_line(p) for e in errors: all_errors.add(e) # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream) carved_content = {} # Format { "objnum": [{keyword: content list}} obj_extract_triage = set() jbig_objs = set() for keyword in triage_keywords: # ObjStms handled differently if keyword == 'ObjStm': continue options = { "search": keyword, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: for p in pdf_parser_result['parts']: content = "" references = [] # Trailer will be extracted anyways, try and grab all references anyways -- will be messy if p.startswith("trailer:"): # Grab the content after the keyword # Check that keyword actually in content if "/{}".format(keyword) in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').split("/", 1)[0].strip() references = re.findall( "[0-9]* [0-9]* R", content) except Exception: continue # If not trailer, should be object elif 'Referencing:' in p: # Grab the content after the keyword if '>>++>>' in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').strip() except Exception: try: content = p.split("\n", 3)[3] except Exception: content = p else: try: content = p.split("\n", 3)[3] except Exception: content = p # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R" if content.startswith("/{}".format(keyword)): try: content = re.sub("/{}[ ]*".format(keyword), "", content, 1) except Exception: pass try: references = p.split("\n", 3)[2].replace( 'Referencing:', '').strip().split(", ") except Exception: pass # Only extract JBIG2Decode objects with deep scan, but always report on their presence if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p: try: objnum = p.split("\n", 1)[0].split(" ")[1] if request.deep_scan: obj_extract_triage.add(objnum) jbig_objs.add(objnum) continue except Exception as e: self.log.debug(e) continue # If no content, then keyword likely points to reference objects, so grab those if content == '': if len(references) > 0: content = references else: # Something is wrong, drop it. continue else: while True: # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R] islist = re.match( r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]", content) if islist: content = re.sub( r"[\[\]]", "", islist.group(0).replace( "s ", '').replace("R ", "R,")).split(",") break # References might be with instructions, i.e. [# # R /FitH null] withinst = re.match( r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}" r"[/a-zA-Z0-9 ]*[ ]?\]", content) if withinst: content = [withinst.group(1)] break content = [content] break for c in content: # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[ 0:5]: continue if c in references or re.match( "[0-9]* [0-9]* R", c): try: ref_obj = c.split(" ", 1)[0] options = { "object": ref_obj, "get_object_detail": True } pdf_parser_subresult, err = self.get_pdf_parser( path, working_dir, options) if pdf_parser_subresult: for sub_p in pdf_parser_subresult[ 'parts']: sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\ .strip().split(", ") ptyp = sub_p.split( "\n", 2)[1].replace( 'Type:', '').strip().replace( "/", "") # If the object contains a stream, extract the object. if "Contains stream" in sub_p: try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] obj_extract_triage.add( objnum) except Exception: pass # Or if the object Type is the keyword, grab all referenced objects. elif sub_references[0] != '' and len(sub_references) >= 1 \ and ptyp == keyword: for sr in sub_references: try: objnum = sr.split( " ", 1)[0] obj_extract_triage.add( objnum) except Exception: pass # If not, extract object detail in to carved output elif pdf_parser_subresult[ 'obj_details'] != "": try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] if objnum in carved_content: carved_content[objnum]\ .append({keyword: pdf_parser_subresult['obj_details']}) else: carved_content[objnum] = \ [{keyword: pdf_parser_subresult['obj_details']}] except Exception: continue for e in err: errors.add(e) except Exception: # If none of that work, just extract the original object for examination. try: objnum = p.split("\n", 1)[0].split(" ")[1] obj_extract_triage.add(objnum) except Exception: pass # If content does not look like a reference: else: if p.startswith("trailer:"): continue objnum = p.split("\n", 1)[0].split(" ")[1] # If the object contains a stream extract the object if p.split("\n", 4)[3] == "Contains stream": obj_extract_triage.add(objnum) else: # Or just carve the content if objnum in carved_content: carved_content[objnum].append( {keyword: c}) else: carved_content[objnum] = [{keyword: c}] for e in errors: all_errors.add(e) # Add carved content to result output show_content_of_interest = False if len(carved_content) > 0 or len(jbig_objs) > 0: carres = ResultSection(title_text="Content of Interest") else: carres = None if len(jbig_objs) > 0: jbigres = ResultSection( title_text= "The following Object IDs are JBIG2DECODE streams:", body_format=BODY_FORMAT.MEMORY_DUMP, parent=carres) jbigres.add_line(', '.join(map(str, jbig_objs))) show_content_of_interest = True if len(carved_content) > 0: for k, l in sorted(carved_content.items()): for d in l: for keyw, con in d.items(): subres = ResultSection( title_text="Object {0}: Hits for Keyword '{1}':" .format(k, keyw)) subres.set_heuristic(8) con_bytes = con.encode() if len(con) < 500: subres.body_format = BODY_FORMAT.MEMORY_DUMP subres.add_line(con) # Check for IOC content patterns = PatternMatch() st_value = patterns.ioc_match(con_bytes, bogon_ip=True) if len(st_value) > 0: carres.add_subsection(subres) show_content_of_interest = True for ty, val in st_value.items(): if val == "": asc_asc = unicodedata.normalize( 'NFKC', val).encode('ascii', 'ignore') subres.add_tag(ty, asc_asc) else: ulis = list(set(val)) for v in ulis: subres.add_tag(ty, v) else: crv_sha = hashlib.sha256(con_bytes).hexdigest() if crv_sha not in carved_extracted_shas: f_name = "carved_content_obj_{}_{}".format( k, crv_sha[0:7]) subres.add_lines([ "Content over 500 bytes it will be extracted for analysis", "Name: {} - SHA256: {}".format( f_name, crv_sha) ]) carres.add_subsection(subres) show_content_of_interest = True crvf = os.path.join( self.working_directory, f_name) with open(crvf, 'wb') as f: f.write(con_bytes) request.add_extracted( crvf, os.path.basename(crvf), "Extracted content from object {}". format(k)) carved_extracted_shas.add(crv_sha) if show_content_of_interest: pdf_parserres.add_subsection(carres) # ELEMENTS # Do not show for objstms if get_malform: if request.deep_scan: options = { "verbose": True, "nocanonicalizedoutput": True, "get_malform": get_malform } elif embed_present: options = { "verbose": True, "elements": "ctsi", "type": "/EmbeddedFile", "get_malform": get_malform } else: options = { "verbose": True, "elements": "cst", "get_malform": get_malform } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) embed_extracted = set() if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No structure information generated for file. Please see errors." ) else: # PDF Parser will write any malformed content over 100 bytes to a file files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'malformed': if len(l) > 0: pdf_parserres.set_heuristic(6) for i in l: request.add_extracted( i, os.path.basename(i), "Extracted malformed content in PDF Parser Analysis." ) parts = pdf_parser_result.get("parts", None) # Extract service will extract the sample's embedded files. # However we want to make note of them so that they are not extracted again below if parts: for p in sorted(parts): if "Type: /EmbeddedFile" in p: getobj = p.split("\n", 1)[0].split(" ")[1] embed_extracted.add(getobj) # Extract objects collected from above analysis obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs if len(obj_to_extract) > 0: options = { "filter": True, "object": obj_to_extract, "dump": "extracted_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: files = pdf_parser_result.get("files", None) extracted_files = [] if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_obj_", "") extracted_files.append( "Extracted object {} as {}".format( obj_id, f_name)) request.add_extracted( i, f_name, "Object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_files: extract_res = ResultSection( title_text="Extracted embedded objects", parent=pdf_parserres) extract_res.set_heuristic(9) extract_res.add_lines(extracted_files) # Extract jbig2decode objects in deep scan mode if request.deep_scan and len(jbig_objs) > 0: options = { "object": jbig_objs, "dump": "extracted_jb_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: extracted_jb = [] files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_jb_obj_", "") extracted_jb.append( "JBIG2DECODE object {} extracted as {}" .format(obj_id, f_name)) request.add_extracted( i, f_name, "JBIG2DECODE object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_jb: jbig_extract_res = ResultSection( title_text="Extracted JBIG2Decode objects", parent=pdf_parserres) jbig_extract_res.set_heuristic(9) jbig_extract_res.add_lines(extracted_jb) if len(pdf_parserres.subsections) > 0: res.add_subsection(pdf_parserres) return res, objstms, all_errors
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop 3 embedded file which two generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in ['d729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06']: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines([get_random_phrase() for _ in range(random.randint(1, 5))]) # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(3, signature="sig_one") # You can attach attack ids to heuristics after they where defined text_section.heuristic.add_attack_id("T1066") # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how # many time the signature fired by setting its frequency. If you call add_signature_id twice with the # same signature, this will effectively increase the frequency of the signature. text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2) text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3) text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_four", score=0) # The heuristic for text_section should have the following properties # 1. 1 attack ID: T1066 # 2. 4 signatures: sig_one, sig_two, sig_three and sig_four # 3. Signature frequencies are cumulative therefor they will be as follow: # - sig_one = 1 # - sig_two = 5 # - sig_three = 2 # - sig_four = 1 # 4. The score used by each heuristic is driven by the following rules: signature_score_map is higher # priority, then score value for the add_signature_id is in second place and finally the default # heuristic score is use. Therefor the score used to calculate the total score for the text_section is # as follow: # - sig_one: 10 -> heuristic default score # - sig_two: 20 -> score provided by the function add_signature_id # - sig_three: 30 -> score provided by the heuristic map # - sig_four: 40 -> score provided by the heuristic map because it's higher priority than the # function score # 5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210 # Make sure you add your section to the result result.add_section(text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 color_map_data = { 'type': 'colormap', 'data': { 'domain': [cmap_min, cmap_max], 'values': [random.random() * cmap_max for _ in range(50)] } } # The classification of a section can be set to any valid classification for your system section_color_map = ResultSection("Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(color_map_data), classification=cl_engine.RESTRICTED) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL, body=json.dumps({"name": "Random url!", "url": f"https://{random_host}/"})) # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed host1 = get_random_host() host2 = get_random_host() ip1 = get_random_ip() ip2 = get_random_ip() ip3 = get_random_ip() urls = [ {"url": f"https://{host1}/"}, {"url": f"https://{host2}/"}, {"url": f"https://{ip1}/"}, {"url": f"https://{ip2}/"}, {"url": f"https://{ip3}/"}] # A heuristic can fire more then once without being associated to a signature url_heuristic = Heuristic(4, frequency=len(urls)) url_sub_section = ResultSection('Example of a url section with multiple links', body=json.dumps(urls), body_format=BODY_FORMAT.URL, heuristic=url_heuristic) url_sub_section.add_tag("network.static.ip", ip1) url_sub_section.add_tag("network.static.ip", ip2) url_sub_section.add_tag("network.static.ip", ip3) url_sub_section.add_tag("network.static.domain", host1) url_sub_section.add_tag("network.dynamic.domain", host2) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump(b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!") memdump_section = ResultSection('Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP, body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed) kv_body = { "a_str": "Some string", "a_bool": False, "an_int": 102, } kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(kv_body)) result.add_section(kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a json dump of a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [ {"d1_key": "val", "d1_key2": "val2"}, {"d2_key": "val", "d2_key2": "val2"} ], "bool": True } } json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON, body=json.dumps(json_body)) result.add_section(json_section) # ================================================================== # PROCESS_TREE section: # This section allows the service writer to list a bunch of dictionary objects that have nested lists # of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore # each dictionary must have be of the following format: # { # "process_pid": int, # "process_name": str, # "command_line": str, # "children": [] NB: This list either is empty or contains more dictionaries that have the same # structure # } nc_body = [ { "process_pid": 123, "process_name": "evil.exe", "command_line": "C:\\evil.exe", "signatures": {}, "children": [ { "process_pid": 321, "process_name": "takeovercomputer.exe", "command_line": "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff", "signatures": {"one":250}, "children": [ { "process_pid": 456, "process_name": "evenworsethanbefore.exe", "command_line": "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad", "signatures": {"one":10, "two":10, "three":10}, "children": [] }, { "process_pid": 234, "process_name": "badfile.exe", "command_line": "C:\\badfile.exe -k nothing_to_see_here", "signatures": {"one":1000, "two":10, "three":10, "four":10, "five":10}, "children": [] } ] }, { "process_pid": 345, "process_name": "benignexe.exe", "command_line": "C:\\benignexe.exe -f \"just kidding, i'm evil\"", "signatures": {"one": 2000}, "children": [] } ] }, { "process_pid": 987, "process_name": "runzeroday.exe", "command_line": "C:\\runzeroday.exe -f insert_bad_spelling", "signatures": {}, "children": [] } ] nc_section = ResultSection('Example of a PROCESS_TREE section', body_format=BODY_FORMAT.PROCESS_TREE, body=json.dumps(nc_body)) result.add_section(nc_section) # ================================================================== # TABLE section: # This section allows the service writer to have their content displayed in a table format in the UI # The body argument must be a list [] of dict {} objects. A dict object can have a key value pair # where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested # table within a cell. table_body = [ { "a_str": "Some string1", "extra_column_here": "confirmed", "a_bool": False, "an_int": 101, }, { "a_str": "Some string2", "a_bool": True, "an_int": 102, }, { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, { "a_str": "Some string4", "a_bool": None, "an_int": -1000000000000000000, "extra_column_there": "confirmed", "nested_table": { "a_str": "Some string3", "a_bool": False, "nested_table_thats_too_deep": { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, }, }, ] table_section = ResultSection('Example of a TABLE section', body_format=BODY_FORMAT.TABLE, body=json.dumps(table_body)) result.add_section(table_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # Embedded files can also have their own classification! fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"CLASSIFIED!!!__"+data.encode()) request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look", classification=cl_engine.RESTRICTED) # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(urls)) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def execute(self, request): parser = eml_parser.eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) content_str = request.file_contents # Attempt conversion of potential Outlook file -> eml if request.file_type == "document/office/email": try: content_str = msg2eml(request.file_path).as_bytes() except Exception: # Try using mailparser to convert converted_path, _ = msgconvert(request.file_path) content_str = open(converted_path, "rb").read() header_agg = { "From": set(), "To": set(), "Cc": set(), "Sent": set(), "Reply-To": set(), "Date": set() } # Assume this is an email saved in HTML format if request.file_type == "code/html": parsed_html = BeautifulSoup(content_str, "lxml") valid_headers = [ "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:" ] if not parsed_html.body or not any(header in parsed_html.body.text for header in valid_headers): # We can assume this is just an HTML doc (or lacking body), one of which we can't process request.result = Result() return # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails valid_headers.append("Date:") html_email = email.message_from_bytes(content_str) generator_metadata_content = "" for meta in parsed_html.find_all("meta"): if meta.attrs.get("name", None) == "Generator": generator_metadata_content = meta.attrs.get("content", "") break # Process HTML emails generated from Outlook if generator_metadata_content == "Microsoft Word 15": paragraphs = parsed_html.body.find_all("p") # Likely an email that was exported with original email headers if any(header in paragraphs[0] for header in valid_headers): for p in paragraphs: if any(valid_header in p.text for valid_header in valid_headers): h_key, h_value = p.text.replace( "\xa0", "").replace("\r\n", " ").split(":", 1) html_email[h_key] = h_value # Subject line indicates the end of the email header, beginning of body if "Subject" in p.text: break # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers) elif (generator_metadata_content == "Microsoft Word 15 (filtered medium)" or generator_metadata_content == "Microsoft Exchange Server" or generator_metadata_content == ""): subject = None for div in parsed_html.find_all("div"): # Header information within divs if any(header in div.text for header in valid_headers ) and "WordSection1" not in div.attrs.get( "class", []): # Usually expect headers to be \n separated in text output but check first if "\n" in div.text: for h in div.text.split("\n"): if any(header in h for header in valid_headers): h_key, h_value = h.split(":", 1) # Implying some malformed message got mixed with the headers of another message if h_key not in valid_headers: for header in valid_headers: if header in h: h_key = header[:-1] # Use the latest message's subject (this maintains FW, RE, etc.) if h_key == "Subject" and not subject: subject = h_value elif h_key != "Subject": header_agg[h_key].add(h_value) # Document was probably not well formatted, so we'll use the headers as delimiters else: header_offset_map = {} # Determine the position of each header for header in list( header_agg.keys()) + ["Subject"]: if header in div.text: header_offset_map[div.text.index( header)] = header # Use the positions and length of header name to determine an offset for i in range(len(header_offset_map)): sorted_keys = sorted(header_offset_map.keys()) header_name = header_offset_map[sorted_keys[i]] offset = len( f"{header_name}: ") + sorted_keys[i] value = (div.text[offset:sorted_keys[i + 1]] if i < len(header_offset_map) - 1 else div.text[offset:]) if header_name == "Subject": subject = value else: header_agg[header_name].add(value) # Assign aggregated info to email object html_email["Subject"] = subject for key, value in header_agg.items(): html_email[key] = "; ".join(value) content_str = html_email.as_bytes() parsed_eml = parser.decode_email_bytes(content_str) result = Result() header = parsed_eml["header"] if "from" in header or "to" in header: all_uri = set() body_words = set(extract_passwords(header["subject"])) for body_counter, body in enumerate(parsed_eml["body"]): body_text = BeautifulSoup(body["content"]).text body_words.update(extract_passwords(body_text)) if request.get_param("extract_body_text"): fd, path = mkstemp() with open(path, "w") as f: f.write(body["content"]) os.close(fd) request.add_extracted(path, "body_" + str(body_counter), "Body text") if "uri" in body: for uri in body["uri"]: all_uri.add(uri) # Words in the email body, used by extract to guess passwords request.temp_submission_data["email_body"] = list(body_words) kv_section = ResultSection("Email Headers", body_format=BODY_FORMAT.KEY_VALUE, parent=result) # Basic tags from_addr = header["from"].strip() if header.get("from", None) else None if from_addr and re.match(EMAIL_REGEX, from_addr): kv_section.add_tag("network.email.address", from_addr) [ kv_section.add_tag("network.email.address", to.strip()) for to in header["to"] if re.match(EMAIL_REGEX, to.strip()) ] kv_section.add_tag("network.email.date", str(header["date"]).strip()) subject = header["subject"].strip() if header.get("subject", None) else None if subject: kv_section.add_tag("network.email.subject", subject) # Add CCs to body and tags if "cc" in header: [ kv_section.add_tag("network.email.address", cc.strip()) for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip()) ] # Add Message ID to body and tags if "message-id" in header["header"]: kv_section.add_tag("network.email.msg_id", header["header"]["message-id"][0].strip()) # Add Tags for received IPs if "received_ip" in header: for ip in header["received_ip"]: ip = ip.strip() try: if isinstance(ip_address(ip), IPv4Address): kv_section.add_tag("network.static.ip", ip) except ValueError: pass # Add Tags for received Domains if "received_domain" in header: for dom in header["received_domain"]: kv_section.add_tag("network.static.domain", dom.strip()) # If we've found URIs, add them to a section if len(all_uri) > 0: uri_section = ResultSection("URIs Found:", parent=result) for uri in all_uri: uri_section.add_line(uri) uri_section.add_tag("network.static.uri", uri.strip()) parsed_url = urlparse(uri) if parsed_url.hostname and re.match( IP_ONLY_REGEX, parsed_url.hostname): uri_section.add_tag("network.static.ip", parsed_url.hostname) else: uri_section.add_tag("network.static.domain", parsed_url.hostname) # Bring all headers together... extra_header = header.pop("header", {}) header.pop("received", None) header.update(extra_header) # Convert to common format header["date"] = [self.json_serial(header["date"])] # Replace with aggregated date(s) if any available if header_agg["Date"]: # Replace if any( default_date in header["date"] for default_date in ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000" ]): header["date"] = list(header_agg["Date"]) # Append else: header["date"] += list(header_agg["Date"]) (kv_section.add_tag("network.email.date", str(date).strip()) for date in header_agg["Date"]) # Filter out useless headers from results self.log.debug(header.keys()) [header.pop(h) for h in self.header_filter if h in header.keys()] kv_section.set_body(json.dumps(header, default=self.json_serial)) attachments_added = [] if "attachment" in parsed_eml: attachments = parsed_eml["attachment"] for attachment in attachments: fd, path = mkstemp() with open(path, "wb") as f: f.write(base64.b64decode(attachment["raw"])) os.close(fd) try: if request.add_extracted( path, attachment["filename"], "Attachment ", safelist_interface=self.api_interface): attachments_added.append(attachment["filename"]) except MaxExtractedExceeded: self.log.warning( f"Extract limit reached on attachments: " f"{len(attachment) - len(attachments_added)} not added" ) break ResultSection("Extracted Attachments:", body="\n".join([x for x in attachments_added]), parent=result) if request.get_param("save_emlparser_output"): fd, temp_path = tempfile.mkstemp(dir=self.working_directory) attachments = parsed_eml.get("attachment", []) # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted for attachment in attachments: _ = attachment.pop("raw", None) with os.fdopen(fd, "w") as myfile: myfile.write( json.dumps(parsed_eml, default=self.json_serial)) request.add_supplementary( temp_path, "parsing.json", "These are the raw results of running GOVCERT-LU's eml_parser" ) else: self.log.warning( "emlParser could not parse EML; no useful information in result's headers" ) request.result = result
def execute(self, request): parser = eml_parser.eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) # Validate URLs in sample, strip out [] if found content_str = request.file_contents.decode(errors="ignore") content_str, retry = self.validate_urls(content_str) while retry: content_str, retry = self.validate_urls(content_str) parsed_eml = parser.decode_email_bytes(content_str.encode()) result = Result() header = parsed_eml['header'] if "from" in header: all_uri = set() for body_counter, body in enumerate(parsed_eml['body']): if request.get_param('extract_body_text'): fd, path = mkstemp() with open(path, 'w') as f: f.write(body['content']) os.close(fd) request.add_extracted(path, "body_" + str(body_counter), "Body text") if "uri" in body: for uri in body['uri']: all_uri.add(uri) kv_section = ResultSection('Email Headers', body_format=BODY_FORMAT.KEY_VALUE, parent=result) # Basic tags kv_section.add_tag("network.email.address", header['from'].strip()) for to in header['to']: kv_section.add_tag("network.email.address", to) kv_section.add_tag("network.email.date", str(header['date']).strip()) kv_section.add_tag("network.email.subject", header['subject'].strip()) # Add CCs to body and tags if 'cc' in header: for to in header['to']: kv_section.add_tag("network.email.address", to.strip()) # Add Message ID to body and tags if 'message-id' in header['header']: kv_section.add_tag("network.email.msg_id", header['header']['message-id'][0].strip()) # Add Tags for received IPs if 'received_ip' in header: for ip in header['received_ip']: kv_section.add_tag('network.static.ip', ip.strip()) # Add Tags for received Domains if 'received_domain' in header: for dom in header['received_domain']: kv_section.add_tag('network.static.domain', dom.strip()) # If we've found URIs, add them to a section if len(all_uri) > 0: uri_section = ResultSection('URIs Found:', parent=result) for uri in all_uri: uri_section.add_line(uri) uri_section.add_tag('network.static.uri', uri.strip()) parsed_url = urlparse(uri) if parsed_url.hostname and re.match( IP_ONLY_REGEX, parsed_url.hostname): uri_section.add_tag('network.static.ip', parsed_url.hostname) else: uri_section.add_tag('network.static.domain', parsed_url.hostname) # Bring all headers together... extra_header = header.pop('header', {}) header.pop('received', None) header.update(extra_header) kv_section.body = json.dumps(header, default=self.json_serial) if "attachment" in parsed_eml: for attachment in parsed_eml['attachment']: fd, path = mkstemp() with open(path, 'wb') as f: f.write(base64.b64decode(attachment['raw'])) os.close(fd) request.add_extracted(path, attachment['filename'], "Attachment ") ResultSection('Extracted Attachments:', body="\n".join([ x['filename'] for x in parsed_eml['attachment'] ]), parent=result) if request.get_param('save_emlparser_output'): fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write( json.dumps(parsed_eml, default=self.json_serial)) request.add_supplementary( temp_path, "parsing.json", "These are the raw results of running GOVCERT-LU's eml_parser" ) else: text_section = ResultSection('EML parsing results') text_section.add_line("Could not parse EML") result.add_section(text_section) request.result = result
def execute(self, request): # --- Setup ---------------------------------------------------------------------------------------------- request.result = Result() patterns = PatternMatch() if request.deep_scan: max_attempts = 100 else: max_attempts = 10 self.files_extracted = set() self.hashes = set() before = set() # --- Pre-Processing -------------------------------------------------------------------------------------- # Get all IOCs prior to de-obfuscation pat_values = patterns.ioc_match(request.file_contents, bogon_ip=True, just_network=False) if pat_values: if request.get_param('extract_original_iocs'): ioc_res = ResultSection( "The following IOCs were found in the original file", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) else: ioc_res = None for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}" ) ioc_res.add_tag(k, asc_asc) before.add((k, asc_asc)) else: for v in val: if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_res.add_tag(k, v) before.add((k, v)) # --- Prepare Techniques ---------------------------------------------------------------------------------- techniques = [ ('MSOffice Embedded script', self.msoffice_embedded_script_string), ('CHR and CHRB decode', self.chr_decode), ('String replace', self.string_replace), ('Powershell carets', self.powershell_carets), ('Array of strings', self.array_of_strings), ('Fake array vars', self.vars_of_fake_arrays), ('Reverse strings', self.str_reverse), ('B64 Decode', self.b64decode_str), ('Simple XOR function', self.simple_xor_function), ] second_pass = [('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), ('Charcode hex', self.charcode_hex)] final_pass = [ ('Charcode', self.charcode), ] code_extracts = [('.*html.*', "HTML scripts extraction", self.extract_htmlscript)] layers_list = [] layer = request.file_contents # --- Stage 1: Script Extraction -------------------------------------------------------------------------- for pattern, name, func in code_extracts: if re.match(re.compile(pattern), request.task.file_type): extracted_parts = func(request.file_contents) layer = b"\n".join(extracted_parts).strip() layers_list.append((name, layer)) break # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------ idx = 0 first_pass_len = len(techniques) layers_count = len(layers_list) while True: if idx > max_attempts: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break for name, technique in techniques: res = technique(layer) if res: layers_list.append((name, res)) # Looks like it worked, restart with new layer layer = res # If the layers haven't changed in a passing, break if layers_count == len(layers_list): if len(techniques) != first_pass_len: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break else: for x in second_pass: techniques.insert(0, x) layers_count = len(layers_list) idx += 1 # --- Compiling results ---------------------------------------------------------------------------------- if len(layers_list) > 0: extract_file = False num_layers = len(layers_list) heur_id = None # Compute heuristic if num_layers < 5: heur_id = 1 elif num_layers < 10: heur_id = 2 elif num_layers < 50: heur_id = 3 elif num_layers < 100: heur_id = 4 elif num_layers >= 100: heur_id = 5 # Cleanup final layer clean = self.clean_up_final_layer(layers_list[-1][1]) if clean != request.file_contents: # Check for new IOCs pat_values = patterns.ioc_match(clean, bogon_ip=True, just_network=False) diff_tags = {} for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if (k, asc_asc) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(asc_asc) else: for v in val: if (k, v) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(v) if request.deep_scan or \ (len(clean) > 1000 and heur_id >= 4) or diff_tags: extract_file = True # Display obfuscation steps mres = ResultSection( "De-obfuscation steps taken by DeobsfuScripter", parent=request.result) if heur_id: mres.set_heuristic(heur_id) lcount = Counter([x[0] for x in layers_list]) for l, c in lcount.items(): mres.add_line(f"{l}, {c} time(s).") # Display final layer byte_count = 5000 if extract_file: # Save extracted file byte_count = 500 fn = f"{request.file_name}_decoded_final" fp = os.path.join(self.working_directory, fn) with open(fp, 'wb') as dcf: dcf.write(clean) self.log.debug( f"Submitted dropped file for analysis: {fp}") request.add_extracted(fp, fn, "Final deobfuscation layer") ResultSection(f"First {byte_count} bytes of the final layer:", body=safe_str(clean[:byte_count]), body_format=BODY_FORMAT.MEMORY_DUMP, parent=request.result) # Display new IOCs from final layer if len(diff_tags) > 0: ioc_new = ResultSection( "New IOCs found after de-obfustcation", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) has_network_heur = False for ty, val in diff_tags.items(): for v in val: if "network" in ty: has_network_heur = True ioc_new.add_line( f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_new.add_tag(ty, v) if has_network_heur: ioc_new.set_heuristic(7) else: ioc_new.set_heuristic(6) if len(self.files_extracted) > 0: ext_file_res = ResultSection( "The following files were extracted during the deobfuscation", heuristic=Heuristic(8), parent=request.result) for f in self.files_extracted: ext_file_res.add_line(os.path.basename(f)) request.add_extracted( f, os.path.basename(f), "File of interest deobfuscated from sample")
def check_file_name_anomalies(self, filename): """Filename anomalies detection""" is_double_ext, f_ext = self.fna_check_double_extension(filename) is_empty_filename = self.fna_check_empty_filename(filename, f_ext) too_many_whitespaces = self.fna_check_filename_ws(filename, f_ext) has_unicode_ext_hiding_ctrls = self.fna_check_unicode_bidir_ctrls(filename, f_ext) file_res = Result() if too_many_whitespaces or is_double_ext or has_unicode_ext_hiding_ctrls or is_empty_filename: res = ResultSection(title_text="File Name Anomalies", parent=file_res) # Tag filename as it might be of interest res.add_tag("file.name.extracted", filename) # Remove Unicode controls, if any, for reporting fn_no_controls = "".join( c for c in filename if c not in ["\u202E", "\u202B", "\u202D", "\u202A", "\u200E", "\u200F"] ) # Also add a line with "actual" file name res.add_line(f"Actual file name: {wrap_bidir_unicode_string(fn_no_controls)}") if too_many_whitespaces: sec = ResultSection("Too many whitespaces", parent=res, heuristic=Heuristic(1)) sec.add_tag("file.name.anomaly", "TOO_MANY_WHITESPACES") sec.add_tag("file.behavior", "File name has too many whitespaces") if is_double_ext: sec = ResultSection("Double file extension", parent=res, heuristic=Heuristic(2)) sec.add_tag("file.name.anomaly", "DOUBLE_FILE_EXTENSION") sec.add_tag("file.behavior", "Double file extension") if has_unicode_ext_hiding_ctrls: sec = ResultSection("Hidden launchable file extension", parent=res, heuristic=Heuristic(3)) sec.add_tag("file.name.anomaly", "UNICODE_EXTENSION_HIDING") sec.add_tag("file.behavior", "Real file extension hidden using unicode trickery") if is_empty_filename: sec = ResultSection("Empty Filename", parent=res, heuristic=Heuristic(4)) sec.add_tag("file.name.anomaly", "FILENAME_EMPTY_OR_ALL_SPACES") sec.add_tag("file.behavior", "File name is empty or all whitespaces") return file_res