def generate_results(self, presults, result, analysis_results, request): if presults['unpacked']: result.add_section( ResultSection("Successully unpacked binary.", heuristic=Heuristic(1))) for r in presults['unpacked_samples']: if len(r['malware_id']) > 0: for rm in r['malware_id']: section = ResultSection("{} - {}".format( r['sha256'], rm['name']), heuristic=Heuristic(2)) section.add_line("Details: {}".format(rm['reference'])) result.add_section(section) request.add_extracted(r['data_path'], r['sha256'], f'Unpacked from {request.sha256}') result.add_section( ResultSection(f"UNPACME Detailed Results", body_format=BODY_FORMAT.JSON, body=json.dumps(analysis_results['results']))) return result, request
def execute(self, request): result = Result() file_path = request.file_path p1 = subprocess.Popen("clamscan -a -z --detect-pua --alert-macros " + file_path, shell=True, stdout=subprocess.PIPE) p1.wait() stdout = p1.communicate()[0].decode("utf-8") report = stdout.split("\n") report = list(filter(None, report)) text_section = ResultSection("Successfully scanned the file") if "FOUND" in report[0]: text_section.set_heuristic(1) for l in report: text_section.add_line(l) result.add_section(text_section) request.result = result
def run_analysis(self, quark_out, result): with open(quark_out) as f: data = json.load(f) self.manage_threat_level(data, result) dic_report_crime = {} crimes_section = ResultSection("Crimes detected") crimes_array = [] counter = 0 for i in range(len(data['crimes'])): if data['crimes'][i]['confidence'] == "80%": crimes_array.insert( len(crimes_array) - counter, data['crimes'][i]) if data['crimes'][i]['confidence'] == "100%": crimes_array.insert(0, data['crimes'][i]) if data['crimes'][i]['confidence'] == "60%": counter += 1 crimes_array.insert(len(crimes_array), data['crimes'][i]) for i in range(len(crimes_array)): if crimes_array[i]['confidence'] in ["80%", "100%"]: dic_report_crime["{0}".format( crimes_array[i]["crime"])] = ResultSection( "{0}".format(crimes_array[i]["crime"]), parent=crimes_section) dic_report_crime["{0}".format( crimes_array[i]["crime"])].add_line( "confidence level : {0}".format( crimes_array[i]["confidence"])) if len(crimes_array[i]['permissions']) > 0: perm_section = ResultSection( "permissions associated with the crime", parent=dic_report_crime["{0}".format( crimes_array[i]["crime"])], body_format=BODY_FORMAT.MEMORY_DUMP) for permission in crimes_array[i]['permissions']: perm_section.add_line(permission) if len(crimes_array[i]['native_api']) > 0: native_api_section = ResultSection( "native_api", parent=dic_report_crime["{0}".format( crimes_array[i]["crime"])], body_format=BODY_FORMAT.MEMORY_DUMP) for api in crimes_array[i]["native_api"]: native_api_section.add_line("class : {0}".format( api["class"])) native_api_section.add_line("method : {0}".format( api["method"])) result.add_section(crimes_section)
def execute(self, request): """Main Module. See README for details.""" max_size = self.config.get('MAX_PDF_SIZE', 3000000) request.result = result = Result() if (os.path.getsize(request.file_path) or 0) < max_size or request.deep_scan: path = request.file_path working_dir = self.working_directory # CALL PDFID and identify all suspicious keyword streams additional_keywords = self.config.get('ADDITIONAL_KEYS', []) heur = deepcopy(self.config.get('HEURISTICS', [])) all_errors = set() res_txt = "Main Document Results" res, contains_objstms, errors = self.analyze_pdf( request, res_txt, path, working_dir, heur, additional_keywords) result.add_section(res) for e in errors: all_errors.add(e) # ObjStms: Treat all ObjStms like a standalone PDF document if contains_objstms: objstm_files = self.analyze_objstm(path, working_dir, request.deep_scan) obj_cnt = 1 for osf in objstm_files: parent_obj = os.path.basename(osf).split("_")[1] res_txt = "ObjStream Object {0} from Parent Object {1}".format( obj_cnt, parent_obj) # It is going to look suspicious as the service created the PDF heur = [ x for x in heur if 'plugin_suspicious_properties' not in x and 'plugin_embeddedfile' not in x and 'plugin_nameobfuscation' not in x ] res, contains_objstms, errors = self.analyze_pdf( request, res_txt, osf, working_dir, heur, additional_keywords, get_malform=False) obj_cnt += 1 result.add_section(res) if len(all_errors) > 0: erres = ResultSection(title_text="Errors Analyzing PDF") for e in all_errors: erres.add_line(e) result.add_section(erres) else: section = ResultSection( "PDF Analysis of the file was skipped because the file is too big (limit is 3 MB)." ) section.set_heuristic(10) result.add_section(section)
def run_badging_analysis(self, apk_file: str, result: Result): badging_args = ['d', 'badging', apk_file] badging, errors = self.run_appt(badging_args) if not badging: return res_badging = ResultSection("Android application details") libs = [] permissions = [] components = [] features = [] pkg_version = None for line in badging.splitlines(): if line.startswith("package:"): pkg_name = line.split("name='")[1].split("'")[0] pkg_version = line.split("versionCode='")[1].split("'")[0] res_badging.add_line(f"Package: {pkg_name} v.{pkg_version}") res_badging.add_tag('file.apk.pkg_name', pkg_name) res_badging.add_tag('file.apk.app.version', pkg_version) if line.startswith("sdkVersion:"): min_sdk = line.split(":'")[1][:-1] res_badging.add_line(f"Min SDK: {min_sdk}") res_badging.add_tag('file.apk.sdk.min', min_sdk) if line.startswith("targetSdkVersion:"): target_sdk = line.split(":'")[1][:-1] res_badging.add_line(f"Target SDK: {target_sdk}") res_badging.add_tag('file.apk.sdk.target', target_sdk) if line.startswith("application-label:"): label = line.split(":'")[1][:-1] res_badging.add_line(f"Default Label: {label}") res_badging.add_tag('file.apk.app.label', label) if line.startswith("launchable-activity:"): launch = line.split("name='")[1].split("'")[0] res_badging.add_line(f"Launchable activity: {launch}") res_badging.add_tag('file.apk.activity', launch) if line.startswith("uses-library-not-required:"): lib = line.split(":'")[1][:-1] if lib not in libs: libs.append(lib) if line.startswith("uses-permission:") or line.startswith("uses-implied-permission:"): perm = line.split("name='")[1].split("'")[0] if perm not in permissions: permissions.append(perm) if line.startswith("provides-component:"): component = line.split(":'")[1][:-1] if component not in components: components.append(component) if "uses-feature:" in line or "uses-implied-feature:" in line: feature = line.split("name='")[1].split("'")[0] if feature not in features: features.append(feature) if pkg_version is not None: pkg_version = int(pkg_version) if pkg_version < 15: ResultSection("Package version is suspiciously low", parent=res_badging, heuristic=Heuristic(17)) elif pkg_version > 999999999: ResultSection("Package version is suspiciously high", parent=res_badging, heuristic=Heuristic(17)) if libs: res_lib = ResultSection("Libraries used", parent=res_badging) for lib in libs: res_lib.add_line(lib) res_lib.add_tag('file.apk.used_library', lib) if permissions: res_permissions = ResultSection("Permissions used", parent=res_badging) dangerous_permissions = [] unknown_permissions = [] for perm in permissions: if perm in ALL_ANDROID_PERMISSIONS: if 'dangerous' in ALL_ANDROID_PERMISSIONS[perm]: dangerous_permissions.append(perm) else: res_permissions.add_line(perm) res_permissions.add_tag('file.apk.permission', perm) else: unknown_permissions.append(perm) if len(set(permissions)) < len(permissions): ResultSection("Some permissions are defined more then once", parent=res_badging, heuristic=Heuristic(18)) if dangerous_permissions: res_dangerous_perm = ResultSection("Dangerous permissions used", parent=res_badging, heuristic=Heuristic(4)) for perm in dangerous_permissions: res_dangerous_perm.add_line(perm) res_dangerous_perm.add_tag('file.apk.permission', perm) if unknown_permissions: res_unknown_perm = ResultSection("Unknown permissions used", parent=res_badging, heuristic=Heuristic(5)) for perm in unknown_permissions: res_unknown_perm.add_line(perm) res_unknown_perm.add_tag('file.apk.permission', perm) if features: res_features = ResultSection("Features used", parent=res_badging) for feature in features: res_features.add_line(feature) res_features.add_tag('file.apk.feature', feature) if components: res_components = ResultSection("Components provided", parent=res_badging) for component in components: res_components.add_line(component) res_components.add_tag('file.apk.provides_component', component) result.add_section(res_badging)
def find_scripts_and_exes(apktool_out_dir: str, result: Result): scripts = [] executables = [] apks = [] # We are gonna do the full apktool output dir here but in case we want to do less, # you can edit the test_path list test_paths = [apktool_out_dir] for path in test_paths: for root, _, files in os.walk(path): for f in files: if f.endswith(".smali"): continue cur_file = os.path.join(root, f) file_type = fileinfo(cur_file)['type'] if "code/sh" in file_type: scripts.append(cur_file.replace(apktool_out_dir, '')) elif "executable/linux" in file_type: executables.append(cur_file.replace(apktool_out_dir, '')) elif "android/apk" in file_type: executables.append(cur_file.replace(apktool_out_dir, '')) if scripts: res_script = ResultSection("Shell script(s) found inside APK", parent=result, heuristic=Heuristic(1)) for script in sorted(scripts)[:20]: res_script.add_line(script) if len(scripts) > 20: res_script.add_line(f"and {len(scripts) - 20} more...") if executables: res_exe = ResultSection("Executable(s) found inside APK", parent=result, heuristic=Heuristic(2)) for exe in sorted(executables)[:20]: res_exe.add_line(exe) if len(executables) > 20: res_exe.add_line(f"and {len(executables) - 20} more...") if apks: res_apk = ResultSection("Other APKs where found inside the APK", parent=result, heuristic=Heuristic(19)) for apk in sorted(apks)[:20]: res_apk.add_line(apk) if len(apks) > 20: res_apk.add_line(f"and {len(apks) - 20} more...")
def execute(self, request): """Main Module. See README for details.""" request.result = Result() self.result = request.result wrk_dir = self.working_directory ipa_path = request.file_path self.known_keys = None self.reported_keys = {} # Determine if PK container has IPA content to parse try: ipa_file = zipfile.ZipFile(ipa_path) except zipfile.BadZipfile: # Return if files cannot be extracted return # isipa returns False if Info.plist not found, or returns Info.plist path name_list, isipa = self.isipa(ipa_file) if not isipa: return # Extract Files of interest using 7zip (some files can be AES encrypted which standard zipfile library does not # support) extract_success = False try: self.extract_archive(ipa_path) extract_success = True except Exception as e: self.log.error(f"Could not extract IPA file due to 7zip error {e}") if not extract_success: return with open(os.path.join(os.path.dirname(__file__), "keys.json"), 'r') as f: keys_dict = json.load(f) self.known_keys = keys_dict['glossary'] patterns = PatternMatch() # Info.plist main_exe = None res = ResultSection("Info.plist") info_plist_path = os.path.join(wrk_dir, isipa) isempty, plist_dict = self.gen_plist_extract(info_plist_path, patterns) if plist_dict is None: res.add_line("Info.plist in sample cannot be parsed. Sample may be corrupt.") elif isempty: res.add_line("Empty Info.plist file. Archive contents may be encrypted.") else: # Grab the main executable name if plist_dict.get("CFBundleExecutable", None): i = plist_dict["CFBundleExecutable"] try: main_exe = (i, f"Name of bundle's main executable file: {i}") res.add_line(main_exe[1]) except UnicodeEncodeError: i = i.encode('utf8', 'replace') main_exe = (i, f"Name of bundle's main executable file: {i}") res.add_line(main_exe[1]) iden_key_res, unk_key_res = self.parse_plist(plist_dict) if iden_key_res: res.add_subsection(iden_key_res) if unk_key_res: res.add_subsection(unk_key_res) request.result.add_section(res) # PkgInfo file pkg_types = { 'APPL': 'application', 'FMWK': 'frameworks', 'BNDL': 'loadable bundle' } pattern = re.compile(r'Payload/[^/]*.app/PkgInfo') for fn in name_list: m = pattern.match(fn) if m is not None: res = ResultSection("PkgInfo Details") pkg_info_path = os.path.join(wrk_dir, m.group()) with open(pkg_info_path, 'r') as f: pkg_info = f.read() if pkg_info == "": res.add_line("Empty PkgInfo file. Archive contents may be encrypted.") elif len(pkg_info) == 8: # noinspection PyBroadException try: pkgtype = pkg_info[0:4] if pkgtype in pkg_types: pkgtype = pkg_types[pkgtype] creator_code = pkg_info[4:] res = ResultSection("PkgInfo Details") res.add_line(f"Package Type: {pkgtype}; Application Signature: {creator_code}") except Exception: continue request.result.add_section(res) if main_exe: main_exe_reg = (rf'.*{main_exe[0]}$', f"Main executable file {main_exe[0]}") else: main_exe_reg = ('$', 'Place holder for missing main executable name.') fextract_regs = [ main_exe_reg, (r'Payload.*\.(?:crt|cer|der|key|p12|p7b|p7c|pem|pfx)$', "Certificate or key file"), (r'Payload.*libswift[^\/]\.dylib$', "Swift code library files"), (r'Payload\/META-INF\/.*ZipMetadata.plist$', "IPA archive content info"), (r'Payload.*mobileprovision$', "Provisioning profile for limiting app uploads"), (r'.*plist$', "Plist information file"), ] empty_file_msg = "Empty file. Archive contents may be encrypted." int_files = {} plist_res = ResultSection("Other Plist File Information (displaying new key-value pairs only)") for root, dirs, files in os.walk(wrk_dir): for name in files: full_path = safe_str(os.path.join(root, name)) if os.path.getsize(full_path) == 0: if int_files.get(empty_file_msg, None): int_files[empty_file_msg].append(full_path) else: int_files[empty_file_msg] = [] int_files[empty_file_msg].append(full_path) else: for p, desc in fextract_regs: pattern = re.compile(p) m = pattern.match(full_path) if m is not None: # Already identify main executable file above if not desc.startswith("Main executable file "): if desc.startswith("Plist"): pres = ResultSection(f"{full_path.replace(wrk_dir, '')}") isempty, plist_parsed = self.gen_plist_extract(full_path, patterns) if not isempty and plist_parsed: iden_key_res, unk_key_res = self.parse_plist(plist_parsed) # If all keys have already been reported, skip this plist if not iden_key_res and not unk_key_res: continue if iden_key_res: pres.add_subsection(iden_key_res) if unk_key_res: pres.add_subsection(unk_key_res) plist_res.add_subsection(pres) elif int_files.get(desc, None): int_files[desc].append(full_path) else: int_files[desc] = [] int_files[desc].append(full_path) break if len(plist_res.subsections) > 0: request.result.add_section(plist_res) if len(int_files) > 0: intf_sec = ResultSection("Files of interest", parent=res) for intf_d, intf_p in int_files.items(): intf_subsec = ResultSection(intf_d, parent=intf_sec) for f in intf_p: intf_subsec.add_line(f.replace(f"{wrk_dir}/", ""))
def unicode_results(self, request: ServiceRequest, patterns: PatternMatch) -> Optional[ResultSection]: """ Finds and report unicode encoded strings Args: request: AL request object with result section patterns: PatternMatch object Returns: The result section (with request.result as its parent) if one is created """ unicode_al_results: Dict[str, Tuple[bytes, bytes]] = {} dropped_unicode: List[Tuple[str, str]] = [] for hes in self.HEXENC_STRINGS: if re.search( re.escape(hes) + b'[A-Fa-f0-9]{2}', request.file_contents): dropped = self.decode_encoded_udata(request, hes, request.file_contents, unicode_al_results) for uhash in dropped: dropped_unicode.append((uhash, safe_str(hes))) # Report Unicode Encoded Data: unicode_heur = Heuristic( 5, frequency=len(dropped_unicode)) if dropped_unicode else None unicode_emb_res = ResultSection( "Found Unicode-Like Strings in Non-Executable:", body_format=BODY_FORMAT.MEMORY_DUMP, heuristic=unicode_heur) for uhash, uenc in dropped_unicode: unicode_emb_res.add_line( f"Extracted over 50 bytes of possible embedded unicode with " f"{uenc} encoding. SHA256: {uhash}. See extracted files.") for unires_index, (sha256, (decoded, encoded)) in enumerate(unicode_al_results.items()): sub_uni_res = (ResultSection(f"Result {unires_index}", parent=unicode_emb_res)) sub_uni_res.add_line(f'ENCODED TEXT SIZE: {len(decoded)}') sub_uni_res.add_line( f'ENCODED SAMPLE TEXT: {safe_str(encoded)}[........]') sub_uni_res.add_line(f'DECODED SHA256: {sha256}') subb_uni_res = (ResultSection("DECODED ASCII DUMP:", body_format=BODY_FORMAT.MEMORY_DUMP, parent=sub_uni_res)) subb_uni_res.add_line('{}'.format(safe_str(decoded))) # Look for IOCs of interest hits = self.ioc_to_tag(decoded, patterns, sub_uni_res, st_max_length=1000, taglist=True) if hits: sub_uni_res.set_heuristic(6) subb_uni_res.add_line( "Suspicious string(s) found in decoded data.") else: sub_uni_res.set_heuristic(4) if unicode_al_results or dropped_unicode: request.result.add_section(unicode_emb_res) return unicode_emb_res return None
def execute(self, request): self.result = Result() request.result = self.result self.request = request self.ip_list = [] self.url_list = [] self.found_powershell = False self.file_hashes = [] vmonkey_err = False actions = [] external_functions = [] tmp_iocs = [] output_results = {} # Running ViperMonkey try: cmd = " ".join([ PYTHON2_INTERPRETER, os.path.join(os.path.dirname(__file__), 'vipermonkey_compat.py2'), request.file_path ]) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) stdout, _ = p.communicate() # Read output if stdout: for l in stdout.splitlines(): if l.startswith(b"{") and l.endswith(b"}"): try: output_results = json.loads(l) except UnicodeDecodeError: output_results = json.loads( l.decode("utf-8", "replace")) break # Checking for tuple in case vmonkey return is None # If no macros found, return is [][], if error, return is None if type(output_results.get('vmonkey_values')) == dict: ''' Structure of variable "actions" is as follows: [action, description, parameter] action: 'Found Entry Point', 'Execute Command', etc... parameter: Parameters for function description: 'Shell Function', etc... external_functions is a list of built-in VBA functions that were called ''' actions = output_results['vmonkey_values']['actions'] external_functions = output_results['vmonkey_values'][ 'external_funcs'] tmp_iocs = output_results['vmonkey_values']['tmp_iocs'] else: vmonkey_err = True else: vmonkey_err = True except Exception: raise # Add vmonkey log as a supplemental file if 'stdout' in output_results: temp_log_copy = os.path.join( tempfile.gettempdir(), f'{request.sid}_vipermonkey_output.log') with open(temp_log_copy, "w") as temp_log_file: temp_log_file.write(output_results['stdout']) self.request.add_supplementary(temp_log_copy, 'vipermonkey_output.log', 'ViperMonkey log output') if vmonkey_err is True: ResultSection( 'ViperMonkey has encountered an error, please check "vipermonkey_output.log"', parent=self.result, heuristic=Heuristic(1)) if len(actions) > 0: # Creating action section action_section = ResultSection('Recorded Actions:', parent=self.result) action_section.add_tag('technique.macro', 'Contains VBA Macro(s)') for action in actions: # Creating action sub-sections for each action cur_action = action[0] cur_description = action[2] if action[2] else cur_action # Entry point actions have an empty description field, re-organize result section for this case if cur_action == 'Found Entry Point': sub_action_section = ResultSection('Found Entry Point', parent=action_section) sub_action_section.add_line(action[1]) else: # Action's description will be the sub-section name sub_action_section = ResultSection(cur_description, parent=action_section) if cur_description == 'Shell function': sub_action_section.set_heuristic(2) # Parameters are sometimes stored as a list, account for this if isinstance(action[1], list): for item in action[1]: # Parameters includes more than strings (booleans for example) if isinstance(item, str): # Check for PowerShell self.extract_powershell( item, sub_action_section) # Join list items into single string param = ', '.join(str(a) for a in action[1]) else: param = action[1] # Parameters includes more than strings (booleans for example) if isinstance(param, str): self.extract_powershell(param, sub_action_section) sub_action_section.add_line(f'Action: {cur_action}') sub_action_section.add_line(f'Parameters: {param}') # If decoded is true, possible base64 string has been found self.check_for_b64(param, sub_action_section) # Add urls/ips found in parameter to respective lists self.find_ip(param) # Check tmp_iocs res_temp_iocs = ResultSection('Runtime temporary IOCs') for ioc in tmp_iocs: self.extract_powershell(ioc, res_temp_iocs) self.check_for_b64(ioc, res_temp_iocs) self.find_ip(ioc) if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body: self.result.add_section(res_temp_iocs) # Add PowerShell score/tag if found if self.found_powershell: ResultSection('Discovered PowerShell code in file', parent=self.result, heuristic=Heuristic(3)) # Add url/ip tags self.add_ip_tags() # Create section for built-in VBA functions called if len(external_functions) > 0: vba_builtin_dict = {} dict_path = os.path.join(os.path.dirname(__file__), 'VBA_built_ins.txt') with open(dict_path, 'r') as f: for line in f: line = line.strip() if re.search(r'^#', line): continue if line: line = line.split(';') vba_builtin_dict[line[0].strip()] = line[1].strip() external_func_section = ResultSection( 'VBA functions called', body_format=BODY_FORMAT.MEMORY_DUMP, parent=self.result) for func in external_functions: if func in vba_builtin_dict: external_func_section.add_line(func + ': ' + vba_builtin_dict[func]) else: external_func_section.add_line(func)
def execute(self, request: ServiceRequest) -> None: self.result = Result() request.result = self.result self.ip_list = [] self.url_list = [] self.found_powershell = False self.file_hashes = [] vmonkey_err = False actions: List[str] = [] external_functions: List[str] = [] tmp_iocs: List[str] = [] output_results: Dict[str, Any] = {} potential_base64: Set[str] = set() # Running ViperMonkey try: file_contents = request.file_contents input_file: str = request.file_path input_file_obj: Optional[IO] = None # Typical start to XML files if not file_contents.startswith( b"<?") and request.file_type == "code/xml": # Default encoding/decoding if BOM not found encoding: Optional[str] = None decoding: Optional[str] = None # Remove potential BOMs from contents if file_contents.startswith(BOM_UTF8): encoding = "utf-8" decoding = "utf-8-sig" elif file_contents.startswith(BOM_UTF16): encoding = "utf-16" decoding = "utf-16" if encoding and decoding: input_file_obj = tempfile.NamedTemporaryFile( "w+", encoding=encoding) input_file_obj.write( file_contents.decode(decoding, errors="ignore")) input_file = input_file_obj.name else: # If the file_type was detected as XML, it's probably buried within but not actually an XML file # Give no response as ViperMonkey can't process this kind of file return cmd = " ".join([ PYTHON2_INTERPRETER, os.path.join(os.path.dirname(__file__), "vipermonkey_compat.py2"), input_file, self.working_directory, ]) p = subprocess.run(cmd, capture_output=True, shell=True) stdout = p.stdout # Close file if input_file_obj and os.path.exists(input_file_obj.name): input_file_obj.close() # Add artifacts artifact_dir = os.path.join( self.working_directory, os.path.basename(input_file) + "_artifacts") if os.path.exists(artifact_dir): for file in os.listdir(artifact_dir): try: file_path = os.path.join(artifact_dir, file) if os.path.isfile(file_path) and os.path.getsize( file_path): request.add_extracted( file_path, file, "File extracted by ViperMonkey during analysis" ) except os.error as e: self.log.warning(e) # Read output if stdout: for line in stdout.splitlines(): if line.startswith(b"{") and line.endswith(b"}"): try: output_results = json.loads(line) except UnicodeDecodeError: output_results = json.loads( line.decode("utf-8", "replace")) break # Checking for tuple in case vmonkey return is None # If no macros found, return is [][][], if error, return is None # vmonkey_err can still happen if return is [][][], log as warning instead of error if isinstance(output_results.get("vmonkey_values"), dict): """ Structure of variable "actions" is as follows: [action, parameters, description] action: 'Found Entry Point', 'Execute Command', etc... parameters: Parameters for function description: 'Shell Function', etc... external_functions is a list of built-in VBA functions that were called """ actions = output_results["vmonkey_values"]["actions"] external_functions = output_results["vmonkey_values"][ "external_funcs"] tmp_iocs = output_results["vmonkey_values"]["tmp_iocs"] if output_results["vmonkey_err"]: vmonkey_err = True self.log.warning(output_results["vmonkey_err"]) else: vmonkey_err = True else: vmonkey_err = True except Exception: self.log.exception( f"Vipermonkey failed to analyze file {request.sha256}") if actions: # Creating action section action_section = ResultSection("Recorded Actions:", parent=self.result) action_section.add_tag("technique.macro", "Contains VBA Macro(s)") sub_action_sections: Dict[str, ResultSection] = {} for action, parameters, description in actions: # Creating action sub-sections for each action if not description: # For actions with no description, just use the type of action description = action if description not in sub_action_sections: # Action's description will be the sub-section name sub_action_section = ResultSection(description, parent=action_section) sub_action_sections[description] = sub_action_section if description == "Shell function": sub_action_section.set_heuristic(2) else: # Reuse existing section sub_action_section = sub_action_sections[description] if sub_action_section.heuristic: sub_action_section.heuristic.increment_frequency() # Parameters are sometimes stored as a list, account for this if isinstance(parameters, list): for item in parameters: # Parameters includes more than strings (booleans for example) if isinstance(item, str): # Check for PowerShell self.extract_powershell(item, sub_action_section, request) # Join list items into single string param = ", ".join(str(p) for p in parameters) else: param = parameters # Parameters includes more than strings (booleans for example) if isinstance(param, str): self.extract_powershell(param, sub_action_section, request) # If the description field was empty, re-organize result section for this case if description == action: sub_action_section.add_line(param) else: sub_action_section.add_line( f"Action: {action}, Parameters: {param}") # Check later for base64 potential_base64.add(param) # Add urls/ips found in parameter to respective lists self.find_ip(param) # Check tmp_iocs res_temp_iocs = ResultSection("Runtime temporary IOCs") for ioc in tmp_iocs: self.extract_powershell(ioc, res_temp_iocs, request) potential_base64.add(ioc) self.find_ip(ioc) if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body: self.result.add_section(res_temp_iocs) # Add PowerShell score/tag if found if self.found_powershell: ResultSection("Discovered PowerShell code in file", parent=self.result, heuristic=Heuristic(3)) # Check parameters and temp_iocs for base64 base64_section = ResultSection("Possible Base64 found", heuristic=Heuristic(5, frequency=0)) for param in potential_base64: self.check_for_b64(param, base64_section, request, request.file_contents) if base64_section.body: self.result.add_section(base64_section) # Add url/ip tags self.add_ip_tags() # Create section for built-in VBA functions called if len(external_functions) > 0: external_func_section = ResultSection( "VBA functions called", body_format=BODY_FORMAT.MEMORY_DUMP, parent=self.result) for func in external_functions: if func in vba_builtins: external_func_section.add_line(func + ": " + vba_builtins[func]) else: external_func_section.add_line(func) # Add vmonkey log as a supplemental file if we have results if "stdout" in output_results and (vmonkey_err or request.result.sections): temp_log_copy = os.path.join( tempfile.gettempdir(), f"{request.sid}_vipermonkey_output.log") with open(temp_log_copy, "w") as temp_log_file: temp_log_file.write(output_results["stdout"]) request.add_supplementary(temp_log_copy, "vipermonkey_output.log", "ViperMonkey log output") if vmonkey_err is True: ResultSection( 'ViperMonkey has encountered an error, please check "vipermonkey_output.log"', parent=self.result, heuristic=Heuristic(1), )
def check_for_b64(self, data: str, section: ResultSection, request: ServiceRequest, file_contents: bytes) -> bool: """Search and decode base64 strings in sample data. Args: data: Data to be parsed section: base64 subsection, must have heuristic set Returns: decoded: Boolean which is true if base64 found """ assert section.heuristic decoded_param = data decoded = False encoded_data = data.encode() for content, start, end in find_base64(encoded_data): if encoded_data[start:end] in file_contents: # Present in original file, not an intermediate IoC continue try: # Powershell base64 will be utf-16 content = content.decode("utf-16").encode() except UnicodeDecodeError: pass try: if len(content) < FILE_PARAMETER_SIZE: decoded_param = decoded_param[: start] + " " + content.decode( errors="ignore" ) + decoded_param[end:] else: b64hash = "" pe_files = find_pe_files(content) for pe_file in pe_files: b64hash = hashlib.sha256(pe_file).hexdigest() pe_path = os.path.join(self.working_directory, b64hash) with open(pe_path, "wb") as f: f.write(pe_file) request.add_extracted( pe_path, b64hash, "PE file found in base64 encoded parameter") section.heuristic.add_signature_id("pe_file") if not pe_files: b64hash = hashlib.sha256(content).hexdigest() content_path = os.path.join(self.working_directory, b64hash) with open(content_path, "wb") as f: f.write(content) request.add_extracted( content_path, b64hash, "Large base64 encoded parameter") section.heuristic.add_signature_id("possible_file") decoded_param = decoded_param[: start] + f"[See extracted file {b64hash}]" + decoded_param[ end:] decoded = True except Exception: pass if decoded: section.heuristic.increment_frequency() section.add_line( f"Possible Base64 {truncate(data)} decoded: {decoded_param}") self.find_ip(decoded_param) return decoded
def execute(self, request): file_path = request.file_path result = Result() # Report the version of suricata as the service context request.set_service_context( f"Suricata version: {self.get_suricata_version()}") # restart Suricata if we need to self.start_suricata_if_necessary() # Strip frame headers from the PCAP, since Suricata sometimes has trouble parsing strange PCAPs stripped_filepath = self.strip_frame_headers(file_path) # Check to make sure the size of the stripped file isn't 0 - this happens on pcapng files # TODO: there's probably a better way to do this - don't event strip it if it's pcapng if os.stat(stripped_filepath).st_size == 0: stripped_filepath = file_path # Switch stdout and stderr so we don't get our logs polluted mystdout = StringIO() old_stdout = sys.stdout sys.stdout = mystdout mystderr = StringIO() old_stderr = sys.stderr sys.stderr = mystderr # Pass the pcap file to Suricata via the socket ret = self.suricata_sc.send_command( "pcap-file", { "filename": stripped_filepath, "output-dir": self.working_directory }) if not ret or ret["return"] != "OK": self.log.exception( f"Failed to submit PCAP for processing: {ret['message']}") # Wait for the socket finish processing our PCAP while True: time.sleep(1) try: ret = self.suricata_sc.send_command("pcap-current") if ret and ret["message"] == "None": break except ConnectionResetError as e: raise RecoverableError(e) # Bring back stdout and stderr sys.stdout = old_stdout sys.stderr = old_stderr # NOTE: for now we will ignore content of mystdout and mystderr but we have them just in case... alerts, signatures, domains, ips, urls, email_addresses, tls_dict, extracted_files, reverse_lookup = self.parse_suricata_output( ).values() file_extracted_section = ResultSection("File(s) extracted by Suricata") # Parse the json results of the service if request.get_param("extract_files"): for file in extracted_files: sha256, filename, extracted_file_path = file.values() self.log.info(f"extracted file {filename}") try: if request.add_extracted( extracted_file_path, filename, "Extracted by Suricata", safelist_interface=self.api_interface): file_extracted_section.add_line(filename) if filename != sha256: file_extracted_section.add_tag( 'file.name.extracted', filename) except FileNotFoundError as e: # An intermittent issue, just try again raise RecoverableError(e) except MaxExtractedExceeded: # We've hit our limit pass # Report a null score to indicate that files were extracted. If no sigs hit, it's not clear # where the extracted files came from if file_extracted_section.body: result.add_section(file_extracted_section) # Add tags for the domains, urls, and IPs we've discovered root_section = ResultSection("Discovered IOCs", parent=result) if domains: domain_section = ResultSection("Domains", parent=root_section) for domain in domains: domain_section.add_line(domain) domain_section.add_tag('network.dynamic.domain', domain) if ips: ip_section = ResultSection("IP Addresses", parent=root_section) for ip in ips: # Make sure it's not a local IP if not (ip.startswith("127.") or ip.startswith("192.168.") or ip.startswith("10.") or (ip.startswith("172.") and 16 <= int(ip.split(".")[1]) <= 31)): ip_section.add_line(ip) ip_section.add_tag('network.dynamic.ip', ip) if urls: url_section = ResultSection("URLs", parent=root_section) for url in urls: url_section.add_line(url) url_section.add_tag('network.dynamic.uri', url) if email_addresses: email_section = ResultSection("Email Addresses", parent=root_section) for eml in email_addresses: email_section.add_line(eml) email_section.add_tag('network.email.address', eml) # Map between suricata key names and AL tag types tls_mappings = { "subject": 'cert.subject', "issuerdn": 'cert.issuer', "version": 'cert.version', "notbefore": 'cert.valid.start', "notafter": 'cert.valid.end', "fingerprint": 'cert.thumbprint', "sni": 'network.tls.sni' } if tls_dict: tls_section = ResultSection("TLS Information", parent=root_section, body_format=BODY_FORMAT.JSON) kv_body = {} for tls_type, tls_values in tls_dict.items(): if tls_type == "fingerprint": # make sure the cert fingerprint/thumbprint matches other values, # like from PEFile tls_values = [ v.replace(":", "").lower() for v in tls_values ] if tls_type in tls_mappings: kv_body[tls_type] = tls_values tag_type = tls_mappings[tls_type] if tag_type is not None: for tls_value in tls_values: tls_section.add_tag(tag_type, tls_value) elif tls_type == "ja3": kv_body.setdefault('ja3_hash', []) kv_body.setdefault('ja3_string', []) for ja3_entry in tls_values: ja3_hash = ja3_entry.get("hash") ja3_string = ja3_entry.get("string") if ja3_hash: kv_body['ja3_hash'].append(ja3_hash) tls_section.add_tag('network.tls.ja3_hash', ja3_hash) if ja3_string: kv_body['ja3_string'].append(ja3_string) tls_section.add_tag('network.tls.ja3_string', ja3_string) else: kv_body[tls_type] = tls_values # stick a message in the logs about a new TLS type found in suricata logs self.log.info( f"Found new TLS type {tls_type} with values {tls_values}" ) tls_section.set_body(json.dumps(kv_body)) # Create the result sections if there are any hits if len(alerts) > 0: for signature_id, signature_details in signatures.items(): signature = signature_details['signature'] attributes = signature_details['attributes'] section = ResultSection(f'{signature_id}: {signature}') heur_id = 3 if any(x in signature for x in self.config.get("sure_score")): heur_id = 1 elif any(x in signature for x in self.config.get("vhigh_score")): heur_id = 2 section.set_heuristic(heur_id) if signature_details['al_signature']: section.add_tag("file.rule.suricata", signature_details['al_signature']) for timestamp, src_ip, src_port, dest_ip, dest_port in alerts[ signature_id][:10]: section.add_line( f"{timestamp} {src_ip}:{src_port} -> {dest_ip}:{dest_port}" ) if len(alerts[signature_id]) > 10: section.add_line( f'And {len(alerts[signature_id]) - 10} more flows') # Tag IPs/Domains/URIs associated to signature for flow in alerts[signature_id]: dest_ip = flow[3] section.add_tag('network.dynamic.ip', dest_ip) if dest_ip in reverse_lookup.keys(): section.add_tag('network.dynamic.domain', reverse_lookup[dest_ip]) [ section.add_tag('network.dynamic.uri', uri) for uri in urls if dest_ip in uri or (reverse_lookup.get(dest_ip) and reverse_lookup[dest_ip] in uri) ] # Add a tag for the signature id and the message section.add_tag('network.signature.signature_id', str(signature_id)) section.add_tag('network.signature.message', signature) [ section.add_tag('network.static.uri', attr['uri']) for attr in attributes if attr.get('uri') ] # Tag malware_family for malware_family in signature_details['malware_family']: section.add_tag('attribution.family', malware_family) result.add_section(section) self.ontology.add_result_part( Signature, data=dict( name=signature_details['al_signature'], type="SURICATA", malware_families=signature_details['malware_family'] or None, attributes=attributes)) # Add the original Suricata output as a supplementary file in the result request.add_supplementary( os.path.join(self.working_directory, 'eve.json'), 'SuricataEventLog.json', 'json') # Add the stats.log to the result, which can be used to determine service success if os.path.exists(os.path.join(self.working_directory, 'stats.log')): request.add_supplementary( os.path.join(self.working_directory, 'stats.log'), 'stats.log', 'log') request.result = result
def test_process_ttps(intezer_static_class_instance, dummy_api_interface_class, mocker): from intezer_static import ALIntezerApi from intezer_sdk.api import IntezerApi from intezer_sdk.errors import UnsupportedOnPremiseVersion from assemblyline_v4_service.common.result import ResultSection, ResultTableSection, TableRow from requests import HTTPError mocker.patch.object(intezer_static_class_instance, "get_api_interface", return_value=dummy_api_interface_class) intezer_static_class_instance.start() parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(IntezerApi, "get_dynamic_ttps", side_effect=HTTPError("FORBIDDEN")) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(IntezerApi, "get_dynamic_ttps", side_effect=UnsupportedOnPremiseVersion()) intezer_static_class_instance._process_ttps("blah", parent_res_sec) assert parent_res_sec.subsections == [] mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "blah", "description": "blah", "data": [], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: blah", "blah") correct_res_sec.set_heuristic(4) correct_res_sec.heuristic.add_signature_id("blah", 10) assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "InjectionInterProcess", "description": "blah", "data": [], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: InjectionInterProcess", "blah") correct_res_sec.set_heuristic(7) correct_res_sec.heuristic.add_signature_id("InjectionInterProcess", 10) correct_res_sec.heuristic.add_attack_id("T1055") assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "enumerates_running_processes", "description": "blah", "data": [{ "wow": "print me!" }], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection( "Signature: enumerates_running_processes", "blah") correct_res_sec.set_heuristic(8) correct_res_sec.heuristic.add_signature_id( "enumerates_running_processes", 10) correct_res_sec.heuristic.add_attack_id("T1057") assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec) parent_res_sec = ResultSection("blah") mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[{ "name": "blah", "description": "blah", "data": [ { "IP": "blah 2.2.2.2 blah" }, ], "severity": 1 }]) intezer_static_class_instance._process_ttps("blah", parent_res_sec) correct_res_sec = ResultSection("Signature: blah", "blah") correct_res_sec.add_line("\tIP: blah 2.2.2.2 blah") correct_res_sec.set_heuristic(4) correct_res_sec.heuristic.add_signature_id("blah", 10) correct_ioc_res_sec = ResultTableSection( "IOCs found in signature marks") correct_ioc_res_sec.add_row(TableRow(ioc_type="ip", ioc="2.2.2.2")) correct_ioc_res_sec.add_tag("network.dynamic.ip", "2.2.2.2") correct_res_sec.add_subsection(correct_ioc_res_sec) assert check_section_equality( parent_res_sec.subsections[0].subsections[0], correct_res_sec)
def execute(self, request: ServiceRequest) -> None: # --- Setup ---------------------------------------------------------------------------------------------- request.result = Result() patterns = PatternMatch() if request.deep_scan: max_attempts = 100 else: max_attempts = 10 self.files_extracted = set() self.hashes = set() # --- Pre-Processing -------------------------------------------------------------------------------------- # Get all IOCs prior to de-obfuscation pat_values = patterns.ioc_match(request.file_contents, bogon_ip=True, just_network=False) if pat_values and request.get_param('extract_original_iocs'): ioc_res = ResultSection( "The following IOCs were found in the original file", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) for k, val in pat_values.items(): for v in val: if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_res.add_tag(k, v) # --- Prepare Techniques ---------------------------------------------------------------------------------- techniques = [ ('MSOffice Embedded script', self.msoffice_embedded_script_string), ('CHR and CHRB decode', self.chr_decode), ('String replace', self.string_replace), ('Powershell carets', self.powershell_carets), ('Array of strings', self.array_of_strings), ('Fake array vars', self.vars_of_fake_arrays), ('Reverse strings', self.str_reverse), ('B64 Decode', self.b64decode_str), ('Simple XOR function', self.simple_xor_function), ] second_pass = [('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), ('Charcode hex', self.charcode_hex)] final_pass = [ ('Charcode', self.charcode), ] code_extracts = [('.*html.*', "HTML scripts extraction", self.extract_htmlscript)] layers_list: List[Tuple[str, bytes]] = [] layer = request.file_contents # --- Stage 1: Script Extraction -------------------------------------------------------------------------- for pattern, name, func in code_extracts: if regex.match(regex.compile(pattern), request.task.file_type): extracted_parts = func(request.file_contents) layer = b"\n".join(extracted_parts).strip() layers_list.append((name, layer)) break # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------ idx = 0 first_pass_len = len(techniques) layers_count = len(layers_list) while True: if idx > max_attempts: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break with ThreadPoolExecutor() as executor: threads = [ executor.submit(technique, layer) for name, technique in techniques ] results = [thread.result() for thread in threads] for i in range(len(results)): result = results[i] if result: layers_list.append((techniques[i][0], result)) # Looks like it worked, restart with new layer layer = result # If the layers haven't changed in a passing, break if layers_count == len(layers_list): if len(techniques) != first_pass_len: final_pass.extend(techniques) with ThreadPoolExecutor() as executor: threads = [ executor.submit(technique, layer) for name, technique in final_pass ] results = [thread.result() for thread in threads] for i in range(len(results)): result = results[i] if result: layers_list.append((techniques[i][0], result)) break for x in second_pass: techniques.insert(0, x) layers_count = len(layers_list) idx += 1 # --- Compiling results ---------------------------------------------------------------------------------- if len(layers_list) > 0: extract_file = False num_layers = len(layers_list) # Compute heuristic if num_layers < 5: heur_id = 1 elif num_layers < 10: heur_id = 2 elif num_layers < 50: heur_id = 3 elif num_layers < 100: heur_id = 4 else: # num_layers >= 100 heur_id = 5 # Cleanup final layer clean = self.clean_up_final_layer(layers_list[-1][1]) if clean != request.file_contents: # Check for new IOCs pat_values = patterns.ioc_match(clean, bogon_ip=True, just_network=False) diff_tags: Dict[str, List[bytes]] = {} for uri in pat_values.get('network.static.uri', []): # Compare URIs without query string uri = uri.split(b'?', 1)[0] if uri not in request.file_contents: diff_tags.setdefault('network.static.uri', []) diff_tags['network.static.uri'].append(uri) if request.deep_scan or (len(clean) > 1000 and heur_id >= 4) or diff_tags: extract_file = True # Display obfuscation steps mres = ResultSection( "De-obfuscation steps taken by DeobsfuScripter", parent=request.result) if heur_id: mres.set_heuristic(heur_id) lcount = Counter([x[0] for x in layers_list]) for l, c in lcount.items(): mres.add_line(f"{l}, {c} time(s).") # Display final layer byte_count = 5000 if extract_file: # Save extracted file byte_count = 500 file_name = f"{os.path.basename(request.file_name)}_decoded_final" file_path = os.path.join(self.working_directory, file_name) # Ensure directory exists before write os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, 'wb+') as f: f.write(clean) self.log.debug( f"Submitted dropped file for analysis: {file_path}" ) request.add_extracted(file_path, file_name, "Final deobfuscation layer") ResultSection(f"First {byte_count} bytes of the final layer:", body=safe_str(clean[:byte_count]), body_format=BODY_FORMAT.MEMORY_DUMP, parent=request.result) # Display new IOCs from final layer if len(diff_tags) > 0: ioc_new = ResultSection( "New IOCs found after de-obfustcation", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) has_network_heur = False for ty, val in diff_tags.items(): for v in val: if "network" in ty: has_network_heur = True ioc_new.add_line( f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_new.add_tag(ty, v) if has_network_heur: ioc_new.set_heuristic(7) else: ioc_new.set_heuristic(6) if len(self.files_extracted) > 0: ext_file_res = ResultSection( "The following files were extracted during the deobfuscation", heuristic=Heuristic(8), parent=request.result) for extracted in self.files_extracted: file_name = os.path.basename(extracted) ext_file_res.add_line(file_name) request.add_extracted( extracted, file_name, "File of interest deobfuscated from sample")
def execute(self, request): request.result = Result() request.set_service_context(self.get_tool_version()) temp_filename = request.file_path filename = os.path.basename(temp_filename) extract_dir = os.path.join(self.working_directory, f"{filename}_extracted") decompiled_dir = os.path.join(self.working_directory, f"{filename}_decompiled") file_res = request.result new_files = [] supplementary_files = [] imp_res_list = [] res_list = [] if request.file_type == "java/jar": self.decompile_jar(temp_filename, decompiled_dir) if self.jar_extract(temp_filename, extract_dir): # Analysis properties self.classloader_found = 0 self.security_found = 0 self.url_found = 0 self.runtime_found = 0 self.applet_found = 0 self.manifest_tags = [] self.signature_block_certs = [] def analyze_file(root, cf, file_res, imp_res_list, supplementary_files, decompiled_dir, extract_dir): cur_file_path = os.path.join(root.decode('utf-8'), cf.decode('utf-8')) with open(cur_file_path, "rb") as cur_file: start_bytes = cur_file.read(24) ############################## # Executables in JAR ############################## cur_ext = os.path.splitext(cf)[1][1:].upper() if start_bytes[:2] == b"MZ": mz_res = dict( title_text=f"Embedded executable file found: {cf} " "There may be a malicious intent.", heur_id=1, tags=[('file.behavior', "Embedded PE")], score_condition=APPLET_MZ, ) imp_res_list.append(mz_res) ############################## # Launchable in JAR ############################## elif cur_ext in G_LAUNCHABLE_EXTENSIONS: l_res = dict( title_text=f"Launch-able file type found: {cf}" "There may be a malicious intent.", heur_id=2, tags=[('file.behavior', "Launch-able file in JAR")], score_condition=APPLET_MZ, ) imp_res_list.append(l_res) if cur_file_path.upper().endswith('.CLASS'): self.analyse_class_file(file_res, cf, cur_file, cur_file_path, start_bytes, imp_res_list, supplementary_files, decompiled_dir, extract_dir) for root, _, files in os.walk(extract_dir.encode('utf-8')): logging.info(f"Extracted: {root} - {files}") # if the META-INF folder is encountered if root.upper().endswith(b'META-INF'): # only top level meta self.analyse_meta_information(file_res, root, supplementary_files, extract_dir) continue with ThreadPoolExecutor() as executor: for cf in files: executor.submit(analyze_file, root, cf, file_res, imp_res_list, supplementary_files, decompiled_dir, extract_dir) res = ResultSection("Analysis of the JAR file") res_meta = ResultSection("[Meta Information]", parent=res) if len(self.manifest_tags) > 0: res_manifest = ResultSection("Manifest File Information Extract", parent=res_meta) for tag, val in self.manifest_tags: res_manifest.add_tag(tag, val) for res_cert in self.signature_block_certs: res_meta.add_subsection(res_cert) if self.runtime_found > 0 \ or self.applet_found > 0 \ or self.classloader_found > 0 \ or self.security_found > 0 \ or self.url_found > 0: res.add_line("All suspicious class files were saved as supplementary files.") res_class = ResultSection("[Suspicious classes]", parent=res) if self.runtime_found > 0: ResultSection("Runtime Found", body=f"java/lang/Runtime: {self.runtime_found}", heuristic=Heuristic(10), parent=res_class) if self.applet_found > 0: ResultSection("Applet Found", body=f"java/applet/Applet: {self.applet_found}", heuristic=Heuristic(6), parent=res_class) if self.classloader_found > 0: ResultSection("Classloader Found", body=f"java/lang/ClassLoader: {self.classloader_found}", heuristic=Heuristic(7), parent=res_class) if self.security_found > 0: ResultSection("Security Found", body=f"java/security/*: {self.security_found}", heuristic=Heuristic(8), parent=res_class) if self.url_found > 0: ResultSection("URL Found", body=f"java/net/URL: {self.url_found}", heuristic=Heuristic(9), parent=res_class) res_list.append(res) # Add results if any self.recurse_add_res(file_res, imp_res_list, new_files) for res in res_list: file_res.add_section(res) # Submit embedded files if len(new_files) > 0: new_files = sorted(list(set(new_files))) txt = f"Extracted from 'JAR' file {filename}" for embed in new_files: request.add_extracted(embed, embed.replace(extract_dir + "/", "").replace(decompiled_dir + "/", ""), txt, safelist_interface=self.api_interface) if len(supplementary_files) > 0: supplementary_files = sorted(list(set(supplementary_files))) for path, name, desc in supplementary_files: request.add_supplementary(path, name, desc)
def execute(self, request): parser = eml_parser.eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) # Validate URLs in sample, strip out [] if found content_str = request.file_contents.decode(errors="ignore") content_str, retry = self.validate_urls(content_str) while retry: content_str, retry = self.validate_urls(content_str) parsed_eml = parser.decode_email_bytes(content_str.encode()) result = Result() header = parsed_eml['header'] if "from" in header: all_uri = set() for body_counter, body in enumerate(parsed_eml['body']): if request.get_param('extract_body_text'): fd, path = mkstemp() with open(path, 'w') as f: f.write(body['content']) os.close(fd) request.add_extracted(path, "body_" + str(body_counter), "Body text") if "uri" in body: for uri in body['uri']: all_uri.add(uri) kv_section = ResultSection('Email Headers', body_format=BODY_FORMAT.KEY_VALUE, parent=result) # Basic tags kv_section.add_tag("network.email.address", header['from'].strip()) for to in header['to']: kv_section.add_tag("network.email.address", to) kv_section.add_tag("network.email.date", str(header['date']).strip()) kv_section.add_tag("network.email.subject", header['subject'].strip()) # Add CCs to body and tags if 'cc' in header: for to in header['to']: kv_section.add_tag("network.email.address", to.strip()) # Add Message ID to body and tags if 'message-id' in header['header']: kv_section.add_tag("network.email.msg_id", header['header']['message-id'][0].strip()) # Add Tags for received IPs if 'received_ip' in header: for ip in header['received_ip']: kv_section.add_tag('network.static.ip', ip.strip()) # Add Tags for received Domains if 'received_domain' in header: for dom in header['received_domain']: kv_section.add_tag('network.static.domain', dom.strip()) # If we've found URIs, add them to a section if len(all_uri) > 0: uri_section = ResultSection('URIs Found:', parent=result) for uri in all_uri: uri_section.add_line(uri) uri_section.add_tag('network.static.uri', uri.strip()) parsed_url = urlparse(uri) if parsed_url.hostname and re.match( IP_ONLY_REGEX, parsed_url.hostname): uri_section.add_tag('network.static.ip', parsed_url.hostname) else: uri_section.add_tag('network.static.domain', parsed_url.hostname) # Bring all headers together... extra_header = header.pop('header', {}) header.pop('received', None) header.update(extra_header) kv_section.body = json.dumps(header, default=self.json_serial) if "attachment" in parsed_eml: for attachment in parsed_eml['attachment']: fd, path = mkstemp() with open(path, 'wb') as f: f.write(base64.b64decode(attachment['raw'])) os.close(fd) request.add_extracted(path, attachment['filename'], "Attachment ") ResultSection('Extracted Attachments:', body="\n".join([ x['filename'] for x in parsed_eml['attachment'] ]), parent=result) if request.get_param('save_emlparser_output'): fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write( json.dumps(parsed_eml, default=self.json_serial)) request.add_supplementary( temp_path, "parsing.json", "These are the raw results of running GOVCERT-LU's eml_parser" ) else: text_section = ResultSection('EML parsing results') text_section.add_line("Could not parse EML") result.add_section(text_section) request.result = result
def execute(self, request): """Main Module. See README for details.""" if not self.rules: return request.set_service_context( f"{self.name} version: {self.get_tool_version()}") self.deep_scan = request.task.deep_scan local_filename = request.file_path tags = { f"al_{k.replace('.', '_')}": i for k, i in request.task.tags.items() } yara_externals = {} for k in self.yara_externals.keys(): # Check default request.task fields sval = request.task.__dict__.get(k, None) # if not sval: # # Check metadata dictionary # sval = request.task.metadata.get(k, None) if not sval: # Check params dictionary sval = request.task.service_config.get(k, None) if not sval: # Check tags list val_list = tags.get(k, None) if val_list: sval = " | ".join(val_list) if not sval: # Check temp submission data sval = request.task.temp_submission_data.get(k, None) # Normalize unicode with safe_str and make sure everything else is a string if sval: yara_externals[k] = safe_str(sval) with self.initialization_lock: try: matches = self.rules.match(local_filename, externals=yara_externals) request.result = self._extract_result_from_matches(matches) except Exception as e: # Internal error 30 == exceeded max string matches on rule if e != "internal error: 30": raise else: try: # Fast mode == Yara skips strings already found matches = self.rules.match(local_filename, externals=yara_externals, fast=True) result = self._extract_result_from_matches(matches) section = ResultSection("Service Warnings", parent=result) section.add_line( "Too many matches detected with current ruleset. " f"{self.name} forced to scan in fast mode.") request.result = result except Exception: self.log.warning( f"YARA internal error 30 detected on submission {request.task.sid}" ) result = Result() section = ResultSection( f"{self.name} scan not completed.", parent=result) section.add_line( "File returned too many matches with current rule set and YARA exited." ) request.result = result
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop 3 embedded file which two generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in ['d729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06']: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines([get_random_phrase() for _ in range(random.randint(1, 5))]) # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(3, signature="sig_one") # You can attach attack ids to heuristics after they where defined text_section.heuristic.add_attack_id("T1066") # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how # many time the signature fired by setting its frequency. If you call add_signature_id twice with the # same signature, this will effectively increase the frequency of the signature. text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2) text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3) text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_four", score=0) # The heuristic for text_section should have the following properties # 1. 1 attack ID: T1066 # 2. 4 signatures: sig_one, sig_two, sig_three and sig_four # 3. Signature frequencies are cumulative therefor they will be as follow: # - sig_one = 1 # - sig_two = 5 # - sig_three = 2 # - sig_four = 1 # 4. The score used by each heuristic is driven by the following rules: signature_score_map is higher # priority, then score value for the add_signature_id is in second place and finally the default # heuristic score is use. Therefor the score used to calculate the total score for the text_section is # as follow: # - sig_one: 10 -> heuristic default score # - sig_two: 20 -> score provided by the function add_signature_id # - sig_three: 30 -> score provided by the heuristic map # - sig_four: 40 -> score provided by the heuristic map because it's higher priority than the # function score # 5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210 # Make sure you add your section to the result result.add_section(text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 color_map_data = { 'type': 'colormap', 'data': { 'domain': [cmap_min, cmap_max], 'values': [random.random() * cmap_max for _ in range(50)] } } # The classification of a section can be set to any valid classification for your system section_color_map = ResultSection("Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(color_map_data), classification=cl_engine.RESTRICTED) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL, body=json.dumps({"name": "Random url!", "url": f"https://{random_host}/"})) # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed host1 = get_random_host() host2 = get_random_host() ip1 = get_random_ip() ip2 = get_random_ip() ip3 = get_random_ip() urls = [ {"url": f"https://{host1}/"}, {"url": f"https://{host2}/"}, {"url": f"https://{ip1}/"}, {"url": f"https://{ip2}/"}, {"url": f"https://{ip3}/"}] # A heuristic can fire more then once without being associated to a signature url_heuristic = Heuristic(4, frequency=len(urls)) url_sub_section = ResultSection('Example of a url section with multiple links', body=json.dumps(urls), body_format=BODY_FORMAT.URL, heuristic=url_heuristic) url_sub_section.add_tag("network.static.ip", ip1) url_sub_section.add_tag("network.static.ip", ip2) url_sub_section.add_tag("network.static.ip", ip3) url_sub_section.add_tag("network.static.domain", host1) url_sub_section.add_tag("network.dynamic.domain", host2) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump(b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!") memdump_section = ResultSection('Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP, body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed) kv_body = { "a_str": "Some string", "a_bool": False, "an_int": 102, } kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(kv_body)) result.add_section(kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a json dump of a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [ {"d1_key": "val", "d1_key2": "val2"}, {"d2_key": "val", "d2_key2": "val2"} ], "bool": True } } json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON, body=json.dumps(json_body)) result.add_section(json_section) # ================================================================== # PROCESS_TREE section: # This section allows the service writer to list a bunch of dictionary objects that have nested lists # of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore # each dictionary must have be of the following format: # { # "process_pid": int, # "process_name": str, # "command_line": str, # "children": [] NB: This list either is empty or contains more dictionaries that have the same # structure # } nc_body = [ { "process_pid": 123, "process_name": "evil.exe", "command_line": "C:\\evil.exe", "signatures": {}, "children": [ { "process_pid": 321, "process_name": "takeovercomputer.exe", "command_line": "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff", "signatures": {"one":250}, "children": [ { "process_pid": 456, "process_name": "evenworsethanbefore.exe", "command_line": "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad", "signatures": {"one":10, "two":10, "three":10}, "children": [] }, { "process_pid": 234, "process_name": "badfile.exe", "command_line": "C:\\badfile.exe -k nothing_to_see_here", "signatures": {"one":1000, "two":10, "three":10, "four":10, "five":10}, "children": [] } ] }, { "process_pid": 345, "process_name": "benignexe.exe", "command_line": "C:\\benignexe.exe -f \"just kidding, i'm evil\"", "signatures": {"one": 2000}, "children": [] } ] }, { "process_pid": 987, "process_name": "runzeroday.exe", "command_line": "C:\\runzeroday.exe -f insert_bad_spelling", "signatures": {}, "children": [] } ] nc_section = ResultSection('Example of a PROCESS_TREE section', body_format=BODY_FORMAT.PROCESS_TREE, body=json.dumps(nc_body)) result.add_section(nc_section) # ================================================================== # TABLE section: # This section allows the service writer to have their content displayed in a table format in the UI # The body argument must be a list [] of dict {} objects. A dict object can have a key value pair # where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested # table within a cell. table_body = [ { "a_str": "Some string1", "extra_column_here": "confirmed", "a_bool": False, "an_int": 101, }, { "a_str": "Some string2", "a_bool": True, "an_int": 102, }, { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, { "a_str": "Some string4", "a_bool": None, "an_int": -1000000000000000000, "extra_column_there": "confirmed", "nested_table": { "a_str": "Some string3", "a_bool": False, "nested_table_thats_too_deep": { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, }, }, ] table_section = ResultSection('Example of a TABLE section', body_format=BODY_FORMAT.TABLE, body=json.dumps(table_body)) result.add_section(table_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # Embedded files can also have their own classification! fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"CLASSIFIED!!!__"+data.encode()) request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look", classification=cl_engine.RESTRICTED) # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(urls)) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop two embedded file which one generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in [ 'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06' ]: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines( [get_random_phrase() for _ in range(random.randint(1, 5))]) # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(random.randint(1, 4), signature=get_random_phrase( 1, 4).lower().replace(" ", "_")) # Make sure you add your section to the result result.add_section(text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 color_map_data = { 'type': 'colormap', 'data': { 'domain': [cmap_min, cmap_max], 'values': [random.random() * cmap_max for _ in range(50)] } } section_color_map = ResultSection( "Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA, body=json.dumps(color_map_data)) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL, body=json.dumps({ "name": "Random url!", "url": f"https://{random_host}/" })) # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed host1 = get_random_host() host2 = get_random_host() ip1 = get_random_ip() urls = [{ "url": f"https://{host1}/" }, { "url": f"https://{host2}/" }, { "url": f"https://{ip1}/" }] url_sub_section = ResultSection( 'Example of a url section with multiple links', body_format=BODY_FORMAT.URL, body=json.dumps(urls)) url_sub_section.set_heuristic(random.randint(1, 4)) url_sub_section.add_tag("network.static.ip", ip1) url_sub_section.add_tag("network.static.domain", host1) url_sub_section.add_tag("network.dynamic.domain", host2) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump( b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!" ) memdump_section = ResultSection( 'Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP, body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed) kv_body = { "a_str": "Some string", "a_bool": False, "an_int": 102, } kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(kv_body)) result.add_section(kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a json dump of a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [{ "d1_key": "val", "d1_key2": "val2" }, { "d2_key": "val", "d2_key2": "val2" }], "bool": True } } json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON, body=json.dumps(json_body)) result.add_section(json_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(urls)) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def peepdf_analysis(self, temp_filename, file_content, request): file_res = Result() try: res_list = [] # js_stream = [] f_list = [] js_dump = [] pdf_parser = PDFParser() ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content) if ret == 0: stats_dict = pdf_file.getStats() if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \ "indirect objects found in the body": # Not a PDF return json_body = dict( version=stats_dict['Version'], binary=stats_dict['Binary'], linearized=stats_dict['Linearized'], encrypted=stats_dict['Encrypted'], ) if stats_dict['Encryption Algorithms']: temp = [] for algorithmInfo in stats_dict['Encryption Algorithms']: temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits") json_body["encryption_algorithms"] = temp json_body.update(dict( updates=stats_dict['Updates'], objects=stats_dict['Objects'], streams=stats_dict['Streams'], comments=stats_dict['Comments'], errors={True: ", ".join(stats_dict['Errors']), False: "None"}[len(stats_dict['Errors']) != 0] )) res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(json_body)) for version in range(len(stats_dict['Versions'])): stats_version = stats_dict['Versions'][version] v_json_body = dict( catalog=stats_version['Catalog'] or "no", info=stats_version['Info'] or "no", objects=self.list_first_x(stats_version['Objects'][1]), ) if stats_version['Compressed Objects'] is not None: v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1]) if stats_version['Errors'] is not None: v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1]) v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1]) if stats_version['Xref Streams'] is not None: v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1]) if stats_version['Object Streams'] is not None: v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1]) if int(stats_version['Streams'][0]) > 0: v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1]) if stats_version['Decoding Errors'] is not None: v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1]) if stats_version['Objects with JS code'] is not None: v_json_body['objects_with_js_code'] = \ self.list_first_x(stats_version['Objects with JS code'][1]) # js_stream.extend(stats_version['Objects with JS code'][1]) res_version = ResultSection(f"Version {str(version)}", parent=res, body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body)) actions = stats_version['Actions'] events = stats_version['Events'] vulns = stats_version['Vulns'] elements = stats_version['Elements'] is_suspicious = False if events is not None or actions is not None or vulns is not None or elements is not None: res_suspicious = ResultSection('Suspicious elements', parent=res_version) if events is not None: for event in events: res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}") is_suspicious = True if actions is not None: for action in actions: res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}") is_suspicious = True if vulns is not None: for vuln in vulns: if vuln in vulnsDict: temp = [vuln, ' ('] for vulnCVE in vulnsDict[vuln]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(vulns[vuln])) res_suspicious.add_line(temp) else: res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}") is_suspicious = True if elements is not None: for element in elements: if element in vulnsDict: temp = [element, ' ('] for vulnCVE in vulnsDict[element]: if len(temp) != 2: temp.append(',') vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE temp.append(vulnCVE) cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE) if cve_found: res_suspicious.add_tag('attribution.exploit', vulnCVE[cve_found.start():cve_found.end()]) res_suspicious.add_tag('file.behavior', vulnCVE[cve_found.start():cve_found.end()]) temp.append('): ') temp.append(str(elements[element])) res_suspicious.add_line(temp) is_suspicious = True else: res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}") is_suspicious = True res_suspicious.set_heuristic(8) if is_suspicious else None urls = stats_version['URLs'] if urls is not None: res.add_line("") res_url = ResultSection('Found URLs', parent=res) for url in urls: res_url.add_line(f"\t\t{url}") res_url.set_heuristic(9) for obj in stats_version['Objects'][1]: cur_obj = pdf_file.getObject(obj, version) if cur_obj.containsJScode: cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} " f"block of JavaScript") score_modifier = 0 js_idx = 0 for js in cur_obj.JSCode: sub_res = ResultSection('Block of JavaScript', parent=cur_res) js_idx += 1 js_score = 0 js_code, unescaped_bytes, _, _, _ = analyseJS(js) js_dump += [x for x in js_code] # Malicious characteristics big_buffs = self.get_big_buffs("".join(js_code)) if len(big_buffs) == 1: js_score += 500 * len(big_buffs) if len(big_buffs) > 0: js_score += 500 * len(big_buffs) has_eval, has_unescape = self.check_dangerous_func("".join(js_code)) if has_unescape: js_score += 100 if has_eval: js_score += 100 js_cmt = "" if has_eval or has_unescape or len(big_buffs) > 0: score_modifier += js_score js_cmt = "Suspiciously malicious " cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF") sub_res.set_heuristic(7) js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res) if js_score > 0: temp_js_outname = f"object{obj}-{version}_{js_idx}.js" temp_js_path = os.path.join(self.working_directory, temp_js_outname) temp_js_bin = "".join(js_code).encode("utf-8") f = open(temp_js_path, "wb") f.write(temp_js_bin) f.close() f_list.append(temp_js_path) js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}") if has_eval or has_unescape: analysis_res = ResultSection("[Suspicious Functions]", parent=js_res) if has_eval: analysis_res.add_line("eval: This JavaScript block uses eval() function " "which is often used to launch deobfuscated " "JavaScript code.") analysis_res.set_heuristic(3) if has_unescape: analysis_res.add_line("unescape: This JavaScript block uses unescape() " "function. It may be legitimate but it is definitely " "suspicious since malware often use this to " "deobfuscate code blocks.") analysis_res.set_heuristic(3) buff_idx = 0 for buff in big_buffs: buff_idx += 1 error, new_buff = unescape(buff) if error == 0: buff = new_buff if buff not in unescaped_bytes: temp_path_name = None if ";base64," in buff[:100] and "data:" in buff[:100]: temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff" try: buff = b64decode(buff.split(";base64,")[1].strip()) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(buff) f.close() f_list.append(temp_path) except Exception: self.log.error("Found 'data:;base64, ' buffer " "but failed to base64 decode.") temp_path_name = None if temp_path_name is not None: buff_cond = f" and was resubmitted as {temp_path_name}" else: buff_cond = "" buff_res = ResultSection( f"A {len(buff)} bytes buffer was found in the JavaScript " f"block{buff_cond}. Here are the first 256 bytes.", parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")), body_format=BODY_FORMAT.MEMORY_DUMP) buff_res.set_heuristic(2) processed_sc = [] sc_idx = 0 for sc in unescaped_bytes: if sc not in processed_sc: sc_idx += 1 processed_sc.append(sc) try: sc = sc.decode("hex") except Exception: pass shell_score = 500 temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff" shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript " f"buffer (id: {sc_idx}) was resubmitted as " f"{temp_path_name}. Here are the first 256 bytes.", parent=js_res) shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP) temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(sc) f.close() f_list.append(temp_path) cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer") shell_res.set_heuristic(6) score_modifier += shell_score if score_modifier > 0: res_list.append(cur_res) elif cur_obj.type == "stream": if cur_obj.isEncodedStream and cur_obj.filter is not None: data = cur_obj.decodedStream encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/", "").strip() val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) else: data = cur_obj.rawStream encoding = None val = cur_obj.rawValue otype = cur_obj.elements.get("/Type", None) sub_type = cur_obj.elements.get("/Subtype", None) length = cur_obj.elements.get("/Length", None) if otype: otype = otype.value.replace("/", "").lower() if sub_type: sub_type = sub_type.value.replace("/", "").lower() if length: length = length.value if otype == "embeddedfile": if len(data) > 4096: if encoding is not None: temp_encoding_str = f"_{encoding}" else: temp_encoding_str = "" cur_res = ResultSection( f'Embedded file found ({length} bytes) [obj: {obj} {version}] ' f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj" temp_path = os.path.join(self.working_directory, temp_path_name) f = open(temp_path, "wb") f.write(data) f.close() f_list.append(temp_path) cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}") res_list.append(cur_res) elif otype not in BANNED_TYPES: cur_res = ResultSection( f'Unknown stream found [obj: {obj} {version}] ' f'{f"(Type: {otype}) " if otype is not None else ""}' f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}' f'{f"(Encoded with {encoding})" if encoding is not None else ""}' ) for line in val.splitlines(): cur_res.add_line(line) emb_res = ResultSection('First 256 bytes', parent=cur_res) first_256 = data[:256] if isinstance(first_256, str): first_256 = first_256.encode() emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP) res_list.append(cur_res) else: pass file_res.add_section(res) for results in res_list: file_res.add_section(results) if js_dump: js_dump_res = ResultSection('Full JavaScript dump') temp_js_dump = "javascript_dump.js" temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump) try: temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8") except UnicodeDecodeError: temp_js_dump_bin = "\n\n----\n\n".join(js_dump) temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest() f = open(temp_js_dump_path, "wb") f.write(temp_js_dump_bin) f.flush() f.close() f_list.append(temp_js_dump_path) js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}") js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}") js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1) file_res.add_section(js_dump_res) for filename in f_list: request.add_extracted(filename, os.path.basename(filename), f"Dumped from {os.path.basename(temp_filename)}") else: res = ResultSection("ERROR: Could not parse file with PeePDF.") file_res.add_section(res) finally: request.result = file_res try: del pdf_file except Exception: pass try: del pdf_parser except Exception: pass gc.collect()
def execute(self, request): # ================================================================== # Execute a request: # Every time your service receives a new file to scan, the execute function is called # This is where you should execute your processing code. # For the purpose of this example, we will only generate results ... # You should run your code here... # ================================================================== # Check if we're scanning an embedded file # This service always drop 3 embedded file which two generates random results and the other empty results # We're making a check to see if we're scanning the embedded file. # In a normal service this is not something you would do at all but since we are using this # service in our unit test to test all features of our report generator, we have to do this if request.sha256 not in [ 'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a', '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec', 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06' ]: # Main file results... # ================================================================== # Write the results: # First, create a result object where all the result sections will be saved to result = Result() # ================================================================== # Standard text section: BODY_FORMAT.TEXT - DEFAULT # Text sections basically just dumps the text to the screen... # All sections scores will be SUMed in the service result # The Result classification will be the highest classification found in the sections text_section = ResultTextSection('Example of a default section') # You can add lines to your section one at a time # Here we will generate a random line text_section.add_line(get_random_phrase()) # Or your can add them from a list # Here we will generate random amount of random lines text_section.add_lines( [get_random_phrase() for _ in range(random.randint(1, 5))]) # You can tag data to a section, tagging is used to to quickly find defining information about a file text_section.add_tag("attribution.implant", "ResultSample") # If the section needs to affect the score of the file you need to set a heuristics # Here we will pick one at random # In addition to add a heuristic, we will associated a signature with the heuristic, # we're doing this by adding the signature name to the heuristic. (Here we generating a random name) text_section.set_heuristic(3, signature="sig_one") # You can attach attack ids to heuristics after they where defined text_section.heuristic.add_attack_id( random.choice(list(software_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(attack_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(group_map.keys()))) text_section.heuristic.add_attack_id( random.choice(list(revoke_map.keys()))) # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how # many time the signature fired by setting its frequency. If you call add_signature_id twice with the # same signature, this will effectively increase the frequency of the signature. text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2) text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3) text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_three") text_section.heuristic.add_signature_id("sig_four", score=0) # The heuristic for text_section should have the following properties # 1. 1 attack ID: T1066 # 2. 4 signatures: sig_one, sig_two, sig_three and sig_four # 3. Signature frequencies are cumulative therefor they will be as follow: # - sig_one = 1 # - sig_two = 5 # - sig_three = 2 # - sig_four = 1 # 4. The score used by each heuristic is driven by the following rules: signature_score_map is higher # priority, then score value for the add_signature_id is in second place and finally the default # heuristic score is use. Therefor the score used to calculate the total score for the text_section is # as follow: # - sig_one: 10 -> heuristic default score # - sig_two: 20 -> score provided by the function add_signature_id # - sig_three: 30 -> score provided by the heuristic map # - sig_four: 40 -> score provided by the heuristic map because it's higher priority than the # function score # 5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210 # Make sure you add your section to the result result.add_section(text_section) # Even if the section was added to the results you can still modify it by adding a subsection for example ResultSection( "Example of sub-section without a body added later in processing", parent=text_section) # ================================================================== # Color map Section: BODY_FORMAT.GRAPH_DATA # Creates a color map bar using a minimum and maximum domain # e.g. We are using this section to display the entropy distribution in some services cmap_min = 0 cmap_max = 20 cmap_values = [random.random() * cmap_max for _ in range(50)] # The classification of a section can be set to any valid classification for your system section_color_map = ResultGraphSection( "Example of colormap result section", classification=cl_engine.RESTRICTED) section_color_map.set_colormap(cmap_min, cmap_max, cmap_values) result.add_section(section_color_map) # ================================================================== # URL section: BODY_FORMAT.URL # Generate a list of clickable urls using a json encoded format # As you can see here, the body of the section can be set directly instead of line by line random_host = get_random_host() url_section = ResultURLSection('Example of a simple url section') url_section.add_url(f"https://{random_host}/", name="Random url!") # Since urls are very important features we can tag those features in the system so they are easy to find # Tags are defined by a type and a value url_section.add_tag("network.static.domain", random_host) # You may also want to provide a list of url! # Also, No need to provide a name, the url link will be displayed hosts = [get_random_host() for _ in range(2)] # A heuristic can fire more then once without being associated to a signature url_heuristic = Heuristic(4, frequency=len(hosts)) url_sub_section = ResultURLSection( 'Example of a url sub-section with multiple links', heuristic=url_heuristic, classification=cl_engine.RESTRICTED) for host in hosts: url_sub_section.add_url(f"https://{host}/") url_sub_section.add_tag("network.static.domain", host) # You can keep nesting sections if you really need to ips = [get_random_ip() for _ in range(3)] url_sub_sub_section = ResultURLSection( 'Exemple of a two level deep sub-section') for ip in ips: url_sub_sub_section.add_url(f"https://{ip}/") url_sub_sub_section.add_tag("network.static.ip", ip) # Since url_sub_sub_section is a sub-section of url_sub_section # we will add it as a sub-section of url_sub_section not to the main result itself url_sub_section.add_subsection(url_sub_sub_section) # Invalid sections will be ignored, and an error will apear in the logs # Sub-sections of invalid sections will be ignored too invalid_section = ResultSection("") ResultSection( "I won't make it to the report because my parent is invalid :(", parent=invalid_section) url_sub_section.add_subsection(invalid_section) # Since url_sub_section is a sub-section of url_section # we will add it as a sub-section of url_section not to the main result itself url_section.add_subsection(url_sub_section) result.add_section(url_section) # ================================================================== # Memory dump section: BODY_FORMAT.MEMORY_DUMP # Dump whatever string content you have into a <pre/> html tag so you can do your own formatting data = hexdump( b"This is some random text that we will format as an hexdump and you'll see " b"that the hexdump formatting will be preserved by the memory dump section!" ) memdump_section = ResultMemoryDumpSection( 'Example of a memory dump section', body=data) memdump_section.set_heuristic(random.randint(1, 4)) result.add_section(memdump_section) # ================================================================== # KEY_VALUE section: # This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI # while also providing easy to parse data for auto mated tools. # NB: You should definitely use this over a JSON body type since this one will be displayed correctly # in the UI for the user # The body argument must be a dictionary (only str, int, and booleans are allowed) kv_section = ResultKeyValueSection( 'Example of a KEY_VALUE section') # You can add items individually kv_section.set_item('key', "value") # Or simply add them in bulk kv_section.update_items({ "a_str": "Some string", "a_bool": False, "an_int": 102, }) result.add_section(kv_section) # ================================================================== # ORDERED_KEY_VALUE section: # This section provides the same functionality as the KEY_VALUE section except the order of the fields # are garanteed to be preserved in the order in which the fields are added to the section. Also with # this section, you can repeat the same key name multiple times oredered_kv_section = ResultOrderedKeyValueSection( 'Example of an ORDERED_KEY_VALUE section') # You can add items individually for x in range(random.randint(3, 6)): oredered_kv_section.add_item(f'key{x}', f"value{x}") result.add_section(oredered_kv_section) # ================================================================== # JSON section: # Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor) # to display a tree view of JSON results. # NB: Use this sparingly! As a service developer you should do your best to include important # results as their own result sections. # The body argument must be a python dictionary json_body = { "a_str": "Some string", "a_list": ["a", "b", "c"], "a_bool": False, "an_int": 102, "a_dict": { "list_of_dict": [{ "d1_key": "val", "d1_key2": "val2" }, { "d2_key": "val", "d2_key2": "val2" }], "bool": True } } json_section = ResultJSONSection('Example of a JSON section') # You can set the json result to a specific value json_section.set_json(json_body) # You can also update specific parts after the fact json_section.update_json({ 'an_int': 1000, 'updated_key': 'updated_value' }) result.add_section(json_section) # ================================================================== # PROCESS_TREE section: # This section allows the service writer to list a bunch of dictionary objects that have nested lists # of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore # each dictionary must have be of the following format: # { # "process_pid": int, # "process_name": str, # "command_line": str, # "signatures": {} This dict has the signature name as a key and the score as it's value # "children": [] NB: This list either is empty or contains more dictionaries that have the same # structure # } process_tree_section = ResultProcessTreeSection( 'Example of a PROCESS_TREE section') # You can use the ProcessItem class to create the processes to add to the result section evil_process = ProcessItem(123, "evil.exe", "c:\\evil.exe") evil_process_child_1 = ProcessItem( 321, "takeovercomputer.exe", "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff") # You can add child processes to the ProcessItem objects evil_process_child_1.add_child_process( ProcessItem( 456, "evenworsethanbefore.exe", "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad", signatures={ "one": 10, "two": 10, "three": 10 })) evil_process_child_1.add_child_process( ProcessItem(234, "badfile.exe", "C:\\badfile.exe -k nothing_to_see_here", signatures={ "one": 1000, "two": 10, "three": 10, "four": 10, "five": 10 })) # You can add signatures that hit on a ProcessItem Object evil_process_child_1.add_signature('one', 250) # Or even directly create the ProcessItem object with the signature in it evil_process_child_2 = ProcessItem( 345, "benignexe.exe", "C:\\benignexe.exe -f \"just kidding, i'm evil\"", signatures={"one": 2000}) # You can also add counts for network, file and registry events to a ProcessItem object evil_process_child_2.add_network_events(4) evil_process_child_2.add_file_events(7000) evil_process_child_2.add_registry_events(10) # You can also indicate if the process tree item has been safelisted benign_process = ProcessItem(678, "trustme.exe", "C:\\trustme.exe") benign_process.safelist() evil_process.add_child_process(evil_process_child_1) evil_process.add_child_process(evil_process_child_2) # Add your processes to the result section via the add_process function process_tree_section.add_process(evil_process) process_tree_section.add_process( ProcessItem(987, "runzeroday.exe", "C:\\runzeroday.exe -f insert_bad_spelling")) process_tree_section.add_process(benign_process) result.add_section(process_tree_section) # ================================================================== # TABLE section: # This section allows the service writer to have their content displayed in a table format in the UI # The body argument must be a list [] of dict {} objects. A dict object can have a key value pair # where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested # table within a cell. table_section = ResultTableSection('Example of a TABLE section') # Use the TableRow class to help adding row to the Table section table_section.add_row( TableRow(a_str="Some string1", extra_column_here="confirmed", a_bool=False, an_int=101)) table_section.add_row( TableRow( { "a_str": "Some string2", "a_bool": True, "an_int": "to_be_overriden_by_kwargs" }, an_int=102)) table_section.add_row( TableRow(a_str="Some string3", a_bool=False, an_int=103)) # Valid values for the items in the TableRow are: str, int, bool, None, or dict of those values table_section.add_row( TableRow( { "a_str": "Some string4", "a_bool": None, "an_int": -1000000000000000000 }, { "extra_column_there": "confirmed", "nested_key_value_pair": { "a_str": "Some string3", "a_bool": False, "nested_kv_thats_too_deep": { "a_str": "Some string3", "a_bool": False, "an_int": 103, }, } })) result.add_section(table_section) # ================================================================== # Re-Submitting files to the system # Adding extracted files will have them resubmitted to the system for analysis # This file will generate random results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(data.encode()) request.add_extracted(temp_path, "file.txt", "Extracted by some magic!") # Embedded files can also have their own classification! fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"CLASSIFIED!!!__" + data.encode()) request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look", classification=cl_engine.RESTRICTED) # This file will generate empty results on the next run fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "wb") as myfile: myfile.write(b"EMPTY") request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file") # ================================================================== # Supplementary files # Adding supplementary files will save them on the datastore for future # reference but wont reprocess those files. fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(url_sub_section.body) request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file") # like embedded files, you can add more then one supplementary files fd, temp_path = tempfile.mkstemp(dir=self.working_directory) with os.fdopen(fd, "w") as myfile: myfile.write(json.dumps(json_body)) request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file") # ================================================================== # Zeroize on safe tags # When this feature is turned on, the section will get its score set to zero if all its tags # were safelisted by the safelisting engine zero_section = ResultSection('Example of zeroize-able section', zeroize_on_tag_safe=True) zero_section.set_heuristic(2) zero_section.add_line( "This section will have a zero score if all tags are safelisted." ) zero_section.add_tag('network.static.ip', '127.0.0.1') result.add_section(zero_section) # ================================================================== # Auto-collapse # When this feature is turned on, the section will be collapsed when first displayed collapse_section = ResultSection( 'Example of auto-collapse section', auto_collapse=True) collapse_section.set_heuristic(2) collapse_section.add_line( "This section was collapsed when first loaded in the UI") result.add_section(collapse_section) # ================================================================== # Image Section # This type of section allows the service writer to display images to the user image_section = ResultImageSection(request, 'Example of Image section') for x in range(6): image_section.add_image(f'data/000{x+1}.jpg', f'000{x+1}.jpg', f'ResultSample screenshot 000{x+1}', ocr_heuristic_id=6) result.add_section(image_section) # ================================================================== # Multi Section # This type of section allows the service writer to display multiple section types # in the same result section. Here's a concrete exemple of this: multi_section = ResultMultiSection( 'Example of Multi-typed section') multi_section.add_section_part( TextSectionBody( body="We have detected very high entropy multiple sections " "of your file, this section is most-likely packed or " "encrypted.\n\nHere are affected sections:")) section_count = random.randint(1, 4) for x in range(section_count): multi_section.add_section_part( KVSectionBody(section_name=f".UPX{x}", offset=f'0x00{8+x}000', size='4196 bytes')) graph_part = GraphSectionBody() graph_part.set_colormap( 0, 8, [7 + random.random() for _ in range(20)]) multi_section.add_section_part(graph_part) if x != section_count - 1: multi_section.add_section_part(DividerSectionBody()) multi_section.add_tag("file.pe.sections.name", f".UPX{x}") multi_section.set_heuristic(5) result.add_section(multi_section) # ================================================================== # Propagate temporary submission data to other services # Sometimes two service can work in tandem were one extra some piece of information the other # one uses to do it's work. This is how a service can set temporary data that other # services that subscribe to can use. request.temp_submission_data['kv_section'] = kv_section.body request.temp_submission_data[ 'process_tree_section'] = process_tree_section.body request.temp_submission_data['url_section'] = url_sub_section.body # ================================================================== # Wrap-up: # Save your result object back into the request request.result = result # ================================================================== # Empty results file elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06': # Creating and empty result object request.result = Result() # ================================================================== # Randomized results file else: # For the randomized results file, we will completely randomize the results # The content of those results do not matter since we've already showed you # all the different result sections, tagging, heuristics and file upload functions embedded_result = Result() # random number of sections for _ in range(1, 3): embedded_result.add_section(self._create_random_section()) request.result = embedded_result
def find_network_indicators(apktool_out_dir: str, result: Result): # Whitelist skip_list = [ "android.intent", "com.google", "com.android", ] indicator_whitelist = [ 'google.to', 'google.ttl', 'google.delay', 'google_tagmanager.db', 'gtm_urls.db', 'gtm.url', 'google_tagmanager.db', 'google_analytics_v4.db', 'Theme.Dialog.Alert', 'popupLocationInfo.gravity', 'popupLocationInfo.displayId', 'popupLocationInfo.left', 'popupLocationInfo.top', 'popupLocationInfo.right', 'popupLocationInfo.bottom', 'googleads.g.doubleclick.net', 'ad.doubleclick.net', '.doubleclick.net', '.googleadservices.com', '.googlesyndication.com', 'android.hardware.type.watch', 'mraid.js', 'google_inapp_purchase.db', 'mobileads.google.com', 'mobileads.google.com', 'share_history.xml', 'share_history.xml', 'activity_choser_model_history.xml', 'FragmentPager.SavedState{', 'android.remoteinput.results', 'android.people', 'android.picture', 'android.icon', 'android.text', 'android.title', 'android.title.big', 'FragmentTabHost.SavedState{', 'android.remoteinput.results', 'android.remoteinput.results', 'android.remoteinput.results', 'libcore.icu.ICU', ] file_list = [] # Indicators url_list = [] domain_list = [] ip_list = [] email_list = [] # Build dynamic whitelist smali_dir = os.path.join(apktool_out_dir, "smali") for root, dirs, files in os.walk(smali_dir): if not files: continue else: skip_list.append(root.replace(smali_dir + "/", "").replace("/", ".")) for cdir in dirs: skip_list.append(os.path.join(root, cdir).replace(smali_dir + "/", "").replace("/", ".")) asset_dir = os.path.join(apktool_out_dir, "assets") if os.path.exists(asset_dir): for root, dirs, files in os.walk(asset_dir): if not files: continue else: for asset_file in files: file_list.append(asset_file) skip_list = list(set(skip_list)) # Find indicators proc = Popen(['grep', '-ER', r'(([[:alpha:]](-?[[:alnum:]])*)\.)*[[:alpha:]](-?[[:alnum:]])+\.[[:alpha:]]{2,}', smali_dir], stdout=PIPE, stderr=PIPE) grep, _ = proc.communicate() for line in safe_str(grep).splitlines(): file_path, line = line.split(":", 1) if "const-string" in line or "Ljava/lang/String;" in line: data = line.split("\"", 1)[1].split("\"")[0] data_low = data.lower() data_split = data.split(".") if data in file_list: continue elif data in indicator_whitelist: continue elif data.startswith("/"): continue elif data_low.startswith("http://") or data_low.startswith('ftp://') or data_low.startswith('https://'): url_list.append(data) elif len(data_split[0]) < len(data_split[-1]) and len(data_split[-1]) > 3: continue elif data.startswith('android.') and data_low != data: continue elif "/" in data and "." in data and data.index("/") < data.index("."): continue elif " " in data: continue elif data_split[0] in ['com', 'org', 'net', 'java']: continue elif data_split[-1].lower() in ['so', 'properties', 'zip', 'read', 'id', 'store', 'name', 'author', 'sh', 'soccer', 'fitness', 'news', 'video']: continue elif data.endswith("."): continue else: do_skip = False for skip in skip_list: if data.startswith(skip): do_skip = True break if do_skip: continue data = data.strip(".") if is_valid_domain(data): domain_list.append(data) elif is_valid_ip(data): ip_list.append(data) elif is_valid_email(data): email_list.append(data) url_list = list(set(url_list)) for url in url_list: dom_ip = url.split("//")[1].split("/")[0] if ":" in dom_ip: dom_ip = dom_ip.split(":")[0] if is_valid_ip(dom_ip): ip_list.append(dom_ip) elif is_valid_domain(dom_ip): domain_list.append(dom_ip) ip_list = list(set(ip_list)) domain_list = list(set(domain_list)) email_list = list(set(email_list)) if url_list or ip_list or domain_list or email_list: res_net = ResultSection("Network indicator(s) found", parent=result, heuristic=Heuristic(3)) if url_list: res_url = ResultSection("Found urls in the decompiled code", parent=res_net) count = 0 for url in url_list: count += 1 if count <= 20: res_url.add_line(url) res_url.add_tag('network.static.uri', url) if count > 20: res_url.add_line(f"and {count - 20} more...") if ip_list: res_ip = ResultSection("Found IPs in the decompiled code", parent=res_net) count = 0 for ip in ip_list: count += 1 if count <= 20: res_ip.add_line(ip) res_ip.add_tag('network.static.ip', ip) if count > 20: res_ip.add_line(f"and {count - 20} more...") if domain_list: res_domain = ResultSection("Found domains in the decompiled code", parent=res_net) count = 0 for domain in domain_list: count += 1 if count <= 20: res_domain.add_line(domain) res_domain.add_tag('network.static.domain', domain) if count > 20: res_domain.add_line(f"and {count - 20} more...") if email_list: res_email = ResultSection("Found email addresses in the decompiled code", parent=res_net) count = 0 for email in email_list: count += 1 if count <= 20: res_email.add_line(email) res_email.add_tag('network.email.address', email) if count > 20: res_email.add_line(f"and {count - 20} more...")
def execute(self, request): request.result = Result() self.result = request.result file_path = request.file_path fh = open(file_path, 'rb') try: self.swf = SWF(fh) if self.swf is None: raise Exception("self.swf is None") except Exception as e: self.log.exception( f"Unable to parse file {request.sha256}: {str(e)}") fh.close() raise self.tag_summary = defaultdict(list) self.symbols = {} self.binary_data = {} self.exported_assets = [] self.big_buffers = set() self.has_product_info = False self.anti_decompilation = False self.recent_compile = False self.disasm_path = None header_subsection = ResultSection(title_text="SWF Header", parent=self.result) if self.swf.header.version: header_subsection.add_line("Version: %d" % self.swf.header.version) header_subsection.add_tag(tag_type="file.swf.header.version", value=str(self.swf.header.version)) header_subsection.add_line("File length: %d" % self.swf.header.file_length) if self.swf.header.frame_size.__str__(): header_subsection.add_line("Frame size: %s" % self.swf.header.frame_size.__str__()) header_subsection.add_tag( tag_type="file.swf.header.frame.size", value=self.swf.header.frame_size.__str__()) if self.swf.header.frame_rate: header_subsection.add_line("Frame rate: %d" % self.swf.header.frame_rate) header_subsection.add_tag(tag_type="file.swf.header.frame.rate", value=str(self.swf.header.frame_rate)) if self.swf.header.frame_count: header_subsection.add_line("Frame count: %d" % self.swf.header.frame_count) header_subsection.add_tag(tag_type="file.swf.header.frame.count", value=str(self.swf.header.frame_count)) # Parse Tags tag_subsection = ResultSection(title_text="SWF Tags", parent=self.result) tag_types = [] for tag in self.swf.tags: self.tag_analyzers.get(SWF_TAGS.get(tag.type), self._dummy)(tag) tag_types.append(str(tag.type)) tag_list = ','.join(tag_types) tags_ssdeep = ssdeep.hash(tag_list) tag_subsection.add_tag(tag_type="file.swf.tags_ssdeep", value=tags_ssdeep) # TODO: not sure we want to split those... # _, hash_one, hash_two = tags_ssdeep.split(':') # tag_subsection.add_tag(tag_type=TAG_TYPE.SWF_TAGS_SSDEEP, value=hash_one) # tag_subsection.add_tag(tag_type=TAG_TYPE.SWF_TAGS_SSDEEP, value=hash_two) # Script Overview if len(self.symbols.keys()) > 0: root_symbol = 'unspecified' if 0 in self.symbols: root_symbol = self.symbols[0] self.symbols.pop(0) symbol_subsection = ResultSection(title_text="Symbol Summary", parent=self.result) symbol_subsection.add_line(f'Main: {root_symbol}') if len(self.symbols.keys()) > 0: for tag_id, name in sorted([(k, v) for k, v in self.symbols.items()]): symbol_subsection.add_line(f'ID:{tag_id} - {name}') if len(self.binary_data.keys()) > 0: binary_subsection = ResultSection( title_text="Attached Binary Data", heuristic=Heuristic(3), parent=self.result) for tag_id, tag_data in self.binary_data.items(): tag_name = self.symbols.get(tag_id, 'unspecified') binary_subsection.add_line(f'ID:{tag_id} - {tag_name}') try: binary_filename = hashlib.sha256( tag_data).hexdigest() + '.attached_binary' binary_path = os.path.join(self.working_directory, binary_filename) with open(binary_path, 'wb') as fh: fh.write(tag_data) request.add_extracted( binary_path, f"{tag_name}_{tag_id}", f"SWF Embedded Binary Data {str(tag_id)}") except Exception: self.log.exception( "Error submitting embedded binary data for swf:") tags_subsection = ResultSection(title_text="Tags of Interest") for tag in sorted(self.tag_summary.keys()): body = [] summaries = self.tag_summary[tag] for summary in summaries: summary_line = '\t'.join(summary) body.append(summary_line) if body: subtag_section = ResultSection(title_text=tag, parent=tags_subsection) subtag_section.add_lines(body) if len(tags_subsection.subsections) > 0: self.result.add_section(tags_subsection) if len(self.big_buffers) > 0: bbs = ResultSection(title_text="Large String Buffers", heuristic=Heuristic(1), parent=self.result) for buf in self.big_buffers: if isinstance(buf, str): buf = buf.encode() bbs.add_line("Found a %d byte string." % len(buf)) buf_filename = "" try: buf_filename = hashlib.sha256( buf).hexdigest() + '.stringbuf' buf_path = os.path.join(self.working_directory, buf_filename) with open(buf_path, 'wb') as fh: fh.write(buf) request.add_extracted(buf_path, "AVM2 Large String Buffer.", buf_filename) except Exception: self.log.exception( "Error submitting AVM2 String Buffer %s" % buf_filename) if not self.has_product_info: self.log.debug("Missing product info.") no_info = ResultSection(title_text="Missing Product Information", heuristic=Heuristic(5), parent=self.result) no_info.add_line( "This SWF doesn't specify information about the product that created it." ) if self.anti_decompilation: self.log.debug("Anti-disassembly techniques may be present.") no_dis = ResultSection(title_text="Incomplete Disassembly", heuristic=Heuristic(4), parent=self.result) no_dis.add_line( "This SWF may contain intentional corruption or obfuscation to prevent disassembly." ) if self.recent_compile: recent_compile = ResultSection(title_text="Recent Compilation", heuristic=Heuristic(2), parent=self.result) recent_compile.add_line( "This SWF was compiled within the last 24 hours.") fh.close()
def execute(self, request): parser = eml_parser.eml_parser.EmlParser(include_raw_body=True, include_attachment_data=True) content_str = request.file_contents # Attempt conversion of potential Outlook file -> eml if request.file_type == "document/office/email": try: content_str = msg2eml(request.file_path).as_bytes() except Exception: # Try using mailparser to convert converted_path, _ = msgconvert(request.file_path) content_str = open(converted_path, "rb").read() header_agg = { "From": set(), "To": set(), "Cc": set(), "Sent": set(), "Reply-To": set(), "Date": set() } # Assume this is an email saved in HTML format if request.file_type == "code/html": parsed_html = BeautifulSoup(content_str, "lxml") valid_headers = [ "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:" ] if not parsed_html.body or not any(header in parsed_html.body.text for header in valid_headers): # We can assume this is just an HTML doc (or lacking body), one of which we can't process request.result = Result() return # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails valid_headers.append("Date:") html_email = email.message_from_bytes(content_str) generator_metadata_content = "" for meta in parsed_html.find_all("meta"): if meta.attrs.get("name", None) == "Generator": generator_metadata_content = meta.attrs.get("content", "") break # Process HTML emails generated from Outlook if generator_metadata_content == "Microsoft Word 15": paragraphs = parsed_html.body.find_all("p") # Likely an email that was exported with original email headers if any(header in paragraphs[0] for header in valid_headers): for p in paragraphs: if any(valid_header in p.text for valid_header in valid_headers): h_key, h_value = p.text.replace( "\xa0", "").replace("\r\n", " ").split(":", 1) html_email[h_key] = h_value # Subject line indicates the end of the email header, beginning of body if "Subject" in p.text: break # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers) elif (generator_metadata_content == "Microsoft Word 15 (filtered medium)" or generator_metadata_content == "Microsoft Exchange Server" or generator_metadata_content == ""): subject = None for div in parsed_html.find_all("div"): # Header information within divs if any(header in div.text for header in valid_headers ) and "WordSection1" not in div.attrs.get( "class", []): # Usually expect headers to be \n separated in text output but check first if "\n" in div.text: for h in div.text.split("\n"): if any(header in h for header in valid_headers): h_key, h_value = h.split(":", 1) # Implying some malformed message got mixed with the headers of another message if h_key not in valid_headers: for header in valid_headers: if header in h: h_key = header[:-1] # Use the latest message's subject (this maintains FW, RE, etc.) if h_key == "Subject" and not subject: subject = h_value elif h_key != "Subject": header_agg[h_key].add(h_value) # Document was probably not well formatted, so we'll use the headers as delimiters else: header_offset_map = {} # Determine the position of each header for header in list( header_agg.keys()) + ["Subject"]: if header in div.text: header_offset_map[div.text.index( header)] = header # Use the positions and length of header name to determine an offset for i in range(len(header_offset_map)): sorted_keys = sorted(header_offset_map.keys()) header_name = header_offset_map[sorted_keys[i]] offset = len( f"{header_name}: ") + sorted_keys[i] value = (div.text[offset:sorted_keys[i + 1]] if i < len(header_offset_map) - 1 else div.text[offset:]) if header_name == "Subject": subject = value else: header_agg[header_name].add(value) # Assign aggregated info to email object html_email["Subject"] = subject for key, value in header_agg.items(): html_email[key] = "; ".join(value) content_str = html_email.as_bytes() parsed_eml = parser.decode_email_bytes(content_str) result = Result() header = parsed_eml["header"] if "from" in header or "to" in header: all_uri = set() body_words = set(extract_passwords(header["subject"])) for body_counter, body in enumerate(parsed_eml["body"]): body_text = BeautifulSoup(body["content"]).text body_words.update(extract_passwords(body_text)) if request.get_param("extract_body_text"): fd, path = mkstemp() with open(path, "w") as f: f.write(body["content"]) os.close(fd) request.add_extracted(path, "body_" + str(body_counter), "Body text") if "uri" in body: for uri in body["uri"]: all_uri.add(uri) # Words in the email body, used by extract to guess passwords request.temp_submission_data["email_body"] = list(body_words) kv_section = ResultSection("Email Headers", body_format=BODY_FORMAT.KEY_VALUE, parent=result) # Basic tags from_addr = header["from"].strip() if header.get("from", None) else None if from_addr and re.match(EMAIL_REGEX, from_addr): kv_section.add_tag("network.email.address", from_addr) [ kv_section.add_tag("network.email.address", to.strip()) for to in header["to"] if re.match(EMAIL_REGEX, to.strip()) ] kv_section.add_tag("network.email.date", str(header["date"]).strip()) subject = header["subject"].strip() if header.get("subject", None) else None if subject: kv_section.add_tag("network.email.subject", subject) # Add CCs to body and tags if "cc" in header: [ kv_section.add_tag("network.email.address", cc.strip()) for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip()) ] # Add Message ID to body and tags if "message-id" in header["header"]: kv_section.add_tag("network.email.msg_id", header["header"]["message-id"][0].strip()) # Add Tags for received IPs if "received_ip" in header: for ip in header["received_ip"]: ip = ip.strip() try: if isinstance(ip_address(ip), IPv4Address): kv_section.add_tag("network.static.ip", ip) except ValueError: pass # Add Tags for received Domains if "received_domain" in header: for dom in header["received_domain"]: kv_section.add_tag("network.static.domain", dom.strip()) # If we've found URIs, add them to a section if len(all_uri) > 0: uri_section = ResultSection("URIs Found:", parent=result) for uri in all_uri: uri_section.add_line(uri) uri_section.add_tag("network.static.uri", uri.strip()) parsed_url = urlparse(uri) if parsed_url.hostname and re.match( IP_ONLY_REGEX, parsed_url.hostname): uri_section.add_tag("network.static.ip", parsed_url.hostname) else: uri_section.add_tag("network.static.domain", parsed_url.hostname) # Bring all headers together... extra_header = header.pop("header", {}) header.pop("received", None) header.update(extra_header) # Convert to common format header["date"] = [self.json_serial(header["date"])] # Replace with aggregated date(s) if any available if header_agg["Date"]: # Replace if any( default_date in header["date"] for default_date in ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000" ]): header["date"] = list(header_agg["Date"]) # Append else: header["date"] += list(header_agg["Date"]) (kv_section.add_tag("network.email.date", str(date).strip()) for date in header_agg["Date"]) # Filter out useless headers from results self.log.debug(header.keys()) [header.pop(h) for h in self.header_filter if h in header.keys()] kv_section.set_body(json.dumps(header, default=self.json_serial)) attachments_added = [] if "attachment" in parsed_eml: attachments = parsed_eml["attachment"] for attachment in attachments: fd, path = mkstemp() with open(path, "wb") as f: f.write(base64.b64decode(attachment["raw"])) os.close(fd) try: if request.add_extracted( path, attachment["filename"], "Attachment ", safelist_interface=self.api_interface): attachments_added.append(attachment["filename"]) except MaxExtractedExceeded: self.log.warning( f"Extract limit reached on attachments: " f"{len(attachment) - len(attachments_added)} not added" ) break ResultSection("Extracted Attachments:", body="\n".join([x for x in attachments_added]), parent=result) if request.get_param("save_emlparser_output"): fd, temp_path = tempfile.mkstemp(dir=self.working_directory) attachments = parsed_eml.get("attachment", []) # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted for attachment in attachments: _ = attachment.pop("raw", None) with os.fdopen(fd, "w") as myfile: myfile.write( json.dumps(parsed_eml, default=self.json_serial)) request.add_supplementary( temp_path, "parsing.json", "These are the raw results of running GOVCERT-LU's eml_parser" ) else: self.log.warning( "emlParser could not parse EML; no useful information in result's headers" ) request.result = result
def __init__(self, i, request=None, result=None, working_directory=None, logger=None): self.request = request self.result = result self.working_directory = working_directory self.log = logger if result: self.working_result = (ResultSection( "Image Steganography Module Results:", body_format=BODY_FORMAT.MEMORY_DUMP)) else: self.result = result # Currently only supporting 8-bit pixel modes self.pixel_size = 8 supported_modes = { 'CMYK': 4, 'P': 1, 'RGB': 3, 'RGBA': 4, } # Pillow seems to like non-corrupt images, so give its best shot and exit on error try: img = Image.open(i) except Exception: raise NotSupported() try: self.iformat = img.format self.imode = img.mode.upper() self.isize = img.size except Exception: raise NotSupported() if not self.iformat and not self.imode and not self.isize: # Something likely wrong raise NotSupported() if self.imode.upper() not in supported_modes: if not self.result: self.log.warning( "{} image mode not currently supported for steganlaysis modules" .format(self.imode)) exit() else: self.log.warning("not a supported mode: {}".format( self.result)) raise NotSupported() else: self.channels_to_process = supported_modes[self.imode] if result: pil_result = ResultSection("Pillow Image Data:", body_format=BODY_FORMAT.MEMORY_DUMP) if self.iformat: pil_result.add_line("Format:\t {}".format(self.iformat)) if self.imode: pil_result.add_line("Mode:\t {}".format(self.imode)) pil_result.add_tag('file.img.mode', self.imode) if self.isize: pil_result.add_line("Size:\t {}x{}".format( self.isize[0], self.isize[1])) pil_result.add_tag( 'file.img.size', "{}x{}".format(self.isize[0], self.isize[1])) self.result.add_section(pil_result) try: self.ipixels = iter(img.getdata()) except Exception: raise NotSupported() try: img = Image.open(i) self.iobject = img.load() except Exception: raise NotSupported() self.binary_pixels = [ self.convert_binary_string(self.imode, self.channels_to_process, self.ipixels) ] self.pixel_count = (self.isize[0] * self.isize[1] * self.channels_to_process) # Chunk size equals (#bytes*8) bits/num byte-values per pixel. Therefore if 8 bits per pixel, and you want to # perform test on every 512 bytes of data, chunk size will be (512*8)/8 == every 512 pixels examined. # Optimize chunk size if this is being run through AL if request is not None: maximizer = self.pixel_count / 20000 if maximizer == 0: maximizer = 1 self.chunk = 128 * maximizer else: self.chunk = 256 self.chunk = int(self.chunk) # total chunk bits/8 self.chunk_bytes = (self.chunk * self.pixel_size * self.channels_to_process) / 8
def analyze_pdf(self, request, res_txt, path, working_dir, heur, additional_keywords, get_malform=True): """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins, and PDF Parser. Args: request: AL request object. res_txt: Header string for AL result section title. path: Original PDF sample path. working_dir: AL working directory. heur: List of plugins to run on PDFId results (provided in service configuration). additional_keywords: List of additional keywords to be searched (provided in service configuration). get_malform: Extract malformed objects from PDF. Returns: AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list. """ triage_keywords = set() all_errors = set() embed_present = False objstms = False res = ResultSection(title_text=res_txt) carved_extracted_shas = set() if request.deep_scan: run_pdfparse = True else: run_pdfparse = False # Run PDFId try: pdfid_result, errors = self.get_pdfid(path, additional_keywords, heur, request.deep_scan) except Exception as e: raise NonRecoverableError(e) # Parse PDFId results pdfidres = ResultSection(title_text="PDFID Results", parent=res) if len(pdfid_result) == 0: pdfidres.add_line( "No results generated for file. Please see errors.") else: # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: version = pdfid_result.get("PDFID", None) if version: pdfidres.add_line(version[0]) properties = pdfid_result.get("Properties", None) if properties: pres = ResultSection(title_text="PDF Properties", parent=pdfidres) for plist in properties: pres.add_line("{0}: {1}".format(plist[0], plist[1])) if plist[0] == "/ModDate": pres.add_tag('file.pdf.date.modified', plist[1]) elif plist[0] == "/CreationDate": pres.add_tag('file.date.creation', plist[1]) elif plist[0] == "/LastModified": pres.add_tag('file.date.last_modified', plist[1]) elif plist[0] == "/SourceModified": pres.add_tag('file.pdf.date.source_modified', plist[1]) elif plist[0] == "/pdfx": pres.add_tag('file.pdf.date.pdfx', plist[1]) entropy = pdfid_result.get("Entropy", None) if entropy: enres = ResultSection(title_text="Entropy", parent=pdfidres) for enlist in entropy: enres.add_line("{0}: {1}, ({2})".format( enlist[0], enlist[1], enlist[2])) flags = pdfid_result.get("Flags", None) if flags: fres = ResultSection(title_text="PDF Keyword Flags", parent=pdfidres) for flist in flags: if flist[0] == "/ObjStm": objstms = True if len(flist) == 3: fres.add_line( "{0}:Count: {1}, Hex-Encoded Count: {2}".format( flist[0], flist[1], flist[2])) else: fres.add_line("{0}:Count: {1}".format( flist[0], flist[1])) fres.add_tag('file.string.extracted', flist[0].replace("/", "", 1)) if flist[0] in additional_keywords: triage_keywords.add(flist[0].replace("/", "", 1)) plugin = pdfid_result.get("Plugin", []) # If any plugin results, or flagged keywords found, run PDF Parser if plugin or len(triage_keywords) > 0: run_pdfparse = True for pllist in plugin: pl_name, pl_heur, pl_text = pllist pl_heur = int(pl_heur) pl_text = pl_text[14:] if not pl_text or pl_text == "None": continue if pl_name in ['EmbeddedFile', 'Name Obfuscation']: modres = ResultSection(title_text=pl_text, parent=pdfidres) if pl_heur > 0: modres.set_heuristic(pl_heur) if pl_name == 'EmbeddedFile': embed_present = True elif pl_name in ['Triage', 'Suspicious Properties']: javascript_found = False for line in pl_text.splitlines(): lineres = ResultSection(title_text=line) # Triage results if '/JavaScript' in line: triage_keywords.add('JavaScript') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JS' in line: triage_keywords.add('JS') if not javascript_found: lineres.set_heuristic(19) javascript_found = True elif '/JBIG2Decode' in line: triage_keywords.add('JBIG2Decode') lineres.set_heuristic(3) elif '/Colors > 2^24' in line: triage_keywords.add('Colors > 2^24') lineres.set_heuristic(20) elif '/AA' in line: triage_keywords.add('AA') lineres.set_heuristic(1) elif '/Launch' in line: triage_keywords.add('Launch') lineres.set_heuristic(1) elif '/OpenAction' in line: triage_keywords.add('OpenAction') lineres.set_heuristic(1) elif '/GoToE' in line: triage_keywords.add('GoToE') lineres.set_heuristic(21) elif '/GoToR' in line: triage_keywords.add('GoToR') lineres.set_heuristic(22) elif '/Encrypt' in line: triage_keywords.add('Encrypt') lineres.set_heuristic(11) elif '/AcroForm' in line: triage_keywords.add('AcroForm') lineres.set_heuristic(4) elif '/RichMedia' in line: triage_keywords.add('RichMedia') lineres.set_heuristic(5) elif '/XFA' in line: triage_keywords.add('XFA') lineres.set_heuristic(23) elif '/Annot' in line: triage_keywords.add('Annot') lineres.set_heuristic(25) elif '/ObjStm' in line: triage_keywords.add('ObjStm') lineres.set_heuristic(7) elif '/URI' in line: triage_keywords.add('URI') lineres.set_heuristic(24) # Suspicious properties results elif "eof2" in line: lineres.set_heuristic(2) elif "eof5" in line: lineres.set_heuristic(17) elif "page" in line: lineres.set_heuristic(26) elif "entropy" in line: lineres.set_heuristic(12) elif "obj/endobj" in line: lineres.set_heuristic(13) elif "stream/endstream" in line: lineres.set_heuristic(14) if lineres.heuristic is not None: pdfidres.add_subsection(lineres) for e in errors: all_errors.add(e) if e.startswith('Error running plugin'): self.log.warn(e) if run_pdfparse: # CALL PDF parser and extract further information pdf_parserres = ResultSection(title_text="PDF Parser Results") # STATISTICS # Do not run for objstms, which are being analyzed when get_malform == False if get_malform: options = { "stats": True, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No statistical results generated for file. Please see errors." ) else: version = pdf_parser_result.get("version", None) if version and version[0] != '0': pdf_parserres.add_line(version[0]) stats = pdf_parser_result.get("stats", None) if stats: sres = ResultSection( title_text="PDF Statistcs", parent=pdf_parserres, body_format=BODY_FORMAT.MEMORY_DUMP) for p in stats: sres.add_line(p) for e in errors: all_errors.add(e) # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream) carved_content = {} # Format { "objnum": [{keyword: content list}} obj_extract_triage = set() jbig_objs = set() for keyword in triage_keywords: # ObjStms handled differently if keyword == 'ObjStm': continue options = { "search": keyword, } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: for p in pdf_parser_result['parts']: content = "" references = [] # Trailer will be extracted anyways, try and grab all references anyways -- will be messy if p.startswith("trailer:"): # Grab the content after the keyword # Check that keyword actually in content if "/{}".format(keyword) in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').split("/", 1)[0].strip() references = re.findall( "[0-9]* [0-9]* R", content) except Exception: continue # If not trailer, should be object elif 'Referencing:' in p: # Grab the content after the keyword if '>>++>>' in p: try: content = p.split(keyword, 1)[1].replace( '>>++>>', '').strip() except Exception: try: content = p.split("\n", 3)[3] except Exception: content = p else: try: content = p.split("\n", 3)[3] except Exception: content = p # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R" if content.startswith("/{}".format(keyword)): try: content = re.sub("/{}[ ]*".format(keyword), "", content, 1) except Exception: pass try: references = p.split("\n", 3)[2].replace( 'Referencing:', '').strip().split(", ") except Exception: pass # Only extract JBIG2Decode objects with deep scan, but always report on their presence if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p: try: objnum = p.split("\n", 1)[0].split(" ")[1] if request.deep_scan: obj_extract_triage.add(objnum) jbig_objs.add(objnum) continue except Exception as e: self.log.debug(e) continue # If no content, then keyword likely points to reference objects, so grab those if content == '': if len(references) > 0: content = references else: # Something is wrong, drop it. continue else: while True: # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R] islist = re.match( r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]", content) if islist: content = re.sub( r"[\[\]]", "", islist.group(0).replace( "s ", '').replace("R ", "R,")).split(",") break # References might be with instructions, i.e. [# # R /FitH null] withinst = re.match( r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}" r"[/a-zA-Z0-9 ]*[ ]?\]", content) if withinst: content = [withinst.group(1)] break content = [content] break for c in content: # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[ 0:5]: continue if c in references or re.match( "[0-9]* [0-9]* R", c): try: ref_obj = c.split(" ", 1)[0] options = { "object": ref_obj, "get_object_detail": True } pdf_parser_subresult, err = self.get_pdf_parser( path, working_dir, options) if pdf_parser_subresult: for sub_p in pdf_parser_subresult[ 'parts']: sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\ .strip().split(", ") ptyp = sub_p.split( "\n", 2)[1].replace( 'Type:', '').strip().replace( "/", "") # If the object contains a stream, extract the object. if "Contains stream" in sub_p: try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] obj_extract_triage.add( objnum) except Exception: pass # Or if the object Type is the keyword, grab all referenced objects. elif sub_references[0] != '' and len(sub_references) >= 1 \ and ptyp == keyword: for sr in sub_references: try: objnum = sr.split( " ", 1)[0] obj_extract_triage.add( objnum) except Exception: pass # If not, extract object detail in to carved output elif pdf_parser_subresult[ 'obj_details'] != "": try: objnum = sub_p.split( "\n", 1)[0].split(" ")[1] if objnum in carved_content: carved_content[objnum]\ .append({keyword: pdf_parser_subresult['obj_details']}) else: carved_content[objnum] = \ [{keyword: pdf_parser_subresult['obj_details']}] except Exception: continue for e in err: errors.add(e) except Exception: # If none of that work, just extract the original object for examination. try: objnum = p.split("\n", 1)[0].split(" ")[1] obj_extract_triage.add(objnum) except Exception: pass # If content does not look like a reference: else: if p.startswith("trailer:"): continue objnum = p.split("\n", 1)[0].split(" ")[1] # If the object contains a stream extract the object if p.split("\n", 4)[3] == "Contains stream": obj_extract_triage.add(objnum) else: # Or just carve the content if objnum in carved_content: carved_content[objnum].append( {keyword: c}) else: carved_content[objnum] = [{keyword: c}] for e in errors: all_errors.add(e) # Add carved content to result output show_content_of_interest = False if len(carved_content) > 0 or len(jbig_objs) > 0: carres = ResultSection(title_text="Content of Interest") else: carres = None if len(jbig_objs) > 0: jbigres = ResultSection( title_text= "The following Object IDs are JBIG2DECODE streams:", body_format=BODY_FORMAT.MEMORY_DUMP, parent=carres) jbigres.add_line(', '.join(map(str, jbig_objs))) show_content_of_interest = True if len(carved_content) > 0: for k, l in sorted(carved_content.items()): for d in l: for keyw, con in d.items(): subres = ResultSection( title_text="Object {0}: Hits for Keyword '{1}':" .format(k, keyw)) subres.set_heuristic(8) con_bytes = con.encode() if len(con) < 500: subres.body_format = BODY_FORMAT.MEMORY_DUMP subres.add_line(con) # Check for IOC content patterns = PatternMatch() st_value = patterns.ioc_match(con_bytes, bogon_ip=True) if len(st_value) > 0: carres.add_subsection(subres) show_content_of_interest = True for ty, val in st_value.items(): if val == "": asc_asc = unicodedata.normalize( 'NFKC', val).encode('ascii', 'ignore') subres.add_tag(ty, asc_asc) else: ulis = list(set(val)) for v in ulis: subres.add_tag(ty, v) else: crv_sha = hashlib.sha256(con_bytes).hexdigest() if crv_sha not in carved_extracted_shas: f_name = "carved_content_obj_{}_{}".format( k, crv_sha[0:7]) subres.add_lines([ "Content over 500 bytes it will be extracted for analysis", "Name: {} - SHA256: {}".format( f_name, crv_sha) ]) carres.add_subsection(subres) show_content_of_interest = True crvf = os.path.join( self.working_directory, f_name) with open(crvf, 'wb') as f: f.write(con_bytes) request.add_extracted( crvf, os.path.basename(crvf), "Extracted content from object {}". format(k)) carved_extracted_shas.add(crv_sha) if show_content_of_interest: pdf_parserres.add_subsection(carres) # ELEMENTS # Do not show for objstms if get_malform: if request.deep_scan: options = { "verbose": True, "nocanonicalizedoutput": True, "get_malform": get_malform } elif embed_present: options = { "verbose": True, "elements": "ctsi", "type": "/EmbeddedFile", "get_malform": get_malform } else: options = { "verbose": True, "elements": "cst", "get_malform": get_malform } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) embed_extracted = set() if pdf_parser_result: if len(pdf_parser_result) == 0: pdf_parserres.add_line( "No structure information generated for file. Please see errors." ) else: # PDF Parser will write any malformed content over 100 bytes to a file files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'malformed': if len(l) > 0: pdf_parserres.set_heuristic(6) for i in l: request.add_extracted( i, os.path.basename(i), "Extracted malformed content in PDF Parser Analysis." ) parts = pdf_parser_result.get("parts", None) # Extract service will extract the sample's embedded files. # However we want to make note of them so that they are not extracted again below if parts: for p in sorted(parts): if "Type: /EmbeddedFile" in p: getobj = p.split("\n", 1)[0].split(" ")[1] embed_extracted.add(getobj) # Extract objects collected from above analysis obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs if len(obj_to_extract) > 0: options = { "filter": True, "object": obj_to_extract, "dump": "extracted_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: files = pdf_parser_result.get("files", None) extracted_files = [] if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_obj_", "") extracted_files.append( "Extracted object {} as {}".format( obj_id, f_name)) request.add_extracted( i, f_name, "Object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_files: extract_res = ResultSection( title_text="Extracted embedded objects", parent=pdf_parserres) extract_res.set_heuristic(9) extract_res.add_lines(extracted_files) # Extract jbig2decode objects in deep scan mode if request.deep_scan and len(jbig_objs) > 0: options = { "object": jbig_objs, "dump": "extracted_jb_obj_", } pdf_parser_result, errors = self.get_pdf_parser( path, working_dir, options) if pdf_parser_result: extracted_jb = [] files = pdf_parser_result.get("files", None) if files: for f, l in files.items(): if f == 'embedded': for i in l: f_name = os.path.basename(i) obj_id = f_name.replace( "extracted_jb_obj_", "") extracted_jb.append( "JBIG2DECODE object {} extracted as {}" .format(obj_id, f_name)) request.add_extracted( i, f_name, "JBIG2DECODE object {} extracted in PDF Parser Analysis." .format(obj_id)) for e in errors: all_errors.add(e) if extracted_jb: jbig_extract_res = ResultSection( title_text="Extracted JBIG2Decode objects", parent=pdf_parserres) jbig_extract_res.set_heuristic(9) jbig_extract_res.add_lines(extracted_jb) if len(pdf_parserres.subsections) > 0: res.add_subsection(pdf_parserres) return res, objstms, all_errors
def execute(self, request): """Main Module. See README for details.""" global imginfo result = Result() request.result = result self.sha = request.sha256 infile = request.file_path run_steg = request.get_param('run_steg') # Run image-specific modules supported_images = re.compile('image/(bmp|gif|jpeg|jpg|png)') if re.match(supported_images, request.file_type): # Extract img info using Pillow (already available in steg.py) and determine if steg modules should be run if self.config['run_steg_auto'] or run_steg: decloak = True else: decloak = False try: imginfo = ImageInfo(infile, request, result, self.working_directory, self.log) except NotSupported: decloak = False # Run Tesseract on sample # Process the command and save the csv result in the result object usable_out = None orig_outfile = os.path.join(self.working_directory, 'outfile') stdout, stderr = self.tesseract_call(infile, orig_outfile) if stdout or stderr: # Assess Tesseract warnings if b"pix too small" in stderr: # Make the image larger with convert command c_outfile = os.path.join(self.working_directory, 'enlrg_img') c_stdout, c_stderr = self.convert_img(infile, c_outfile) if c_stdout: c_outfile = os.path.join(self.working_directory, 'c_outfile') enlrg_infile = os.path.join(self.working_directory, 'enlrg') if not c_stderr: stdout, stderr = self.tesseract_call( enlrg_infile, c_outfile) if stdout: if not stderr: outfile = c_outfile else: outfile = orig_outfile else: outfile = orig_outfile else: outfile = orig_outfile else: outfile = orig_outfile else: outfile = orig_outfile self.log.debug( "Tesseract errored/warned on sample {}. Error:{}". format(self.sha, stderr)) usable_out = self.assess_output(outfile, request) if usable_out: ores = ResultSection("OCR Engine detected strings in image", body_format=BODY_FORMAT.MEMORY_DUMP) ores.add_line("Text preview (up to 500 bytes):\n") ores.add_line("{}".format(usable_out[0:500])) result.add_section(ores) # Find attached data additional_content = self.find_additional_content(infile) if additional_content: ares = (ResultSection("Possible Appended Content Found", body_format=BODY_FORMAT.MEMORY_DUMP)) ares.add_line( "{} Bytes of content found at end of image file".format( len(additional_content))) ares.add_line("Text preview (up to 500 bytes):\n") ares.add_line("{}".format(safe_str(additional_content)[0:500])) ares.set_heuristic(2) result.add_section(ares) file_name = "{}_appended_img_content".format( hashlib.sha256(additional_content).hexdigest()[0:10]) file_path = os.path.join(self.working_directory, file_name) request.add_extracted(file_path, file_name, "Carved content found at end of image.") with open(file_path, 'wb') as unibu_file: unibu_file.write(additional_content) # Steganography modules if decloak: if request.deep_scan: imginfo.decloak()
def execute(self, request): # --- Setup ---------------------------------------------------------------------------------------------- request.result = Result() patterns = PatternMatch() if request.deep_scan: max_attempts = 100 else: max_attempts = 10 self.files_extracted = set() self.hashes = set() before = set() # --- Pre-Processing -------------------------------------------------------------------------------------- # Get all IOCs prior to de-obfuscation pat_values = patterns.ioc_match(request.file_contents, bogon_ip=True, just_network=False) if pat_values: if request.get_param('extract_original_iocs'): ioc_res = ResultSection( "The following IOCs were found in the original file", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) else: ioc_res = None for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}" ) ioc_res.add_tag(k, asc_asc) before.add((k, asc_asc)) else: for v in val: if ioc_res: ioc_res.add_line( f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_res.add_tag(k, v) before.add((k, v)) # --- Prepare Techniques ---------------------------------------------------------------------------------- techniques = [ ('MSOffice Embedded script', self.msoffice_embedded_script_string), ('CHR and CHRB decode', self.chr_decode), ('String replace', self.string_replace), ('Powershell carets', self.powershell_carets), ('Array of strings', self.array_of_strings), ('Fake array vars', self.vars_of_fake_arrays), ('Reverse strings', self.str_reverse), ('B64 Decode', self.b64decode_str), ('Simple XOR function', self.simple_xor_function), ] second_pass = [('Concat strings', self.concat_strings), ('MSWord macro vars', self.mswordmacro_vars), ('Powershell vars', self.powershell_vars), ('Charcode hex', self.charcode_hex)] final_pass = [ ('Charcode', self.charcode), ] code_extracts = [('.*html.*', "HTML scripts extraction", self.extract_htmlscript)] layers_list = [] layer = request.file_contents # --- Stage 1: Script Extraction -------------------------------------------------------------------------- for pattern, name, func in code_extracts: if re.match(re.compile(pattern), request.task.file_type): extracted_parts = func(request.file_contents) layer = b"\n".join(extracted_parts).strip() layers_list.append((name, layer)) break # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------ idx = 0 first_pass_len = len(techniques) layers_count = len(layers_list) while True: if idx > max_attempts: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break for name, technique in techniques: res = technique(layer) if res: layers_list.append((name, res)) # Looks like it worked, restart with new layer layer = res # If the layers haven't changed in a passing, break if layers_count == len(layers_list): if len(techniques) != first_pass_len: final_pass.extend(techniques) for name, technique in final_pass: res = technique(layer) if res: layers_list.append((name, res)) break else: for x in second_pass: techniques.insert(0, x) layers_count = len(layers_list) idx += 1 # --- Compiling results ---------------------------------------------------------------------------------- if len(layers_list) > 0: extract_file = False num_layers = len(layers_list) heur_id = None # Compute heuristic if num_layers < 5: heur_id = 1 elif num_layers < 10: heur_id = 2 elif num_layers < 50: heur_id = 3 elif num_layers < 100: heur_id = 4 elif num_layers >= 100: heur_id = 5 # Cleanup final layer clean = self.clean_up_final_layer(layers_list[-1][1]) if clean != request.file_contents: # Check for new IOCs pat_values = patterns.ioc_match(clean, bogon_ip=True, just_network=False) diff_tags = {} for k, val in pat_values.items(): if val == "": asc_asc = unicodedata.normalize('NFKC', val).encode( 'ascii', 'ignore') if (k, asc_asc) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(asc_asc) else: for v in val: if (k, v) not in before: diff_tags.setdefault(k, []) diff_tags[k].append(v) if request.deep_scan or \ (len(clean) > 1000 and heur_id >= 4) or diff_tags: extract_file = True # Display obfuscation steps mres = ResultSection( "De-obfuscation steps taken by DeobsfuScripter", parent=request.result) if heur_id: mres.set_heuristic(heur_id) lcount = Counter([x[0] for x in layers_list]) for l, c in lcount.items(): mres.add_line(f"{l}, {c} time(s).") # Display final layer byte_count = 5000 if extract_file: # Save extracted file byte_count = 500 fn = f"{request.file_name}_decoded_final" fp = os.path.join(self.working_directory, fn) with open(fp, 'wb') as dcf: dcf.write(clean) self.log.debug( f"Submitted dropped file for analysis: {fp}") request.add_extracted(fp, fn, "Final deobfuscation layer") ResultSection(f"First {byte_count} bytes of the final layer:", body=safe_str(clean[:byte_count]), body_format=BODY_FORMAT.MEMORY_DUMP, parent=request.result) # Display new IOCs from final layer if len(diff_tags) > 0: ioc_new = ResultSection( "New IOCs found after de-obfustcation", parent=request.result, body_format=BODY_FORMAT.MEMORY_DUMP) has_network_heur = False for ty, val in diff_tags.items(): for v in val: if "network" in ty: has_network_heur = True ioc_new.add_line( f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}" ) ioc_new.add_tag(ty, v) if has_network_heur: ioc_new.set_heuristic(7) else: ioc_new.set_heuristic(6) if len(self.files_extracted) > 0: ext_file_res = ResultSection( "The following files were extracted during the deobfuscation", heuristic=Heuristic(8), parent=request.result) for f in self.files_extracted: ext_file_res.add_line(os.path.basename(f)) request.add_extracted( f, os.path.basename(f), "File of interest deobfuscated from sample")
def tag_data(data, data_deobfuscated, result_ioc, result_formula): pattern = PatternMatch() # Get all IoCs without deobfuscation ioc_dict = {} formulas = collections.OrderedDict() for line in data: if line[:4] == 'CELL': split_value = line.split(',', 1) cell = split_value[0].split(':')[1].strip() formula = split_value[1].rsplit(',', 1)[0].strip() # Add formula to list of formulas if it contains IoC(s) if pattern.ioc_match(formula, cell, ioc_dict): formulas[cell] = formula # Get all IoCs after deobfuscation ioc_deobfuscated_dict = {} formulas_deobfuscated = collections.OrderedDict() for line in data_deobfuscated: split_value = line.split(':', 1) cell = split_value[0].strip() formula = split_value[1].strip() # Add formula to list of deobfuscated formulas if it contains IoC(s) if pattern.ioc_match(formula, cell, ioc_deobfuscated_dict): formulas_deobfuscated[cell] = formula # Remove duplicate IoCs (found both before AND after deobfuscation) for ioc_tag, values in ioc_deobfuscated_dict.copy().items(): for ioc_details in values.copy(): if ioc_tag in ioc_dict and ioc_details in ioc_dict[ioc_tag]: ioc_deobfuscated_dict[ioc_tag].remove(ioc_details) # Remove ioc_tag if no IoCs are associated with it if len(ioc_deobfuscated_dict[ioc_tag]) == 0: del ioc_deobfuscated_dict[ioc_tag] # Remove duplicate formulas from the same cell (found both before AND after deobfuscation) for cell, formula in formulas_deobfuscated.copy().items(): if cell in formulas and formula in formulas[cell]: del formulas_deobfuscated[cell] # Create the appropriate result subsections for formulas formulas_subsection = ResultSection('Formulas') formulas_deobfuscated_subsection = ResultSection('Deobfuscated Formulas') formulas_deobfuscated_subsection.set_heuristic(5) if formulas: result_formula.add_subsection(formulas_subsection) if formulas_deobfuscated: result_formula.add_subsection(formulas_deobfuscated_subsection) # Generate result subsections for IoCs found without deobfuscation heuristics = [1, 2] for ioc_tag, values in ioc_dict.items(): for ioc_details in values: ioc = ioc_details[0] title = ioc_details[1] heuristic = heuristics[ioc_details[2]] ioc_subsection = get_result_subsection(result_ioc, title, heuristic) ioc_subsection.add_tag(ioc_tag, ioc) pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)') if ioc_subsection.body is not None and not pattern.search( ioc_subsection.body): ioc_subsection.add_line(ioc) elif ioc_subsection.body is None: ioc_subsection.add_line(ioc) formulas_subsection.add_tag(ioc_tag, ioc) # Generate result subsections for deobfuscated IoCs heuristics = [3, 4] for ioc_tag, values in ioc_deobfuscated_dict.items(): for ioc_details in values: ioc = ioc_details[0] title = 'Deobfuscated ' + ioc_details[1] heuristic = heuristics[ioc_details[2]] ioc_subsection = get_result_subsection(result_ioc, title, heuristic) ioc_subsection.add_tag(ioc_tag, ioc) pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)') if ioc_subsection.body is not None and not pattern.search( ioc_subsection.body): ioc_subsection.add_line(ioc) elif ioc_subsection.body is None: ioc_subsection.add_line(ioc) formulas_deobfuscated_subsection.add_tag(ioc_tag, ioc) # Populate 'Formulas' result subsection with all suspicious formulas found without deobfuscation for cell, formula in formulas.items(): # Only add complete formulas if "FORMULA(" in formula: cell_referenced = formula.rsplit(',', 1)[1][:-1] if cell_referenced not in formulas.keys(): formulas_subsection.add_line(cell + ": " + formula) else: formulas_subsection.add_line(cell + ": " + formula) # Populate 'Deobfuscated Formulas' result subsection with all deobfuscated suspicious formulas for cell, formula in formulas_deobfuscated.items(): # Only add complete formulas if "FORMULA(" in formula: cell_referenced = formula.rsplit(',', 1)[1][:-1] if cell_referenced not in formulas_deobfuscated.keys(): formulas_deobfuscated_subsection.add_line(cell + ": " + formula) else: formulas_deobfuscated_subsection.add_line(cell + ": " + formula)