def generate_results(self, presults, result, analysis_results, request):
        if presults['unpacked']:
            result.add_section(
                ResultSection("Successully unpacked binary.",
                              heuristic=Heuristic(1)))

        for r in presults['unpacked_samples']:
            if len(r['malware_id']) > 0:
                for rm in r['malware_id']:
                    section = ResultSection("{} - {}".format(
                        r['sha256'], rm['name']),
                                            heuristic=Heuristic(2))
                    section.add_line("Details: {}".format(rm['reference']))
                    result.add_section(section)
            request.add_extracted(r['data_path'], r['sha256'],
                                  f'Unpacked from {request.sha256}')

        result.add_section(
            ResultSection(f"UNPACME Detailed Results",
                          body_format=BODY_FORMAT.JSON,
                          body=json.dumps(analysis_results['results'])))

        return result, request
예제 #2
0
    def execute(self, request):
        result = Result()
        file_path = request.file_path

        p1 = subprocess.Popen("clamscan -a -z --detect-pua --alert-macros " +
                              file_path,
                              shell=True,
                              stdout=subprocess.PIPE)
        p1.wait()
        stdout = p1.communicate()[0].decode("utf-8")

        report = stdout.split("\n")
        report = list(filter(None, report))

        text_section = ResultSection("Successfully scanned the file")
        if "FOUND" in report[0]:
            text_section.set_heuristic(1)

        for l in report:
            text_section.add_line(l)

        result.add_section(text_section)
        request.result = result
    def run_analysis(self, quark_out, result):
        with open(quark_out) as f:
            data = json.load(f)

        self.manage_threat_level(data, result)
        dic_report_crime = {}
        crimes_section = ResultSection("Crimes detected")
        crimes_array = []
        counter = 0
        for i in range(len(data['crimes'])):
            if data['crimes'][i]['confidence'] == "80%":
                crimes_array.insert(
                    len(crimes_array) - counter, data['crimes'][i])
            if data['crimes'][i]['confidence'] == "100%":
                crimes_array.insert(0, data['crimes'][i])
            if data['crimes'][i]['confidence'] == "60%":
                counter += 1
                crimes_array.insert(len(crimes_array), data['crimes'][i])

        for i in range(len(crimes_array)):
            if crimes_array[i]['confidence'] in ["80%", "100%"]:
                dic_report_crime["{0}".format(
                    crimes_array[i]["crime"])] = ResultSection(
                        "{0}".format(crimes_array[i]["crime"]),
                        parent=crimes_section)
                dic_report_crime["{0}".format(
                    crimes_array[i]["crime"])].add_line(
                        "confidence level : {0}".format(
                            crimes_array[i]["confidence"]))

                if len(crimes_array[i]['permissions']) > 0:
                    perm_section = ResultSection(
                        "permissions associated with the crime",
                        parent=dic_report_crime["{0}".format(
                            crimes_array[i]["crime"])],
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    for permission in crimes_array[i]['permissions']:
                        perm_section.add_line(permission)

                if len(crimes_array[i]['native_api']) > 0:
                    native_api_section = ResultSection(
                        "native_api",
                        parent=dic_report_crime["{0}".format(
                            crimes_array[i]["crime"])],
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    for api in crimes_array[i]["native_api"]:
                        native_api_section.add_line("class : {0}".format(
                            api["class"]))
                        native_api_section.add_line("method : {0}".format(
                            api["method"]))

        result.add_section(crimes_section)
    def execute(self, request):
        """Main Module. See README for details."""
        max_size = self.config.get('MAX_PDF_SIZE', 3000000)
        request.result = result = Result()
        if (os.path.getsize(request.file_path)
                or 0) < max_size or request.deep_scan:
            path = request.file_path
            working_dir = self.working_directory

            # CALL PDFID and identify all suspicious keyword streams
            additional_keywords = self.config.get('ADDITIONAL_KEYS', [])
            heur = deepcopy(self.config.get('HEURISTICS', []))
            all_errors = set()

            res_txt = "Main Document Results"
            res, contains_objstms, errors = self.analyze_pdf(
                request, res_txt, path, working_dir, heur, additional_keywords)
            result.add_section(res)

            for e in errors:
                all_errors.add(e)

            #  ObjStms: Treat all ObjStms like a standalone PDF document
            if contains_objstms:
                objstm_files = self.analyze_objstm(path, working_dir,
                                                   request.deep_scan)
                obj_cnt = 1
                for osf in objstm_files:
                    parent_obj = os.path.basename(osf).split("_")[1]
                    res_txt = "ObjStream Object {0} from Parent Object {1}".format(
                        obj_cnt, parent_obj)
                    # It is going to look suspicious as the service created the PDF
                    heur = [
                        x for x in heur
                        if 'plugin_suspicious_properties' not in x
                        and 'plugin_embeddedfile' not in x
                        and 'plugin_nameobfuscation' not in x
                    ]

                    res, contains_objstms, errors = self.analyze_pdf(
                        request,
                        res_txt,
                        osf,
                        working_dir,
                        heur,
                        additional_keywords,
                        get_malform=False)

                    obj_cnt += 1
                    result.add_section(res)

            if len(all_errors) > 0:
                erres = ResultSection(title_text="Errors Analyzing PDF")
                for e in all_errors:
                    erres.add_line(e)
                result.add_section(erres)

        else:
            section = ResultSection(
                "PDF Analysis of the file was skipped because the file is too big (limit is 3 MB)."
            )
            section.set_heuristic(10)
            result.add_section(section)
    def run_badging_analysis(self, apk_file: str, result: Result):
        badging_args = ['d', 'badging', apk_file]
        badging, errors = self.run_appt(badging_args)
        if not badging:
            return
        res_badging = ResultSection("Android application details")
        libs = []
        permissions = []
        components = []
        features = []
        pkg_version = None
        for line in badging.splitlines():
            if line.startswith("package:"):
                pkg_name = line.split("name='")[1].split("'")[0]
                pkg_version = line.split("versionCode='")[1].split("'")[0]
                res_badging.add_line(f"Package: {pkg_name} v.{pkg_version}")
                res_badging.add_tag('file.apk.pkg_name', pkg_name)
                res_badging.add_tag('file.apk.app.version', pkg_version)

            if line.startswith("sdkVersion:"):
                min_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Min SDK: {min_sdk}")
                res_badging.add_tag('file.apk.sdk.min', min_sdk)

            if line.startswith("targetSdkVersion:"):
                target_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Target SDK: {target_sdk}")
                res_badging.add_tag('file.apk.sdk.target', target_sdk)

            if line.startswith("application-label:"):
                label = line.split(":'")[1][:-1]
                res_badging.add_line(f"Default Label: {label}")
                res_badging.add_tag('file.apk.app.label', label)

            if line.startswith("launchable-activity:"):
                launch = line.split("name='")[1].split("'")[0]
                res_badging.add_line(f"Launchable activity: {launch}")
                res_badging.add_tag('file.apk.activity', launch)

            if line.startswith("uses-library-not-required:"):
                lib = line.split(":'")[1][:-1]
                if lib not in libs:
                    libs.append(lib)

            if line.startswith("uses-permission:") or line.startswith("uses-implied-permission:"):
                perm = line.split("name='")[1].split("'")[0]
                if perm not in permissions:
                    permissions.append(perm)

            if line.startswith("provides-component:"):
                component = line.split(":'")[1][:-1]
                if component not in components:
                    components.append(component)

            if "uses-feature:" in line or "uses-implied-feature:" in line:
                feature = line.split("name='")[1].split("'")[0]
                if feature not in features:
                    features.append(feature)

        if pkg_version is not None:
            pkg_version = int(pkg_version)
            if pkg_version < 15:
                ResultSection("Package version is suspiciously low", parent=res_badging,
                              heuristic=Heuristic(17))
            elif pkg_version > 999999999:
                ResultSection("Package version is suspiciously high", parent=res_badging,
                              heuristic=Heuristic(17))

        if libs:
            res_lib = ResultSection("Libraries used", parent=res_badging)
            for lib in libs:
                res_lib.add_line(lib)
                res_lib.add_tag('file.apk.used_library', lib)

        if permissions:
            res_permissions = ResultSection("Permissions used", parent=res_badging)
            dangerous_permissions = []
            unknown_permissions = []
            for perm in permissions:
                if perm in ALL_ANDROID_PERMISSIONS:
                    if 'dangerous' in ALL_ANDROID_PERMISSIONS[perm]:
                        dangerous_permissions.append(perm)
                    else:
                        res_permissions.add_line(perm)
                        res_permissions.add_tag('file.apk.permission', perm)
                else:
                    unknown_permissions.append(perm)

            if len(set(permissions)) < len(permissions):
                ResultSection("Some permissions are defined more then once", parent=res_badging,
                              heuristic=Heuristic(18))

            if dangerous_permissions:
                res_dangerous_perm = ResultSection("Dangerous permissions used", parent=res_badging,
                                                   heuristic=Heuristic(4))
                for perm in dangerous_permissions:
                    res_dangerous_perm.add_line(perm)
                    res_dangerous_perm.add_tag('file.apk.permission', perm)

            if unknown_permissions:
                res_unknown_perm = ResultSection("Unknown permissions used", parent=res_badging,
                                                 heuristic=Heuristic(5))
                for perm in unknown_permissions:
                    res_unknown_perm.add_line(perm)
                    res_unknown_perm.add_tag('file.apk.permission', perm)

        if features:
            res_features = ResultSection("Features used", parent=res_badging)
            for feature in features:
                res_features.add_line(feature)
                res_features.add_tag('file.apk.feature', feature)

        if components:
            res_components = ResultSection("Components provided", parent=res_badging)
            for component in components:
                res_components.add_line(component)
                res_components.add_tag('file.apk.provides_component', component)

        result.add_section(res_badging)
    def find_scripts_and_exes(apktool_out_dir: str, result: Result):
        scripts = []
        executables = []
        apks = []

        # We are gonna do the full apktool output dir here but in case we want to do less,
        # you can edit the test_path list
        test_paths = [apktool_out_dir]
        for path in test_paths:
            for root, _, files in os.walk(path):
                for f in files:
                    if f.endswith(".smali"):
                        continue
                    cur_file = os.path.join(root, f)
                    file_type = fileinfo(cur_file)['type']

                    if "code/sh" in file_type:
                        scripts.append(cur_file.replace(apktool_out_dir, ''))
                    elif "executable/linux" in file_type:
                        executables.append(cur_file.replace(apktool_out_dir, ''))
                    elif "android/apk" in file_type:
                        executables.append(cur_file.replace(apktool_out_dir, ''))

        if scripts:
            res_script = ResultSection("Shell script(s) found inside APK", parent=result,
                                       heuristic=Heuristic(1))
            for script in sorted(scripts)[:20]:
                res_script.add_line(script)
            if len(scripts) > 20:
                res_script.add_line(f"and {len(scripts) - 20} more...")

        if executables:
            res_exe = ResultSection("Executable(s) found inside APK", parent=result,
                                    heuristic=Heuristic(2))
            for exe in sorted(executables)[:20]:
                res_exe.add_line(exe)
            if len(executables) > 20:
                res_exe.add_line(f"and {len(executables) - 20} more...")

        if apks:
            res_apk = ResultSection("Other APKs where found inside the APK", parent=result,
                                    heuristic=Heuristic(19))
            for apk in sorted(apks)[:20]:
                res_apk.add_line(apk)
            if len(apks) > 20:
                res_apk.add_line(f"and {len(apks) - 20} more...")
예제 #7
0
    def execute(self, request):
        """Main Module. See README for details."""
        request.result = Result()
        self.result = request.result
        wrk_dir = self.working_directory
        ipa_path = request.file_path
        self.known_keys = None
        self.reported_keys = {}

        # Determine if PK container has IPA content to parse
        try:
            ipa_file = zipfile.ZipFile(ipa_path)
        except zipfile.BadZipfile:
            # Return if files cannot be extracted
            return
        # isipa returns False if Info.plist not found, or returns Info.plist path
        name_list, isipa = self.isipa(ipa_file)

        if not isipa:
            return

        # Extract Files of interest using 7zip (some files can be AES encrypted which standard zipfile library does not
        # support)
        extract_success = False
        try:
            self.extract_archive(ipa_path)
            extract_success = True
        except Exception as e:
            self.log.error(f"Could not extract IPA file due to 7zip error {e}")

        if not extract_success:
            return

        with open(os.path.join(os.path.dirname(__file__), "keys.json"), 'r') as f:
            keys_dict = json.load(f)
            self.known_keys = keys_dict['glossary']

        patterns = PatternMatch()

        # Info.plist
        main_exe = None
        res = ResultSection("Info.plist")
        info_plist_path = os.path.join(wrk_dir, isipa)

        isempty, plist_dict = self.gen_plist_extract(info_plist_path, patterns)

        if plist_dict is None:
            res.add_line("Info.plist in sample cannot be parsed. Sample may be corrupt.")

        elif isempty:
            res.add_line("Empty Info.plist file. Archive contents may be encrypted.")

        else:
            # Grab the main executable name
            if plist_dict.get("CFBundleExecutable", None):
                i = plist_dict["CFBundleExecutable"]
                try:
                    main_exe = (i, f"Name of bundle's main executable file: {i}")
                    res.add_line(main_exe[1])
                except UnicodeEncodeError:
                    i = i.encode('utf8', 'replace')
                    main_exe = (i, f"Name of bundle's main executable file: {i}")
                    res.add_line(main_exe[1])

            iden_key_res, unk_key_res = self.parse_plist(plist_dict)
            if iden_key_res:
                res.add_subsection(iden_key_res)
            if unk_key_res:
                res.add_subsection(unk_key_res)
            request.result.add_section(res)

        # PkgInfo file
        pkg_types = {
            'APPL': 'application',
            'FMWK': 'frameworks',
            'BNDL': 'loadable bundle'
        }
        pattern = re.compile(r'Payload/[^/]*.app/PkgInfo')
        for fn in name_list:
            m = pattern.match(fn)
            if m is not None:
                res = ResultSection("PkgInfo Details")
                pkg_info_path = os.path.join(wrk_dir, m.group())
                with open(pkg_info_path, 'r') as f:
                    pkg_info = f.read()
                if pkg_info == "":
                    res.add_line("Empty PkgInfo file. Archive contents may be encrypted.")
                elif len(pkg_info) == 8:
                    # noinspection PyBroadException
                    try:
                        pkgtype = pkg_info[0:4]
                        if pkgtype in pkg_types:
                            pkgtype = pkg_types[pkgtype]
                        creator_code = pkg_info[4:]
                        res = ResultSection("PkgInfo Details")
                        res.add_line(f"Package Type: {pkgtype}; Application Signature: {creator_code}")
                    except Exception:
                        continue
                request.result.add_section(res)

        if main_exe:
            main_exe_reg = (rf'.*{main_exe[0]}$', f"Main executable file {main_exe[0]}")
        else:
            main_exe_reg = ('$', 'Place holder for missing main executable name.')

        fextract_regs = [
            main_exe_reg,
            (r'Payload.*\.(?:crt|cer|der|key|p12|p7b|p7c|pem|pfx)$', "Certificate or key file"),
            (r'Payload.*libswift[^\/]\.dylib$', "Swift code library files"),
            (r'Payload\/META-INF\/.*ZipMetadata.plist$', "IPA archive content info"),
            (r'Payload.*mobileprovision$', "Provisioning profile for limiting app uploads"),
            (r'.*plist$', "Plist information file"),
        ]

        empty_file_msg = "Empty file. Archive contents may be encrypted."
        int_files = {}
        plist_res = ResultSection("Other Plist File Information (displaying new key-value pairs only)")
        for root, dirs, files in os.walk(wrk_dir):
            for name in files:
                full_path = safe_str(os.path.join(root, name))
                if os.path.getsize(full_path) == 0:
                    if int_files.get(empty_file_msg, None):
                        int_files[empty_file_msg].append(full_path)
                    else:
                        int_files[empty_file_msg] = []
                        int_files[empty_file_msg].append(full_path)
                else:
                    for p, desc in fextract_regs:
                        pattern = re.compile(p)
                        m = pattern.match(full_path)
                        if m is not None:
                            # Already identify main executable file above
                            if not desc.startswith("Main executable file "):
                                if desc.startswith("Plist"):
                                    pres = ResultSection(f"{full_path.replace(wrk_dir, '')}")
                                    isempty, plist_parsed = self.gen_plist_extract(full_path, patterns)
                                    if not isempty and plist_parsed:
                                        iden_key_res, unk_key_res = self.parse_plist(plist_parsed)
                                        # If all keys have already been reported, skip this plist
                                        if not iden_key_res and not unk_key_res:
                                            continue
                                        if iden_key_res:
                                            pres.add_subsection(iden_key_res)
                                        if unk_key_res:
                                            pres.add_subsection(unk_key_res)
                                        plist_res.add_subsection(pres)
                                elif int_files.get(desc, None):
                                    int_files[desc].append(full_path)
                                else:
                                    int_files[desc] = []
                                    int_files[desc].append(full_path)
                            break

        if len(plist_res.subsections) > 0:
            request.result.add_section(plist_res)

        if len(int_files) > 0:
            intf_sec = ResultSection("Files of interest", parent=res)
            for intf_d, intf_p in int_files.items():
                intf_subsec = ResultSection(intf_d, parent=intf_sec)
                for f in intf_p:
                    intf_subsec.add_line(f.replace(f"{wrk_dir}/", ""))
    def unicode_results(self, request: ServiceRequest,
                        patterns: PatternMatch) -> Optional[ResultSection]:
        """
        Finds and report unicode encoded strings

        Args:
            request: AL request object with result section
            patterns: PatternMatch object

        Returns:
            The result section (with request.result as its parent) if one is created
        """
        unicode_al_results: Dict[str, Tuple[bytes, bytes]] = {}
        dropped_unicode: List[Tuple[str, str]] = []
        for hes in self.HEXENC_STRINGS:
            if re.search(
                    re.escape(hes) + b'[A-Fa-f0-9]{2}', request.file_contents):
                dropped = self.decode_encoded_udata(request, hes,
                                                    request.file_contents,
                                                    unicode_al_results)
                for uhash in dropped:
                    dropped_unicode.append((uhash, safe_str(hes)))

        # Report Unicode Encoded Data:
        unicode_heur = Heuristic(
            5, frequency=len(dropped_unicode)) if dropped_unicode else None
        unicode_emb_res = ResultSection(
            "Found Unicode-Like Strings in Non-Executable:",
            body_format=BODY_FORMAT.MEMORY_DUMP,
            heuristic=unicode_heur)
        for uhash, uenc in dropped_unicode:
            unicode_emb_res.add_line(
                f"Extracted over 50 bytes of possible embedded unicode with "
                f"{uenc} encoding. SHA256: {uhash}. See extracted files.")

        for unires_index, (sha256,
                           (decoded,
                            encoded)) in enumerate(unicode_al_results.items()):
            sub_uni_res = (ResultSection(f"Result {unires_index}",
                                         parent=unicode_emb_res))
            sub_uni_res.add_line(f'ENCODED TEXT SIZE: {len(decoded)}')
            sub_uni_res.add_line(
                f'ENCODED SAMPLE TEXT: {safe_str(encoded)}[........]')
            sub_uni_res.add_line(f'DECODED SHA256: {sha256}')
            subb_uni_res = (ResultSection("DECODED ASCII DUMP:",
                                          body_format=BODY_FORMAT.MEMORY_DUMP,
                                          parent=sub_uni_res))
            subb_uni_res.add_line('{}'.format(safe_str(decoded)))
            # Look for IOCs of interest
            hits = self.ioc_to_tag(decoded,
                                   patterns,
                                   sub_uni_res,
                                   st_max_length=1000,
                                   taglist=True)
            if hits:
                sub_uni_res.set_heuristic(6)
                subb_uni_res.add_line(
                    "Suspicious string(s) found in decoded data.")
            else:
                sub_uni_res.set_heuristic(4)

        if unicode_al_results or dropped_unicode:
            request.result.add_section(unicode_emb_res)
            return unicode_emb_res
        return None
    def execute(self, request):
        self.result = Result()
        request.result = self.result
        self.request = request

        self.ip_list = []
        self.url_list = []
        self.found_powershell = False
        self.file_hashes = []

        vmonkey_err = False
        actions = []
        external_functions = []
        tmp_iocs = []
        output_results = {}

        # Running ViperMonkey
        try:
            cmd = " ".join([
                PYTHON2_INTERPRETER,
                os.path.join(os.path.dirname(__file__),
                             'vipermonkey_compat.py2'), request.file_path
            ])
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
            stdout, _ = p.communicate()

            # Read output
            if stdout:
                for l in stdout.splitlines():
                    if l.startswith(b"{") and l.endswith(b"}"):
                        try:
                            output_results = json.loads(l)
                        except UnicodeDecodeError:
                            output_results = json.loads(
                                l.decode("utf-8", "replace"))
                        break

                # Checking for tuple in case vmonkey return is None
                # If no macros found, return is [][], if error, return is None
                if type(output_results.get('vmonkey_values')) == dict:
                    '''
                    Structure of variable "actions" is as follows:
                    [action, description, parameter]
                    action: 'Found Entry Point', 'Execute Command', etc...
                    parameter: Parameters for function
                    description: 'Shell Function', etc...

                    external_functions is a list of built-in VBA functions
                    that were called
                    '''
                    actions = output_results['vmonkey_values']['actions']
                    external_functions = output_results['vmonkey_values'][
                        'external_funcs']
                    tmp_iocs = output_results['vmonkey_values']['tmp_iocs']
                else:
                    vmonkey_err = True
            else:
                vmonkey_err = True

        except Exception:
            raise

        # Add vmonkey log as a supplemental file
        if 'stdout' in output_results:
            temp_log_copy = os.path.join(
                tempfile.gettempdir(), f'{request.sid}_vipermonkey_output.log')
            with open(temp_log_copy, "w") as temp_log_file:
                temp_log_file.write(output_results['stdout'])

            self.request.add_supplementary(temp_log_copy,
                                           'vipermonkey_output.log',
                                           'ViperMonkey log output')
            if vmonkey_err is True:
                ResultSection(
                    'ViperMonkey has encountered an error, please check "vipermonkey_output.log"',
                    parent=self.result,
                    heuristic=Heuristic(1))

        if len(actions) > 0:
            # Creating action section
            action_section = ResultSection('Recorded Actions:',
                                           parent=self.result)
            action_section.add_tag('technique.macro', 'Contains VBA Macro(s)')
            for action in actions:  # Creating action sub-sections for each action
                cur_action = action[0]
                cur_description = action[2] if action[2] else cur_action

                # Entry point actions have an empty description field, re-organize result section for this case
                if cur_action == 'Found Entry Point':
                    sub_action_section = ResultSection('Found Entry Point',
                                                       parent=action_section)
                    sub_action_section.add_line(action[1])
                else:
                    # Action's description will be the sub-section name
                    sub_action_section = ResultSection(cur_description,
                                                       parent=action_section)
                    if cur_description == 'Shell function':
                        sub_action_section.set_heuristic(2)

                    # Parameters are sometimes stored as a list, account for this
                    if isinstance(action[1], list):
                        for item in action[1]:
                            # Parameters includes more than strings (booleans for example)
                            if isinstance(item, str):
                                # Check for PowerShell
                                self.extract_powershell(
                                    item, sub_action_section)
                        # Join list items into single string
                        param = ', '.join(str(a) for a in action[1])

                    else:
                        param = action[1]
                        # Parameters includes more than strings (booleans for example)
                        if isinstance(param, str):
                            self.extract_powershell(param, sub_action_section)

                    sub_action_section.add_line(f'Action: {cur_action}')
                    sub_action_section.add_line(f'Parameters: {param}')

                    # If decoded is true, possible base64 string has been found
                    self.check_for_b64(param, sub_action_section)

                    # Add urls/ips found in parameter to respective lists
                    self.find_ip(param)

        # Check tmp_iocs
        res_temp_iocs = ResultSection('Runtime temporary IOCs')
        for ioc in tmp_iocs:
            self.extract_powershell(ioc, res_temp_iocs)
            self.check_for_b64(ioc, res_temp_iocs)
            self.find_ip(ioc)

        if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body:
            self.result.add_section(res_temp_iocs)

        # Add PowerShell score/tag if found
        if self.found_powershell:
            ResultSection('Discovered PowerShell code in file',
                          parent=self.result,
                          heuristic=Heuristic(3))

        # Add url/ip tags
        self.add_ip_tags()

        # Create section for built-in VBA functions called
        if len(external_functions) > 0:
            vba_builtin_dict = {}
            dict_path = os.path.join(os.path.dirname(__file__),
                                     'VBA_built_ins.txt')
            with open(dict_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if re.search(r'^#', line):
                        continue
                    if line:
                        line = line.split(';')
                        vba_builtin_dict[line[0].strip()] = line[1].strip()

            external_func_section = ResultSection(
                'VBA functions called',
                body_format=BODY_FORMAT.MEMORY_DUMP,
                parent=self.result)
            for func in external_functions:
                if func in vba_builtin_dict:
                    external_func_section.add_line(func + ': ' +
                                                   vba_builtin_dict[func])
                else:
                    external_func_section.add_line(func)
    def execute(self, request: ServiceRequest) -> None:
        self.result = Result()
        request.result = self.result

        self.ip_list = []
        self.url_list = []
        self.found_powershell = False
        self.file_hashes = []

        vmonkey_err = False
        actions: List[str] = []
        external_functions: List[str] = []
        tmp_iocs: List[str] = []
        output_results: Dict[str, Any] = {}
        potential_base64: Set[str] = set()

        # Running ViperMonkey
        try:
            file_contents = request.file_contents
            input_file: str = request.file_path
            input_file_obj: Optional[IO] = None
            # Typical start to XML files
            if not file_contents.startswith(
                    b"<?") and request.file_type == "code/xml":
                # Default encoding/decoding if BOM not found
                encoding: Optional[str] = None
                decoding: Optional[str] = None
                # Remove potential BOMs from contents
                if file_contents.startswith(BOM_UTF8):
                    encoding = "utf-8"
                    decoding = "utf-8-sig"
                elif file_contents.startswith(BOM_UTF16):
                    encoding = "utf-16"
                    decoding = "utf-16"
                if encoding and decoding:
                    input_file_obj = tempfile.NamedTemporaryFile(
                        "w+", encoding=encoding)
                    input_file_obj.write(
                        file_contents.decode(decoding, errors="ignore"))
                    input_file = input_file_obj.name
                else:
                    # If the file_type was detected as XML, it's probably buried within but not actually an XML file
                    # Give no response as ViperMonkey can't process this kind of file
                    return
            cmd = " ".join([
                PYTHON2_INTERPRETER,
                os.path.join(os.path.dirname(__file__),
                             "vipermonkey_compat.py2"),
                input_file,
                self.working_directory,
            ])
            p = subprocess.run(cmd, capture_output=True, shell=True)
            stdout = p.stdout

            # Close file
            if input_file_obj and os.path.exists(input_file_obj.name):
                input_file_obj.close()

            # Add artifacts
            artifact_dir = os.path.join(
                self.working_directory,
                os.path.basename(input_file) + "_artifacts")
            if os.path.exists(artifact_dir):
                for file in os.listdir(artifact_dir):
                    try:
                        file_path = os.path.join(artifact_dir, file)
                        if os.path.isfile(file_path) and os.path.getsize(
                                file_path):
                            request.add_extracted(
                                file_path, file,
                                "File extracted by ViperMonkey during analysis"
                            )
                    except os.error as e:
                        self.log.warning(e)

            # Read output
            if stdout:
                for line in stdout.splitlines():
                    if line.startswith(b"{") and line.endswith(b"}"):
                        try:
                            output_results = json.loads(line)
                        except UnicodeDecodeError:
                            output_results = json.loads(
                                line.decode("utf-8", "replace"))
                        break

                # Checking for tuple in case vmonkey return is None
                # If no macros found, return is [][][], if error, return is None
                # vmonkey_err can still happen if return is [][][], log as warning instead of error
                if isinstance(output_results.get("vmonkey_values"), dict):
                    """
                    Structure of variable "actions" is as follows:
                    [action, parameters, description]
                    action: 'Found Entry Point', 'Execute Command', etc...
                    parameters: Parameters for function
                    description: 'Shell Function', etc...

                    external_functions is a list of built-in VBA functions
                    that were called
                    """
                    actions = output_results["vmonkey_values"]["actions"]
                    external_functions = output_results["vmonkey_values"][
                        "external_funcs"]
                    tmp_iocs = output_results["vmonkey_values"]["tmp_iocs"]
                    if output_results["vmonkey_err"]:
                        vmonkey_err = True
                        self.log.warning(output_results["vmonkey_err"])
                else:
                    vmonkey_err = True
            else:
                vmonkey_err = True

        except Exception:
            self.log.exception(
                f"Vipermonkey failed to analyze file {request.sha256}")

        if actions:
            # Creating action section
            action_section = ResultSection("Recorded Actions:",
                                           parent=self.result)
            action_section.add_tag("technique.macro", "Contains VBA Macro(s)")
            sub_action_sections: Dict[str, ResultSection] = {}
            for action, parameters, description in actions:  # Creating action sub-sections for each action
                if not description:  # For actions with no description, just use the type of action
                    description = action

                if description not in sub_action_sections:
                    # Action's description will be the sub-section name
                    sub_action_section = ResultSection(description,
                                                       parent=action_section)
                    sub_action_sections[description] = sub_action_section
                    if description == "Shell function":
                        sub_action_section.set_heuristic(2)
                else:
                    # Reuse existing section
                    sub_action_section = sub_action_sections[description]
                    if sub_action_section.heuristic:
                        sub_action_section.heuristic.increment_frequency()

                # Parameters are sometimes stored as a list, account for this
                if isinstance(parameters, list):
                    for item in parameters:
                        # Parameters includes more than strings (booleans for example)
                        if isinstance(item, str):
                            # Check for PowerShell
                            self.extract_powershell(item, sub_action_section,
                                                    request)
                    # Join list items into single string
                    param = ", ".join(str(p) for p in parameters)

                else:
                    param = parameters
                    # Parameters includes more than strings (booleans for example)
                    if isinstance(param, str):
                        self.extract_powershell(param, sub_action_section,
                                                request)

                # If the description field was empty, re-organize result section for this case
                if description == action:
                    sub_action_section.add_line(param)
                else:
                    sub_action_section.add_line(
                        f"Action: {action}, Parameters: {param}")

                # Check later for base64
                potential_base64.add(param)

                # Add urls/ips found in parameter to respective lists
                self.find_ip(param)
        # Check tmp_iocs
        res_temp_iocs = ResultSection("Runtime temporary IOCs")
        for ioc in tmp_iocs:
            self.extract_powershell(ioc, res_temp_iocs, request)
            potential_base64.add(ioc)
            self.find_ip(ioc)

        if len(res_temp_iocs.subsections) != 0 or res_temp_iocs.body:
            self.result.add_section(res_temp_iocs)

        # Add PowerShell score/tag if found
        if self.found_powershell:
            ResultSection("Discovered PowerShell code in file",
                          parent=self.result,
                          heuristic=Heuristic(3))

        # Check parameters and temp_iocs for base64
        base64_section = ResultSection("Possible Base64 found",
                                       heuristic=Heuristic(5, frequency=0))
        for param in potential_base64:
            self.check_for_b64(param, base64_section, request,
                               request.file_contents)
        if base64_section.body:
            self.result.add_section(base64_section)

        # Add url/ip tags
        self.add_ip_tags()

        # Create section for built-in VBA functions called
        if len(external_functions) > 0:
            external_func_section = ResultSection(
                "VBA functions called",
                body_format=BODY_FORMAT.MEMORY_DUMP,
                parent=self.result)
            for func in external_functions:
                if func in vba_builtins:
                    external_func_section.add_line(func + ": " +
                                                   vba_builtins[func])
                else:
                    external_func_section.add_line(func)

        # Add vmonkey log as a supplemental file if we have results
        if "stdout" in output_results and (vmonkey_err
                                           or request.result.sections):
            temp_log_copy = os.path.join(
                tempfile.gettempdir(), f"{request.sid}_vipermonkey_output.log")
            with open(temp_log_copy, "w") as temp_log_file:
                temp_log_file.write(output_results["stdout"])

            request.add_supplementary(temp_log_copy, "vipermonkey_output.log",
                                      "ViperMonkey log output")
            if vmonkey_err is True:
                ResultSection(
                    'ViperMonkey has encountered an error, please check "vipermonkey_output.log"',
                    parent=self.result,
                    heuristic=Heuristic(1),
                )
    def check_for_b64(self, data: str, section: ResultSection,
                      request: ServiceRequest, file_contents: bytes) -> bool:
        """Search and decode base64 strings in sample data.

        Args:
            data: Data to be parsed
            section: base64 subsection, must have heuristic set

        Returns:
            decoded: Boolean which is true if base64 found
        """
        assert section.heuristic

        decoded_param = data
        decoded = False

        encoded_data = data.encode()
        for content, start, end in find_base64(encoded_data):
            if encoded_data[start:end] in file_contents:
                # Present in original file, not an intermediate IoC
                continue
            try:
                # Powershell base64 will be utf-16
                content = content.decode("utf-16").encode()
            except UnicodeDecodeError:
                pass
            try:
                if len(content) < FILE_PARAMETER_SIZE:
                    decoded_param = decoded_param[:
                                                  start] + " " + content.decode(
                                                      errors="ignore"
                                                  ) + decoded_param[end:]
                else:
                    b64hash = ""
                    pe_files = find_pe_files(content)
                    for pe_file in pe_files:
                        b64hash = hashlib.sha256(pe_file).hexdigest()
                        pe_path = os.path.join(self.working_directory, b64hash)
                        with open(pe_path, "wb") as f:
                            f.write(pe_file)
                        request.add_extracted(
                            pe_path, b64hash,
                            "PE file found in base64 encoded parameter")
                        section.heuristic.add_signature_id("pe_file")
                    if not pe_files:
                        b64hash = hashlib.sha256(content).hexdigest()
                        content_path = os.path.join(self.working_directory,
                                                    b64hash)
                        with open(content_path, "wb") as f:
                            f.write(content)
                        request.add_extracted(
                            content_path, b64hash,
                            "Large base64 encoded parameter")
                        section.heuristic.add_signature_id("possible_file")
                    decoded_param = decoded_param[:
                                                  start] + f"[See extracted file {b64hash}]" + decoded_param[
                                                      end:]
                decoded = True
            except Exception:
                pass

        if decoded:
            section.heuristic.increment_frequency()
            section.add_line(
                f"Possible Base64 {truncate(data)} decoded: {decoded_param}")
            self.find_ip(decoded_param)

        return decoded
예제 #12
0
    def execute(self, request):
        file_path = request.file_path
        result = Result()

        # Report the version of suricata as the service context
        request.set_service_context(
            f"Suricata version: {self.get_suricata_version()}")

        # restart Suricata if we need to
        self.start_suricata_if_necessary()

        # Strip frame headers from the PCAP, since Suricata sometimes has trouble parsing strange PCAPs
        stripped_filepath = self.strip_frame_headers(file_path)

        # Check to make sure the size of the stripped file isn't 0 - this happens on pcapng files
        # TODO: there's probably a better way to do this - don't event strip it if it's pcapng
        if os.stat(stripped_filepath).st_size == 0:
            stripped_filepath = file_path

        # Switch stdout and stderr so we don't get our logs polluted
        mystdout = StringIO()
        old_stdout = sys.stdout
        sys.stdout = mystdout

        mystderr = StringIO()
        old_stderr = sys.stderr
        sys.stderr = mystderr

        # Pass the pcap file to Suricata via the socket
        ret = self.suricata_sc.send_command(
            "pcap-file", {
                "filename": stripped_filepath,
                "output-dir": self.working_directory
            })

        if not ret or ret["return"] != "OK":
            self.log.exception(
                f"Failed to submit PCAP for processing: {ret['message']}")

        # Wait for the socket finish processing our PCAP
        while True:
            time.sleep(1)
            try:
                ret = self.suricata_sc.send_command("pcap-current")
                if ret and ret["message"] == "None":
                    break
            except ConnectionResetError as e:
                raise RecoverableError(e)

        # Bring back stdout and stderr
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        # NOTE: for now we will ignore content of mystdout and mystderr but we have them just in case...

        alerts, signatures, domains, ips, urls, email_addresses, tls_dict, extracted_files, reverse_lookup = self.parse_suricata_output(
        ).values()

        file_extracted_section = ResultSection("File(s) extracted by Suricata")
        # Parse the json results of the service
        if request.get_param("extract_files"):
            for file in extracted_files:
                sha256, filename, extracted_file_path = file.values()
                self.log.info(f"extracted file {filename}")
                try:
                    if request.add_extracted(
                            extracted_file_path,
                            filename,
                            "Extracted by Suricata",
                            safelist_interface=self.api_interface):
                        file_extracted_section.add_line(filename)
                        if filename != sha256:
                            file_extracted_section.add_tag(
                                'file.name.extracted', filename)
                except FileNotFoundError as e:
                    # An intermittent issue, just try again
                    raise RecoverableError(e)
                except MaxExtractedExceeded:
                    # We've hit our limit
                    pass

        # Report a null score to indicate that files were extracted. If no sigs hit, it's not clear
        # where the extracted files came from
        if file_extracted_section.body:
            result.add_section(file_extracted_section)

        # Add tags for the domains, urls, and IPs we've discovered
        root_section = ResultSection("Discovered IOCs", parent=result)
        if domains:
            domain_section = ResultSection("Domains", parent=root_section)
            for domain in domains:
                domain_section.add_line(domain)
                domain_section.add_tag('network.dynamic.domain', domain)
        if ips:
            ip_section = ResultSection("IP Addresses", parent=root_section)
            for ip in ips:
                # Make sure it's not a local IP
                if not (ip.startswith("127.") or ip.startswith("192.168.")
                        or ip.startswith("10.") or
                        (ip.startswith("172.")
                         and 16 <= int(ip.split(".")[1]) <= 31)):
                    ip_section.add_line(ip)
                    ip_section.add_tag('network.dynamic.ip', ip)

        if urls:
            url_section = ResultSection("URLs", parent=root_section)
            for url in urls:
                url_section.add_line(url)
                url_section.add_tag('network.dynamic.uri', url)
        if email_addresses:
            email_section = ResultSection("Email Addresses",
                                          parent=root_section)
            for eml in email_addresses:
                email_section.add_line(eml)
                email_section.add_tag('network.email.address', eml)

        # Map between suricata key names and AL tag types
        tls_mappings = {
            "subject": 'cert.subject',
            "issuerdn": 'cert.issuer',
            "version": 'cert.version',
            "notbefore": 'cert.valid.start',
            "notafter": 'cert.valid.end',
            "fingerprint": 'cert.thumbprint',
            "sni": 'network.tls.sni'
        }

        if tls_dict:
            tls_section = ResultSection("TLS Information",
                                        parent=root_section,
                                        body_format=BODY_FORMAT.JSON)
            kv_body = {}
            for tls_type, tls_values in tls_dict.items():
                if tls_type == "fingerprint":
                    # make sure the cert fingerprint/thumbprint matches other values,
                    # like from PEFile
                    tls_values = [
                        v.replace(":", "").lower() for v in tls_values
                    ]

                if tls_type in tls_mappings:
                    kv_body[tls_type] = tls_values

                    tag_type = tls_mappings[tls_type]
                    if tag_type is not None:
                        for tls_value in tls_values:
                            tls_section.add_tag(tag_type, tls_value)

                elif tls_type == "ja3":
                    kv_body.setdefault('ja3_hash', [])
                    kv_body.setdefault('ja3_string', [])

                    for ja3_entry in tls_values:
                        ja3_hash = ja3_entry.get("hash")
                        ja3_string = ja3_entry.get("string")
                        if ja3_hash:
                            kv_body['ja3_hash'].append(ja3_hash)
                            tls_section.add_tag('network.tls.ja3_hash',
                                                ja3_hash)
                        if ja3_string:
                            kv_body['ja3_string'].append(ja3_string)
                            tls_section.add_tag('network.tls.ja3_string',
                                                ja3_string)

                else:
                    kv_body[tls_type] = tls_values
                    # stick a message in the logs about a new TLS type found in suricata logs
                    self.log.info(
                        f"Found new TLS type {tls_type} with values {tls_values}"
                    )
            tls_section.set_body(json.dumps(kv_body))

        # Create the result sections if there are any hits
        if len(alerts) > 0:
            for signature_id, signature_details in signatures.items():
                signature = signature_details['signature']
                attributes = signature_details['attributes']
                section = ResultSection(f'{signature_id}: {signature}')
                heur_id = 3
                if any(x in signature for x in self.config.get("sure_score")):
                    heur_id = 1
                elif any(x in signature
                         for x in self.config.get("vhigh_score")):
                    heur_id = 2

                section.set_heuristic(heur_id)
                if signature_details['al_signature']:
                    section.add_tag("file.rule.suricata",
                                    signature_details['al_signature'])
                for timestamp, src_ip, src_port, dest_ip, dest_port in alerts[
                        signature_id][:10]:
                    section.add_line(
                        f"{timestamp} {src_ip}:{src_port} -> {dest_ip}:{dest_port}"
                    )
                if len(alerts[signature_id]) > 10:
                    section.add_line(
                        f'And {len(alerts[signature_id]) - 10} more flows')

                # Tag IPs/Domains/URIs associated to signature
                for flow in alerts[signature_id]:
                    dest_ip = flow[3]
                    section.add_tag('network.dynamic.ip', dest_ip)
                    if dest_ip in reverse_lookup.keys():
                        section.add_tag('network.dynamic.domain',
                                        reverse_lookup[dest_ip])
                    [
                        section.add_tag('network.dynamic.uri', uri)
                        for uri in urls
                        if dest_ip in uri or (reverse_lookup.get(dest_ip) and
                                              reverse_lookup[dest_ip] in uri)
                    ]

                # Add a tag for the signature id and the message
                section.add_tag('network.signature.signature_id',
                                str(signature_id))
                section.add_tag('network.signature.message', signature)
                [
                    section.add_tag('network.static.uri', attr['uri'])
                    for attr in attributes if attr.get('uri')
                ]
                # Tag malware_family
                for malware_family in signature_details['malware_family']:
                    section.add_tag('attribution.family', malware_family)

                result.add_section(section)
                self.ontology.add_result_part(
                    Signature,
                    data=dict(
                        name=signature_details['al_signature'],
                        type="SURICATA",
                        malware_families=signature_details['malware_family']
                        or None,
                        attributes=attributes))

            # Add the original Suricata output as a supplementary file in the result
            request.add_supplementary(
                os.path.join(self.working_directory, 'eve.json'),
                'SuricataEventLog.json', 'json')

        # Add the stats.log to the result, which can be used to determine service success
        if os.path.exists(os.path.join(self.working_directory, 'stats.log')):
            request.add_supplementary(
                os.path.join(self.working_directory, 'stats.log'), 'stats.log',
                'log')

        request.result = result
    def test_process_ttps(intezer_static_class_instance,
                          dummy_api_interface_class, mocker):
        from intezer_static import ALIntezerApi
        from intezer_sdk.api import IntezerApi
        from intezer_sdk.errors import UnsupportedOnPremiseVersion
        from assemblyline_v4_service.common.result import ResultSection, ResultTableSection, TableRow
        from requests import HTTPError
        mocker.patch.object(intezer_static_class_instance,
                            "get_api_interface",
                            return_value=dummy_api_interface_class)
        intezer_static_class_instance.start()
        parent_res_sec = ResultSection("blah")

        mocker.patch.object(ALIntezerApi, "get_dynamic_ttps", return_value=[])
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        assert parent_res_sec.subsections == []

        mocker.patch.object(IntezerApi,
                            "get_dynamic_ttps",
                            side_effect=HTTPError("FORBIDDEN"))
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        assert parent_res_sec.subsections == []

        mocker.patch.object(IntezerApi,
                            "get_dynamic_ttps",
                            side_effect=UnsupportedOnPremiseVersion())
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        assert parent_res_sec.subsections == []

        mocker.patch.object(ALIntezerApi,
                            "get_dynamic_ttps",
                            return_value=[{
                                "name": "blah",
                                "description": "blah",
                                "data": [],
                                "severity": 1
                            }])
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        correct_res_sec = ResultSection("Signature: blah", "blah")
        correct_res_sec.set_heuristic(4)
        correct_res_sec.heuristic.add_signature_id("blah", 10)
        assert check_section_equality(
            parent_res_sec.subsections[0].subsections[0], correct_res_sec)

        parent_res_sec = ResultSection("blah")
        mocker.patch.object(ALIntezerApi,
                            "get_dynamic_ttps",
                            return_value=[{
                                "name": "InjectionInterProcess",
                                "description": "blah",
                                "data": [],
                                "severity": 1
                            }])
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        correct_res_sec = ResultSection("Signature: InjectionInterProcess",
                                        "blah")
        correct_res_sec.set_heuristic(7)
        correct_res_sec.heuristic.add_signature_id("InjectionInterProcess", 10)
        correct_res_sec.heuristic.add_attack_id("T1055")
        assert check_section_equality(
            parent_res_sec.subsections[0].subsections[0], correct_res_sec)

        parent_res_sec = ResultSection("blah")
        mocker.patch.object(ALIntezerApi,
                            "get_dynamic_ttps",
                            return_value=[{
                                "name": "enumerates_running_processes",
                                "description": "blah",
                                "data": [{
                                    "wow": "print me!"
                                }],
                                "severity": 1
                            }])
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        correct_res_sec = ResultSection(
            "Signature: enumerates_running_processes", "blah")
        correct_res_sec.set_heuristic(8)
        correct_res_sec.heuristic.add_signature_id(
            "enumerates_running_processes", 10)
        correct_res_sec.heuristic.add_attack_id("T1057")
        assert check_section_equality(
            parent_res_sec.subsections[0].subsections[0], correct_res_sec)

        parent_res_sec = ResultSection("blah")
        mocker.patch.object(ALIntezerApi,
                            "get_dynamic_ttps",
                            return_value=[{
                                "name":
                                "blah",
                                "description":
                                "blah",
                                "data": [
                                    {
                                        "IP": "blah 2.2.2.2 blah"
                                    },
                                ],
                                "severity":
                                1
                            }])
        intezer_static_class_instance._process_ttps("blah", parent_res_sec)
        correct_res_sec = ResultSection("Signature: blah", "blah")
        correct_res_sec.add_line("\tIP: blah 2.2.2.2 blah")
        correct_res_sec.set_heuristic(4)
        correct_res_sec.heuristic.add_signature_id("blah", 10)
        correct_ioc_res_sec = ResultTableSection(
            "IOCs found in signature marks")
        correct_ioc_res_sec.add_row(TableRow(ioc_type="ip", ioc="2.2.2.2"))
        correct_ioc_res_sec.add_tag("network.dynamic.ip", "2.2.2.2")
        correct_res_sec.add_subsection(correct_ioc_res_sec)
        assert check_section_equality(
            parent_res_sec.subsections[0].subsections[0], correct_res_sec)
예제 #14
0
    def execute(self, request: ServiceRequest) -> None:
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values and request.get_param('extract_original_iocs'):
            ioc_res = ResultSection(
                "The following IOCs were found in the original file",
                parent=request.result,
                body_format=BODY_FORMAT.MEMORY_DUMP)
            for k, val in pat_values.items():
                for v in val:
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                        )
                        ioc_res.add_tag(k, v)

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list: List[Tuple[str, bytes]] = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if regex.match(regex.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            with ThreadPoolExecutor() as executor:
                threads = [
                    executor.submit(technique, layer)
                    for name, technique in techniques
                ]
                results = [thread.result() for thread in threads]
                for i in range(len(results)):
                    result = results[i]
                    if result:
                        layers_list.append((techniques[i][0], result))
                        # Looks like it worked, restart with new layer
                        layer = result
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    with ThreadPoolExecutor() as executor:
                        threads = [
                            executor.submit(technique, layer)
                            for name, technique in final_pass
                        ]
                        results = [thread.result() for thread in threads]
                        for i in range(len(results)):
                            result = results[i]
                            if result:
                                layers_list.append((techniques[i][0], result))
                    break
                for x in second_pass:
                    techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            else:  # num_layers >= 100
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags: Dict[str, List[bytes]] = {}

                for uri in pat_values.get('network.static.uri', []):
                    # Compare URIs without query string
                    uri = uri.split(b'?', 1)[0]
                    if uri not in request.file_contents:
                        diff_tags.setdefault('network.static.uri', [])
                        diff_tags['network.static.uri'].append(uri)

                if request.deep_scan or (len(clean) > 1000
                                         and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    file_name = f"{os.path.basename(request.file_name)}_decoded_final"
                    file_path = os.path.join(self.working_directory, file_name)
                    # Ensure directory exists before write
                    os.makedirs(os.path.dirname(file_path), exist_ok=True)
                    with open(file_path, 'wb+') as f:
                        f.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {file_path}"
                        )
                    request.add_extracted(file_path, file_name,
                                          "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for extracted in self.files_extracted:
                        file_name = os.path.basename(extracted)
                        ext_file_res.add_line(file_name)
                        request.add_extracted(
                            extracted, file_name,
                            "File of interest deobfuscated from sample")
예제 #15
0
    def execute(self, request):
        request.result = Result()
        request.set_service_context(self.get_tool_version())
        temp_filename = request.file_path
        filename = os.path.basename(temp_filename)
        extract_dir = os.path.join(self.working_directory, f"{filename}_extracted")
        decompiled_dir = os.path.join(self.working_directory, f"{filename}_decompiled")
        file_res = request.result
        new_files = []
        supplementary_files = []
        imp_res_list = []
        res_list = []

        if request.file_type == "java/jar":
            self.decompile_jar(temp_filename, decompiled_dir)
            if self.jar_extract(temp_filename, extract_dir):
                # Analysis properties
                self.classloader_found = 0
                self.security_found = 0
                self.url_found = 0
                self.runtime_found = 0
                self.applet_found = 0

                self.manifest_tags = []
                self.signature_block_certs = []

                def analyze_file(root, cf, file_res, imp_res_list, supplementary_files, decompiled_dir, extract_dir):
                    cur_file_path = os.path.join(root.decode('utf-8'), cf.decode('utf-8'))
                    with open(cur_file_path, "rb") as cur_file:
                        start_bytes = cur_file.read(24)

                        ##############################
                        # Executables in JAR
                        ##############################
                        cur_ext = os.path.splitext(cf)[1][1:].upper()
                        if start_bytes[:2] == b"MZ":
                            mz_res = dict(
                                title_text=f"Embedded executable file found: {cf} "
                                "There may be a malicious intent.",
                                heur_id=1,
                                tags=[('file.behavior', "Embedded PE")],
                                score_condition=APPLET_MZ,
                            )
                            imp_res_list.append(mz_res)

                        ##############################
                        # Launchable in JAR
                        ##############################
                        elif cur_ext in G_LAUNCHABLE_EXTENSIONS:
                            l_res = dict(
                                title_text=f"Launch-able file type found: {cf}"
                                "There may be a malicious intent.",
                                heur_id=2,
                                tags=[('file.behavior', "Launch-able file in JAR")],
                                score_condition=APPLET_MZ,
                            )
                            imp_res_list.append(l_res)

                        if cur_file_path.upper().endswith('.CLASS'):
                            self.analyse_class_file(file_res, cf, cur_file, cur_file_path,
                                                    start_bytes, imp_res_list, supplementary_files,
                                                    decompiled_dir, extract_dir)

                for root, _, files in os.walk(extract_dir.encode('utf-8')):
                    logging.info(f"Extracted: {root} - {files}")

                    # if the META-INF folder is encountered
                    if root.upper().endswith(b'META-INF'):  # only top level meta
                        self.analyse_meta_information(file_res, root, supplementary_files, extract_dir)
                        continue

                    with ThreadPoolExecutor() as executor:
                        for cf in files:
                            executor.submit(analyze_file, root, cf, file_res, imp_res_list,
                                            supplementary_files, decompiled_dir, extract_dir)

                res = ResultSection("Analysis of the JAR file")

                res_meta = ResultSection("[Meta Information]", parent=res)
                if len(self.manifest_tags) > 0:
                    res_manifest = ResultSection("Manifest File Information Extract",
                                                 parent=res_meta)
                    for tag, val in self.manifest_tags:
                        res_manifest.add_tag(tag, val)

                for res_cert in self.signature_block_certs:
                    res_meta.add_subsection(res_cert)

                if self.runtime_found > 0 \
                        or self.applet_found > 0 \
                        or self.classloader_found > 0 \
                        or self.security_found > 0 \
                        or self.url_found > 0:
                    res.add_line("All suspicious class files were saved as supplementary files.")

                res_class = ResultSection("[Suspicious classes]", parent=res)

                if self.runtime_found > 0:
                    ResultSection("Runtime Found",
                                  body=f"java/lang/Runtime: {self.runtime_found}",
                                  heuristic=Heuristic(10),
                                  parent=res_class)

                if self.applet_found > 0:
                    ResultSection("Applet Found",
                                  body=f"java/applet/Applet: {self.applet_found}",
                                  heuristic=Heuristic(6),
                                  parent=res_class)

                if self.classloader_found > 0:
                    ResultSection("Classloader Found",
                                  body=f"java/lang/ClassLoader: {self.classloader_found}",
                                  heuristic=Heuristic(7),
                                  parent=res_class)

                if self.security_found > 0:
                    ResultSection("Security Found",
                                  body=f"java/security/*: {self.security_found}",
                                  heuristic=Heuristic(8),
                                  parent=res_class)

                if self.url_found > 0:
                    ResultSection("URL Found",
                                  body=f"java/net/URL: {self.url_found}",
                                  heuristic=Heuristic(9),
                                  parent=res_class)

                res_list.append(res)

        # Add results if any
        self.recurse_add_res(file_res, imp_res_list, new_files)
        for res in res_list:
            file_res.add_section(res)

        # Submit embedded files
        if len(new_files) > 0:
            new_files = sorted(list(set(new_files)))
            txt = f"Extracted from 'JAR' file {filename}"
            for embed in new_files:
                request.add_extracted(embed, embed.replace(extract_dir + "/", "").replace(decompiled_dir + "/", ""),
                                      txt, safelist_interface=self.api_interface)

        if len(supplementary_files) > 0:
            supplementary_files = sorted(list(set(supplementary_files)))
            for path, name, desc in supplementary_files:
                request.add_supplementary(path, name, desc)
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)

        # Validate URLs in sample, strip out [] if found
        content_str = request.file_contents.decode(errors="ignore")
        content_str, retry = self.validate_urls(content_str)
        while retry:
            content_str, retry = self.validate_urls(content_str)
        parsed_eml = parser.decode_email_bytes(content_str.encode())

        result = Result()
        header = parsed_eml['header']

        if "from" in header:
            all_uri = set()

            for body_counter, body in enumerate(parsed_eml['body']):
                if request.get_param('extract_body_text'):
                    fd, path = mkstemp()
                    with open(path, 'w') as f:
                        f.write(body['content'])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body['uri']:
                        all_uri.add(uri)

            kv_section = ResultSection('Email Headers',
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            kv_section.add_tag("network.email.address", header['from'].strip())
            for to in header['to']:
                kv_section.add_tag("network.email.address", to)
            kv_section.add_tag("network.email.date",
                               str(header['date']).strip())
            kv_section.add_tag("network.email.subject",
                               header['subject'].strip())

            # Add CCs to body and tags
            if 'cc' in header:
                for to in header['to']:
                    kv_section.add_tag("network.email.address", to.strip())

            # Add Message ID to body and tags
            if 'message-id' in header['header']:
                kv_section.add_tag("network.email.msg_id",
                                   header['header']['message-id'][0].strip())

            # Add Tags for received IPs
            if 'received_ip' in header:
                for ip in header['received_ip']:
                    kv_section.add_tag('network.static.ip', ip.strip())

            # Add Tags for received Domains
            if 'received_domain' in header:
                for dom in header['received_domain']:
                    kv_section.add_tag('network.static.domain', dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection('URIs Found:', parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag('network.static.uri', uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag('network.static.ip',
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag('network.static.domain',
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop('header', {})
            header.pop('received', None)
            header.update(extra_header)

            kv_section.body = json.dumps(header, default=self.json_serial)

            if "attachment" in parsed_eml:
                for attachment in parsed_eml['attachment']:
                    fd, path = mkstemp()

                    with open(path, 'wb') as f:
                        f.write(base64.b64decode(attachment['raw']))
                        os.close(fd)
                    request.add_extracted(path, attachment['filename'],
                                          "Attachment ")
                ResultSection('Extracted Attachments:',
                              body="\n".join([
                                  x['filename']
                                  for x in parsed_eml['attachment']
                              ]),
                              parent=result)

            if request.get_param('save_emlparser_output'):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            text_section = ResultSection('EML parsing results')
            text_section.add_line("Could not parse EML")
            result.add_section(text_section)

        request.result = result
예제 #17
0
    def execute(self, request):
        """Main Module. See README for details."""
        if not self.rules:
            return

        request.set_service_context(
            f"{self.name} version: {self.get_tool_version()}")

        self.deep_scan = request.task.deep_scan
        local_filename = request.file_path
        tags = {
            f"al_{k.replace('.', '_')}": i
            for k, i in request.task.tags.items()
        }

        yara_externals = {}
        for k in self.yara_externals.keys():
            # Check default request.task fields
            sval = request.task.__dict__.get(k, None)

            # if not sval:
            #     # Check metadata dictionary
            #     sval = request.task.metadata.get(k, None)

            if not sval:
                # Check params dictionary
                sval = request.task.service_config.get(k, None)

            if not sval:
                # Check tags list
                val_list = tags.get(k, None)
                if val_list:
                    sval = " | ".join(val_list)

            if not sval:
                # Check temp submission data
                sval = request.task.temp_submission_data.get(k, None)

            # Normalize unicode with safe_str and make sure everything else is a string
            if sval:
                yara_externals[k] = safe_str(sval)

        with self.initialization_lock:
            try:
                matches = self.rules.match(local_filename,
                                           externals=yara_externals)
                request.result = self._extract_result_from_matches(matches)
            except Exception as e:
                # Internal error 30 == exceeded max string matches on rule
                if e != "internal error: 30":
                    raise
                else:
                    try:
                        # Fast mode == Yara skips strings already found
                        matches = self.rules.match(local_filename,
                                                   externals=yara_externals,
                                                   fast=True)
                        result = self._extract_result_from_matches(matches)
                        section = ResultSection("Service Warnings",
                                                parent=result)
                        section.add_line(
                            "Too many matches detected with current ruleset. "
                            f"{self.name} forced to scan in fast mode.")
                        request.result = result
                    except Exception:
                        self.log.warning(
                            f"YARA internal error 30 detected on submission {request.task.sid}"
                        )
                        result = Result()
                        section = ResultSection(
                            f"{self.name} scan not completed.", parent=result)
                        section.add_line(
                            "File returned too many matches with current rule set and YARA exited."
                        )
                        request.result = result
    def execute(self, request):
        # ==================================================================
        # Execute a request:
        #   Every time your service receives a new file to scan, the execute function is called
        #   This is where you should execute your processing code.
        #   For the purpose of this example, we will only generate results ...

        # You should run your code here...

        # ==================================================================
        # Check if we're scanning an embedded file
        #   This service always drop 3 embedded file which two generates random results and the other empty results
        #   We're making a check to see if we're scanning the embedded file.
        #   In a normal service this is not something you would do at all but since we are using this
        #   service in our unit test to test all features of our report generator, we have to do this
        if request.sha256 not in ['d729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a',
                                  '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec',
                                  'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06']:
            # Main file results...

            # ==================================================================
            # Write the results:
            #   First, create a result object where all the result sections will be saved to
            result = Result()

            # ==================================================================
            # Standard text section: BODY_FORMAT.TEXT - DEFAULT
            #   Text sections basically just dumps the text to the screen...
            #     All sections scores will be SUMed in the service result
            #     The Result classification will be the highest classification found in the sections
            text_section = ResultSection('Example of a default section')
            # You can add lines to your section one at a time
            #   Here we will generate a random line
            text_section.add_line(get_random_phrase())
            # Or your can add them from a list
            #   Here we will generate random amount of random lines
            text_section.add_lines([get_random_phrase() for _ in range(random.randint(1, 5))])
            # If the section needs to affect the score of the file you need to set a heuristics
            #   Here we will pick one at random
            #     In addition to add a heuristic, we will associated a signature with the heuristic,
            #     we're doing this by adding the signature name to the heuristic. (Here we generating a random name)
            text_section.set_heuristic(3, signature="sig_one")
            # You can attach attack ids to heuristics after they where defined
            text_section.heuristic.add_attack_id("T1066")
            # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how
            #   many time the signature fired by setting its frequency. If you call add_signature_id twice with the
            #   same signature, this will effectively increase the frequency of the signature.
            text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2)
            text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3)
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_four", score=0)
            # The heuristic for text_section should have the following properties
            #   1. 1 attack ID: T1066
            #   2. 4 signatures: sig_one, sig_two, sig_three and sig_four
            #   3. Signature frequencies are cumulative therefor they will be as follow:
            #      - sig_one = 1
            #      - sig_two = 5
            #      - sig_three = 2
            #      - sig_four = 1
            #   4. The score used by each heuristic is driven by the following rules: signature_score_map is higher
            #      priority, then score value for the add_signature_id is in second place and finally the default
            #      heuristic score is use. Therefor the score used to calculate the total score for the text_section is
            #      as follow:
            #      - sig_one: 10    -> heuristic default score
            #      - sig_two: 20    -> score provided by the function add_signature_id
            #      - sig_three: 30  -> score provided by the heuristic map
            #      - sig_four: 40   -> score provided by the heuristic map because it's higher priority than the
            #                          function score
            #    5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210
            # Make sure you add your section to the result
            result.add_section(text_section)

            # ==================================================================
            # Color map Section: BODY_FORMAT.GRAPH_DATA
            #     Creates a color map bar using a minimum and maximum domain
            #     e.g. We are using this section to display the entropy distribution in some services
            cmap_min = 0
            cmap_max = 20
            color_map_data = {
                'type': 'colormap',
                'data': {
                    'domain': [cmap_min, cmap_max],
                    'values': [random.random() * cmap_max for _ in range(50)]
                }
            }
            # The classification of a section can be set to any valid classification for your system
            section_color_map = ResultSection("Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA,
                                              body=json.dumps(color_map_data), classification=cl_engine.RESTRICTED)
            result.add_section(section_color_map)

            # ==================================================================
            # URL section: BODY_FORMAT.URL
            #   Generate a list of clickable urls using a json encoded format
            #     As you can see here, the body of the section can be set directly instead of line by line
            random_host = get_random_host()
            url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL,
                                        body=json.dumps({"name": "Random url!", "url": f"https://{random_host}/"}))

            # Since urls are very important features we can tag those features in the system so they are easy to find
            #   Tags are defined by a type and a value
            url_section.add_tag("network.static.domain", random_host)

            # You may also want to provide a list of url!
            #   Also, No need to provide a name, the url link will be displayed
            host1 = get_random_host()
            host2 = get_random_host()
            ip1 = get_random_ip()
            ip2 = get_random_ip()
            ip3 = get_random_ip()
            urls = [
                {"url": f"https://{host1}/"},
                {"url": f"https://{host2}/"},
                {"url": f"https://{ip1}/"},
                {"url": f"https://{ip2}/"},
                {"url": f"https://{ip3}/"}]

            # A heuristic can fire more then once without being associated to a signature
            url_heuristic = Heuristic(4, frequency=len(urls))

            url_sub_section = ResultSection('Example of a url section with multiple links',
                                            body=json.dumps(urls), body_format=BODY_FORMAT.URL,
                                            heuristic=url_heuristic)
            url_sub_section.add_tag("network.static.ip", ip1)
            url_sub_section.add_tag("network.static.ip", ip2)
            url_sub_section.add_tag("network.static.ip", ip3)
            url_sub_section.add_tag("network.static.domain", host1)
            url_sub_section.add_tag("network.dynamic.domain", host2)
            # Since url_sub_section is a sub-section of url_section
            # we will add it as a sub-section of url_section not to the main result itself
            url_section.add_subsection(url_sub_section)
            result.add_section(url_section)

            # ==================================================================
            # Memory dump section: BODY_FORMAT.MEMORY_DUMP
            #     Dump whatever string content you have into a <pre/> html tag so you can do your own formatting
            data = hexdump(b"This is some random text that we will format as an hexdump and you'll see "
                           b"that the hexdump formatting will be preserved by the memory dump section!")
            memdump_section = ResultSection('Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP,
                                            body=data)
            memdump_section.set_heuristic(random.randint(1, 4))
            result.add_section(memdump_section)

            # ==================================================================
            # KEY_VALUE section:
            #     This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI
            #     while also providing easy to parse data for auto mated tools.
            #     NB: You should definitely use this over a JSON body type since this one will be displayed correctly
            #         in the UI for the user
            #     The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed)
            kv_body = {
                "a_str": "Some string",
                "a_bool": False,
                "an_int": 102,
            }
            kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE,
                                       body=json.dumps(kv_body))
            result.add_section(kv_section)

            # ==================================================================
            # JSON section:
            #     Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor)
            #     to display a tree view of JSON results.
            #     NB: Use this sparingly! As a service developer you should do your best to include important
            #     results as their own result sections.
            #     The body argument must be a json dump of a python dictionary
            json_body = {
                "a_str": "Some string",
                "a_list": ["a", "b", "c"],
                "a_bool": False,
                "an_int": 102,
                "a_dict": {
                    "list_of_dict": [
                        {"d1_key": "val", "d1_key2": "val2"},
                        {"d2_key": "val", "d2_key2": "val2"}
                    ],
                    "bool": True
                }
            }
            json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON,
                                         body=json.dumps(json_body))
            result.add_section(json_section)

            # ==================================================================
            # PROCESS_TREE section:
            #     This section allows the service writer to list a bunch of dictionary objects that have nested lists
            #     of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore
            #     each dictionary must have be of the following format:
            #     {
            #       "process_pid": int,
            #       "process_name": str,
            #       "command_line": str,
            #       "children": [] NB: This list either is empty or contains more dictionaries that have the same
            #                          structure
            #     }
            nc_body = [
                {
                    "process_pid": 123,
                    "process_name": "evil.exe",
                    "command_line": "C:\\evil.exe",
                    "signatures": {},
                    "children": [
                        {
                            "process_pid": 321,
                            "process_name": "takeovercomputer.exe",
                            "command_line": "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff",
                            "signatures": {"one":250},
                            "children": [
                                {
                                    "process_pid": 456,
                                    "process_name": "evenworsethanbefore.exe",
                                    "command_line": "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad",
                                    "signatures": {"one":10, "two":10, "three":10},
                                    "children": []
                                },
                                {
                                    "process_pid": 234,
                                    "process_name": "badfile.exe",
                                    "command_line": "C:\\badfile.exe -k nothing_to_see_here",
                                    "signatures": {"one":1000, "two":10, "three":10, "four":10, "five":10},
                                    "children": []
                                }
                            ]
                        },
                        {
                            "process_pid": 345,
                            "process_name": "benignexe.exe",
                            "command_line": "C:\\benignexe.exe -f \"just kidding, i'm evil\"",
                            "signatures": {"one": 2000},
                            "children": []
                        }
                    ]
                },
                {
                    "process_pid": 987,
                    "process_name": "runzeroday.exe",
                    "command_line": "C:\\runzeroday.exe -f insert_bad_spelling",
                    "signatures": {},
                    "children": []
                }
            ]
            nc_section = ResultSection('Example of a PROCESS_TREE section',
                                       body_format=BODY_FORMAT.PROCESS_TREE,
                                       body=json.dumps(nc_body))
            result.add_section(nc_section)
            
            # ==================================================================
            # TABLE section:
            #     This section allows the service writer to have their content displayed in a table format in the UI
            #     The body argument must be a list [] of dict {} objects. A dict object can have a key value pair
            #     where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested
            #     table within a cell.
            table_body = [
                {
                    "a_str": "Some string1",
                    "extra_column_here": "confirmed",
                    "a_bool": False,
                    "an_int": 101,
                },
                {
                    "a_str": "Some string2",
                    "a_bool": True,
                    "an_int": 102,
                },
                {
                    "a_str": "Some string3",
                    "a_bool": False,
                    "an_int": 103,
                },
                {
                    "a_str": "Some string4",
                    "a_bool": None,
                    "an_int": -1000000000000000000,
                    "extra_column_there": "confirmed",
                    "nested_table": {
                        "a_str": "Some string3",
                        "a_bool": False,
                        "nested_table_thats_too_deep": {
                            "a_str": "Some string3",
                            "a_bool": False,
                            "an_int": 103,
                        },
                    },
                },
            ]
            table_section = ResultSection('Example of a TABLE section',
                                          body_format=BODY_FORMAT.TABLE,
                                          body=json.dumps(table_body))
            result.add_section(table_section)

            # ==================================================================
            # Re-Submitting files to the system
            #     Adding extracted files will have them resubmitted to the system for analysis

            # This file will generate random results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(data.encode())
            request.add_extracted(temp_path, "file.txt", "Extracted by some magic!")

            # Embedded files can also have their own classification!
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"CLASSIFIED!!!__"+data.encode())
            request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look",
                                  classification=cl_engine.RESTRICTED)

            # This file will generate empty results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"EMPTY")
            request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file")

            # ==================================================================
            # Supplementary files
            #     Adding supplementary files will save them on the datastore for future
            #      reference but wont reprocess those files.
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(urls))
            request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file")
            # like embedded files, you can add more then one supplementary files
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(json_body))
            request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file")

            # ==================================================================
            # Wrap-up:
            #     Save your result object back into the request
            request.result = result

        # ==================================================================
        # Empty results file
        elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06':
            # Creating and empty result object
            request.result = Result()

        # ==================================================================
        # Randomized results file
        else:
            # For the randomized  results file, we will completely randomize the results
            #   The content of those results do not matter since we've already showed you
            #   all the different result sections, tagging, heuristics and file upload functions
            embedded_result = Result()

            # random number of sections
            for _ in range(1, 3):
                embedded_result.add_section(self._create_random_section())

            request.result = embedded_result
예제 #19
0
    def execute(self, request):
        # ==================================================================
        # Execute a request:
        #   Every time your service receives a new file to scan, the execute function is called
        #   This is where you should execute your processing code.
        #   For the purpose of this example, we will only generate results ...

        # You should run your code here...

        # ==================================================================
        # Check if we're scanning an embedded file
        #   This service always drop two embedded file which one generates random results and the other empty results
        #   We're making a check to see if we're scanning the embedded file.
        #   In a normal service this is not something you would do at all but since we are using this
        #   service in our unit test to test all features of our report generator, we have to do this
        if request.sha256 not in [
                'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a',
                'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06'
        ]:
            # Main file results...

            # ==================================================================
            # Write the results:
            #   First, create a result object where all the result sections will be saved to
            result = Result()

            # ==================================================================
            # Standard text section: BODY_FORMAT.TEXT - DEFAULT
            #   Text sections basically just dumps the text to the screen...
            #     All sections scores will be SUMed in the service result
            #     The Result classification will be the highest classification found in the sections
            text_section = ResultSection('Example of a default section')
            # You can add lines to your section one at a time
            #   Here we will generate a random line
            text_section.add_line(get_random_phrase())
            # Or your can add them from a list
            #   Here we will generate random amount of random lines
            text_section.add_lines(
                [get_random_phrase() for _ in range(random.randint(1, 5))])
            # If the section needs to affect the score of the file you need to set a heuristics
            #   Here we will pick one at random
            #     In addition to add a heuristic, we will associated a signature with the heuristic,
            #     we're doing this by adding the signature name to the heuristic. (Here we generating a random name)
            text_section.set_heuristic(random.randint(1, 4),
                                       signature=get_random_phrase(
                                           1, 4).lower().replace(" ", "_"))
            # Make sure you add your section to the result
            result.add_section(text_section)

            # ==================================================================
            # Color map Section: BODY_FORMAT.GRAPH_DATA
            #     Creates a color map bar using a minimum and maximum domain
            #     e.g. We are using this section to display the entropy distribution in some services
            cmap_min = 0
            cmap_max = 20
            color_map_data = {
                'type': 'colormap',
                'data': {
                    'domain': [cmap_min, cmap_max],
                    'values': [random.random() * cmap_max for _ in range(50)]
                }
            }
            section_color_map = ResultSection(
                "Example of colormap result section",
                body_format=BODY_FORMAT.GRAPH_DATA,
                body=json.dumps(color_map_data))
            result.add_section(section_color_map)

            # ==================================================================
            # URL section: BODY_FORMAT.URL
            #   Generate a list of clickable urls using a json encoded format
            #     As you can see here, the body of the section can be set directly instead of line by line
            random_host = get_random_host()
            url_section = ResultSection('Example of a simple url section',
                                        body_format=BODY_FORMAT.URL,
                                        body=json.dumps({
                                            "name":
                                            "Random url!",
                                            "url":
                                            f"https://{random_host}/"
                                        }))

            # Since urls are very important features we can tag those features in the system so they are easy to find
            #   Tags are defined by a type and a value
            url_section.add_tag("network.static.domain", random_host)

            # You may also want to provide a list of url!
            #   Also, No need to provide a name, the url link will be displayed
            host1 = get_random_host()
            host2 = get_random_host()
            ip1 = get_random_ip()
            urls = [{
                "url": f"https://{host1}/"
            }, {
                "url": f"https://{host2}/"
            }, {
                "url": f"https://{ip1}/"
            }]
            url_sub_section = ResultSection(
                'Example of a url section with multiple links',
                body_format=BODY_FORMAT.URL,
                body=json.dumps(urls))
            url_sub_section.set_heuristic(random.randint(1, 4))
            url_sub_section.add_tag("network.static.ip", ip1)
            url_sub_section.add_tag("network.static.domain", host1)
            url_sub_section.add_tag("network.dynamic.domain", host2)
            # Since url_sub_section is a sub-section of url_section
            # we will add it as a sub-section of url_section not to the main result itself
            url_section.add_subsection(url_sub_section)
            result.add_section(url_section)

            # ==================================================================
            # Memory dump section: BODY_FORMAT.MEMORY_DUMP
            #     Dump whatever string content you have into a <pre/> html tag so you can do your own formatting
            data = hexdump(
                b"This is some random text that we will format as an hexdump and you'll see "
                b"that the hexdump formatting will be preserved by the memory dump section!"
            )
            memdump_section = ResultSection(
                'Example of a memory dump section',
                body_format=BODY_FORMAT.MEMORY_DUMP,
                body=data)
            memdump_section.set_heuristic(random.randint(1, 4))
            result.add_section(memdump_section)

            # ==================================================================
            # KEY_VALUE section:
            #     This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI
            #     while also providing easy to parse data for auto mated tools.
            #     NB: You should definitely use this over a JSON body type since this one will be displayed correctly
            #         in the UI for the user
            #     The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed)
            kv_body = {
                "a_str": "Some string",
                "a_bool": False,
                "an_int": 102,
            }
            kv_section = ResultSection('Example of a KEY_VALUE section',
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       body=json.dumps(kv_body))
            result.add_section(kv_section)

            # ==================================================================
            # JSON section:
            #     Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor)
            #     to display a tree view of JSON results.
            #     NB: Use this sparingly! As a service developer you should do your best to include important
            #     results as their own result sections.
            #     The body argument must be a json dump of a python dictionary
            json_body = {
                "a_str": "Some string",
                "a_list": ["a", "b", "c"],
                "a_bool": False,
                "an_int": 102,
                "a_dict": {
                    "list_of_dict": [{
                        "d1_key": "val",
                        "d1_key2": "val2"
                    }, {
                        "d2_key": "val",
                        "d2_key2": "val2"
                    }],
                    "bool":
                    True
                }
            }
            json_section = ResultSection('Example of a JSON section',
                                         body_format=BODY_FORMAT.JSON,
                                         body=json.dumps(json_body))
            result.add_section(json_section)

            # ==================================================================
            # Re-Submitting files to the system
            #     Adding extracted files will have them resubmitted to the system for analysis

            # This file will generate random results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(data.encode())
            request.add_extracted(temp_path, "file.txt",
                                  "Extracted by some magic!")

            # This file will generate empty results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"EMPTY")
            request.add_extracted(temp_path, "empty.txt",
                                  "Extracted empty resulting file")

            # ==================================================================
            # Supplementary files
            #     Adding supplementary files will save them on the datastore for future
            #      reference but wont reprocess those files.
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(urls))
            request.add_supplementary(temp_path, "urls.json",
                                      "These are urls as a JSON file")
            # like embedded files, you can add more then one supplementary files
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(json_body))
            request.add_supplementary(temp_path, "json_body.json",
                                      "This is the json_body as a JSON file")

            # ==================================================================
            # Wrap-up:
            #     Save your result object back into the request
            request.result = result

        # ==================================================================
        # Empty results file
        elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06':
            # Creating and empty result object
            request.result = Result()

        # ==================================================================
        # Randomized results file
        else:
            # For the randomized  results file, we will completely randomize the results
            #   The content of those results do not matter since we've already showed you
            #   all the different result sections, tagging, heuristics and file upload functions
            embedded_result = Result()

            # random number of sections
            for _ in range(1, 3):
                embedded_result.add_section(self._create_random_section())

            request.result = embedded_result
    def peepdf_analysis(self, temp_filename, file_content, request):
        file_res = Result()
        try:
            res_list = []
            # js_stream = []
            f_list = []
            js_dump = []

            pdf_parser = PDFParser()
            ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content)
            if ret == 0:
                stats_dict = pdf_file.getStats()

                if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \
                                                      "indirect objects found in the body":
                    # Not a PDF
                    return

                json_body = dict(
                    version=stats_dict['Version'],
                    binary=stats_dict['Binary'],
                    linearized=stats_dict['Linearized'],
                    encrypted=stats_dict['Encrypted'],
                )

                if stats_dict['Encryption Algorithms']:
                    temp = []
                    for algorithmInfo in stats_dict['Encryption Algorithms']:
                        temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits")
                    json_body["encryption_algorithms"] = temp

                json_body.update(dict(
                    updates=stats_dict['Updates'],
                    objects=stats_dict['Objects'],
                    streams=stats_dict['Streams'],
                    comments=stats_dict['Comments'],
                    errors={True: ", ".join(stats_dict['Errors']),
                            False: "None"}[len(stats_dict['Errors']) != 0]
                ))
                res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE,
                                    body=json.dumps(json_body))

                for version in range(len(stats_dict['Versions'])):
                    stats_version = stats_dict['Versions'][version]
                    v_json_body = dict(
                        catalog=stats_version['Catalog'] or "no",
                        info=stats_version['Info'] or "no",
                        objects=self.list_first_x(stats_version['Objects'][1]),
                    )

                    if stats_version['Compressed Objects'] is not None:
                        v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1])

                    if stats_version['Errors'] is not None:
                        v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1])

                    v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1])

                    if stats_version['Xref Streams'] is not None:
                        v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1])

                    if stats_version['Object Streams'] is not None:
                        v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1])

                    if int(stats_version['Streams'][0]) > 0:
                        v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1])
                        if stats_version['Decoding Errors'] is not None:
                            v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1])

                    if stats_version['Objects with JS code'] is not None:
                        v_json_body['objects_with_js_code'] = \
                            self.list_first_x(stats_version['Objects with JS code'][1])
                        # js_stream.extend(stats_version['Objects with JS code'][1])

                    res_version = ResultSection(f"Version {str(version)}", parent=res,
                                                body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body))

                    actions = stats_version['Actions']
                    events = stats_version['Events']
                    vulns = stats_version['Vulns']
                    elements = stats_version['Elements']
                    is_suspicious = False
                    if events is not None or actions is not None or vulns is not None or elements is not None:
                        res_suspicious = ResultSection('Suspicious elements', parent=res_version)
                        if events is not None:
                            for event in events:
                                res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}")
                            is_suspicious = True
                        if actions is not None:
                            for action in actions:
                                res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}")
                            is_suspicious = True
                        if vulns is not None:
                            for vuln in vulns:
                                if vuln in vulnsDict:
                                    temp = [vuln, ' (']
                                    for vulnCVE in vulnsDict[vuln]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                            vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                            temp.append(vulnCVE)
                                            cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                            if cve_found:
                                                res_suspicious.add_tag('attribution.exploit',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                                res_suspicious.add_tag('file.behavior',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(vulns[vuln]))
                                    res_suspicious.add_line(temp)
                                else:
                                    res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}")
                                is_suspicious = True
                        if elements is not None:
                            for element in elements:
                                if element in vulnsDict:
                                    temp = [element, ' (']
                                    for vulnCVE in vulnsDict[element]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                        vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                        temp.append(vulnCVE)
                                        cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                        if cve_found:
                                            res_suspicious.add_tag('attribution.exploit',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                            res_suspicious.add_tag('file.behavior',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(elements[element]))
                                    res_suspicious.add_line(temp)
                                    is_suspicious = True
                                else:
                                    res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}")
                                    is_suspicious = True
                    res_suspicious.set_heuristic(8) if is_suspicious else None

                    urls = stats_version['URLs']
                    if urls is not None:
                        res.add_line("")
                        res_url = ResultSection('Found URLs', parent=res)
                        for url in urls:
                            res_url.add_line(f"\t\t{url}")
                            res_url.set_heuristic(9)

                    for obj in stats_version['Objects'][1]:
                        cur_obj = pdf_file.getObject(obj, version)

                        if cur_obj.containsJScode:
                            cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} "
                                                    f"block of JavaScript")
                            score_modifier = 0

                            js_idx = 0
                            for js in cur_obj.JSCode:
                                sub_res = ResultSection('Block of JavaScript', parent=cur_res)
                                js_idx += 1
                                js_score = 0
                                js_code, unescaped_bytes, _, _, _ = analyseJS(js)

                                js_dump += [x for x in js_code]

                                # Malicious characteristics
                                big_buffs = self.get_big_buffs("".join(js_code))
                                if len(big_buffs) == 1:
                                    js_score += 500 * len(big_buffs)
                                if len(big_buffs) > 0:
                                    js_score += 500 * len(big_buffs)
                                has_eval, has_unescape = self.check_dangerous_func("".join(js_code))
                                if has_unescape:
                                    js_score += 100
                                if has_eval:
                                    js_score += 100

                                js_cmt = ""
                                if has_eval or has_unescape or len(big_buffs) > 0:
                                    score_modifier += js_score
                                    js_cmt = "Suspiciously malicious "
                                    cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF")
                                    sub_res.set_heuristic(7)
                                js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res)

                                if js_score > 0:
                                    temp_js_outname = f"object{obj}-{version}_{js_idx}.js"
                                    temp_js_path = os.path.join(self.working_directory, temp_js_outname)
                                    temp_js_bin = "".join(js_code).encode("utf-8")
                                    f = open(temp_js_path, "wb")
                                    f.write(temp_js_bin)
                                    f.close()
                                    f_list.append(temp_js_path)

                                    js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}")
                                    if has_eval or has_unescape:
                                        analysis_res = ResultSection("[Suspicious Functions]", parent=js_res)
                                        if has_eval:
                                            analysis_res.add_line("eval: This JavaScript block uses eval() function "
                                                                  "which is often used to launch deobfuscated "
                                                                  "JavaScript code.")
                                            analysis_res.set_heuristic(3)
                                        if has_unescape:
                                            analysis_res.add_line("unescape: This JavaScript block uses unescape() "
                                                                  "function. It may be legitimate but it is definitely "
                                                                  "suspicious since malware often use this to "
                                                                  "deobfuscate code blocks.")
                                            analysis_res.set_heuristic(3)

                                    buff_idx = 0
                                    for buff in big_buffs:
                                        buff_idx += 1
                                        error, new_buff = unescape(buff)
                                        if error == 0:
                                            buff = new_buff

                                        if buff not in unescaped_bytes:
                                            temp_path_name = None
                                            if ";base64," in buff[:100] and "data:" in buff[:100]:
                                                temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff"
                                                try:
                                                    buff = b64decode(buff.split(";base64,")[1].strip())
                                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                                    f = open(temp_path, "wb")
                                                    f.write(buff)
                                                    f.close()
                                                    f_list.append(temp_path)
                                                except Exception:
                                                    self.log.error("Found 'data:;base64, ' buffer "
                                                                   "but failed to base64 decode.")
                                                    temp_path_name = None

                                            if temp_path_name is not None:
                                                buff_cond = f" and was resubmitted as {temp_path_name}"
                                            else:
                                                buff_cond = ""
                                            buff_res = ResultSection(
                                                f"A {len(buff)} bytes buffer was found in the JavaScript "
                                                f"block{buff_cond}. Here are the first 256 bytes.",
                                                parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")),
                                                body_format=BODY_FORMAT.MEMORY_DUMP)
                                            buff_res.set_heuristic(2)

                                processed_sc = []
                                sc_idx = 0
                                for sc in unescaped_bytes:
                                    if sc not in processed_sc:
                                        sc_idx += 1
                                        processed_sc.append(sc)

                                        try:
                                            sc = sc.decode("hex")
                                        except Exception:
                                            pass

                                        shell_score = 500
                                        temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff"

                                        shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript "
                                                                  f"buffer (id: {sc_idx}) was resubmitted as "
                                                                  f"{temp_path_name}. Here are the first 256 bytes.",
                                                                  parent=js_res)
                                        shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP)

                                        temp_path = os.path.join(self.working_directory, temp_path_name)
                                        f = open(temp_path, "wb")
                                        f.write(sc)
                                        f.close()
                                        f_list.append(temp_path)

                                        cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer")
                                        shell_res.set_heuristic(6)
                                        score_modifier += shell_score

                            if score_modifier > 0:
                                res_list.append(cur_res)

                        elif cur_obj.type == "stream":
                            if cur_obj.isEncodedStream and cur_obj.filter is not None:
                                data = cur_obj.decodedStream
                                encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/",
                                                                                                          "").strip()
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            else:
                                data = cur_obj.rawStream
                                encoding = None
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            if otype:
                                otype = otype.value.replace("/", "").lower()
                            if sub_type:
                                sub_type = sub_type.value.replace("/", "").lower()
                            if length:
                                length = length.value

                            if otype == "embeddedfile":
                                if len(data) > 4096:
                                    if encoding is not None:
                                        temp_encoding_str = f"_{encoding}"
                                    else:
                                        temp_encoding_str = ""

                                    cur_res = ResultSection(
                                        f'Embedded file found ({length} bytes) [obj: {obj} {version}] '
                                        f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}'
                                        f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                        f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                    )

                                    temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj"
                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                    f = open(temp_path, "wb")
                                    f.write(data)
                                    f.close()
                                    f_list.append(temp_path)

                                    cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}")
                                    res_list.append(cur_res)

                            elif otype not in BANNED_TYPES:
                                cur_res = ResultSection(
                                    f'Unknown stream found [obj: {obj} {version}] '
                                    f'{f"(Type: {otype}) " if otype is not None else ""}'
                                    f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                    f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                )
                                for line in val.splitlines():
                                    cur_res.add_line(line)

                                emb_res = ResultSection('First 256 bytes', parent=cur_res)
                                first_256 = data[:256]
                                if isinstance(first_256, str):
                                    first_256 = first_256.encode()
                                emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP)
                                res_list.append(cur_res)
                        else:
                            pass

                file_res.add_section(res)

                for results in res_list:
                    file_res.add_section(results)

                if js_dump:
                    js_dump_res = ResultSection('Full JavaScript dump')

                    temp_js_dump = "javascript_dump.js"
                    temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump)
                    try:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8")
                    except UnicodeDecodeError:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump)
                    temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest()
                    f = open(temp_js_dump_path, "wb")
                    f.write(temp_js_dump_bin)
                    f.flush()
                    f.close()
                    f_list.append(temp_js_dump_path)

                    js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}")
                    js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}")

                    js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1)
                    file_res.add_section(js_dump_res)

                for filename in f_list:
                    request.add_extracted(filename, os.path.basename(filename),
                                          f"Dumped from {os.path.basename(temp_filename)}")

            else:
                res = ResultSection("ERROR: Could not parse file with PeePDF.")
                file_res.add_section(res)
        finally:
            request.result = file_res
            try:
                del pdf_file
            except Exception:
                pass

            try:
                del pdf_parser
            except Exception:
                pass

            gc.collect()
    def execute(self, request):
        # ==================================================================
        # Execute a request:
        #   Every time your service receives a new file to scan, the execute function is called
        #   This is where you should execute your processing code.
        #   For the purpose of this example, we will only generate results ...

        # You should run your code here...

        # ==================================================================
        # Check if we're scanning an embedded file
        #   This service always drop 3 embedded file which two generates random results and the other empty results
        #   We're making a check to see if we're scanning the embedded file.
        #   In a normal service this is not something you would do at all but since we are using this
        #   service in our unit test to test all features of our report generator, we have to do this
        if request.sha256 not in [
                'd729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a',
                '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec',
                'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06'
        ]:
            # Main file results...

            # ==================================================================
            # Write the results:
            #   First, create a result object where all the result sections will be saved to
            result = Result()

            # ==================================================================
            # Standard text section: BODY_FORMAT.TEXT - DEFAULT
            #   Text sections basically just dumps the text to the screen...
            #     All sections scores will be SUMed in the service result
            #     The Result classification will be the highest classification found in the sections
            text_section = ResultTextSection('Example of a default section')
            # You can add lines to your section one at a time
            #   Here we will generate a random line
            text_section.add_line(get_random_phrase())
            # Or your can add them from a list
            #   Here we will generate random amount of random lines
            text_section.add_lines(
                [get_random_phrase() for _ in range(random.randint(1, 5))])
            # You can tag data to a section, tagging is used to to quickly find defining information about a file
            text_section.add_tag("attribution.implant", "ResultSample")
            # If the section needs to affect the score of the file you need to set a heuristics
            #   Here we will pick one at random
            #     In addition to add a heuristic, we will associated a signature with the heuristic,
            #     we're doing this by adding the signature name to the heuristic. (Here we generating a random name)
            text_section.set_heuristic(3, signature="sig_one")
            # You can attach attack ids to heuristics after they where defined
            text_section.heuristic.add_attack_id(
                random.choice(list(software_map.keys())))
            text_section.heuristic.add_attack_id(
                random.choice(list(attack_map.keys())))
            text_section.heuristic.add_attack_id(
                random.choice(list(group_map.keys())))
            text_section.heuristic.add_attack_id(
                random.choice(list(revoke_map.keys())))
            # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how
            #   many time the signature fired by setting its frequency. If you call add_signature_id twice with the
            #   same signature, this will effectively increase the frequency of the signature.
            text_section.heuristic.add_signature_id("sig_two",
                                                    score=20,
                                                    frequency=2)
            text_section.heuristic.add_signature_id("sig_two",
                                                    score=20,
                                                    frequency=3)
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_four", score=0)
            # The heuristic for text_section should have the following properties
            #   1. 1 attack ID: T1066
            #   2. 4 signatures: sig_one, sig_two, sig_three and sig_four
            #   3. Signature frequencies are cumulative therefor they will be as follow:
            #      - sig_one = 1
            #      - sig_two = 5
            #      - sig_three = 2
            #      - sig_four = 1
            #   4. The score used by each heuristic is driven by the following rules: signature_score_map is higher
            #      priority, then score value for the add_signature_id is in second place and finally the default
            #      heuristic score is use. Therefor the score used to calculate the total score for the text_section is
            #      as follow:
            #      - sig_one: 10    -> heuristic default score
            #      - sig_two: 20    -> score provided by the function add_signature_id
            #      - sig_three: 30  -> score provided by the heuristic map
            #      - sig_four: 40   -> score provided by the heuristic map because it's higher priority than the
            #                          function score
            #    5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210
            # Make sure you add your section to the result
            result.add_section(text_section)

            # Even if the section was added to the results you can still modify it by adding a subsection for example
            ResultSection(
                "Example of sub-section without a body added later in processing",
                parent=text_section)

            # ==================================================================
            # Color map Section: BODY_FORMAT.GRAPH_DATA
            #     Creates a color map bar using a minimum and maximum domain
            #     e.g. We are using this section to display the entropy distribution in some services
            cmap_min = 0
            cmap_max = 20
            cmap_values = [random.random() * cmap_max for _ in range(50)]
            # The classification of a section can be set to any valid classification for your system
            section_color_map = ResultGraphSection(
                "Example of colormap result section",
                classification=cl_engine.RESTRICTED)
            section_color_map.set_colormap(cmap_min, cmap_max, cmap_values)
            result.add_section(section_color_map)

            # ==================================================================
            # URL section: BODY_FORMAT.URL
            #   Generate a list of clickable urls using a json encoded format
            #     As you can see here, the body of the section can be set directly instead of line by line
            random_host = get_random_host()
            url_section = ResultURLSection('Example of a simple url section')
            url_section.add_url(f"https://{random_host}/", name="Random url!")

            # Since urls are very important features we can tag those features in the system so they are easy to find
            #   Tags are defined by a type and a value
            url_section.add_tag("network.static.domain", random_host)

            # You may also want to provide a list of url!
            #   Also, No need to provide a name, the url link will be displayed
            hosts = [get_random_host() for _ in range(2)]

            # A heuristic can fire more then once without being associated to a signature
            url_heuristic = Heuristic(4, frequency=len(hosts))

            url_sub_section = ResultURLSection(
                'Example of a url sub-section with multiple links',
                heuristic=url_heuristic,
                classification=cl_engine.RESTRICTED)
            for host in hosts:
                url_sub_section.add_url(f"https://{host}/")
                url_sub_section.add_tag("network.static.domain", host)

            # You can keep nesting sections if you really need to
            ips = [get_random_ip() for _ in range(3)]
            url_sub_sub_section = ResultURLSection(
                'Exemple of a two level deep sub-section')
            for ip in ips:
                url_sub_sub_section.add_url(f"https://{ip}/")
                url_sub_sub_section.add_tag("network.static.ip", ip)

            # Since url_sub_sub_section is a sub-section of url_sub_section
            # we will add it as a sub-section of url_sub_section not to the main result itself
            url_sub_section.add_subsection(url_sub_sub_section)

            # Invalid sections will be ignored, and an error will apear in the logs
            # Sub-sections of invalid sections will be ignored too
            invalid_section = ResultSection("")
            ResultSection(
                "I won't make it to the report because my parent is invalid :(",
                parent=invalid_section)
            url_sub_section.add_subsection(invalid_section)

            # Since url_sub_section is a sub-section of url_section
            # we will add it as a sub-section of url_section not to the main result itself
            url_section.add_subsection(url_sub_section)

            result.add_section(url_section)

            # ==================================================================
            # Memory dump section: BODY_FORMAT.MEMORY_DUMP
            #     Dump whatever string content you have into a <pre/> html tag so you can do your own formatting
            data = hexdump(
                b"This is some random text that we will format as an hexdump and you'll see "
                b"that the hexdump formatting will be preserved by the memory dump section!"
            )
            memdump_section = ResultMemoryDumpSection(
                'Example of a memory dump section', body=data)
            memdump_section.set_heuristic(random.randint(1, 4))
            result.add_section(memdump_section)

            # ==================================================================
            # KEY_VALUE section:
            #     This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI
            #     while also providing easy to parse data for auto mated tools.
            #     NB: You should definitely use this over a JSON body type since this one will be displayed correctly
            #         in the UI for the user
            #     The body argument must be a dictionary (only str, int, and booleans are allowed)
            kv_section = ResultKeyValueSection(
                'Example of a KEY_VALUE section')
            # You can add items individually
            kv_section.set_item('key', "value")
            # Or simply add them in bulk
            kv_section.update_items({
                "a_str": "Some string",
                "a_bool": False,
                "an_int": 102,
            })
            result.add_section(kv_section)

            # ==================================================================
            # ORDERED_KEY_VALUE section:
            #     This section provides the same functionality as the KEY_VALUE section except the order of the fields
            #     are garanteed to be preserved in the order in which the fields are added to the section. Also with
            #     this section, you can repeat the same key name multiple times
            oredered_kv_section = ResultOrderedKeyValueSection(
                'Example of an ORDERED_KEY_VALUE section')
            # You can add items individually
            for x in range(random.randint(3, 6)):
                oredered_kv_section.add_item(f'key{x}', f"value{x}")

            result.add_section(oredered_kv_section)

            # ==================================================================
            # JSON section:
            #     Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor)
            #     to display a tree view of JSON results.
            #     NB: Use this sparingly! As a service developer you should do your best to include important
            #     results as their own result sections.
            #     The body argument must be a python dictionary
            json_body = {
                "a_str": "Some string",
                "a_list": ["a", "b", "c"],
                "a_bool": False,
                "an_int": 102,
                "a_dict": {
                    "list_of_dict": [{
                        "d1_key": "val",
                        "d1_key2": "val2"
                    }, {
                        "d2_key": "val",
                        "d2_key2": "val2"
                    }],
                    "bool":
                    True
                }
            }
            json_section = ResultJSONSection('Example of a JSON section')
            # You can set the json result to a specific value
            json_section.set_json(json_body)
            # You can also update specific parts after the fact
            json_section.update_json({
                'an_int': 1000,
                'updated_key': 'updated_value'
            })

            result.add_section(json_section)

            # ==================================================================
            # PROCESS_TREE section:
            #     This section allows the service writer to list a bunch of dictionary objects that have nested lists
            #     of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore
            #     each dictionary must have be of the following format:
            #     {
            #       "process_pid": int,
            #       "process_name": str,
            #       "command_line": str,
            #       "signatures": {}  This dict has the signature name as a key and the score as it's value
            #       "children": []    NB: This list either is empty or contains more dictionaries that have the same
            #                             structure
            #     }
            process_tree_section = ResultProcessTreeSection(
                'Example of a PROCESS_TREE section')
            # You can use the ProcessItem class to create the processes to add to the result section
            evil_process = ProcessItem(123, "evil.exe", "c:\\evil.exe")
            evil_process_child_1 = ProcessItem(
                321, "takeovercomputer.exe",
                "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff")
            # You can add child processes to the ProcessItem objects
            evil_process_child_1.add_child_process(
                ProcessItem(
                    456,
                    "evenworsethanbefore.exe",
                    "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad",
                    signatures={
                        "one": 10,
                        "two": 10,
                        "three": 10
                    }))
            evil_process_child_1.add_child_process(
                ProcessItem(234,
                            "badfile.exe",
                            "C:\\badfile.exe -k nothing_to_see_here",
                            signatures={
                                "one": 1000,
                                "two": 10,
                                "three": 10,
                                "four": 10,
                                "five": 10
                            }))

            # You can add signatures that hit on a ProcessItem Object
            evil_process_child_1.add_signature('one', 250)

            # Or even directly create the ProcessItem object with the signature in it
            evil_process_child_2 = ProcessItem(
                345,
                "benignexe.exe",
                "C:\\benignexe.exe -f \"just kidding, i'm evil\"",
                signatures={"one": 2000})

            # You can also add counts for network, file and registry events to a ProcessItem object
            evil_process_child_2.add_network_events(4)
            evil_process_child_2.add_file_events(7000)
            evil_process_child_2.add_registry_events(10)

            # You can also indicate if the process tree item has been safelisted
            benign_process = ProcessItem(678, "trustme.exe", "C:\\trustme.exe")
            benign_process.safelist()

            evil_process.add_child_process(evil_process_child_1)
            evil_process.add_child_process(evil_process_child_2)

            # Add your processes to the result section via the add_process function
            process_tree_section.add_process(evil_process)
            process_tree_section.add_process(
                ProcessItem(987, "runzeroday.exe",
                            "C:\\runzeroday.exe -f insert_bad_spelling"))
            process_tree_section.add_process(benign_process)

            result.add_section(process_tree_section)

            # ==================================================================
            # TABLE section:
            #     This section allows the service writer to have their content displayed in a table format in the UI
            #     The body argument must be a list [] of dict {} objects. A dict object can have a key value pair
            #     where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested
            #     table within a cell.
            table_section = ResultTableSection('Example of a TABLE section')
            # Use the TableRow class to help adding row to the Table section
            table_section.add_row(
                TableRow(a_str="Some string1",
                         extra_column_here="confirmed",
                         a_bool=False,
                         an_int=101))
            table_section.add_row(
                TableRow(
                    {
                        "a_str": "Some string2",
                        "a_bool": True,
                        "an_int": "to_be_overriden_by_kwargs"
                    },
                    an_int=102))
            table_section.add_row(
                TableRow(a_str="Some string3", a_bool=False, an_int=103))
            # Valid values for the items in the TableRow are: str, int, bool, None, or dict of those values
            table_section.add_row(
                TableRow(
                    {
                        "a_str": "Some string4",
                        "a_bool": None,
                        "an_int": -1000000000000000000
                    }, {
                        "extra_column_there": "confirmed",
                        "nested_key_value_pair": {
                            "a_str": "Some string3",
                            "a_bool": False,
                            "nested_kv_thats_too_deep": {
                                "a_str": "Some string3",
                                "a_bool": False,
                                "an_int": 103,
                            },
                        }
                    }))
            result.add_section(table_section)

            # ==================================================================
            # Re-Submitting files to the system
            #     Adding extracted files will have them resubmitted to the system for analysis

            # This file will generate random results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(data.encode())
            request.add_extracted(temp_path, "file.txt",
                                  "Extracted by some magic!")

            # Embedded files can also have their own classification!
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"CLASSIFIED!!!__" + data.encode())
            request.add_extracted(temp_path,
                                  "classified.doc",
                                  "Classified file ... don't look",
                                  classification=cl_engine.RESTRICTED)

            # This file will generate empty results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"EMPTY")
            request.add_extracted(temp_path, "empty.txt",
                                  "Extracted empty resulting file")

            # ==================================================================
            # Supplementary files
            #     Adding supplementary files will save them on the datastore for future
            #      reference but wont reprocess those files.
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(url_sub_section.body)
            request.add_supplementary(temp_path, "urls.json",
                                      "These are urls as a JSON file")
            # like embedded files, you can add more then one supplementary files
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(json_body))
            request.add_supplementary(temp_path, "json_body.json",
                                      "This is the json_body as a JSON file")

            # ==================================================================
            # Zeroize on safe tags
            #     When this feature is turned on, the section will get its score set to zero if all its tags
            #     were safelisted by the safelisting engine
            zero_section = ResultSection('Example of zeroize-able section',
                                         zeroize_on_tag_safe=True)
            zero_section.set_heuristic(2)
            zero_section.add_line(
                "This section will have a zero score if all tags are safelisted."
            )
            zero_section.add_tag('network.static.ip', '127.0.0.1')
            result.add_section(zero_section)

            # ==================================================================
            # Auto-collapse
            #     When this feature is turned on, the section will be collapsed when first displayed
            collapse_section = ResultSection(
                'Example of auto-collapse section', auto_collapse=True)
            collapse_section.set_heuristic(2)
            collapse_section.add_line(
                "This section was collapsed when first loaded in the UI")
            result.add_section(collapse_section)

            # ==================================================================
            # Image Section
            #     This type of section allows the service writer to display images to the user
            image_section = ResultImageSection(request,
                                               'Example of Image section')
            for x in range(6):
                image_section.add_image(f'data/000{x+1}.jpg',
                                        f'000{x+1}.jpg',
                                        f'ResultSample screenshot 000{x+1}',
                                        ocr_heuristic_id=6)
            result.add_section(image_section)

            # ==================================================================
            # Multi Section
            #     This type of section allows the service writer to display multiple section types
            #     in the same result section. Here's a concrete exemple of this:
            multi_section = ResultMultiSection(
                'Example of Multi-typed section')
            multi_section.add_section_part(
                TextSectionBody(
                    body="We have detected very high entropy multiple sections "
                    "of your file, this section is most-likely packed or "
                    "encrypted.\n\nHere are affected sections:"))
            section_count = random.randint(1, 4)
            for x in range(section_count):
                multi_section.add_section_part(
                    KVSectionBody(section_name=f".UPX{x}",
                                  offset=f'0x00{8+x}000',
                                  size='4196 bytes'))
                graph_part = GraphSectionBody()
                graph_part.set_colormap(
                    0, 8, [7 + random.random() for _ in range(20)])
                multi_section.add_section_part(graph_part)
                if x != section_count - 1:
                    multi_section.add_section_part(DividerSectionBody())
                multi_section.add_tag("file.pe.sections.name", f".UPX{x}")

            multi_section.set_heuristic(5)
            result.add_section(multi_section)

            # ==================================================================
            # Propagate temporary submission data to other services
            #   Sometimes two service can work in tandem were one extra some piece of information the other
            #   one uses to do it's work. This is how a service can set temporary data that other
            #   services that subscribe to can use.
            request.temp_submission_data['kv_section'] = kv_section.body
            request.temp_submission_data[
                'process_tree_section'] = process_tree_section.body
            request.temp_submission_data['url_section'] = url_sub_section.body

            # ==================================================================
            # Wrap-up:
            #     Save your result object back into the request
            request.result = result

        # ==================================================================
        # Empty results file
        elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06':
            # Creating and empty result object
            request.result = Result()

        # ==================================================================
        # Randomized results file
        else:
            # For the randomized  results file, we will completely randomize the results
            #   The content of those results do not matter since we've already showed you
            #   all the different result sections, tagging, heuristics and file upload functions
            embedded_result = Result()

            # random number of sections
            for _ in range(1, 3):
                embedded_result.add_section(self._create_random_section())

            request.result = embedded_result
    def find_network_indicators(apktool_out_dir: str, result: Result):
        # Whitelist
        skip_list = [
            "android.intent",
            "com.google",
            "com.android",
        ]

        indicator_whitelist = [
            'google.to',
            'google.ttl',
            'google.delay',
            'google_tagmanager.db',
            'gtm_urls.db',
            'gtm.url',
            'google_tagmanager.db',
            'google_analytics_v4.db',
            'Theme.Dialog.Alert',
            'popupLocationInfo.gravity',
            'popupLocationInfo.displayId',
            'popupLocationInfo.left',
            'popupLocationInfo.top',
            'popupLocationInfo.right',
            'popupLocationInfo.bottom',
            'googleads.g.doubleclick.net',
            'ad.doubleclick.net',
            '.doubleclick.net',
            '.googleadservices.com',
            '.googlesyndication.com',
            'android.hardware.type.watch',
            'mraid.js',
            'google_inapp_purchase.db',
            'mobileads.google.com',
            'mobileads.google.com',
            'share_history.xml',
            'share_history.xml',
            'activity_choser_model_history.xml',
            'FragmentPager.SavedState{',
            'android.remoteinput.results',
            'android.people',
            'android.picture',
            'android.icon',
            'android.text',
            'android.title',
            'android.title.big',
            'FragmentTabHost.SavedState{',
            'android.remoteinput.results',
            'android.remoteinput.results',
            'android.remoteinput.results',
            'libcore.icu.ICU',
        ]

        file_list = []

        # Indicators
        url_list = []
        domain_list = []
        ip_list = []
        email_list = []

        # Build dynamic whitelist
        smali_dir = os.path.join(apktool_out_dir, "smali")
        for root, dirs, files in os.walk(smali_dir):
            if not files:
                continue
            else:
                skip_list.append(root.replace(smali_dir + "/", "").replace("/", "."))

            for cdir in dirs:
                skip_list.append(os.path.join(root, cdir).replace(smali_dir + "/", "").replace("/", "."))

        asset_dir = os.path.join(apktool_out_dir, "assets")
        if os.path.exists(asset_dir):
            for root, dirs, files in os.walk(asset_dir):
                if not files:
                    continue
                else:
                    for asset_file in files:
                        file_list.append(asset_file)
        skip_list = list(set(skip_list))

        # Find indicators
        proc = Popen(['grep', '-ER', r'(([[:alpha:]](-?[[:alnum:]])*)\.)*[[:alpha:]](-?[[:alnum:]])+\.[[:alpha:]]{2,}',
                      smali_dir], stdout=PIPE, stderr=PIPE)
        grep, _ = proc.communicate()
        for line in safe_str(grep).splitlines():
            file_path, line = line.split(":", 1)

            if "const-string" in line or "Ljava/lang/String;" in line:
                data = line.split("\"", 1)[1].split("\"")[0]
                data_low = data.lower()
                data_split = data.split(".")
                if data in file_list:
                    continue
                elif data in indicator_whitelist:
                    continue
                elif data.startswith("/"):
                    continue
                elif data_low.startswith("http://") or data_low.startswith('ftp://') or data_low.startswith('https://'):
                    url_list.append(data)
                elif len(data_split[0]) < len(data_split[-1]) and len(data_split[-1]) > 3:
                    continue
                elif data.startswith('android.') and data_low != data:
                    continue
                elif "/" in data and "." in data and data.index("/") < data.index("."):
                    continue
                elif " " in data:
                    continue
                elif data_split[0] in ['com', 'org', 'net', 'java']:
                    continue
                elif data_split[-1].lower() in ['so', 'properties', 'zip', 'read', 'id', 'store',
                                                'name', 'author', 'sh', 'soccer', 'fitness', 'news', 'video']:
                    continue
                elif data.endswith("."):
                    continue
                else:
                    do_skip = False
                    for skip in skip_list:
                        if data.startswith(skip):
                            do_skip = True
                            break

                    if do_skip:
                        continue

                    data = data.strip(".")

                    if is_valid_domain(data):
                        domain_list.append(data)
                    elif is_valid_ip(data):
                        ip_list.append(data)
                    elif is_valid_email(data):
                        email_list.append(data)

        url_list = list(set(url_list))
        for url in url_list:
            dom_ip = url.split("//")[1].split("/")[0]
            if ":" in dom_ip:
                dom_ip = dom_ip.split(":")[0]

            if is_valid_ip(dom_ip):
                ip_list.append(dom_ip)
            elif is_valid_domain(dom_ip):
                domain_list.append(dom_ip)

        ip_list = list(set(ip_list))
        domain_list = list(set(domain_list))
        email_list = list(set(email_list))

        if url_list or ip_list or domain_list or email_list:
            res_net = ResultSection("Network indicator(s) found", parent=result, heuristic=Heuristic(3))

            if url_list:
                res_url = ResultSection("Found urls in the decompiled code", parent=res_net)
                count = 0
                for url in url_list:
                    count += 1
                    if count <= 20:
                        res_url.add_line(url)
                    res_url.add_tag('network.static.uri', url)
                if count > 20:
                    res_url.add_line(f"and {count - 20} more...")

            if ip_list:
                res_ip = ResultSection("Found IPs in the decompiled code", parent=res_net)
                count = 0
                for ip in ip_list:
                    count += 1
                    if count <= 20:
                        res_ip.add_line(ip)
                    res_ip.add_tag('network.static.ip', ip)
                if count > 20:
                    res_ip.add_line(f"and {count - 20} more...")

            if domain_list:
                res_domain = ResultSection("Found domains in the decompiled code", parent=res_net)
                count = 0
                for domain in domain_list:
                    count += 1
                    if count <= 20:
                        res_domain.add_line(domain)
                    res_domain.add_tag('network.static.domain', domain)
                if count > 20:
                    res_domain.add_line(f"and {count - 20} more...")

            if email_list:
                res_email = ResultSection("Found email addresses in the decompiled code", parent=res_net)
                count = 0
                for email in email_list:
                    count += 1
                    if count <= 20:
                        res_email.add_line(email)
                    res_email.add_tag('network.email.address', email)
                if count > 20:
                    res_email.add_line(f"and {count - 20} more...")
예제 #23
0
    def execute(self, request):
        request.result = Result()
        self.result = request.result
        file_path = request.file_path
        fh = open(file_path, 'rb')
        try:
            self.swf = SWF(fh)
            if self.swf is None:
                raise Exception("self.swf is None")
        except Exception as e:
            self.log.exception(
                f"Unable to parse file {request.sha256}: {str(e)}")
            fh.close()
            raise
        self.tag_summary = defaultdict(list)
        self.symbols = {}
        self.binary_data = {}
        self.exported_assets = []
        self.big_buffers = set()
        self.has_product_info = False
        self.anti_decompilation = False
        self.recent_compile = False
        self.disasm_path = None

        header_subsection = ResultSection(title_text="SWF Header",
                                          parent=self.result)
        if self.swf.header.version:
            header_subsection.add_line("Version: %d" % self.swf.header.version)
            header_subsection.add_tag(tag_type="file.swf.header.version",
                                      value=str(self.swf.header.version))
        header_subsection.add_line("File length: %d" %
                                   self.swf.header.file_length)
        if self.swf.header.frame_size.__str__():
            header_subsection.add_line("Frame size: %s" %
                                       self.swf.header.frame_size.__str__())
            header_subsection.add_tag(
                tag_type="file.swf.header.frame.size",
                value=self.swf.header.frame_size.__str__())
        if self.swf.header.frame_rate:
            header_subsection.add_line("Frame rate: %d" %
                                       self.swf.header.frame_rate)
            header_subsection.add_tag(tag_type="file.swf.header.frame.rate",
                                      value=str(self.swf.header.frame_rate))
        if self.swf.header.frame_count:
            header_subsection.add_line("Frame count: %d" %
                                       self.swf.header.frame_count)
            header_subsection.add_tag(tag_type="file.swf.header.frame.count",
                                      value=str(self.swf.header.frame_count))

        # Parse Tags
        tag_subsection = ResultSection(title_text="SWF Tags",
                                       parent=self.result)
        tag_types = []
        for tag in self.swf.tags:
            self.tag_analyzers.get(SWF_TAGS.get(tag.type), self._dummy)(tag)
            tag_types.append(str(tag.type))
        tag_list = ','.join(tag_types)
        tags_ssdeep = ssdeep.hash(tag_list)
        tag_subsection.add_tag(tag_type="file.swf.tags_ssdeep",
                               value=tags_ssdeep)
        # TODO: not sure we want to split those...
        # _, hash_one, hash_two = tags_ssdeep.split(':')
        # tag_subsection.add_tag(tag_type=TAG_TYPE.SWF_TAGS_SSDEEP, value=hash_one)
        # tag_subsection.add_tag(tag_type=TAG_TYPE.SWF_TAGS_SSDEEP, value=hash_two)

        # Script Overview
        if len(self.symbols.keys()) > 0:
            root_symbol = 'unspecified'
            if 0 in self.symbols:
                root_symbol = self.symbols[0]
                self.symbols.pop(0)
            symbol_subsection = ResultSection(title_text="Symbol Summary",
                                              parent=self.result)
            symbol_subsection.add_line(f'Main: {root_symbol}')
            if len(self.symbols.keys()) > 0:
                for tag_id, name in sorted([(k, v)
                                            for k, v in self.symbols.items()]):
                    symbol_subsection.add_line(f'ID:{tag_id} - {name}')

        if len(self.binary_data.keys()) > 0:
            binary_subsection = ResultSection(
                title_text="Attached Binary Data",
                heuristic=Heuristic(3),
                parent=self.result)
            for tag_id, tag_data in self.binary_data.items():
                tag_name = self.symbols.get(tag_id, 'unspecified')
                binary_subsection.add_line(f'ID:{tag_id} - {tag_name}')
                try:
                    binary_filename = hashlib.sha256(
                        tag_data).hexdigest() + '.attached_binary'
                    binary_path = os.path.join(self.working_directory,
                                               binary_filename)
                    with open(binary_path, 'wb') as fh:
                        fh.write(tag_data)
                    request.add_extracted(
                        binary_path, f"{tag_name}_{tag_id}",
                        f"SWF Embedded Binary Data {str(tag_id)}")
                except Exception:
                    self.log.exception(
                        "Error submitting embedded binary data for swf:")

        tags_subsection = ResultSection(title_text="Tags of Interest")
        for tag in sorted(self.tag_summary.keys()):
            body = []
            summaries = self.tag_summary[tag]
            for summary in summaries:
                summary_line = '\t'.join(summary)
                body.append(summary_line)
            if body:
                subtag_section = ResultSection(title_text=tag,
                                               parent=tags_subsection)
                subtag_section.add_lines(body)
        if len(tags_subsection.subsections) > 0:
            self.result.add_section(tags_subsection)

        if len(self.big_buffers) > 0:
            bbs = ResultSection(title_text="Large String Buffers",
                                heuristic=Heuristic(1),
                                parent=self.result)
            for buf in self.big_buffers:
                if isinstance(buf, str):
                    buf = buf.encode()
                bbs.add_line("Found a %d byte string." % len(buf))
                buf_filename = ""
                try:
                    buf_filename = hashlib.sha256(
                        buf).hexdigest() + '.stringbuf'
                    buf_path = os.path.join(self.working_directory,
                                            buf_filename)
                    with open(buf_path, 'wb') as fh:
                        fh.write(buf)
                    request.add_extracted(buf_path,
                                          "AVM2 Large String Buffer.",
                                          buf_filename)
                except Exception:
                    self.log.exception(
                        "Error submitting AVM2 String Buffer %s" %
                        buf_filename)

        if not self.has_product_info:
            self.log.debug("Missing product info.")
            no_info = ResultSection(title_text="Missing Product Information",
                                    heuristic=Heuristic(5),
                                    parent=self.result)
            no_info.add_line(
                "This SWF doesn't specify information about the product that created it."
            )

        if self.anti_decompilation:
            self.log.debug("Anti-disassembly techniques may be present.")
            no_dis = ResultSection(title_text="Incomplete Disassembly",
                                   heuristic=Heuristic(4),
                                   parent=self.result)
            no_dis.add_line(
                "This SWF may contain intentional corruption or obfuscation to prevent disassembly."
            )

        if self.recent_compile:
            recent_compile = ResultSection(title_text="Recent Compilation",
                                           heuristic=Heuristic(2),
                                           parent=self.result)
            recent_compile.add_line(
                "This SWF was compiled within the last 24 hours.")

        fh.close()
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)
        content_str = request.file_contents

        # Attempt conversion of potential Outlook file -> eml
        if request.file_type == "document/office/email":
            try:
                content_str = msg2eml(request.file_path).as_bytes()
            except Exception:
                # Try using mailparser to convert
                converted_path, _ = msgconvert(request.file_path)
                content_str = open(converted_path, "rb").read()

        header_agg = {
            "From": set(),
            "To": set(),
            "Cc": set(),
            "Sent": set(),
            "Reply-To": set(),
            "Date": set()
        }
        # Assume this is an email saved in HTML format
        if request.file_type == "code/html":
            parsed_html = BeautifulSoup(content_str, "lxml")
            valid_headers = [
                "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:"
            ]

            if not parsed_html.body or not any(header in parsed_html.body.text
                                               for header in valid_headers):
                # We can assume this is just an HTML doc (or lacking body), one of which we can't process
                request.result = Result()
                return

            # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails
            valid_headers.append("Date:")

            html_email = email.message_from_bytes(content_str)
            generator_metadata_content = ""
            for meta in parsed_html.find_all("meta"):
                if meta.attrs.get("name", None) == "Generator":
                    generator_metadata_content = meta.attrs.get("content", "")
                    break

            # Process HTML emails generated from Outlook
            if generator_metadata_content == "Microsoft Word 15":
                paragraphs = parsed_html.body.find_all("p")
                # Likely an email that was exported with original email headers
                if any(header in paragraphs[0] for header in valid_headers):
                    for p in paragraphs:
                        if any(valid_header in p.text
                               for valid_header in valid_headers):
                            h_key, h_value = p.text.replace(
                                "\xa0", "").replace("\r\n", " ").split(":", 1)
                            html_email[h_key] = h_value
                            # Subject line indicates the end of the email header, beginning of body
                            if "Subject" in p.text:
                                break
            # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers)
            elif (generator_metadata_content
                  == "Microsoft Word 15 (filtered medium)"
                  or generator_metadata_content == "Microsoft Exchange Server"
                  or generator_metadata_content == ""):
                subject = None
                for div in parsed_html.find_all("div"):
                    # Header information within divs
                    if any(header in div.text for header in valid_headers
                           ) and "WordSection1" not in div.attrs.get(
                               "class", []):
                        # Usually expect headers to be \n separated in text output but check first
                        if "\n" in div.text:
                            for h in div.text.split("\n"):
                                if any(header in h
                                       for header in valid_headers):
                                    h_key, h_value = h.split(":", 1)

                                    # Implying some malformed message got mixed with the headers of another message
                                    if h_key not in valid_headers:
                                        for header in valid_headers:
                                            if header in h:
                                                h_key = header[:-1]

                                    # Use the latest message's subject (this maintains FW, RE, etc.)
                                    if h_key == "Subject" and not subject:
                                        subject = h_value
                                    elif h_key != "Subject":
                                        header_agg[h_key].add(h_value)

                        # Document was probably not well formatted, so we'll use the headers as delimiters
                        else:
                            header_offset_map = {}
                            # Determine the position of each header
                            for header in list(
                                    header_agg.keys()) + ["Subject"]:
                                if header in div.text:
                                    header_offset_map[div.text.index(
                                        header)] = header
                            # Use the positions and length of header name to determine an offset
                            for i in range(len(header_offset_map)):
                                sorted_keys = sorted(header_offset_map.keys())
                                header_name = header_offset_map[sorted_keys[i]]
                                offset = len(
                                    f"{header_name}: ") + sorted_keys[i]
                                value = (div.text[offset:sorted_keys[i + 1]]
                                         if i < len(header_offset_map) - 1 else
                                         div.text[offset:])

                                if header_name == "Subject":
                                    subject = value
                                else:
                                    header_agg[header_name].add(value)

                # Assign aggregated info to email object
                html_email["Subject"] = subject
                for key, value in header_agg.items():
                    html_email[key] = "; ".join(value)
            content_str = html_email.as_bytes()

        parsed_eml = parser.decode_email_bytes(content_str)
        result = Result()
        header = parsed_eml["header"]

        if "from" in header or "to" in header:
            all_uri = set()
            body_words = set(extract_passwords(header["subject"]))
            for body_counter, body in enumerate(parsed_eml["body"]):
                body_text = BeautifulSoup(body["content"]).text
                body_words.update(extract_passwords(body_text))
                if request.get_param("extract_body_text"):
                    fd, path = mkstemp()
                    with open(path, "w") as f:
                        f.write(body["content"])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body["uri"]:
                        all_uri.add(uri)
            # Words in the email body, used by extract to guess passwords
            request.temp_submission_data["email_body"] = list(body_words)

            kv_section = ResultSection("Email Headers",
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            from_addr = header["from"].strip() if header.get("from",
                                                             None) else None
            if from_addr and re.match(EMAIL_REGEX, from_addr):
                kv_section.add_tag("network.email.address", from_addr)
            [
                kv_section.add_tag("network.email.address", to.strip())
                for to in header["to"] if re.match(EMAIL_REGEX, to.strip())
            ]

            kv_section.add_tag("network.email.date",
                               str(header["date"]).strip())

            subject = header["subject"].strip() if header.get("subject",
                                                              None) else None
            if subject:
                kv_section.add_tag("network.email.subject", subject)

            # Add CCs to body and tags
            if "cc" in header:
                [
                    kv_section.add_tag("network.email.address", cc.strip())
                    for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip())
                ]
            # Add Message ID to body and tags
            if "message-id" in header["header"]:
                kv_section.add_tag("network.email.msg_id",
                                   header["header"]["message-id"][0].strip())

            # Add Tags for received IPs
            if "received_ip" in header:
                for ip in header["received_ip"]:
                    ip = ip.strip()
                    try:
                        if isinstance(ip_address(ip), IPv4Address):
                            kv_section.add_tag("network.static.ip", ip)
                    except ValueError:
                        pass

            # Add Tags for received Domains
            if "received_domain" in header:
                for dom in header["received_domain"]:
                    kv_section.add_tag("network.static.domain", dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection("URIs Found:", parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag("network.static.uri", uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag("network.static.ip",
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag("network.static.domain",
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop("header", {})
            header.pop("received", None)
            header.update(extra_header)

            # Convert to common format
            header["date"] = [self.json_serial(header["date"])]

            # Replace with aggregated date(s) if any available
            if header_agg["Date"]:
                # Replace
                if any(
                        default_date in header["date"] for default_date in
                    ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000"
                     ]):
                    header["date"] = list(header_agg["Date"])
                # Append
                else:
                    header["date"] += list(header_agg["Date"])
                (kv_section.add_tag("network.email.date",
                                    str(date).strip())
                 for date in header_agg["Date"])

            # Filter out useless headers from results
            self.log.debug(header.keys())
            [header.pop(h) for h in self.header_filter if h in header.keys()]
            kv_section.set_body(json.dumps(header, default=self.json_serial))

            attachments_added = []
            if "attachment" in parsed_eml:
                attachments = parsed_eml["attachment"]
                for attachment in attachments:
                    fd, path = mkstemp()

                    with open(path, "wb") as f:
                        f.write(base64.b64decode(attachment["raw"]))
                        os.close(fd)
                    try:
                        if request.add_extracted(
                                path,
                                attachment["filename"],
                                "Attachment ",
                                safelist_interface=self.api_interface):
                            attachments_added.append(attachment["filename"])
                    except MaxExtractedExceeded:
                        self.log.warning(
                            f"Extract limit reached on attachments: "
                            f"{len(attachment) - len(attachments_added)} not added"
                        )
                        break
                ResultSection("Extracted Attachments:",
                              body="\n".join([x for x in attachments_added]),
                              parent=result)

            if request.get_param("save_emlparser_output"):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                attachments = parsed_eml.get("attachment", [])
                # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted
                for attachment in attachments:
                    _ = attachment.pop("raw", None)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            self.log.warning(
                "emlParser could not parse EML; no useful information in result's headers"
            )

        request.result = result
예제 #25
0
    def __init__(self,
                 i,
                 request=None,
                 result=None,
                 working_directory=None,
                 logger=None):

        self.request = request
        self.result = result
        self.working_directory = working_directory
        self.log = logger

        if result:
            self.working_result = (ResultSection(
                "Image Steganography Module Results:",
                body_format=BODY_FORMAT.MEMORY_DUMP))
        else:
            self.result = result

        # Currently only supporting 8-bit pixel modes
        self.pixel_size = 8
        supported_modes = {
            'CMYK': 4,
            'P': 1,
            'RGB': 3,
            'RGBA': 4,
        }

        # Pillow seems to like non-corrupt images, so give its best shot and exit on error
        try:
            img = Image.open(i)
        except Exception:
            raise NotSupported()

        try:
            self.iformat = img.format
            self.imode = img.mode.upper()
            self.isize = img.size
        except Exception:
            raise NotSupported()

        if not self.iformat and not self.imode and not self.isize:
            # Something likely wrong
            raise NotSupported()

        if self.imode.upper() not in supported_modes:
            if not self.result:
                self.log.warning(
                    "{} image mode not currently supported for steganlaysis modules"
                    .format(self.imode))
                exit()
            else:
                self.log.warning("not a supported mode: {}".format(
                    self.result))
                raise NotSupported()
        else:
            self.channels_to_process = supported_modes[self.imode]

        if result:
            pil_result = ResultSection("Pillow Image Data:",
                                       body_format=BODY_FORMAT.MEMORY_DUMP)
            if self.iformat:
                pil_result.add_line("Format:\t {}".format(self.iformat))
            if self.imode:
                pil_result.add_line("Mode:\t {}".format(self.imode))
                pil_result.add_tag('file.img.mode', self.imode)
            if self.isize:
                pil_result.add_line("Size:\t {}x{}".format(
                    self.isize[0], self.isize[1]))
                pil_result.add_tag(
                    'file.img.size', "{}x{}".format(self.isize[0],
                                                    self.isize[1]))
            self.result.add_section(pil_result)

        try:
            self.ipixels = iter(img.getdata())
        except Exception:
            raise NotSupported()

        try:
            img = Image.open(i)
            self.iobject = img.load()
        except Exception:
            raise NotSupported()

        self.binary_pixels = [
            self.convert_binary_string(self.imode, self.channels_to_process,
                                       self.ipixels)
        ]
        self.pixel_count = (self.isize[0] * self.isize[1] *
                            self.channels_to_process)

        # Chunk size equals (#bytes*8) bits/num byte-values per pixel. Therefore if 8 bits per pixel, and you want to
        # perform test on every 512 bytes of data, chunk size will be (512*8)/8 == every 512 pixels examined.
        # Optimize chunk size if this is being run through AL
        if request is not None:
            maximizer = self.pixel_count / 20000
            if maximizer == 0:
                maximizer = 1
            self.chunk = 128 * maximizer
        else:
            self.chunk = 256

        self.chunk = int(self.chunk)
        # total chunk bits/8
        self.chunk_bytes = (self.chunk * self.pixel_size *
                            self.channels_to_process) / 8
    def analyze_pdf(self,
                    request,
                    res_txt,
                    path,
                    working_dir,
                    heur,
                    additional_keywords,
                    get_malform=True):
        """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins,
        and PDF Parser.

        Args:
            request: AL request object.
            res_txt: Header string for AL result section title.
            path: Original PDF sample path.
            working_dir: AL working directory.
            heur: List of plugins to run on PDFId results (provided in service configuration).
            additional_keywords: List of additional keywords to be searched (provided in service configuration).
            get_malform: Extract malformed objects from PDF.

        Returns:
            AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list.
        """
        triage_keywords = set()
        all_errors = set()
        embed_present = False
        objstms = False
        res = ResultSection(title_text=res_txt)
        carved_extracted_shas = set()

        if request.deep_scan:
            run_pdfparse = True
        else:
            run_pdfparse = False

        # Run PDFId
        try:
            pdfid_result, errors = self.get_pdfid(path, additional_keywords,
                                                  heur, request.deep_scan)
        except Exception as e:
            raise NonRecoverableError(e)
        # Parse PDFId results
        pdfidres = ResultSection(title_text="PDFID Results", parent=res)
        if len(pdfid_result) == 0:
            pdfidres.add_line(
                "No results generated for file. Please see errors.")
        else:
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                version = pdfid_result.get("PDFID", None)
                if version:
                    pdfidres.add_line(version[0])
                properties = pdfid_result.get("Properties", None)
                if properties:
                    pres = ResultSection(title_text="PDF Properties",
                                         parent=pdfidres)
                    for plist in properties:
                        pres.add_line("{0}: {1}".format(plist[0], plist[1]))
                        if plist[0] == "/ModDate":
                            pres.add_tag('file.pdf.date.modified', plist[1])
                        elif plist[0] == "/CreationDate":
                            pres.add_tag('file.date.creation', plist[1])
                        elif plist[0] == "/LastModified":
                            pres.add_tag('file.date.last_modified', plist[1])
                        elif plist[0] == "/SourceModified":
                            pres.add_tag('file.pdf.date.source_modified',
                                         plist[1])
                        elif plist[0] == "/pdfx":
                            pres.add_tag('file.pdf.date.pdfx', plist[1])
                entropy = pdfid_result.get("Entropy", None)
                if entropy:
                    enres = ResultSection(title_text="Entropy",
                                          parent=pdfidres)
                    for enlist in entropy:
                        enres.add_line("{0}: {1}, ({2})".format(
                            enlist[0], enlist[1], enlist[2]))
            flags = pdfid_result.get("Flags", None)
            if flags:
                fres = ResultSection(title_text="PDF Keyword Flags",
                                     parent=pdfidres)
                for flist in flags:
                    if flist[0] == "/ObjStm":
                        objstms = True
                    if len(flist) == 3:
                        fres.add_line(
                            "{0}:Count: {1}, Hex-Encoded Count: {2}".format(
                                flist[0], flist[1], flist[2]))
                    else:
                        fres.add_line("{0}:Count: {1}".format(
                            flist[0], flist[1]))
                    fres.add_tag('file.string.extracted',
                                 flist[0].replace("/", "", 1))
                    if flist[0] in additional_keywords:
                        triage_keywords.add(flist[0].replace("/", "", 1))

            plugin = pdfid_result.get("Plugin", [])

            # If any plugin results, or flagged keywords found, run PDF Parser
            if plugin or len(triage_keywords) > 0:
                run_pdfparse = True

            for pllist in plugin:
                pl_name, pl_heur, pl_text = pllist
                pl_heur = int(pl_heur)
                pl_text = pl_text[14:]
                if not pl_text or pl_text == "None":
                    continue

                if pl_name in ['EmbeddedFile', 'Name Obfuscation']:
                    modres = ResultSection(title_text=pl_text, parent=pdfidres)

                    if pl_heur > 0:
                        modres.set_heuristic(pl_heur)

                    if pl_name == 'EmbeddedFile':
                        embed_present = True

                elif pl_name in ['Triage', 'Suspicious Properties']:
                    javascript_found = False
                    for line in pl_text.splitlines():
                        lineres = ResultSection(title_text=line)
                        # Triage results
                        if '/JavaScript' in line:
                            triage_keywords.add('JavaScript')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JS' in line:
                            triage_keywords.add('JS')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JBIG2Decode' in line:
                            triage_keywords.add('JBIG2Decode')
                            lineres.set_heuristic(3)
                        elif '/Colors > 2^24' in line:
                            triage_keywords.add('Colors > 2^24')
                            lineres.set_heuristic(20)
                        elif '/AA' in line:
                            triage_keywords.add('AA')
                            lineres.set_heuristic(1)
                        elif '/Launch' in line:
                            triage_keywords.add('Launch')
                            lineres.set_heuristic(1)
                        elif '/OpenAction' in line:
                            triage_keywords.add('OpenAction')
                            lineres.set_heuristic(1)
                        elif '/GoToE' in line:
                            triage_keywords.add('GoToE')
                            lineres.set_heuristic(21)
                        elif '/GoToR' in line:
                            triage_keywords.add('GoToR')
                            lineres.set_heuristic(22)
                        elif '/Encrypt' in line:
                            triage_keywords.add('Encrypt')
                            lineres.set_heuristic(11)
                        elif '/AcroForm' in line:
                            triage_keywords.add('AcroForm')
                            lineres.set_heuristic(4)
                        elif '/RichMedia' in line:
                            triage_keywords.add('RichMedia')
                            lineres.set_heuristic(5)
                        elif '/XFA' in line:
                            triage_keywords.add('XFA')
                            lineres.set_heuristic(23)
                        elif '/Annot' in line:
                            triage_keywords.add('Annot')
                            lineres.set_heuristic(25)
                        elif '/ObjStm' in line:
                            triage_keywords.add('ObjStm')
                            lineres.set_heuristic(7)
                        elif '/URI' in line:
                            triage_keywords.add('URI')
                            lineres.set_heuristic(24)

                        # Suspicious properties results
                        elif "eof2" in line:
                            lineres.set_heuristic(2)
                        elif "eof5" in line:
                            lineres.set_heuristic(17)
                        elif "page" in line:
                            lineres.set_heuristic(26)
                        elif "entropy" in line:
                            lineres.set_heuristic(12)
                        elif "obj/endobj" in line:
                            lineres.set_heuristic(13)
                        elif "stream/endstream" in line:
                            lineres.set_heuristic(14)

                        if lineres.heuristic is not None:
                            pdfidres.add_subsection(lineres)

        for e in errors:
            all_errors.add(e)
            if e.startswith('Error running plugin'):
                self.log.warn(e)

        if run_pdfparse:
            # CALL PDF parser and extract further information
            pdf_parserres = ResultSection(title_text="PDF Parser Results")
            # STATISTICS
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                options = {
                    "stats": True,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No statistical results generated for file. Please see errors."
                        )
                    else:
                        version = pdf_parser_result.get("version", None)
                        if version and version[0] != '0':
                            pdf_parserres.add_line(version[0])
                        stats = pdf_parser_result.get("stats", None)
                        if stats:
                            sres = ResultSection(
                                title_text="PDF Statistcs",
                                parent=pdf_parserres,
                                body_format=BODY_FORMAT.MEMORY_DUMP)
                            for p in stats:
                                sres.add_line(p)
                    for e in errors:
                        all_errors.add(e)

            # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream)
            carved_content = {}  # Format { "objnum": [{keyword: content list}}
            obj_extract_triage = set()
            jbig_objs = set()

            for keyword in triage_keywords:
                # ObjStms handled differently
                if keyword == 'ObjStm':
                    continue

                options = {
                    "search": keyword,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    for p in pdf_parser_result['parts']:
                        content = ""
                        references = []
                        # Trailer will be extracted anyways, try and grab all references anyways -- will be messy
                        if p.startswith("trailer:"):
                            # Grab the content after the keyword
                            # Check that keyword actually in content
                            if "/{}".format(keyword) in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').split("/", 1)[0].strip()
                                    references = re.findall(
                                        "[0-9]* [0-9]* R", content)
                                except Exception:
                                    continue
                        # If not trailer, should be object
                        elif 'Referencing:' in p:
                            # Grab the content after the keyword
                            if '>>++>>' in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').strip()
                                except Exception:
                                    try:
                                        content = p.split("\n", 3)[3]
                                    except Exception:
                                        content = p
                            else:
                                try:
                                    content = p.split("\n", 3)[3]
                                except Exception:
                                    content = p
                            # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R"
                            if content.startswith("/{}".format(keyword)):
                                try:
                                    content = re.sub("/{}[ ]*".format(keyword),
                                                     "", content, 1)
                                except Exception:
                                    pass
                            try:
                                references = p.split("\n", 3)[2].replace(
                                    'Referencing:', '').strip().split(", ")
                            except Exception:
                                pass
                        # Only extract JBIG2Decode objects with deep scan, but always report on their presence
                        if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p:
                            try:
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                if request.deep_scan:
                                    obj_extract_triage.add(objnum)
                                jbig_objs.add(objnum)
                                continue
                            except Exception as e:
                                self.log.debug(e)
                                continue
                        # If no content, then keyword likely points to reference objects, so grab those
                        if content == '':
                            if len(references) > 0:
                                content = references
                            else:
                                # Something is wrong, drop it.
                                continue
                        else:
                            while True:
                                # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R]
                                islist = re.match(
                                    r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]",
                                    content)
                                if islist:
                                    content = re.sub(
                                        r"[\[\]]", "",
                                        islist.group(0).replace(
                                            "s ", '').replace("R ",
                                                              "R,")).split(",")
                                    break
                                # References might be with instructions, i.e. [# # R /FitH null]
                                withinst = re.match(
                                    r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}"
                                    r"[/a-zA-Z0-9 ]*[ ]?\]", content)
                                if withinst:
                                    content = [withinst.group(1)]
                                    break
                                content = [content]
                                break
                        for c in content:
                            # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted
                            if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[
                                    0:5]:
                                continue
                            if c in references or re.match(
                                    "[0-9]* [0-9]* R", c):
                                try:
                                    ref_obj = c.split(" ", 1)[0]
                                    options = {
                                        "object": ref_obj,
                                        "get_object_detail": True
                                    }
                                    pdf_parser_subresult, err = self.get_pdf_parser(
                                        path, working_dir, options)

                                    if pdf_parser_subresult:
                                        for sub_p in pdf_parser_subresult[
                                                'parts']:
                                            sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\
                                                .strip().split(", ")
                                            ptyp = sub_p.split(
                                                "\n", 2)[1].replace(
                                                    'Type:',
                                                    '').strip().replace(
                                                        "/", "")
                                            # If the object contains a stream, extract the object.
                                            if "Contains stream" in sub_p:
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    obj_extract_triage.add(
                                                        objnum)
                                                except Exception:
                                                    pass
                                            # Or if the object Type is the keyword, grab all referenced objects.
                                            elif sub_references[0] != '' and len(sub_references) >= 1 \
                                                    and ptyp == keyword:
                                                for sr in sub_references:
                                                    try:
                                                        objnum = sr.split(
                                                            " ", 1)[0]
                                                        obj_extract_triage.add(
                                                            objnum)
                                                    except Exception:
                                                        pass
                                            # If not, extract object detail in to carved output
                                            elif pdf_parser_subresult[
                                                    'obj_details'] != "":
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    if objnum in carved_content:
                                                        carved_content[objnum]\
                                                            .append({keyword: pdf_parser_subresult['obj_details']})
                                                    else:
                                                        carved_content[objnum] = \
                                                            [{keyword: pdf_parser_subresult['obj_details']}]
                                                except Exception:
                                                    continue

                                    for e in err:
                                        errors.add(e)
                                except Exception:
                                    # If none of that work, just extract the original object for examination.
                                    try:
                                        objnum = p.split("\n",
                                                         1)[0].split(" ")[1]
                                        obj_extract_triage.add(objnum)
                                    except Exception:
                                        pass
                            # If content does not look like a reference:
                            else:
                                if p.startswith("trailer:"):
                                    continue
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                # If the object contains a stream extract the object
                                if p.split("\n", 4)[3] == "Contains stream":
                                    obj_extract_triage.add(objnum)
                                else:
                                    # Or just carve the content
                                    if objnum in carved_content:
                                        carved_content[objnum].append(
                                            {keyword: c})
                                    else:
                                        carved_content[objnum] = [{keyword: c}]

                    for e in errors:
                        all_errors.add(e)

            # Add carved content to result output
            show_content_of_interest = False
            if len(carved_content) > 0 or len(jbig_objs) > 0:
                carres = ResultSection(title_text="Content of Interest")
            else:
                carres = None

            if len(jbig_objs) > 0:
                jbigres = ResultSection(
                    title_text=
                    "The following Object IDs are JBIG2DECODE streams:",
                    body_format=BODY_FORMAT.MEMORY_DUMP,
                    parent=carres)
                jbigres.add_line(', '.join(map(str, jbig_objs)))
                show_content_of_interest = True

            if len(carved_content) > 0:
                for k, l in sorted(carved_content.items()):
                    for d in l:
                        for keyw, con in d.items():
                            subres = ResultSection(
                                title_text="Object {0}: Hits for Keyword '{1}':"
                                .format(k, keyw))
                            subres.set_heuristic(8)

                            con_bytes = con.encode()
                            if len(con) < 500:
                                subres.body_format = BODY_FORMAT.MEMORY_DUMP
                                subres.add_line(con)

                                # Check for IOC content
                                patterns = PatternMatch()
                                st_value = patterns.ioc_match(con_bytes,
                                                              bogon_ip=True)
                                if len(st_value) > 0:
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    for ty, val in st_value.items():
                                        if val == "":
                                            asc_asc = unicodedata.normalize(
                                                'NFKC',
                                                val).encode('ascii', 'ignore')
                                            subres.add_tag(ty, asc_asc)
                                        else:
                                            ulis = list(set(val))
                                            for v in ulis:
                                                subres.add_tag(ty, v)
                            else:
                                crv_sha = hashlib.sha256(con_bytes).hexdigest()

                                if crv_sha not in carved_extracted_shas:
                                    f_name = "carved_content_obj_{}_{}".format(
                                        k, crv_sha[0:7])
                                    subres.add_lines([
                                        "Content over 500 bytes it will be extracted for analysis",
                                        "Name: {} - SHA256: {}".format(
                                            f_name, crv_sha)
                                    ])
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    crvf = os.path.join(
                                        self.working_directory, f_name)
                                    with open(crvf, 'wb') as f:
                                        f.write(con_bytes)
                                    request.add_extracted(
                                        crvf, os.path.basename(crvf),
                                        "Extracted content from object {}".
                                        format(k))
                                    carved_extracted_shas.add(crv_sha)

            if show_content_of_interest:
                pdf_parserres.add_subsection(carres)

            # ELEMENTS
            # Do not show for objstms
            if get_malform:
                if request.deep_scan:
                    options = {
                        "verbose": True,
                        "nocanonicalizedoutput": True,
                        "get_malform": get_malform
                    }
                elif embed_present:
                    options = {
                        "verbose": True,
                        "elements": "ctsi",
                        "type": "/EmbeddedFile",
                        "get_malform": get_malform
                    }
                else:
                    options = {
                        "verbose": True,
                        "elements": "cst",
                        "get_malform": get_malform
                    }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                embed_extracted = set()
                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No structure information generated for file. Please see errors."
                        )
                    else:
                        # PDF Parser will write any malformed content over 100 bytes to a file
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'malformed':
                                    if len(l) > 0:
                                        pdf_parserres.set_heuristic(6)
                                    for i in l:
                                        request.add_extracted(
                                            i, os.path.basename(i),
                                            "Extracted malformed content in PDF Parser Analysis."
                                        )

                        parts = pdf_parser_result.get("parts", None)
                        # Extract service will extract the sample's embedded files.
                        # However we want to make note of them so that they are not extracted again below
                        if parts:
                            for p in sorted(parts):
                                if "Type: /EmbeddedFile" in p:
                                    getobj = p.split("\n", 1)[0].split(" ")[1]
                                    embed_extracted.add(getobj)

                # Extract objects collected from above analysis
                obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs

                if len(obj_to_extract) > 0:
                    options = {
                        "filter": True,
                        "object": obj_to_extract,
                        "dump": "extracted_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        files = pdf_parser_result.get("files", None)
                        extracted_files = []
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_obj_", "")
                                        extracted_files.append(
                                            "Extracted object {} as {}".format(
                                                obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "Object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))
                        for e in errors:
                            all_errors.add(e)

                        if extracted_files:
                            extract_res = ResultSection(
                                title_text="Extracted embedded objects",
                                parent=pdf_parserres)
                            extract_res.set_heuristic(9)
                            extract_res.add_lines(extracted_files)

                # Extract jbig2decode objects in deep scan mode
                if request.deep_scan and len(jbig_objs) > 0:
                    options = {
                        "object": jbig_objs,
                        "dump": "extracted_jb_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        extracted_jb = []
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_jb_obj_", "")
                                        extracted_jb.append(
                                            "JBIG2DECODE object {} extracted as {}"
                                            .format(obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "JBIG2DECODE object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))

                        for e in errors:
                            all_errors.add(e)

                        if extracted_jb:
                            jbig_extract_res = ResultSection(
                                title_text="Extracted JBIG2Decode objects",
                                parent=pdf_parserres)
                            jbig_extract_res.set_heuristic(9)
                            jbig_extract_res.add_lines(extracted_jb)

            if len(pdf_parserres.subsections) > 0:
                res.add_subsection(pdf_parserres)

        return res, objstms, all_errors
예제 #27
0
    def execute(self, request):
        """Main Module. See README for details."""
        global imginfo
        result = Result()
        request.result = result
        self.sha = request.sha256
        infile = request.file_path
        run_steg = request.get_param('run_steg')

        # Run image-specific modules
        supported_images = re.compile('image/(bmp|gif|jpeg|jpg|png)')
        if re.match(supported_images, request.file_type):
            # Extract img info using Pillow (already available in steg.py) and determine if steg modules should be run
            if self.config['run_steg_auto'] or run_steg:
                decloak = True
            else:
                decloak = False
            try:
                imginfo = ImageInfo(infile, request, result,
                                    self.working_directory, self.log)
            except NotSupported:
                decloak = False

            # Run Tesseract on sample
            # Process the command and save the csv result in the result object
            usable_out = None
            orig_outfile = os.path.join(self.working_directory, 'outfile')
            stdout, stderr = self.tesseract_call(infile, orig_outfile)

            if stdout or stderr:
                # Assess Tesseract warnings
                if b"pix too small" in stderr:
                    # Make the image larger with convert command
                    c_outfile = os.path.join(self.working_directory,
                                             'enlrg_img')
                    c_stdout, c_stderr = self.convert_img(infile, c_outfile)
                    if c_stdout:
                        c_outfile = os.path.join(self.working_directory,
                                                 'c_outfile')
                        enlrg_infile = os.path.join(self.working_directory,
                                                    'enlrg')
                        if not c_stderr:
                            stdout, stderr = self.tesseract_call(
                                enlrg_infile, c_outfile)
                            if stdout:
                                if not stderr:
                                    outfile = c_outfile
                                else:
                                    outfile = orig_outfile
                            else:
                                outfile = orig_outfile
                        else:
                            outfile = orig_outfile
                    else:
                        outfile = orig_outfile
                else:
                    outfile = orig_outfile
                    self.log.debug(
                        "Tesseract errored/warned on sample {}. Error:{}".
                        format(self.sha, stderr))

                usable_out = self.assess_output(outfile, request)

            if usable_out:
                ores = ResultSection("OCR Engine detected strings in image",
                                     body_format=BODY_FORMAT.MEMORY_DUMP)
                ores.add_line("Text preview (up to 500 bytes):\n")
                ores.add_line("{}".format(usable_out[0:500]))
                result.add_section(ores)
            # Find attached data
            additional_content = self.find_additional_content(infile)
            if additional_content:
                ares = (ResultSection("Possible Appended Content Found",
                                      body_format=BODY_FORMAT.MEMORY_DUMP))
                ares.add_line(
                    "{} Bytes of content found at end of image file".format(
                        len(additional_content)))
                ares.add_line("Text preview (up to 500 bytes):\n")
                ares.add_line("{}".format(safe_str(additional_content)[0:500]))
                ares.set_heuristic(2)
                result.add_section(ares)
                file_name = "{}_appended_img_content".format(
                    hashlib.sha256(additional_content).hexdigest()[0:10])
                file_path = os.path.join(self.working_directory, file_name)
                request.add_extracted(file_path, file_name,
                                      "Carved content found at end of image.")
                with open(file_path, 'wb') as unibu_file:
                    unibu_file.write(additional_content)
            # Steganography modules
            if decloak:
                if request.deep_scan:
                    imginfo.decloak()
    def execute(self, request):
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()
        before = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values:
            if request.get_param('extract_original_iocs'):
                ioc_res = ResultSection(
                    "The following IOCs were found in the original file",
                    parent=request.result,
                    body_format=BODY_FORMAT.MEMORY_DUMP)
            else:
                ioc_res = None
            for k, val in pat_values.items():
                if val == "":
                    asc_asc = unicodedata.normalize('NFKC', val).encode(
                        'ascii', 'ignore')
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}"
                        )
                        ioc_res.add_tag(k, asc_asc)
                    before.add((k, asc_asc))
                else:
                    for v in val:
                        if ioc_res:
                            ioc_res.add_line(
                                f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_res.add_tag(k, v)
                        before.add((k, v))

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if re.match(re.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            for name, technique in techniques:
                res = technique(layer)
                if res:
                    layers_list.append((name, res))
                    # Looks like it worked, restart with new layer
                    layer = res
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    for name, technique in final_pass:
                        res = technique(layer)
                        if res:
                            layers_list.append((name, res))
                    break
                else:
                    for x in second_pass:
                        techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)
            heur_id = None

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            elif num_layers >= 100:
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags = {}

                for k, val in pat_values.items():
                    if val == "":
                        asc_asc = unicodedata.normalize('NFKC', val).encode(
                            'ascii', 'ignore')
                        if (k, asc_asc) not in before:
                            diff_tags.setdefault(k, [])
                            diff_tags[k].append(asc_asc)
                    else:
                        for v in val:
                            if (k, v) not in before:
                                diff_tags.setdefault(k, [])
                                diff_tags[k].append(v)

                if request.deep_scan or \
                        (len(clean) > 1000 and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    fn = f"{request.file_name}_decoded_final"
                    fp = os.path.join(self.working_directory, fn)
                    with open(fp, 'wb') as dcf:
                        dcf.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {fp}")
                    request.add_extracted(fp, fn, "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for f in self.files_extracted:
                        ext_file_res.add_line(os.path.basename(f))
                        request.add_extracted(
                            f, os.path.basename(f),
                            "File of interest deobfuscated from sample")
def tag_data(data, data_deobfuscated, result_ioc, result_formula):
    pattern = PatternMatch()

    # Get all IoCs without deobfuscation
    ioc_dict = {}
    formulas = collections.OrderedDict()
    for line in data:
        if line[:4] == 'CELL':
            split_value = line.split(',', 1)
            cell = split_value[0].split(':')[1].strip()
            formula = split_value[1].rsplit(',', 1)[0].strip()

            # Add formula to list of formulas if it contains IoC(s)
            if pattern.ioc_match(formula, cell, ioc_dict):
                formulas[cell] = formula

    # Get all IoCs after deobfuscation
    ioc_deobfuscated_dict = {}
    formulas_deobfuscated = collections.OrderedDict()
    for line in data_deobfuscated:
        split_value = line.split(':', 1)
        cell = split_value[0].strip()
        formula = split_value[1].strip()

        # Add formula to list of deobfuscated formulas if it contains IoC(s)
        if pattern.ioc_match(formula, cell, ioc_deobfuscated_dict):
            formulas_deobfuscated[cell] = formula

    # Remove duplicate IoCs (found both before AND after deobfuscation)
    for ioc_tag, values in ioc_deobfuscated_dict.copy().items():
        for ioc_details in values.copy():
            if ioc_tag in ioc_dict and ioc_details in ioc_dict[ioc_tag]:
                ioc_deobfuscated_dict[ioc_tag].remove(ioc_details)
                # Remove ioc_tag if no IoCs are associated with it
                if len(ioc_deobfuscated_dict[ioc_tag]) == 0:
                    del ioc_deobfuscated_dict[ioc_tag]

    # Remove duplicate formulas from the same cell (found both before AND after deobfuscation)
    for cell, formula in formulas_deobfuscated.copy().items():
        if cell in formulas and formula in formulas[cell]:
            del formulas_deobfuscated[cell]

    # Create the appropriate result subsections for formulas
    formulas_subsection = ResultSection('Formulas')
    formulas_deobfuscated_subsection = ResultSection('Deobfuscated Formulas')
    formulas_deobfuscated_subsection.set_heuristic(5)
    if formulas:
        result_formula.add_subsection(formulas_subsection)
    if formulas_deobfuscated:
        result_formula.add_subsection(formulas_deobfuscated_subsection)

    # Generate result subsections for IoCs found without deobfuscation
    heuristics = [1, 2]
    for ioc_tag, values in ioc_dict.items():
        for ioc_details in values:
            ioc = ioc_details[0]
            title = ioc_details[1]
            heuristic = heuristics[ioc_details[2]]

            ioc_subsection = get_result_subsection(result_ioc, title,
                                                   heuristic)
            ioc_subsection.add_tag(ioc_tag, ioc)
            pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)')
            if ioc_subsection.body is not None and not pattern.search(
                    ioc_subsection.body):
                ioc_subsection.add_line(ioc)
            elif ioc_subsection.body is None:
                ioc_subsection.add_line(ioc)

            formulas_subsection.add_tag(ioc_tag, ioc)

    # Generate result subsections for deobfuscated IoCs
    heuristics = [3, 4]
    for ioc_tag, values in ioc_deobfuscated_dict.items():
        for ioc_details in values:
            ioc = ioc_details[0]
            title = 'Deobfuscated ' + ioc_details[1]
            heuristic = heuristics[ioc_details[2]]

            ioc_subsection = get_result_subsection(result_ioc, title,
                                                   heuristic)
            ioc_subsection.add_tag(ioc_tag, ioc)
            pattern = re.compile('(\\n|^)' + re.escape(ioc) + '(\\n|$)')
            if ioc_subsection.body is not None and not pattern.search(
                    ioc_subsection.body):
                ioc_subsection.add_line(ioc)
            elif ioc_subsection.body is None:
                ioc_subsection.add_line(ioc)

            formulas_deobfuscated_subsection.add_tag(ioc_tag, ioc)

    # Populate 'Formulas' result subsection with all suspicious formulas found without deobfuscation
    for cell, formula in formulas.items():
        # Only add complete formulas
        if "FORMULA(" in formula:
            cell_referenced = formula.rsplit(',', 1)[1][:-1]
            if cell_referenced not in formulas.keys():
                formulas_subsection.add_line(cell + ": " + formula)
        else:
            formulas_subsection.add_line(cell + ": " + formula)

    # Populate 'Deobfuscated Formulas' result subsection with all deobfuscated suspicious formulas
    for cell, formula in formulas_deobfuscated.items():
        # Only add complete formulas
        if "FORMULA(" in formula:
            cell_referenced = formula.rsplit(',', 1)[1][:-1]
            if cell_referenced not in formulas_deobfuscated.keys():
                formulas_deobfuscated_subsection.add_line(cell + ": " +
                                                          formula)
        else:
            formulas_deobfuscated_subsection.add_line(cell + ": " + formula)