Beispiel #1
0
    def execute(self, request):
        result = Result()

        file_path = request.file_path
        file_type = request.file_type

        shutil.copyfile(file_path, self.working_directory + "/analyzed")

        p1 = subprocess.Popen(
            "java -jar /var/lib/assemblyline/StegExpose/StegExpose.jar " +
            self.working_directory + " standard default " +
            self.working_directory + "/report.csv",
            shell=True)
        p1.wait()

        lsb_steg_results = self.read_csv(self.working_directory +
                                         "/report.csv")
        lsb_steg_results = self.beautify_dict(lsb_steg_results)

        kv_section = ResultSection("Result of the LSB steganalysis",
                                   body_format=BODY_FORMAT.KEY_VALUE,
                                   body=json.dumps(lsb_steg_results))
        result.add_section(kv_section)

        request.result = result
Beispiel #2
0
    def execute(self, request):
        temp_filename = request.file_path

        # Filter out large documents
        if os.path.getsize(temp_filename) > self.max_pdf_size:
            file_res = Result()
            res = (ResultSection(
                f"PDF Analysis of the file was skipped because the "
                f"file is too big (limit is {(self.max_pdf_size / 1000 / 1000)} MB)."
            ))

            file_res.add_section(res)
            request.result = file_res
            return

        filename = os.path.basename(temp_filename)
        # noinspection PyUnusedLocal
        file_content = ''
        with open(temp_filename, 'rb') as f:
            file_content = f.read()

        if '<xdp:xdp'.encode(encoding='UTF-8') in file_content:
            self.find_xdp_embedded(filename, file_content, request)

        self.peepdf_analysis(temp_filename, file_content, request)
 def test_reduce():
     from assemblyline_v4_service.common.section_reducer import reduce
     from assemblyline_v4_service.common.result import Result, ResultSection
     res = Result()
     result_section = ResultSection("blah")
     res.add_section(result_section)
     reduce(res)
     # Code coverage only
     assert True
 def resubmit_dex2jar_output(self, apk_file: str, target: str, result: Result, request):
     dex = os.path.join(self.working_directory, "classes.dex")
     self.get_dex(apk_file, dex)
     if os.path.exists(dex):
         d2j = Popen([self.dex2jar, "--output", target, dex],
                     stdout=PIPE, stderr=PIPE)
         d2j.communicate()
         if os.path.exists(target):
             res_sec = ResultSection("Classes.dex file was recompiled as a JAR and re-submitted for analysis")
             res_sec.add_line(f"JAR file resubmitted as: {os.path.basename(target)}")
             request.add_extracted(target, os.path.basename(target), "Dex2Jar output JAR file")
             result.add_section(res_sec)
Beispiel #5
0
    def execute(self, request: ServiceRequest) -> None:
        sha256 = request.sha256
        result = Result()

        # First, let's get the analysis metadata, if it exists on the system
        main_api_result = self._get_analysis_metadata(
            request.get_param('analysis_id'), sha256)

        if not main_api_result:
            self.log.debug(f"SHA256 {sha256} is not on the system.")
            request.result = result
            return

        if main_api_result.get(
                "verdict") in Verdicts.NOT_SUPPORTED_VERDICTS.value:
            self.log.debug(f"Unsupported file type: {request.file_type}")
            request.result = result
            return
        elif main_api_result.get("verdict") == AnalysisStatusCode.FAILED.value:
            self.log.warning("The Intezer server is not feeling well :(")
            request.result = result
            return

        analysis_id = main_api_result["analysis_id"]

        # Setup the main result section
        main_kv_section = ResultKeyValueSection(
            "IntezerStatic analysis report")
        processed_main_api_result = self._process_details(
            main_api_result.copy(), UNINTERESTING_ANALYSIS_KEYS)
        main_kv_section.update_items(processed_main_api_result)
        if "family_name" in main_api_result:
            main_kv_section.add_tag("attribution.family",
                                    main_api_result["family_name"])

        # This file-verdict map will be used later on to assign heuristics to sub-analyses
        file_verdict_map = {}
        self._process_iocs(analysis_id, file_verdict_map, main_kv_section)
        if not self.config["is_on_premise"]:
            self._process_ttps(analysis_id, main_kv_section)
        self._handle_subanalyses(request, sha256, analysis_id,
                                 file_verdict_map, main_kv_section)

        # Setting heuristic here to avoid FPs
        if main_kv_section.subsections:
            self._set_heuristic_by_verdict(main_kv_section,
                                           main_api_result["verdict"])

        if main_kv_section.subsections or main_kv_section.heuristic:
            result.add_section(main_kv_section)
        request.result = result
Beispiel #6
0
    def test_execute(sample, metadefender_class_instance, mocker):
        from assemblyline_v4_service.common.task import Task
        from assemblyline_v4_service.common.result import Result
        from assemblyline.odm.messages.task import Task as ServiceTask
        from assemblyline_v4_service.common.request import ServiceRequest
        import json
        metadefender_class_instance.nodes["blah"] = {
            "engine_count": 1,
            "oldest_dat": 1,
            "newest_dat": 1
        }
        mocker.patch.object(metadefender_class_instance, "_get_version_map")
        metadefender_class_instance.start()

        service_task = ServiceTask(sample)
        task = Task(service_task)
        metadefender_class_instance._task = task
        service_request = ServiceRequest(task)

        mocker.patch.object(metadefender_class_instance, "scan_file")
        mocker.patch.object(metadefender_class_instance, "new_node")
        mocker.patch.object(metadefender_class_instance,
                            "parse_results",
                            return_value=Result())

        # Actually executing the sample
        metadefender_class_instance.execute(service_request)

        # For coverage
        metadefender_class_instance.config["max_node_time"] = 0
        metadefender_class_instance.execute(service_request)

        metadefender_class_instance.config["max_node_time"] = 1000
        metadefender_class_instance.config["min_node_time"] = 0
        metadefender_class_instance.execute(service_request)
    def execute(self, request):
        result = Result()
        apk = request.file_path
        filename = os.path.basename(apk)
        quark_out = os.path.join(self.working_directory, 'quark_out')
        quark_graph = os.path.join('/opt/al_service/', 'call_graph_image')

        if request.get_param('generate_graphs'):
            call([
                "quark", "-a", apk, "-g", "-s", "-o", quark_out, "-r",
                "/opt/al_support/quark-rules"
            ])
        else:
            call([
                "quark", "-a", apk, "-o", quark_out, "-r",
                "/opt/al_support/quark-rules"
            ])
        if os.path.exists(quark_out):
            self.run_analysis(quark_out, result)
            request.add_supplementary(
                quark_out, "quark_out",
                "These are quark Results as a JSON file")

        if os.path.exists(quark_graph):
            for filename in os.listdir(quark_graph):
                if filename.endswith(".png"):
                    request.add_supplementary(
                        os.path.join(quark_graph, filename), filename,
                        "call graph : {0}".format(filename))
        request.result = result
 def gen_results(self, api_response):
     procr = self.upmal.process_results(api_response, self.upm)
     result = Result()
     service_task = ServiceTask(sample1)
     task = Task(service_task)
     request = ServiceRequest(task)
     self.upmal.generate_results(procr, result, api_response, request)
    def execute(self, request):
        result = Result()
        request.result = result
        file_path = request.file_path
        password = request.get_param('password')
        start_point = request.get_param('start point')

        try:
            data = process_file(file=file_path,
                                password=password,
                                noninteractive=True,
                                no_indent=True,
                                output_level=0,
                                return_deobfuscated=True,
                                extract_only=True)

            data_deobfuscated = process_file(
                file=file_path,
                password=password,
                start_point=start_point,
                noninteractive=True,
                no_indent=True,
                output_level=0,
                output_formula_format='[[CELL-ADDR]]: [[INT-FORMULA]]',
                return_deobfuscated=True)
        except Exception as e:
            section = ResultSection('Failed to analyze', parent=request.result)
            section.add_line(str(e))
            if str(e).startswith('Failed to decrypt'):
                section.set_heuristic(6)
            return

        add_results(result, data, data_deobfuscated)
Beispiel #10
0
 def test_parse_results(response, correct_res_secs,
                        metadefender_class_instance):
     from assemblyline_v4_service.common.result import Result, ResultSection, BODY_FORMAT, Heuristic
     metadefender_class_instance.blocklist = ["a"]
     metadefender_class_instance.sig_score_revision_map = {}
     metadefender_class_instance.kw_score_revision_map = {}
     metadefender_class_instance.current_node = "http://blah"
     metadefender_class_instance.nodes[
         metadefender_class_instance.current_node] = {
             "engine_map": {
                 "z": {
                     "version": "blah",
                     "def_time": "blah"
                 },
                 "y": {
                     "version": "blah",
                     "def_time": "blah"
                 }
             },
             "queue_times": [],
             "file_count": 0
         }
     correct_result = Result()
     for correct_res_sec in correct_res_secs:
         section = ResultSection(
             correct_res_sec["title_text"],
             body_format=BODY_FORMAT.TEXT if
             not correct_res_sec.get("body_format") else BODY_FORMAT.JSON,
             body=correct_res_sec.get("body"))
         for subsec in correct_res_sec.get("subsections", []):
             subsection = ResultSection(
                 subsec["title_text"],
                 body=subsec["body"],
                 body_format=BODY_FORMAT.KEY_VALUE,
                 tags=subsec.get("tags"),
             )
             if subsec.get("heuristic"):
                 subsection.set_heuristic(subsec["heuristic"]["heur_id"])
                 print(subsec["heuristic"]["signatures"])
                 for key in subsec["heuristic"]["signatures"].keys():
                     subsection.heuristic.add_signature_id(key)
             section.add_subsection(subsection)
         correct_result.add_section(section)
     actual_result = metadefender_class_instance.parse_results(response)
     for index, section in enumerate(actual_result.sections):
         assert check_section_equality(section,
                                       correct_result.sections[index])
    def execute(self, request: ServiceRequest) -> None:
        """ Main Module. See README for details."""
        request.result = Result()
        patterns = PatternMatch()
        self.sample_type = request.file_type
        self.excess_extracted = 0
        # Filters for submission modes. Listed in order of use.
        if request.deep_scan:
            # Maximum size of submitted file to run this service:
            max_size = 8000000
            # String length maximum
            # Used in basic ASCII and UNICODE modules:
            max_length = 1000000
            # String list maximum size
            # List produced by basic ASCII and UNICODE module results and will determine
            # if patterns.py will only evaluate network IOC patterns:
            st_max_size = 1000000
            # BBcrack maximum size of submitted file to run module:
            bb_max_size = 200000
        else:
            max_size = self.config.get('max_size', 3000000)
            max_length = self.config.get('max_length', 5000)
            st_max_size = self.config.get('st_max_size', 0)
            bb_max_size = self.config.get('bb_max_size', 85000)

        # Begin analysis
        if (len(request.file_contents)
                or 0) >= max_size or self.sample_type.startswith("archive/"):
            # No analysis is done if the file is an archive or too large
            return

        self.ascii_results(request, patterns, max_length, st_max_size)
        self.embedded_pe_results(request)

        # Possible encoded strings -- all sample types except code/* (code is handled by deobfuscripter service)
        if not self.sample_type.startswith('code'):
            self.base64_results(request, patterns)
            if (len(request.file_contents) or 0) < bb_max_size:
                self.bbcrack_results(request)

        # Other possible encoded strings -- all sample types but code and executables
        if not self.sample_type.split('/', 1)[0] in ['executable', 'code']:
            self.unicode_results(request, patterns)
            # Go over again, looking for long ASCII-HEX character strings
            if not self.sample_type.startswith('document/office'):
                self.hex_results(request, patterns)

        if self.excess_extracted:
            self.log.warning(
                f"Too many files extracted from {request.sha256}, "
                f"{self.excess_extracted} files were not extracted")
            request.result.add_section(
                ResultSection(
                    f"Over extraction limit: "
                    f"{self.excess_extracted} files were not extracted"))
Beispiel #12
0
    def execute(self, request: ServiceRequest) -> Optional[Dict[str, Any]]:
        result = Result()
        request.result = result

        # Get AV labels from previous services
        av_labels = request.task.tags.get('av.virus_name')
        if not av_labels:
            return

        # Extract AVclass tags
        av_tags = self._get_avclass_tags(request.md5, request.sha1,
                                         request.sha256, av_labels)
        if av_tags is None:
            return

        # Build results
        section = self._get_result_section(av_tags.family, av_tags.is_pup)
        for tag_section in self._get_category_sections(av_tags.tags):
            section.add_subsection(tag_section)

        result.add_section(section)
Beispiel #13
0
    def execute(self, request):
        self.mwcp_report = cli.register()
        result = Result()
        # Run Ratdecoders
        output = cli.run_ratdecoders(request.file_path, self.mwcp_report)
        if type(output) is str:
            self.log.info(output)
            output = ""
        if type(output) is dict:
            self.log.info(output)
            for parser, fields in output.items():
                self.section_builder(parser, fields, result, "RATDecoder")

        tags = {
            f"al_{k.replace('.', '_')}": i
            for k, i in request.task.tags.items()
        }
        newtags = {}
        # yara externals must be dicts w key value pairs being strings
        for k, v in tags.items():
            key = f"al_{k.replace('.', '_')}"
            for i in range(len(v)):
                if not isinstance(v[i], str):
                    v[i] = str(v[i])
            value = " | ".join(v)
            newtags[key] = value
        # get matches for both, dedup then run
        parsers = cli.deduplicate(self.file_parsers, self.tag_parsers,
                                  request.file_path, newtags)
        output_fields = cli.run(parsers, request.file_path, self.mwcp_report)

        for parser, field_dict in output_fields.items():
            self.section_builder(parser, field_dict, result)
            if "outputfile" in field_dict:
                # outputfile value is a list of lists containing filename, description and md5 has of additional
                # outputfiles
                outputfiles = field_dict['outputfile']
                for output_list in outputfiles:
                    output_filename = output_list[0]
                    output_description = output_list[1]
                    output_md5 = output_list[2]
                    output_fullpath = os.path.join(
                        os.getcwd(), output_md5[:5] + '_' + output_filename)
                    request.add_supplementary(output_fullpath, output_filename,
                                              output_description)
        fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
        if output or output_fields:
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(output))
                myfile.write(json.dumps(output_fields))
            request.add_supplementary(temp_path, "output.json",
                                      "This is MWCP output as a JSON file")
        request.result = result
    def execute(self, request):
        """Main Module. See README for details."""
        result = Result()
        self.sha = request.sha256
        local = request.file_path

        text_section = None
        kv_section = None

        extracted, metadata = self.dexray(request, local)

        num_extracted = len(request.extracted)
        if num_extracted != 0:
            text_section = ResultSection("DeXRAY found files:")
            for extracted in request.extracted:
                file_name = extracted.get('name')
                text_section.add_line(
                    f"Resubmitted un-quarantined file as : {file_name}")

        if metadata:
            # Can contain live URLs to the original content source
            kv_section = ResultSection("DeXRAY Quarantine Metadata",
                                       body_format=BODY_FORMAT.JSON,
                                       body=json.dumps(metadata))
            result.add_section(kv_section)

        for section in (text_section, kv_section):
            if section:
                result.add_section(section)
Beispiel #15
0
    def parse_results(response: Dict[str, Any]):
        res = Result()
        response = response['data']

        url_section = ResultSection('VirusTotal report permalink',
                                    body_format=BODY_FORMAT.URL,
                                    body=json.dumps(
                                        {"url": response['links']['self']}))
        res.add_section(url_section)
        response = response['attributes']
        scans = response['last_analysis_results']
        av_hits = ResultSection('Anti-Virus Detections')
        av_hits.add_line(
            f'Found {response["last_analysis_stats"]["malicious"]} AV hit(s) from '
            f'{len(response["last_analysis_results"].keys())}')
        for majorkey, subdict in sorted(scans.items()):
            if subdict['category'] == "malicious":
                virus_name = subdict['result']
                av_hit_section = AvHitSection(majorkey, virus_name)
                av_hit_section.set_heuristic(
                    1, signature=f'{majorkey}.{virus_name}')
                av_hit_section.add_tag('av.virus_name', virus_name)
                av_hits.add_subsection(av_hit_section)

        res.add_section(av_hits)

        return res
	def execute(self, request):
		result = Result()
		file = request.file_path

		with open(file, "rb") as f:
			file_content = f.read()

		content_list = autoit_ripper.extract(data=file_content)

		if content_list:
			content = content_list[0][1].decode("utf-8")

			text_section = ResultSection('[DUMP RESULT]')
			text_section.add_line(content)
			text_section.set_heuristic(1)
			result.add_section(text_section)

			with open(self.working_directory + "script.au3", "w") as f:
				f.write(content)
			request.add_extracted(self.working_directory + 'script.au3', 'script.au3', 'This is the unpacked script')
		
		request.result = result
Beispiel #17
0
    def _extract_result_from_matches(self, matches):
        """
        Iterate through Yara match object and send to parser.

        Args:
            matches: Yara rules Match object (list).

        Returns:
            AL Result object.
        """
        result = Result()
        for match in matches:
            self._add_resultinfo_for_match(result, match)
        return result
Beispiel #18
0
    def execute(self, request):
        result = Result()
        file_path = request.file_path

        p1 = subprocess.Popen("clamscan -a -z --detect-pua --alert-macros " +
                              file_path,
                              shell=True,
                              stdout=subprocess.PIPE)
        p1.wait()
        stdout = p1.communicate()[0].decode("utf-8")

        report = stdout.split("\n")
        report = list(filter(None, report))

        text_section = ResultSection("Successfully scanned the file")
        if "FOUND" in report[0]:
            text_section.set_heuristic(1)

        for l in report:
            text_section.add_line(l)

        result.add_section(text_section)
        request.result = result
 def execute(self, request: ServiceRequest) -> None:
     result = Result()
     self.hits = {}  # clear the hits dict
     path = request.file_path
     file_name = request.file_name
     self.log.info(f" Executing {file_name}")
     self.log.info(f"Number of rules {len(self.sigma_parser.rules)}")
     self.sigma_parser.register_callback(self.sigma_hit)
     self.sigma_parser.check_logfile(path)
     if len(self.hits) > 0:
         hit_section = ResultSection('Events detected as suspicious')
         # group alerts together
         for id, events in self.hits.items():
             title = self.sigma_parser.rules[id].title
             section = SigmaHitSection(title, events)
             tags = self.sigma_parser.rules[id].tags
             if tags:
                 for tag in tags:
                     name = tag[7:]
                     if name.startswith(('t', 'g', 's')):
                         attack_id = name.upper()
             source = events[0]['signature_source']
             if attack_id:
                 section.set_heuristic(get_heur_id(events[0]['score']),
                                       attack_id=attack_id,
                                       signature=f"{source}.{title}")
                 section.add_tag(f"file.rule.{source}", f"{source}.{title}")
             else:
                 section.set_heuristic(get_heur_id(events[0]['score']),
                                       signature=f"{source}.{title}")
                 section.add_tag(f"file.rule.{source}", f"{source}.{title}")
             for event in events:
                 # add the event data as a subsection
                 section.add_subsection(EventDataSection(event))
             hit_section.add_subsection(section)
         result.add_section(hit_section)
     request.result = result
    def execute(self, request):
        result = Result()
        url = request.task.metadata.get('submitted_url')
        api_key = request.get_param("api_key")
        public = request.get_param("public")

        u = UrlScan(apikey=api_key, url=url, public=public)
        u.submit()

        # We need to wait for the API to process our request
        response = self.wait_processing(u)

        # We get the response parts that we want and merge them all together
        report = {
            **response.json()["verdicts"]["overall"],
            **response.json()["lists"],
            **response.json()["page"]
        }

        # We convert the "certicates" section from a list of dictionnaries to a dictionnary of lists
        certificates = report.pop("certificates")
        certificates = {
            k: [dic[k] for dic in certificates]
            for k in certificates[0]
        }

        # We add the converted section to the report
        report = {**report, **certificates}

        # We create the KEY_VALUE section to add the report to the result page
        kv_section = ResultSection("Urlscan.io report",
                                   body_format=BODY_FORMAT.KEY_VALUE,
                                   body=json.dumps(report))

        for domain in report["domains"]:
            kv_section.add_tag("network.static.domain", domain.strip())

        result.add_section(kv_section)

        # We get the preview of the website
        screenshot = u.getScreenshot()
        with open(self.working_directory + "/preview.png", "wb") as ofile:
            ofile.write(screenshot)

        # Adding the preview on the result page
        url_section = ResultSection(
            'Urlscan.io website screenshot',
            body_format=BODY_FORMAT.URL,
            body=json.dumps({
                "name": "The preview is also available here !",
                "url": response.json()["task"]["screenshotURL"]
            }))
        result.add_section(url_section)
        request.add_extracted(self.working_directory + "/preview.png",
                              "preview.png", "Here\'s the preview of the site")

        request.result = result
    def execute(self, request):
        result = Result()
        request.set_service_context(self.get_tool_version())

        apk = request.file_path
        filename = os.path.basename(apk)
        d2j_out = os.path.join(self.working_directory, f'{filename}.jar')
        apktool_out = os.path.join(self.working_directory, f'{filename}_apktool')
        apktool_workdir = os.path.join(self.working_directory, f'{filename}_apktool_workdir')

        self.run_badging_analysis(apk, result)
        self.run_strings_analysis(apk, result)
        self.run_apktool(apk, apktool_out, apktool_workdir, result)
        if request.get_param('resubmit_apk_as_jar'):
            self.resubmit_dex2jar_output(apk, d2j_out, result, request)

        request.result = result
    def check_file_name_anomalies(self, filename):
        """Filename anomalies detection"""

        is_double_ext, f_ext = self.fna_check_double_extension(filename)
        is_empty_filename = self.fna_check_empty_filename(filename, f_ext)
        too_many_whitespaces = self.fna_check_filename_ws(filename, f_ext)
        has_unicode_ext_hiding_ctrls = self.fna_check_unicode_bidir_ctrls(filename, f_ext)

        file_res = Result()

        if too_many_whitespaces or is_double_ext or has_unicode_ext_hiding_ctrls or is_empty_filename:
            res = ResultSection(title_text="File Name Anomalies", parent=file_res)

            # Tag filename as it might be of interest
            res.add_tag("file.name.extracted", filename)

            # Remove Unicode controls, if any, for reporting
            fn_no_controls = "".join(
                c for c in filename if c not in ["\u202E", "\u202B", "\u202D", "\u202A", "\u200E", "\u200F"]
            )

            # Also add a line with "actual" file name
            res.add_line(f"Actual file name: {wrap_bidir_unicode_string(fn_no_controls)}")

            if too_many_whitespaces:
                sec = ResultSection("Too many whitespaces", parent=res, heuristic=Heuristic(1))
                sec.add_tag("file.name.anomaly", "TOO_MANY_WHITESPACES")
                sec.add_tag("file.behavior", "File name has too many whitespaces")

            if is_double_ext:
                sec = ResultSection("Double file extension", parent=res, heuristic=Heuristic(2))
                sec.add_tag("file.name.anomaly", "DOUBLE_FILE_EXTENSION")
                sec.add_tag("file.behavior", "Double file extension")

            if has_unicode_ext_hiding_ctrls:
                sec = ResultSection("Hidden launchable file extension", parent=res, heuristic=Heuristic(3))
                sec.add_tag("file.name.anomaly", "UNICODE_EXTENSION_HIDING")
                sec.add_tag("file.behavior", "Real file extension hidden using unicode trickery")

            if is_empty_filename:
                sec = ResultSection("Empty Filename", parent=res, heuristic=Heuristic(4))
                sec.add_tag("file.name.anomaly", "FILENAME_EMPTY_OR_ALL_SPACES")
                sec.add_tag("file.behavior", "File name is empty or all whitespaces")

        return file_res
Beispiel #23
0
    def execute(self, request: ServiceRequest):
        try:
            self.client = Client(apikey=self.config.get(
                "api_key", request.get_param("api_key")),
                                 proxy=self.config.get('proxy') or None)
        except Exception as e:
            self.log.error("No API key found for VirusTotal")
            raise e

        if request.task.metadata.get('submitted_url',
                                     None) and request.task.depth == 0:
            response = self.scan_url(request)
        else:
            response = self.scan_file(request)
        if response:
            result = self.parse_results(response)
            request.result = result
        else:
            request.result = Result()
    def execute(self, request):
        # Result Object
        result = Result()
        api_key = request.get_param("api_key")

        if self.prechecks(request, api_key):
            upm = unpacme.UnpacMe(api_key)
            record = upm.upload_file(request.file_path)
            if record['success']:
                analysis_results = self.wait_for_completion(upm, record)
                if analysis_results:
                    presults = self.process_results(analysis_results, upm)
                    result, request = self.generate_results(
                        presults, result, analysis_results, request)
            else:
                self.log.error(
                    f"An exception occurred while uploading the sample to UNPACME: %s"
                    % record['msg'])

        request.result = result
    def execute(self, request):
        qr = xqrcode.decode_from_file(request.file_path)
        if len(qr) > 0:
            result_url = qr[0]['data']
            result = Result()
            text_section = ResultSection('QR Code')
            text_section.add_line(result_url)
            result.add_section(text_section)

            url_section = ResultSection('url extracted',
                                        body_format=BODY_FORMAT.URL,
                                        body=json.dumps({
                                            "name": "QR Code Url",
                                            "url": f"{result_url}"
                                        }))

            url_section.add_tag("network.static.domain", result_url)
            result.add_section(url_section)

            request.result = result
        else:
            request.result = Result()
    def execute(self, request):
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()
        before = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values:
            if request.get_param('extract_original_iocs'):
                ioc_res = ResultSection(
                    "The following IOCs were found in the original file",
                    parent=request.result,
                    body_format=BODY_FORMAT.MEMORY_DUMP)
            else:
                ioc_res = None
            for k, val in pat_values.items():
                if val == "":
                    asc_asc = unicodedata.normalize('NFKC', val).encode(
                        'ascii', 'ignore')
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}"
                        )
                        ioc_res.add_tag(k, asc_asc)
                    before.add((k, asc_asc))
                else:
                    for v in val:
                        if ioc_res:
                            ioc_res.add_line(
                                f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_res.add_tag(k, v)
                        before.add((k, v))

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if re.match(re.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            for name, technique in techniques:
                res = technique(layer)
                if res:
                    layers_list.append((name, res))
                    # Looks like it worked, restart with new layer
                    layer = res
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    for name, technique in final_pass:
                        res = technique(layer)
                        if res:
                            layers_list.append((name, res))
                    break
                else:
                    for x in second_pass:
                        techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)
            heur_id = None

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            elif num_layers >= 100:
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags = {}

                for k, val in pat_values.items():
                    if val == "":
                        asc_asc = unicodedata.normalize('NFKC', val).encode(
                            'ascii', 'ignore')
                        if (k, asc_asc) not in before:
                            diff_tags.setdefault(k, [])
                            diff_tags[k].append(asc_asc)
                    else:
                        for v in val:
                            if (k, v) not in before:
                                diff_tags.setdefault(k, [])
                                diff_tags[k].append(v)

                if request.deep_scan or \
                        (len(clean) > 1000 and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    fn = f"{request.file_name}_decoded_final"
                    fp = os.path.join(self.working_directory, fn)
                    with open(fp, 'wb') as dcf:
                        dcf.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {fp}")
                    request.add_extracted(fp, fn, "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for f in self.files_extracted:
                        ext_file_res.add_line(os.path.basename(f))
                        request.add_extracted(
                            f, os.path.basename(f),
                            "File of interest deobfuscated from sample")
    def peepdf_analysis(self, temp_filename, file_content, request):
        file_res = Result()
        try:
            res_list = []
            # js_stream = []
            f_list = []
            js_dump = []

            pdf_parser = PDFParser()
            ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content)
            if ret == 0:
                stats_dict = pdf_file.getStats()

                if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \
                                                      "indirect objects found in the body":
                    # Not a PDF
                    return

                json_body = dict(
                    version=stats_dict['Version'],
                    binary=stats_dict['Binary'],
                    linearized=stats_dict['Linearized'],
                    encrypted=stats_dict['Encrypted'],
                )

                if stats_dict['Encryption Algorithms']:
                    temp = []
                    for algorithmInfo in stats_dict['Encryption Algorithms']:
                        temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits")
                    json_body["encryption_algorithms"] = temp

                json_body.update(dict(
                    updates=stats_dict['Updates'],
                    objects=stats_dict['Objects'],
                    streams=stats_dict['Streams'],
                    comments=stats_dict['Comments'],
                    errors={True: ", ".join(stats_dict['Errors']),
                            False: "None"}[len(stats_dict['Errors']) != 0]
                ))
                res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE,
                                    body=json.dumps(json_body))

                for version in range(len(stats_dict['Versions'])):
                    stats_version = stats_dict['Versions'][version]
                    v_json_body = dict(
                        catalog=stats_version['Catalog'] or "no",
                        info=stats_version['Info'] or "no",
                        objects=self.list_first_x(stats_version['Objects'][1]),
                    )

                    if stats_version['Compressed Objects'] is not None:
                        v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1])

                    if stats_version['Errors'] is not None:
                        v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1])

                    v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1])

                    if stats_version['Xref Streams'] is not None:
                        v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1])

                    if stats_version['Object Streams'] is not None:
                        v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1])

                    if int(stats_version['Streams'][0]) > 0:
                        v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1])
                        if stats_version['Decoding Errors'] is not None:
                            v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1])

                    if stats_version['Objects with JS code'] is not None:
                        v_json_body['objects_with_js_code'] = \
                            self.list_first_x(stats_version['Objects with JS code'][1])
                        # js_stream.extend(stats_version['Objects with JS code'][1])

                    res_version = ResultSection(f"Version {str(version)}", parent=res,
                                                body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body))

                    actions = stats_version['Actions']
                    events = stats_version['Events']
                    vulns = stats_version['Vulns']
                    elements = stats_version['Elements']
                    is_suspicious = False
                    if events is not None or actions is not None or vulns is not None or elements is not None:
                        res_suspicious = ResultSection('Suspicious elements', parent=res_version)
                        if events is not None:
                            for event in events:
                                res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}")
                            is_suspicious = True
                        if actions is not None:
                            for action in actions:
                                res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}")
                            is_suspicious = True
                        if vulns is not None:
                            for vuln in vulns:
                                if vuln in vulnsDict:
                                    temp = [vuln, ' (']
                                    for vulnCVE in vulnsDict[vuln]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                            vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                            temp.append(vulnCVE)
                                            cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                            if cve_found:
                                                res_suspicious.add_tag('attribution.exploit',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                                res_suspicious.add_tag('file.behavior',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(vulns[vuln]))
                                    res_suspicious.add_line(temp)
                                else:
                                    res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}")
                                is_suspicious = True
                        if elements is not None:
                            for element in elements:
                                if element in vulnsDict:
                                    temp = [element, ' (']
                                    for vulnCVE in vulnsDict[element]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                        vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                        temp.append(vulnCVE)
                                        cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                        if cve_found:
                                            res_suspicious.add_tag('attribution.exploit',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                            res_suspicious.add_tag('file.behavior',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(elements[element]))
                                    res_suspicious.add_line(temp)
                                    is_suspicious = True
                                else:
                                    res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}")
                                    is_suspicious = True
                    res_suspicious.set_heuristic(8) if is_suspicious else None

                    urls = stats_version['URLs']
                    if urls is not None:
                        res.add_line("")
                        res_url = ResultSection('Found URLs', parent=res)
                        for url in urls:
                            res_url.add_line(f"\t\t{url}")
                            res_url.set_heuristic(9)

                    for obj in stats_version['Objects'][1]:
                        cur_obj = pdf_file.getObject(obj, version)

                        if cur_obj.containsJScode:
                            cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} "
                                                    f"block of JavaScript")
                            score_modifier = 0

                            js_idx = 0
                            for js in cur_obj.JSCode:
                                sub_res = ResultSection('Block of JavaScript', parent=cur_res)
                                js_idx += 1
                                js_score = 0
                                js_code, unescaped_bytes, _, _, _ = analyseJS(js)

                                js_dump += [x for x in js_code]

                                # Malicious characteristics
                                big_buffs = self.get_big_buffs("".join(js_code))
                                if len(big_buffs) == 1:
                                    js_score += 500 * len(big_buffs)
                                if len(big_buffs) > 0:
                                    js_score += 500 * len(big_buffs)
                                has_eval, has_unescape = self.check_dangerous_func("".join(js_code))
                                if has_unescape:
                                    js_score += 100
                                if has_eval:
                                    js_score += 100

                                js_cmt = ""
                                if has_eval or has_unescape or len(big_buffs) > 0:
                                    score_modifier += js_score
                                    js_cmt = "Suspiciously malicious "
                                    cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF")
                                    sub_res.set_heuristic(7)
                                js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res)

                                if js_score > 0:
                                    temp_js_outname = f"object{obj}-{version}_{js_idx}.js"
                                    temp_js_path = os.path.join(self.working_directory, temp_js_outname)
                                    temp_js_bin = "".join(js_code).encode("utf-8")
                                    f = open(temp_js_path, "wb")
                                    f.write(temp_js_bin)
                                    f.close()
                                    f_list.append(temp_js_path)

                                    js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}")
                                    if has_eval or has_unescape:
                                        analysis_res = ResultSection("[Suspicious Functions]", parent=js_res)
                                        if has_eval:
                                            analysis_res.add_line("eval: This JavaScript block uses eval() function "
                                                                  "which is often used to launch deobfuscated "
                                                                  "JavaScript code.")
                                            analysis_res.set_heuristic(3)
                                        if has_unescape:
                                            analysis_res.add_line("unescape: This JavaScript block uses unescape() "
                                                                  "function. It may be legitimate but it is definitely "
                                                                  "suspicious since malware often use this to "
                                                                  "deobfuscate code blocks.")
                                            analysis_res.set_heuristic(3)

                                    buff_idx = 0
                                    for buff in big_buffs:
                                        buff_idx += 1
                                        error, new_buff = unescape(buff)
                                        if error == 0:
                                            buff = new_buff

                                        if buff not in unescaped_bytes:
                                            temp_path_name = None
                                            if ";base64," in buff[:100] and "data:" in buff[:100]:
                                                temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff"
                                                try:
                                                    buff = b64decode(buff.split(";base64,")[1].strip())
                                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                                    f = open(temp_path, "wb")
                                                    f.write(buff)
                                                    f.close()
                                                    f_list.append(temp_path)
                                                except Exception:
                                                    self.log.error("Found 'data:;base64, ' buffer "
                                                                   "but failed to base64 decode.")
                                                    temp_path_name = None

                                            if temp_path_name is not None:
                                                buff_cond = f" and was resubmitted as {temp_path_name}"
                                            else:
                                                buff_cond = ""
                                            buff_res = ResultSection(
                                                f"A {len(buff)} bytes buffer was found in the JavaScript "
                                                f"block{buff_cond}. Here are the first 256 bytes.",
                                                parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")),
                                                body_format=BODY_FORMAT.MEMORY_DUMP)
                                            buff_res.set_heuristic(2)

                                processed_sc = []
                                sc_idx = 0
                                for sc in unescaped_bytes:
                                    if sc not in processed_sc:
                                        sc_idx += 1
                                        processed_sc.append(sc)

                                        try:
                                            sc = sc.decode("hex")
                                        except Exception:
                                            pass

                                        shell_score = 500
                                        temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff"

                                        shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript "
                                                                  f"buffer (id: {sc_idx}) was resubmitted as "
                                                                  f"{temp_path_name}. Here are the first 256 bytes.",
                                                                  parent=js_res)
                                        shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP)

                                        temp_path = os.path.join(self.working_directory, temp_path_name)
                                        f = open(temp_path, "wb")
                                        f.write(sc)
                                        f.close()
                                        f_list.append(temp_path)

                                        cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer")
                                        shell_res.set_heuristic(6)
                                        score_modifier += shell_score

                            if score_modifier > 0:
                                res_list.append(cur_res)

                        elif cur_obj.type == "stream":
                            if cur_obj.isEncodedStream and cur_obj.filter is not None:
                                data = cur_obj.decodedStream
                                encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/",
                                                                                                          "").strip()
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            else:
                                data = cur_obj.rawStream
                                encoding = None
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            if otype:
                                otype = otype.value.replace("/", "").lower()
                            if sub_type:
                                sub_type = sub_type.value.replace("/", "").lower()
                            if length:
                                length = length.value

                            if otype == "embeddedfile":
                                if len(data) > 4096:
                                    if encoding is not None:
                                        temp_encoding_str = f"_{encoding}"
                                    else:
                                        temp_encoding_str = ""

                                    cur_res = ResultSection(
                                        f'Embedded file found ({length} bytes) [obj: {obj} {version}] '
                                        f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}'
                                        f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                        f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                    )

                                    temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj"
                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                    f = open(temp_path, "wb")
                                    f.write(data)
                                    f.close()
                                    f_list.append(temp_path)

                                    cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}")
                                    res_list.append(cur_res)

                            elif otype not in BANNED_TYPES:
                                cur_res = ResultSection(
                                    f'Unknown stream found [obj: {obj} {version}] '
                                    f'{f"(Type: {otype}) " if otype is not None else ""}'
                                    f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                    f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                )
                                for line in val.splitlines():
                                    cur_res.add_line(line)

                                emb_res = ResultSection('First 256 bytes', parent=cur_res)
                                first_256 = data[:256]
                                if isinstance(first_256, str):
                                    first_256 = first_256.encode()
                                emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP)
                                res_list.append(cur_res)
                        else:
                            pass

                file_res.add_section(res)

                for results in res_list:
                    file_res.add_section(results)

                if js_dump:
                    js_dump_res = ResultSection('Full JavaScript dump')

                    temp_js_dump = "javascript_dump.js"
                    temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump)
                    try:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8")
                    except UnicodeDecodeError:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump)
                    temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest()
                    f = open(temp_js_dump_path, "wb")
                    f.write(temp_js_dump_bin)
                    f.flush()
                    f.close()
                    f_list.append(temp_js_dump_path)

                    js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}")
                    js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}")

                    js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1)
                    file_res.add_section(js_dump_res)

                for filename in f_list:
                    request.add_extracted(filename, os.path.basename(filename),
                                          f"Dumped from {os.path.basename(temp_filename)}")

            else:
                res = ResultSection("ERROR: Could not parse file with PeePDF.")
                file_res.add_section(res)
        finally:
            request.result = file_res
            try:
                del pdf_file
            except Exception:
                pass

            try:
                del pdf_parser
            except Exception:
                pass

            gc.collect()
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)
        content_str = request.file_contents

        # Attempt conversion of potential Outlook file -> eml
        if request.file_type == "document/office/email":
            try:
                content_str = msg2eml(request.file_path).as_bytes()
            except Exception:
                # Try using mailparser to convert
                converted_path, _ = msgconvert(request.file_path)
                content_str = open(converted_path, "rb").read()

        header_agg = {
            "From": set(),
            "To": set(),
            "Cc": set(),
            "Sent": set(),
            "Reply-To": set(),
            "Date": set()
        }
        # Assume this is an email saved in HTML format
        if request.file_type == "code/html":
            parsed_html = BeautifulSoup(content_str, "lxml")
            valid_headers = [
                "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:"
            ]

            if not parsed_html.body or not any(header in parsed_html.body.text
                                               for header in valid_headers):
                # We can assume this is just an HTML doc (or lacking body), one of which we can't process
                request.result = Result()
                return

            # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails
            valid_headers.append("Date:")

            html_email = email.message_from_bytes(content_str)
            generator_metadata_content = ""
            for meta in parsed_html.find_all("meta"):
                if meta.attrs.get("name", None) == "Generator":
                    generator_metadata_content = meta.attrs.get("content", "")
                    break

            # Process HTML emails generated from Outlook
            if generator_metadata_content == "Microsoft Word 15":
                paragraphs = parsed_html.body.find_all("p")
                # Likely an email that was exported with original email headers
                if any(header in paragraphs[0] for header in valid_headers):
                    for p in paragraphs:
                        if any(valid_header in p.text
                               for valid_header in valid_headers):
                            h_key, h_value = p.text.replace(
                                "\xa0", "").replace("\r\n", " ").split(":", 1)
                            html_email[h_key] = h_value
                            # Subject line indicates the end of the email header, beginning of body
                            if "Subject" in p.text:
                                break
            # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers)
            elif (generator_metadata_content
                  == "Microsoft Word 15 (filtered medium)"
                  or generator_metadata_content == "Microsoft Exchange Server"
                  or generator_metadata_content == ""):
                subject = None
                for div in parsed_html.find_all("div"):
                    # Header information within divs
                    if any(header in div.text for header in valid_headers
                           ) and "WordSection1" not in div.attrs.get(
                               "class", []):
                        # Usually expect headers to be \n separated in text output but check first
                        if "\n" in div.text:
                            for h in div.text.split("\n"):
                                if any(header in h
                                       for header in valid_headers):
                                    h_key, h_value = h.split(":", 1)

                                    # Implying some malformed message got mixed with the headers of another message
                                    if h_key not in valid_headers:
                                        for header in valid_headers:
                                            if header in h:
                                                h_key = header[:-1]

                                    # Use the latest message's subject (this maintains FW, RE, etc.)
                                    if h_key == "Subject" and not subject:
                                        subject = h_value
                                    elif h_key != "Subject":
                                        header_agg[h_key].add(h_value)

                        # Document was probably not well formatted, so we'll use the headers as delimiters
                        else:
                            header_offset_map = {}
                            # Determine the position of each header
                            for header in list(
                                    header_agg.keys()) + ["Subject"]:
                                if header in div.text:
                                    header_offset_map[div.text.index(
                                        header)] = header
                            # Use the positions and length of header name to determine an offset
                            for i in range(len(header_offset_map)):
                                sorted_keys = sorted(header_offset_map.keys())
                                header_name = header_offset_map[sorted_keys[i]]
                                offset = len(
                                    f"{header_name}: ") + sorted_keys[i]
                                value = (div.text[offset:sorted_keys[i + 1]]
                                         if i < len(header_offset_map) - 1 else
                                         div.text[offset:])

                                if header_name == "Subject":
                                    subject = value
                                else:
                                    header_agg[header_name].add(value)

                # Assign aggregated info to email object
                html_email["Subject"] = subject
                for key, value in header_agg.items():
                    html_email[key] = "; ".join(value)
            content_str = html_email.as_bytes()

        parsed_eml = parser.decode_email_bytes(content_str)
        result = Result()
        header = parsed_eml["header"]

        if "from" in header or "to" in header:
            all_uri = set()
            body_words = set(extract_passwords(header["subject"]))
            for body_counter, body in enumerate(parsed_eml["body"]):
                body_text = BeautifulSoup(body["content"]).text
                body_words.update(extract_passwords(body_text))
                if request.get_param("extract_body_text"):
                    fd, path = mkstemp()
                    with open(path, "w") as f:
                        f.write(body["content"])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body["uri"]:
                        all_uri.add(uri)
            # Words in the email body, used by extract to guess passwords
            request.temp_submission_data["email_body"] = list(body_words)

            kv_section = ResultSection("Email Headers",
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            from_addr = header["from"].strip() if header.get("from",
                                                             None) else None
            if from_addr and re.match(EMAIL_REGEX, from_addr):
                kv_section.add_tag("network.email.address", from_addr)
            [
                kv_section.add_tag("network.email.address", to.strip())
                for to in header["to"] if re.match(EMAIL_REGEX, to.strip())
            ]

            kv_section.add_tag("network.email.date",
                               str(header["date"]).strip())

            subject = header["subject"].strip() if header.get("subject",
                                                              None) else None
            if subject:
                kv_section.add_tag("network.email.subject", subject)

            # Add CCs to body and tags
            if "cc" in header:
                [
                    kv_section.add_tag("network.email.address", cc.strip())
                    for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip())
                ]
            # Add Message ID to body and tags
            if "message-id" in header["header"]:
                kv_section.add_tag("network.email.msg_id",
                                   header["header"]["message-id"][0].strip())

            # Add Tags for received IPs
            if "received_ip" in header:
                for ip in header["received_ip"]:
                    ip = ip.strip()
                    try:
                        if isinstance(ip_address(ip), IPv4Address):
                            kv_section.add_tag("network.static.ip", ip)
                    except ValueError:
                        pass

            # Add Tags for received Domains
            if "received_domain" in header:
                for dom in header["received_domain"]:
                    kv_section.add_tag("network.static.domain", dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection("URIs Found:", parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag("network.static.uri", uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag("network.static.ip",
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag("network.static.domain",
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop("header", {})
            header.pop("received", None)
            header.update(extra_header)

            # Convert to common format
            header["date"] = [self.json_serial(header["date"])]

            # Replace with aggregated date(s) if any available
            if header_agg["Date"]:
                # Replace
                if any(
                        default_date in header["date"] for default_date in
                    ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000"
                     ]):
                    header["date"] = list(header_agg["Date"])
                # Append
                else:
                    header["date"] += list(header_agg["Date"])
                (kv_section.add_tag("network.email.date",
                                    str(date).strip())
                 for date in header_agg["Date"])

            # Filter out useless headers from results
            self.log.debug(header.keys())
            [header.pop(h) for h in self.header_filter if h in header.keys()]
            kv_section.set_body(json.dumps(header, default=self.json_serial))

            attachments_added = []
            if "attachment" in parsed_eml:
                attachments = parsed_eml["attachment"]
                for attachment in attachments:
                    fd, path = mkstemp()

                    with open(path, "wb") as f:
                        f.write(base64.b64decode(attachment["raw"]))
                        os.close(fd)
                    try:
                        if request.add_extracted(
                                path,
                                attachment["filename"],
                                "Attachment ",
                                safelist_interface=self.api_interface):
                            attachments_added.append(attachment["filename"])
                    except MaxExtractedExceeded:
                        self.log.warning(
                            f"Extract limit reached on attachments: "
                            f"{len(attachment) - len(attachments_added)} not added"
                        )
                        break
                ResultSection("Extracted Attachments:",
                              body="\n".join([x for x in attachments_added]),
                              parent=result)

            if request.get_param("save_emlparser_output"):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                attachments = parsed_eml.get("attachment", [])
                # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted
                for attachment in attachments:
                    _ = attachment.pop("raw", None)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            self.log.warning(
                "emlParser could not parse EML; no useful information in result's headers"
            )

        request.result = result
    def run_badging_analysis(self, apk_file: str, result: Result):
        badging_args = ['d', 'badging', apk_file]
        badging, errors = self.run_appt(badging_args)
        if not badging:
            return
        res_badging = ResultSection("Android application details")
        libs = []
        permissions = []
        components = []
        features = []
        pkg_version = None
        for line in badging.splitlines():
            if line.startswith("package:"):
                pkg_name = line.split("name='")[1].split("'")[0]
                pkg_version = line.split("versionCode='")[1].split("'")[0]
                res_badging.add_line(f"Package: {pkg_name} v.{pkg_version}")
                res_badging.add_tag('file.apk.pkg_name', pkg_name)
                res_badging.add_tag('file.apk.app.version', pkg_version)

            if line.startswith("sdkVersion:"):
                min_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Min SDK: {min_sdk}")
                res_badging.add_tag('file.apk.sdk.min', min_sdk)

            if line.startswith("targetSdkVersion:"):
                target_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Target SDK: {target_sdk}")
                res_badging.add_tag('file.apk.sdk.target', target_sdk)

            if line.startswith("application-label:"):
                label = line.split(":'")[1][:-1]
                res_badging.add_line(f"Default Label: {label}")
                res_badging.add_tag('file.apk.app.label', label)

            if line.startswith("launchable-activity:"):
                launch = line.split("name='")[1].split("'")[0]
                res_badging.add_line(f"Launchable activity: {launch}")
                res_badging.add_tag('file.apk.activity', launch)

            if line.startswith("uses-library-not-required:"):
                lib = line.split(":'")[1][:-1]
                if lib not in libs:
                    libs.append(lib)

            if line.startswith("uses-permission:") or line.startswith("uses-implied-permission:"):
                perm = line.split("name='")[1].split("'")[0]
                if perm not in permissions:
                    permissions.append(perm)

            if line.startswith("provides-component:"):
                component = line.split(":'")[1][:-1]
                if component not in components:
                    components.append(component)

            if "uses-feature:" in line or "uses-implied-feature:" in line:
                feature = line.split("name='")[1].split("'")[0]
                if feature not in features:
                    features.append(feature)

        if pkg_version is not None:
            pkg_version = int(pkg_version)
            if pkg_version < 15:
                ResultSection("Package version is suspiciously low", parent=res_badging,
                              heuristic=Heuristic(17))
            elif pkg_version > 999999999:
                ResultSection("Package version is suspiciously high", parent=res_badging,
                              heuristic=Heuristic(17))

        if libs:
            res_lib = ResultSection("Libraries used", parent=res_badging)
            for lib in libs:
                res_lib.add_line(lib)
                res_lib.add_tag('file.apk.used_library', lib)

        if permissions:
            res_permissions = ResultSection("Permissions used", parent=res_badging)
            dangerous_permissions = []
            unknown_permissions = []
            for perm in permissions:
                if perm in ALL_ANDROID_PERMISSIONS:
                    if 'dangerous' in ALL_ANDROID_PERMISSIONS[perm]:
                        dangerous_permissions.append(perm)
                    else:
                        res_permissions.add_line(perm)
                        res_permissions.add_tag('file.apk.permission', perm)
                else:
                    unknown_permissions.append(perm)

            if len(set(permissions)) < len(permissions):
                ResultSection("Some permissions are defined more then once", parent=res_badging,
                              heuristic=Heuristic(18))

            if dangerous_permissions:
                res_dangerous_perm = ResultSection("Dangerous permissions used", parent=res_badging,
                                                   heuristic=Heuristic(4))
                for perm in dangerous_permissions:
                    res_dangerous_perm.add_line(perm)
                    res_dangerous_perm.add_tag('file.apk.permission', perm)

            if unknown_permissions:
                res_unknown_perm = ResultSection("Unknown permissions used", parent=res_badging,
                                                 heuristic=Heuristic(5))
                for perm in unknown_permissions:
                    res_unknown_perm.add_line(perm)
                    res_unknown_perm.add_tag('file.apk.permission', perm)

        if features:
            res_features = ResultSection("Features used", parent=res_badging)
            for feature in features:
                res_features.add_line(feature)
                res_features.add_tag('file.apk.feature', feature)

        if components:
            res_components = ResultSection("Components provided", parent=res_badging)
            for component in components:
                res_components.add_line(component)
                res_components.add_tag('file.apk.provides_component', component)

        result.add_section(res_badging)
    def execute(self, request):
        """Main Module. See README for details."""
        max_size = self.config.get('MAX_PDF_SIZE', 3000000)
        request.result = result = Result()
        if (os.path.getsize(request.file_path)
                or 0) < max_size or request.deep_scan:
            path = request.file_path
            working_dir = self.working_directory

            # CALL PDFID and identify all suspicious keyword streams
            additional_keywords = self.config.get('ADDITIONAL_KEYS', [])
            heur = deepcopy(self.config.get('HEURISTICS', []))
            all_errors = set()

            res_txt = "Main Document Results"
            res, contains_objstms, errors = self.analyze_pdf(
                request, res_txt, path, working_dir, heur, additional_keywords)
            result.add_section(res)

            for e in errors:
                all_errors.add(e)

            #  ObjStms: Treat all ObjStms like a standalone PDF document
            if contains_objstms:
                objstm_files = self.analyze_objstm(path, working_dir,
                                                   request.deep_scan)
                obj_cnt = 1
                for osf in objstm_files:
                    parent_obj = os.path.basename(osf).split("_")[1]
                    res_txt = "ObjStream Object {0} from Parent Object {1}".format(
                        obj_cnt, parent_obj)
                    # It is going to look suspicious as the service created the PDF
                    heur = [
                        x for x in heur
                        if 'plugin_suspicious_properties' not in x
                        and 'plugin_embeddedfile' not in x
                        and 'plugin_nameobfuscation' not in x
                    ]

                    res, contains_objstms, errors = self.analyze_pdf(
                        request,
                        res_txt,
                        osf,
                        working_dir,
                        heur,
                        additional_keywords,
                        get_malform=False)

                    obj_cnt += 1
                    result.add_section(res)

            if len(all_errors) > 0:
                erres = ResultSection(title_text="Errors Analyzing PDF")
                for e in all_errors:
                    erres.add_line(e)
                result.add_section(erres)

        else:
            section = ResultSection(
                "PDF Analysis of the file was skipped because the file is too big (limit is 3 MB)."
            )
            section.set_heuristic(10)
            result.add_section(section)