def test_handle_artefact(artefact, expected_result_section_title):
        from assemblyline_v4_service.common.dynamic_service_helper import SandboxOntology, Artefact
        from assemblyline_v4_service.common.result import ResultSection

        if artefact is None:
            with pytest.raises(Exception):
                SandboxOntology._handle_artefact(artefact, None)
            return

        expected_result_section = None
        if expected_result_section_title is not None:
            expected_result_section = ResultSection(
                expected_result_section_title)
            expected_result_section.add_tag("dynamic.process.file_name",
                                            artefact["path"])

        parent_result_section = ResultSection("blah")
        a = Artefact(name=artefact["name"],
                     path=artefact["path"],
                     description=artefact["description"],
                     to_be_extracted=artefact["to_be_extracted"])
        SandboxOntology._handle_artefact(a, parent_result_section)
        if len(parent_result_section.subsections) > 0:
            actual_result_section = parent_result_section.subsections[0]
        else:
            actual_result_section = None

        if expected_result_section is None and actual_result_section is None:
            assert True
        else:
            assert check_section_equality(actual_result_section,
                                          expected_result_section)
Example #2
0
def decoded_result(text: bytes) -> Optional[ResultSection]:
    """ Generates a ResultSection from floss decoded strings output section """
    lines = text.splitlines()
    lines[0] = b'Most likely decoding functions:'
    body = b'\n'.join(lines[:-1])

    strings = re.findall(rb'^\[[A-Z]+\]\s+0x[0-9A-F]+\s+(.+)',
                         body,
                         flags=re.M)
    if not strings:
        return None

    result = ResultSection('FLARE FLOSS Decoded Strings',
                           body_format=BODY_FORMAT.MEMORY_DUMP,
                           heuristic=Heuristic(1))
    assert result.heuristic
    ioc = False
    for string in strings:
        ioc = ioc_tag(string, result, just_network=len(strings) > 1000) or ioc
        result.add_tag('file.string.decoded', string[:75])
    if ioc:
        result.heuristic.add_signature_id('decoded_ioc')

    result.add_line(body.decode())
    return result
Example #3
0
    def _get_category_section(self, category: str,
                              tags: Iterator[AVClassTag]) -> ResultSection:
        """
        Gets a `ResultSection` for a list of tags from a single category.

        Result contains table with AVclass tag information in descending order by rank.

        :param category: Category of tags
        :param tags: Tags belonging to category
        :return: `ResultSection`
        """
        tags = sorted(tags, key=lambda t: t.rank, reverse=True)

        category_name, heur_id, tag_type = AVCLASS_CATEGORY[category]
        tag_table = [{
            'name': tag.name,
            'category': category_name,
            'path': tag.path,
            'rank': tag.rank
        } for tag in tags]

        section = ResultSection(
            f'AVclass extracted {len(tags)} {category_name} tags',
            body=json.dumps(tag_table),
            body_format=BODY_FORMAT.TABLE,
            heuristic=Heuristic(heur_id) if heur_id is not None else None)
        if tag_type is not None:
            for tag in tags:
                section.add_tag(tag_type, tag.name)

        return section
Example #4
0
 def additional_parsing(self, file_path: str) -> Optional[ResultSection]:
     urls = set()
     try:
         with pikepdf.open(file_path) as pdf:
             num_pages = len(pdf.pages)
             for page in pdf.pages:
                 if '/Annots' not in page:
                     continue
                 for annot in page['/Annots'].as_list():
                     if annot.get('/Subtype') == '/Link':
                         if '/A' not in annot:
                             continue
                     _url = annot['/A'].get('/URI')
                     if not hasattr(_url, '__str__'):
                         continue
                     url = str(_url)
                     if re.match(FULL_URI, url):
                         urls.add(url)
         if not urls:
             return None
         patterns = PatternMatch()
         body = '\n'.join(urls)
         tags: dict[str, set[bytes]] = patterns.ioc_match(body.encode())
         result = ResultSection(
             'URL in Annotations',
             heuristic=Heuristic(
                 27, signature='one_page' if num_pages == 1 else None),
             body=body)
         for ty, vals in tags.items():
             for val in vals:
                 result.add_tag(ty, val)
         return result
     except Exception as e:
         self.log.warning(f'pikepdf failed to parse sample: {e}')
         return None
    def add_ip_tags(self):
        """
        Adds tags for urls and ip addresses from given lists
        """

        if self.url_list or self.ip_list:
            sec_iocs = ResultSection(
                "ViperMonkey has found the following IOCs:",
                parent=self.result,
                heuristic=Heuristic(4))

            # Add Urls
            for url in set(self.url_list):
                sec_iocs.add_line(url)
                sec_iocs.add_tag('network.static.uri', url)
                try:
                    parsed = urlparse(url)
                    if not re.match(IP_ONLY_REGEX, parsed.hostname):
                        sec_iocs.add_tag('network.static.domain',
                                         parsed.hostname)

                except Exception:
                    pass

            # Add IPs
            for ip in set(self.ip_list):
                sec_iocs.add_line(ip)
                # Checking if IP ports also found and adding the corresponding tags
                if re.findall(":", ip):
                    net_ip, net_port = ip.split(':')
                    sec_iocs.add_tag('network.static.ip', net_ip)
                    sec_iocs.add_tag('network.port', net_port)
                else:
                    sec_iocs.add_tag('network.static.ip', ip)
    def run_strings_analysis(self, apk_file, result: Result):
        string_args = ['d', 'strings', apk_file]
        strings, _ = self.run_appt(string_args)
        if not strings or strings == "String pool is unitialized.\n":
            ResultSection("No strings found in APK", body="This is highly unlikely and most-likely malicious.",
                          parent=result, heuristic=Heuristic(6))
        else:
            res_strings = ResultSection("Strings Analysis", parent=result)

            config_args = ['d', 'configurations', apk_file]
            configs, _ = self.run_appt(config_args)
            languages = []
            for line in configs.splitlines():
                config = line.upper()
                if config in ISO_LOCALES:
                    languages.append(config)
                    res_strings.add_tag('file.apk.locale', config)

            data_line = strings.split("\n", 1)[0]
            count = int(data_line.split(" entries")[0].rsplit(" ", 1)[1])
            styles = int(data_line.split(" styles")[0].rsplit(" ", 1)[1])
            if count < 50:
                ResultSection("Low volume of strings, this is suspicious.", parent=res_strings,
                              body_format=BODY_FORMAT.MEMORY_DUMP, body=safe_str(strings), heuristic=Heuristic(7))

            if len(languages) < 2:
                ResultSection("This app is not built for multiple languages. This is unlikely.",
                              parent=res_strings, heuristic=Heuristic(8))

            res_strings.add_line(f"Total string count: {count}")
            res_strings.add_line(f"Total styles: {styles}")
            if languages:
                res_strings.add_line(f"Languages: {', '.join(languages)}")
Example #7
0
    def recurse_add_res(self, file_res, res_list, new_files, parent=None):
        for res_dic in res_list:
            # Check if condition is OK
            if self.pass_condition(res_dic.get("condition", None)):
                res = ResultSection(res_dic['title_text'],
                                    classification=res_dic.get('classification', Classification.UNRESTRICTED),
                                    parent=parent, body_format=res_dic.get('body_format', BODY_FORMAT.TEXT))
                heur_id = self.heuristic_alteration(res_dic.get('score_condition', None), res_dic['heur_id'])
                res.set_heuristic(heur_id)

                # Add Tags
                tags = res_dic.get('tags', [])
                for res_tag in tags:
                    res.add_tag(res_tag[0], res_tag[1])

                # Add body
                body = res_dic.get('body', None)
                if body:
                    res.set_body(body)

                # File for resubmit
                files = res_dic.get('files', [])
                for res_file in files:
                    if isinstance(res_file, tuple):
                        res_file = res_file[1]
                    new_files.append(res_file)

                # Add to file res if root result
                if parent is None:
                    file_res.add_section(res)
    def _create_random_section(self):
        # choose a random body format
        body_format = random.choice(FORMAT_LIST)

        # create a section with a random title
        section = ResultSection(get_random_phrase(3, 7), body_format=body_format)

        # choose random amount of lines in the body
        for _ in range(1, 5):
            # generate random line
            section.add_line(get_random_phrase(5, 10))

        # choose random amount of tags
        tags = flatten(get_random_tags())
        for key, val in tags.items():
            for v in val:
                section.add_tag(key, v)

        # set a heuristic a third of the time
        if random.choice([False, False, True]):
            section.set_heuristic(random.randint(1, 4))

        # Create random sub-sections
        if random.choice([False, False, True]):
            section.add_subsection(self._create_random_section())

        return section
def _validate_tag(
    result_section: ResultSection,
    tag: str,
    value: Any,
    safelist: Dict[str, Dict[str, List[str]]] = None
) -> bool:
    """
    This method validates the value relative to the tag type before adding the value as a tag to the ResultSection.
    :param result_section: The ResultSection that the tag will be added to
    :param tag: The tag type that the value will be tagged under
    :param value: The item that will be tagged under the tag type
    :param safelist: The safelist containing matches and regexs. The product of a service using self.get_api_interface().get_safelist().
    :return: Tag was successfully added
    """
    if safelist is None:
        safelist = {}

    regex = _get_regex_for_tag(tag)
    if regex and not match(regex, value):
        return False

    if "ip" in tag and not is_valid_ip(value):
        return False

    if "domain" in tag:
        if not is_valid_domain(value):
            return False
        elif value in FALSE_POSITIVE_DOMAINS_FOUND_IN_PATHS:
            return False
        elif isinstance(value, str) and value.split(".")[-1] in COMMON_FILE_EXTENSIONS:
            return False

    if is_tag_safelisted(value, [tag], safelist):
        return False

    # if "uri" is in the tag, let's try to extract its domain/ip and tag it.
    if "uri_path" not in tag and "uri" in tag:
        # First try to get the domain
        valid_domain = False
        domain = search(DOMAIN_REGEX, value)
        if domain:
            domain = domain.group()
            valid_domain = _validate_tag(result_section, "network.dynamic.domain", domain, safelist)
        # Then try to get the IP
        valid_ip = False
        ip = search(IP_REGEX, value)
        if ip:
            ip = ip.group()
            valid_ip = _validate_tag(result_section, "network.dynamic.ip", ip, safelist)

        if value not in [domain, ip] and (valid_domain or valid_ip):
            result_section.add_tag(tag, safe_str(value))
        else:
            return False
    else:
        result_section.add_tag(tag, safe_str(value))

    return True
    def execute(self, request):
        result = Result()
        url = request.task.metadata.get('submitted_url')
        api_key = request.get_param("api_key")
        public = request.get_param("public")

        u = UrlScan(apikey=api_key, url=url, public=public)
        u.submit()

        # We need to wait for the API to process our request
        response = self.wait_processing(u)

        # We get the response parts that we want and merge them all together
        report = {
            **response.json()["verdicts"]["overall"],
            **response.json()["lists"],
            **response.json()["page"]
        }

        # We convert the "certicates" section from a list of dictionnaries to a dictionnary of lists
        certificates = report.pop("certificates")
        certificates = {
            k: [dic[k] for dic in certificates]
            for k in certificates[0]
        }

        # We add the converted section to the report
        report = {**report, **certificates}

        # We create the KEY_VALUE section to add the report to the result page
        kv_section = ResultSection("Urlscan.io report",
                                   body_format=BODY_FORMAT.KEY_VALUE,
                                   body=json.dumps(report))

        for domain in report["domains"]:
            kv_section.add_tag("network.static.domain", domain.strip())

        result.add_section(kv_section)

        # We get the preview of the website
        screenshot = u.getScreenshot()
        with open(self.working_directory + "/preview.png", "wb") as ofile:
            ofile.write(screenshot)

        # Adding the preview on the result page
        url_section = ResultSection(
            'Urlscan.io website screenshot',
            body_format=BODY_FORMAT.URL,
            body=json.dumps({
                "name": "The preview is also available here !",
                "url": response.json()["task"]["screenshotURL"]
            }))
        result.add_section(url_section)
        request.add_extracted(self.working_directory + "/preview.png",
                              "preview.png", "Here\'s the preview of the site")

        request.result = result
    def test_process_iocs(intezer_static_class_instance,
                          dummy_api_interface_class, mocker):
        from intezer_static import ALIntezerApi
        from intezer_sdk.api import IntezerApi
        from assemblyline_v4_service.common.result import ResultSection
        from requests import HTTPError
        mocker.patch.object(intezer_static_class_instance,
                            "get_api_interface",
                            return_value=dummy_api_interface_class)
        intezer_static_class_instance.start()
        parent_res_sec = ResultSection("blah")
        file_verdict_map = {}

        mocker.patch.object(ALIntezerApi,
                            "get_iocs",
                            return_value={
                                "files": [],
                                "network": []
                            })
        intezer_static_class_instance._process_iocs("blah", file_verdict_map,
                                                    parent_res_sec)
        assert parent_res_sec.subsections == []
        assert file_verdict_map == {}

        mocker.patch.object(IntezerApi,
                            "get_iocs",
                            side_effect=HTTPError("FORBIDDEN"))
        intezer_static_class_instance._process_iocs("blah", file_verdict_map,
                                                    parent_res_sec)
        assert parent_res_sec.subsections == []
        assert file_verdict_map == {}

        mocker.patch.object(ALIntezerApi,
                            "get_iocs",
                            return_value={
                                "files": [{
                                    "sha256": "blah",
                                    "verdict": "malicious"
                                }],
                                "network": [{
                                    "ioc": "1.1.1.1",
                                    "type": "ip"
                                }, {
                                    "ioc": "blah.com",
                                    "type": "domain"
                                }]
                            })
        intezer_static_class_instance._process_iocs("blah", file_verdict_map,
                                                    parent_res_sec)
        correct_res_sec = ResultSection("Network Communication Observed")
        correct_res_sec.add_tag("network.dynamic.ip", "1.1.1.1")
        correct_res_sec.add_tag("network.dynamic.domain", "blah.com")
        correct_res_sec.add_line("IOC: 1.1.1.1")
        correct_res_sec.add_line("IOC: blah.com")
        assert check_section_equality(parent_res_sec.subsections[0],
                                      correct_res_sec)
        assert file_verdict_map == {"blah": "malicious"}
 def test_section_traverser(tags, correct_tags):
     from assemblyline_v4_service.common.section_reducer import _section_traverser
     from assemblyline_v4_service.common.result import ResultSection
     section = ResultSection("blah")
     subsection = ResultSection("subblah")
     for t_type, t_values in tags.items():
         for t_value in t_values:
             subsection.add_tag(t_type, t_value)
     section.add_subsection(subsection)
     assert _section_traverser(section).subsections[0].tags == correct_tags
Example #13
0
    def parse_plist(self, pdict):
        """Attempts to extract and identify all known and unknown keys of a plist file.

        Args:
            pdict: Plist dictionary item.

        Returns:
            A list of known keys and a list of unknown keys.
        """

        idenkey_sec = ResultSection("Identified Keys")
        unkkey_sec = ResultSection("Unidentified Keys:")

        # Sometimes plist is a list of dictionaries, or it is just a list. Will merge dict /convert to dict for now
        if isinstance(pdict, list):
            pdict = self.transform_dicts(pdict)

        for k, i in list(pdict.items()):
            # Prepare Keys
            k = str(safe_str(k))
            k_noipad = k.replace("~ipad", "")

            # Prepare values
            if i is None:
                i = [""]
            elif not isinstance(i, list):
                i = [i]

            # Many plist files are duplicates of info.plist, do not report on keys already identified
            if k_noipad in self.reported_keys:
                if i in self.reported_keys[k_noipad]:
                    continue
                self.reported_keys[k_noipad].append(i)
            else:
                self.reported_keys[k_noipad] = [i]

            # Process known keys
            if k_noipad in self.known_keys:
                desc, create_tag = self.known_keys[k_noipad]

                idenkey_sec.add_line(f"{k} ({desc}): {', '.join([safe_str(x, force_str=True) for x in i])}")
                if create_tag:
                    for val in i:
                        idenkey_sec.add_tag(TAG_MAP[k_noipad.upper()], safe_str(val, force_str=True))

            else:
                unkkey_sec.add_line(f"{k}: {', '.join([safe_str(x, force_str=True) for x in i])}")

        if idenkey_sec.body is None:
            idenkey_sec = None

        if unkkey_sec.body is None:
            unkkey_sec = None

        return idenkey_sec, unkkey_sec
Example #14
0
def ioc_tag(text: bytes,
            result: ResultSection,
            just_network: bool = False) -> bool:
    """ Tags iocs found in text to result

    text: text to search for iocs
    result: ResultSection to tag with iocs
    just_network: whether non-network iocs should be skipped

    returns: whether iocs are found
    """
    pattern = PatternMatch()
    ioc = pattern.ioc_match(text, bogon_ip=True, just_network=just_network)
    for kind, values in ioc.items():
        for val in values:
            result.add_tag(kind, val[:MAX_TAG_LEN])
    # Return whether any IOCs were found
    return bool(ioc)
Example #15
0
    def parse_link(self, parent_res, path):
        with open(path, "rb") as fh:
            metadata = decode_lnk(fh.read())

        if metadata is None:
            return False

        body_output = {
            build_key(k): v
            for k, v in flatten(metadata).items() if v
        }
        res = ResultSection("Metadata extracted by parse_lnk",
                            body_format=BODY_FORMAT.KEY_VALUE,
                            body=json.dumps(body_output),
                            parent=parent_res)

        bp = metadata.get("BasePath", "").strip()
        rp = metadata.get("RELATIVE_PATH", "").strip()
        nn = metadata.get("NetName", "").strip()
        cla = metadata.get("COMMAND_LINE_ARGUMENTS", "").strip()
        s = BAD_LINK_RE.search(cla.lower())
        if s:
            res.set_heuristic(1)
        res.add_tag(tag_type="file.name.extracted",
                    value=(bp or rp or nn).rsplit("\\")[-1])
        res.add_tag(tag_type="dynamic.process.command_line",
                    value=f"{(rp or bp or nn)} {cla}".strip())

        for k, v in body_output.items():
            tag_type = TAG_MAP.get("LNK", {}).get(k, None) or \
                       TAG_MAP.get(None, {}).get(k, None)
            if tag_type:
                res.add_tag(tag_type, v)

        return True
    def execute(self, request):
        qr = xqrcode.decode_from_file(request.file_path)
        if len(qr) > 0:
            result_url = qr[0]['data']
            result = Result()
            text_section = ResultSection('QR Code')
            text_section.add_line(result_url)
            result.add_section(text_section)

            url_section = ResultSection('url extracted',
                                        body_format=BODY_FORMAT.URL,
                                        body=json.dumps({
                                            "name": "QR Code Url",
                                            "url": f"{result_url}"
                                        }))

            url_section.add_tag("network.static.domain", result_url)
            result.add_section(url_section)

            request.result = result
        else:
            request.result = Result()
Example #17
0
    def _handle_artefact(artefact: Artefact = None,
                         artefacts_result_section: ResultSection = None):
        if artefact is None:
            raise Exception("Artefact cannot be None")

        # This is a dict who's key-value pairs follow the format {regex: result_section_title}
        artefact_map = {
            HOLLOWSHUNTER_EXE_REGEX:
            "HollowsHunter Injected Portable Executable",
            HOLLOWSHUNTER_SHC_REGEX: "HollowsHunter Shellcode",
            HOLLOWSHUNTER_DLL_REGEX: "HollowsHunter DLL",
        }
        artefact_result_section = None

        for regex, title in artefact_map.items():
            pattern = compile(regex)
            if pattern.match(artefact.name):
                artefact_result_section = ResultSection(title)
                artefact_result_section.add_tag("dynamic.process.file_name",
                                                artefact.path)

        if artefact_result_section is not None:
            artefacts_result_section.add_subsection(artefact_result_section)
Example #18
0
def subsection_builder(parent_section: ResultSection = None,
                       fields: dict = {}):
    for mwcp_field, mwcp_field_data in fields.items():
        if mwcp_field in FIELD_TAG_MAP:
            tag = FIELD_TAG_MAP[mwcp_field]
            table_body = []
            table_section = ResultSection(
                f"Extracted {mwcp_field.capitalize()}")
            if tag:
                for x in mwcp_field_data:
                    table_section.add_tag(tag, x)
                # Tag everything that we can
            # Add data to section body
            for line in mwcp_field_data:
                if type(line) is str:
                    table_body.append({mwcp_field: line})
                elif type(line) is list:
                    for item in line:
                        table_body.append({mwcp_field: item})
            table_section.set_body(body_format=BODY_FORMAT.TABLE,
                                   body=json.dumps(table_body))

            parent_section.add_subsection(table_section)
    def validate_certs(apktool_out_dir: str, result: Result):
        has_cert = False
        for root, _, files in os.walk(os.path.join(apktool_out_dir, "original", "META-INF")):
            for f in files:
                cur_file = os.path.join(root, f)
                stdout, stderr = Popen(["keytool", "-printcert", "-file", cur_file],
                                       stderr=PIPE, stdout=PIPE).communicate()
                stdout = safe_str(stdout)
                if stdout:
                    if "keytool error" not in stdout:
                        has_cert = True
                        issuer = ""
                        owner = ""
                        country = ""
                        valid_from = ""
                        valid_to = ""
                        valid_year_end = 0
                        valid_year_start = 0
                        valid_until_date = time.time()
                        play_store_min = 'Sat Oct 22 00:00:00 2033'
                        play_store_min_valid_date = time.mktime(time.strptime(play_store_min, "%a %b %d %H:%M:%S %Y"))

                        for line in stdout.splitlines():
                            if "Owner:" in line:
                                owner = line.split(": ", 1)[1]
                                country = owner.split("C=")
                                if len(country) != 1:
                                    country = country[1]
                                else:
                                    country = ""

                            if "Issuer:" in line:
                                issuer = line.split(": ", 1)[1]

                            if "Valid from:" in line:
                                valid_from = line.split(": ", 1)[1].split(" until:")[0]
                                valid_to = line.rsplit(": ", 1)[1]

                                valid_from_splitted = valid_from.split(" ")
                                valid_to_splitted = valid_to.split(" ")

                                valid_year_start = int(valid_from_splitted[-1])
                                valid_year_end = int(valid_to_splitted[-1])

                                valid_until = " ".join(valid_to_splitted[:-2] + valid_to_splitted[-1:])
                                valid_until_date = time.mktime(time.strptime(valid_until, "%a %b %d %H:%M:%S %Y"))

                        res_cert = ResultSection("Certificate Analysis", body=safe_str(stdout),
                                                 parent=result, body_format=BODY_FORMAT.MEMORY_DUMP)

                        res_cert.add_tag('cert.valid.start', valid_from)
                        res_cert.add_tag('cert.valid.end', valid_to)
                        res_cert.add_tag('cert.issues', issuer)
                        res_cert.add_tag('cert.owner', owner)

                        if owner == issuer:
                            ResultSection("Certificate is self-signed", parent=res_cert,
                                          heuristic=Heuristic(10))

                        if not country:
                            ResultSection("Certificate owner has no country", parent=res_cert,
                                          heuristic=Heuristic(11))

                        if valid_year_start < 2008:
                            ResultSection("Certificate valid before first android release", parent=res_cert,
                                          heuristic=Heuristic(12))

                        if valid_year_start > valid_year_end:
                            ResultSection("Certificate expires before validity date starts", parent=res_cert,
                                          heuristic=Heuristic(16))

                        if (valid_year_end - valid_year_start) > 30:
                            ResultSection("Certificate valid more then 30 years", parent=res_cert,
                                          heuristic=Heuristic(13))

                        if valid_until_date < play_store_min_valid_date:
                            ResultSection("Certificate not valid until minimum valid playstore date", parent=res_cert,
                                          heuristic=Heuristic(20))

                        if country:
                            try:
                                int(country)
                                is_int_country = True
                            except Exception:
                                is_int_country = False

                            if len(country) != 2 or is_int_country:
                                ResultSection("Invalid country code in certificate owner", parent=res_cert,
                                              heuristic=Heuristic(14))

                        if f != "CERT.RSA":
                            ResultSection(f"Certificate name not using conventional name: {f}", parent=res_cert,
                                          heuristic=Heuristic(15))

        if not has_cert:
            ResultSection("This APK is not signed", parent=result, heuristic=Heuristic(9))
    def run_badging_analysis(self, apk_file: str, result: Result):
        badging_args = ['d', 'badging', apk_file]
        badging, errors = self.run_appt(badging_args)
        if not badging:
            return
        res_badging = ResultSection("Android application details")
        libs = []
        permissions = []
        components = []
        features = []
        pkg_version = None
        for line in badging.splitlines():
            if line.startswith("package:"):
                pkg_name = line.split("name='")[1].split("'")[0]
                pkg_version = line.split("versionCode='")[1].split("'")[0]
                res_badging.add_line(f"Package: {pkg_name} v.{pkg_version}")
                res_badging.add_tag('file.apk.pkg_name', pkg_name)
                res_badging.add_tag('file.apk.app.version', pkg_version)

            if line.startswith("sdkVersion:"):
                min_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Min SDK: {min_sdk}")
                res_badging.add_tag('file.apk.sdk.min', min_sdk)

            if line.startswith("targetSdkVersion:"):
                target_sdk = line.split(":'")[1][:-1]
                res_badging.add_line(f"Target SDK: {target_sdk}")
                res_badging.add_tag('file.apk.sdk.target', target_sdk)

            if line.startswith("application-label:"):
                label = line.split(":'")[1][:-1]
                res_badging.add_line(f"Default Label: {label}")
                res_badging.add_tag('file.apk.app.label', label)

            if line.startswith("launchable-activity:"):
                launch = line.split("name='")[1].split("'")[0]
                res_badging.add_line(f"Launchable activity: {launch}")
                res_badging.add_tag('file.apk.activity', launch)

            if line.startswith("uses-library-not-required:"):
                lib = line.split(":'")[1][:-1]
                if lib not in libs:
                    libs.append(lib)

            if line.startswith("uses-permission:") or line.startswith("uses-implied-permission:"):
                perm = line.split("name='")[1].split("'")[0]
                if perm not in permissions:
                    permissions.append(perm)

            if line.startswith("provides-component:"):
                component = line.split(":'")[1][:-1]
                if component not in components:
                    components.append(component)

            if "uses-feature:" in line or "uses-implied-feature:" in line:
                feature = line.split("name='")[1].split("'")[0]
                if feature not in features:
                    features.append(feature)

        if pkg_version is not None:
            pkg_version = int(pkg_version)
            if pkg_version < 15:
                ResultSection("Package version is suspiciously low", parent=res_badging,
                              heuristic=Heuristic(17))
            elif pkg_version > 999999999:
                ResultSection("Package version is suspiciously high", parent=res_badging,
                              heuristic=Heuristic(17))

        if libs:
            res_lib = ResultSection("Libraries used", parent=res_badging)
            for lib in libs:
                res_lib.add_line(lib)
                res_lib.add_tag('file.apk.used_library', lib)

        if permissions:
            res_permissions = ResultSection("Permissions used", parent=res_badging)
            dangerous_permissions = []
            unknown_permissions = []
            for perm in permissions:
                if perm in ALL_ANDROID_PERMISSIONS:
                    if 'dangerous' in ALL_ANDROID_PERMISSIONS[perm]:
                        dangerous_permissions.append(perm)
                    else:
                        res_permissions.add_line(perm)
                        res_permissions.add_tag('file.apk.permission', perm)
                else:
                    unknown_permissions.append(perm)

            if len(set(permissions)) < len(permissions):
                ResultSection("Some permissions are defined more then once", parent=res_badging,
                              heuristic=Heuristic(18))

            if dangerous_permissions:
                res_dangerous_perm = ResultSection("Dangerous permissions used", parent=res_badging,
                                                   heuristic=Heuristic(4))
                for perm in dangerous_permissions:
                    res_dangerous_perm.add_line(perm)
                    res_dangerous_perm.add_tag('file.apk.permission', perm)

            if unknown_permissions:
                res_unknown_perm = ResultSection("Unknown permissions used", parent=res_badging,
                                                 heuristic=Heuristic(5))
                for perm in unknown_permissions:
                    res_unknown_perm.add_line(perm)
                    res_unknown_perm.add_tag('file.apk.permission', perm)

        if features:
            res_features = ResultSection("Features used", parent=res_badging)
            for feature in features:
                res_features.add_line(feature)
                res_features.add_tag('file.apk.feature', feature)

        if components:
            res_components = ResultSection("Components provided", parent=res_badging)
            for component in components:
                res_components.add_line(component)
                res_components.add_tag('file.apk.provides_component', component)

        result.add_section(res_badging)
    def find_network_indicators(apktool_out_dir: str, result: Result):
        # Whitelist
        skip_list = [
            "android.intent",
            "com.google",
            "com.android",
        ]

        indicator_whitelist = [
            'google.to',
            'google.ttl',
            'google.delay',
            'google_tagmanager.db',
            'gtm_urls.db',
            'gtm.url',
            'google_tagmanager.db',
            'google_analytics_v4.db',
            'Theme.Dialog.Alert',
            'popupLocationInfo.gravity',
            'popupLocationInfo.displayId',
            'popupLocationInfo.left',
            'popupLocationInfo.top',
            'popupLocationInfo.right',
            'popupLocationInfo.bottom',
            'googleads.g.doubleclick.net',
            'ad.doubleclick.net',
            '.doubleclick.net',
            '.googleadservices.com',
            '.googlesyndication.com',
            'android.hardware.type.watch',
            'mraid.js',
            'google_inapp_purchase.db',
            'mobileads.google.com',
            'mobileads.google.com',
            'share_history.xml',
            'share_history.xml',
            'activity_choser_model_history.xml',
            'FragmentPager.SavedState{',
            'android.remoteinput.results',
            'android.people',
            'android.picture',
            'android.icon',
            'android.text',
            'android.title',
            'android.title.big',
            'FragmentTabHost.SavedState{',
            'android.remoteinput.results',
            'android.remoteinput.results',
            'android.remoteinput.results',
            'libcore.icu.ICU',
        ]

        file_list = []

        # Indicators
        url_list = []
        domain_list = []
        ip_list = []
        email_list = []

        # Build dynamic whitelist
        smali_dir = os.path.join(apktool_out_dir, "smali")
        for root, dirs, files in os.walk(smali_dir):
            if not files:
                continue
            else:
                skip_list.append(root.replace(smali_dir + "/", "").replace("/", "."))

            for cdir in dirs:
                skip_list.append(os.path.join(root, cdir).replace(smali_dir + "/", "").replace("/", "."))

        asset_dir = os.path.join(apktool_out_dir, "assets")
        if os.path.exists(asset_dir):
            for root, dirs, files in os.walk(asset_dir):
                if not files:
                    continue
                else:
                    for asset_file in files:
                        file_list.append(asset_file)
        skip_list = list(set(skip_list))

        # Find indicators
        proc = Popen(['grep', '-ER', r'(([[:alpha:]](-?[[:alnum:]])*)\.)*[[:alpha:]](-?[[:alnum:]])+\.[[:alpha:]]{2,}',
                      smali_dir], stdout=PIPE, stderr=PIPE)
        grep, _ = proc.communicate()
        for line in safe_str(grep).splitlines():
            file_path, line = line.split(":", 1)

            if "const-string" in line or "Ljava/lang/String;" in line:
                data = line.split("\"", 1)[1].split("\"")[0]
                data_low = data.lower()
                data_split = data.split(".")
                if data in file_list:
                    continue
                elif data in indicator_whitelist:
                    continue
                elif data.startswith("/"):
                    continue
                elif data_low.startswith("http://") or data_low.startswith('ftp://') or data_low.startswith('https://'):
                    url_list.append(data)
                elif len(data_split[0]) < len(data_split[-1]) and len(data_split[-1]) > 3:
                    continue
                elif data.startswith('android.') and data_low != data:
                    continue
                elif "/" in data and "." in data and data.index("/") < data.index("."):
                    continue
                elif " " in data:
                    continue
                elif data_split[0] in ['com', 'org', 'net', 'java']:
                    continue
                elif data_split[-1].lower() in ['so', 'properties', 'zip', 'read', 'id', 'store',
                                                'name', 'author', 'sh', 'soccer', 'fitness', 'news', 'video']:
                    continue
                elif data.endswith("."):
                    continue
                else:
                    do_skip = False
                    for skip in skip_list:
                        if data.startswith(skip):
                            do_skip = True
                            break

                    if do_skip:
                        continue

                    data = data.strip(".")

                    if is_valid_domain(data):
                        domain_list.append(data)
                    elif is_valid_ip(data):
                        ip_list.append(data)
                    elif is_valid_email(data):
                        email_list.append(data)

        url_list = list(set(url_list))
        for url in url_list:
            dom_ip = url.split("//")[1].split("/")[0]
            if ":" in dom_ip:
                dom_ip = dom_ip.split(":")[0]

            if is_valid_ip(dom_ip):
                ip_list.append(dom_ip)
            elif is_valid_domain(dom_ip):
                domain_list.append(dom_ip)

        ip_list = list(set(ip_list))
        domain_list = list(set(domain_list))
        email_list = list(set(email_list))

        if url_list or ip_list or domain_list or email_list:
            res_net = ResultSection("Network indicator(s) found", parent=result, heuristic=Heuristic(3))

            if url_list:
                res_url = ResultSection("Found urls in the decompiled code", parent=res_net)
                count = 0
                for url in url_list:
                    count += 1
                    if count <= 20:
                        res_url.add_line(url)
                    res_url.add_tag('network.static.uri', url)
                if count > 20:
                    res_url.add_line(f"and {count - 20} more...")

            if ip_list:
                res_ip = ResultSection("Found IPs in the decompiled code", parent=res_net)
                count = 0
                for ip in ip_list:
                    count += 1
                    if count <= 20:
                        res_ip.add_line(ip)
                    res_ip.add_tag('network.static.ip', ip)
                if count > 20:
                    res_ip.add_line(f"and {count - 20} more...")

            if domain_list:
                res_domain = ResultSection("Found domains in the decompiled code", parent=res_net)
                count = 0
                for domain in domain_list:
                    count += 1
                    if count <= 20:
                        res_domain.add_line(domain)
                    res_domain.add_tag('network.static.domain', domain)
                if count > 20:
                    res_domain.add_line(f"and {count - 20} more...")

            if email_list:
                res_email = ResultSection("Found email addresses in the decompiled code", parent=res_net)
                count = 0
                for email in email_list:
                    count += 1
                    if count <= 20:
                        res_email.add_line(email)
                    res_email.add_tag('network.email.address', email)
                if count > 20:
                    res_email.add_line(f"and {count - 20} more...")
 def _report_embedded_xdp(self, file_res, chunk_number, binary, leftover):
     res_section = ResultSection([f"Found {chunk_number}", "Embedded PDF (in XDP)"])
     res_section.set_heuristic(1)
     res_section.add_tag('file.behavior', "Embedded PDF (in XDP)")
     file_res.add_section(res_section)
    def peepdf_analysis(self, temp_filename, file_content, request):
        file_res = Result()
        try:
            res_list = []
            # js_stream = []
            f_list = []
            js_dump = []

            pdf_parser = PDFParser()
            ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content)
            if ret == 0:
                stats_dict = pdf_file.getStats()

                if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \
                                                      "indirect objects found in the body":
                    # Not a PDF
                    return

                json_body = dict(
                    version=stats_dict['Version'],
                    binary=stats_dict['Binary'],
                    linearized=stats_dict['Linearized'],
                    encrypted=stats_dict['Encrypted'],
                )

                if stats_dict['Encryption Algorithms']:
                    temp = []
                    for algorithmInfo in stats_dict['Encryption Algorithms']:
                        temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits")
                    json_body["encryption_algorithms"] = temp

                json_body.update(dict(
                    updates=stats_dict['Updates'],
                    objects=stats_dict['Objects'],
                    streams=stats_dict['Streams'],
                    comments=stats_dict['Comments'],
                    errors={True: ", ".join(stats_dict['Errors']),
                            False: "None"}[len(stats_dict['Errors']) != 0]
                ))
                res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE,
                                    body=json.dumps(json_body))

                for version in range(len(stats_dict['Versions'])):
                    stats_version = stats_dict['Versions'][version]
                    v_json_body = dict(
                        catalog=stats_version['Catalog'] or "no",
                        info=stats_version['Info'] or "no",
                        objects=self.list_first_x(stats_version['Objects'][1]),
                    )

                    if stats_version['Compressed Objects'] is not None:
                        v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1])

                    if stats_version['Errors'] is not None:
                        v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1])

                    v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1])

                    if stats_version['Xref Streams'] is not None:
                        v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1])

                    if stats_version['Object Streams'] is not None:
                        v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1])

                    if int(stats_version['Streams'][0]) > 0:
                        v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1])
                        if stats_version['Decoding Errors'] is not None:
                            v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1])

                    if stats_version['Objects with JS code'] is not None:
                        v_json_body['objects_with_js_code'] = \
                            self.list_first_x(stats_version['Objects with JS code'][1])
                        # js_stream.extend(stats_version['Objects with JS code'][1])

                    res_version = ResultSection(f"Version {str(version)}", parent=res,
                                                body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body))

                    actions = stats_version['Actions']
                    events = stats_version['Events']
                    vulns = stats_version['Vulns']
                    elements = stats_version['Elements']
                    is_suspicious = False
                    if events is not None or actions is not None or vulns is not None or elements is not None:
                        res_suspicious = ResultSection('Suspicious elements', parent=res_version)
                        if events is not None:
                            for event in events:
                                res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}")
                            is_suspicious = True
                        if actions is not None:
                            for action in actions:
                                res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}")
                            is_suspicious = True
                        if vulns is not None:
                            for vuln in vulns:
                                if vuln in vulnsDict:
                                    temp = [vuln, ' (']
                                    for vulnCVE in vulnsDict[vuln]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                            vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                            temp.append(vulnCVE)
                                            cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                            if cve_found:
                                                res_suspicious.add_tag('attribution.exploit',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                                res_suspicious.add_tag('file.behavior',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(vulns[vuln]))
                                    res_suspicious.add_line(temp)
                                else:
                                    res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}")
                                is_suspicious = True
                        if elements is not None:
                            for element in elements:
                                if element in vulnsDict:
                                    temp = [element, ' (']
                                    for vulnCVE in vulnsDict[element]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                        vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                        temp.append(vulnCVE)
                                        cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                        if cve_found:
                                            res_suspicious.add_tag('attribution.exploit',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                            res_suspicious.add_tag('file.behavior',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(elements[element]))
                                    res_suspicious.add_line(temp)
                                    is_suspicious = True
                                else:
                                    res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}")
                                    is_suspicious = True
                    res_suspicious.set_heuristic(8) if is_suspicious else None

                    urls = stats_version['URLs']
                    if urls is not None:
                        res.add_line("")
                        res_url = ResultSection('Found URLs', parent=res)
                        for url in urls:
                            res_url.add_line(f"\t\t{url}")
                            res_url.set_heuristic(9)

                    for obj in stats_version['Objects'][1]:
                        cur_obj = pdf_file.getObject(obj, version)

                        if cur_obj.containsJScode:
                            cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} "
                                                    f"block of JavaScript")
                            score_modifier = 0

                            js_idx = 0
                            for js in cur_obj.JSCode:
                                sub_res = ResultSection('Block of JavaScript', parent=cur_res)
                                js_idx += 1
                                js_score = 0
                                js_code, unescaped_bytes, _, _, _ = analyseJS(js)

                                js_dump += [x for x in js_code]

                                # Malicious characteristics
                                big_buffs = self.get_big_buffs("".join(js_code))
                                if len(big_buffs) == 1:
                                    js_score += 500 * len(big_buffs)
                                if len(big_buffs) > 0:
                                    js_score += 500 * len(big_buffs)
                                has_eval, has_unescape = self.check_dangerous_func("".join(js_code))
                                if has_unescape:
                                    js_score += 100
                                if has_eval:
                                    js_score += 100

                                js_cmt = ""
                                if has_eval or has_unescape or len(big_buffs) > 0:
                                    score_modifier += js_score
                                    js_cmt = "Suspiciously malicious "
                                    cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF")
                                    sub_res.set_heuristic(7)
                                js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res)

                                if js_score > 0:
                                    temp_js_outname = f"object{obj}-{version}_{js_idx}.js"
                                    temp_js_path = os.path.join(self.working_directory, temp_js_outname)
                                    temp_js_bin = "".join(js_code).encode("utf-8")
                                    f = open(temp_js_path, "wb")
                                    f.write(temp_js_bin)
                                    f.close()
                                    f_list.append(temp_js_path)

                                    js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}")
                                    if has_eval or has_unescape:
                                        analysis_res = ResultSection("[Suspicious Functions]", parent=js_res)
                                        if has_eval:
                                            analysis_res.add_line("eval: This JavaScript block uses eval() function "
                                                                  "which is often used to launch deobfuscated "
                                                                  "JavaScript code.")
                                            analysis_res.set_heuristic(3)
                                        if has_unescape:
                                            analysis_res.add_line("unescape: This JavaScript block uses unescape() "
                                                                  "function. It may be legitimate but it is definitely "
                                                                  "suspicious since malware often use this to "
                                                                  "deobfuscate code blocks.")
                                            analysis_res.set_heuristic(3)

                                    buff_idx = 0
                                    for buff in big_buffs:
                                        buff_idx += 1
                                        error, new_buff = unescape(buff)
                                        if error == 0:
                                            buff = new_buff

                                        if buff not in unescaped_bytes:
                                            temp_path_name = None
                                            if ";base64," in buff[:100] and "data:" in buff[:100]:
                                                temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff"
                                                try:
                                                    buff = b64decode(buff.split(";base64,")[1].strip())
                                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                                    f = open(temp_path, "wb")
                                                    f.write(buff)
                                                    f.close()
                                                    f_list.append(temp_path)
                                                except Exception:
                                                    self.log.error("Found 'data:;base64, ' buffer "
                                                                   "but failed to base64 decode.")
                                                    temp_path_name = None

                                            if temp_path_name is not None:
                                                buff_cond = f" and was resubmitted as {temp_path_name}"
                                            else:
                                                buff_cond = ""
                                            buff_res = ResultSection(
                                                f"A {len(buff)} bytes buffer was found in the JavaScript "
                                                f"block{buff_cond}. Here are the first 256 bytes.",
                                                parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")),
                                                body_format=BODY_FORMAT.MEMORY_DUMP)
                                            buff_res.set_heuristic(2)

                                processed_sc = []
                                sc_idx = 0
                                for sc in unescaped_bytes:
                                    if sc not in processed_sc:
                                        sc_idx += 1
                                        processed_sc.append(sc)

                                        try:
                                            sc = sc.decode("hex")
                                        except Exception:
                                            pass

                                        shell_score = 500
                                        temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff"

                                        shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript "
                                                                  f"buffer (id: {sc_idx}) was resubmitted as "
                                                                  f"{temp_path_name}. Here are the first 256 bytes.",
                                                                  parent=js_res)
                                        shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP)

                                        temp_path = os.path.join(self.working_directory, temp_path_name)
                                        f = open(temp_path, "wb")
                                        f.write(sc)
                                        f.close()
                                        f_list.append(temp_path)

                                        cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer")
                                        shell_res.set_heuristic(6)
                                        score_modifier += shell_score

                            if score_modifier > 0:
                                res_list.append(cur_res)

                        elif cur_obj.type == "stream":
                            if cur_obj.isEncodedStream and cur_obj.filter is not None:
                                data = cur_obj.decodedStream
                                encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/",
                                                                                                          "").strip()
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            else:
                                data = cur_obj.rawStream
                                encoding = None
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            if otype:
                                otype = otype.value.replace("/", "").lower()
                            if sub_type:
                                sub_type = sub_type.value.replace("/", "").lower()
                            if length:
                                length = length.value

                            if otype == "embeddedfile":
                                if len(data) > 4096:
                                    if encoding is not None:
                                        temp_encoding_str = f"_{encoding}"
                                    else:
                                        temp_encoding_str = ""

                                    cur_res = ResultSection(
                                        f'Embedded file found ({length} bytes) [obj: {obj} {version}] '
                                        f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}'
                                        f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                        f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                    )

                                    temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj"
                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                    f = open(temp_path, "wb")
                                    f.write(data)
                                    f.close()
                                    f_list.append(temp_path)

                                    cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}")
                                    res_list.append(cur_res)

                            elif otype not in BANNED_TYPES:
                                cur_res = ResultSection(
                                    f'Unknown stream found [obj: {obj} {version}] '
                                    f'{f"(Type: {otype}) " if otype is not None else ""}'
                                    f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                    f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                )
                                for line in val.splitlines():
                                    cur_res.add_line(line)

                                emb_res = ResultSection('First 256 bytes', parent=cur_res)
                                first_256 = data[:256]
                                if isinstance(first_256, str):
                                    first_256 = first_256.encode()
                                emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP)
                                res_list.append(cur_res)
                        else:
                            pass

                file_res.add_section(res)

                for results in res_list:
                    file_res.add_section(results)

                if js_dump:
                    js_dump_res = ResultSection('Full JavaScript dump')

                    temp_js_dump = "javascript_dump.js"
                    temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump)
                    try:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8")
                    except UnicodeDecodeError:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump)
                    temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest()
                    f = open(temp_js_dump_path, "wb")
                    f.write(temp_js_dump_bin)
                    f.flush()
                    f.close()
                    f_list.append(temp_js_dump_path)

                    js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}")
                    js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}")

                    js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1)
                    file_res.add_section(js_dump_res)

                for filename in f_list:
                    request.add_extracted(filename, os.path.basename(filename),
                                          f"Dumped from {os.path.basename(temp_filename)}")

            else:
                res = ResultSection("ERROR: Could not parse file with PeePDF.")
                file_res.add_section(res)
        finally:
            request.result = file_res
            try:
                del pdf_file
            except Exception:
                pass

            try:
                del pdf_parser
            except Exception:
                pass

            gc.collect()
Example #24
0
    def execute(self, request):
        request.result = Result()

        # 1. Calculate entropy map
        with open(request.file_path, 'rb') as fin:
            (entropy, part_entropies) = calculate_partition_entropy(fin)

        entropy_graph_data = {
            'type': 'colormap',
            'data': {
                'domain': [0, 8],
                'values': part_entropies
            }
        }

        ResultSection(f"File entropy: {round(entropy, 3)}",
                      parent=request.result,
                      body_format=BODY_FORMAT.GRAPH_DATA,
                      body=json.dumps(entropy_graph_data))

        if request.file_type == "meta/shortcut/windows":
            # 2. Parse windows shortcuts
            self.parse_link(request.result, request.file_path)
        else:
            # 3. Get hachoir metadata
            parser = createParser(request.file_path)
            if parser is not None:
                with parser:
                    tags = parser.getParserTags()
                    parser_id = tags.get('id', 'unknown')

                    # Do basic metadata extraction
                    metadata = extractMetadata(parser, 1)

                    if metadata:
                        kv_body = {}
                        tags = []
                        for m in metadata:
                            if m.key == "comment":
                                for v in m.values:
                                    key, val = get_type_val(v.text, "comment")
                                    if not val:
                                        continue

                                    kv_body[key] = val

                                    tag_type = TAG_MAP.get(parser_id, {}).get(key, None) or \
                                        TAG_MAP.get(None, {}).get(key, None)
                                    if tag_type is not None:
                                        tags.append((tag_type, val))
                            elif m.key in ["mime_type"]:
                                pass
                            else:
                                values = [v.text for v in m.values]
                                if len(values) == 1 and values[0]:
                                    kv_body[m.key] = values[0]
                                elif values:
                                    kv_body[m.key] = values

                                for v in values:
                                    tag_type = TAG_MAP.get(parser_id, {}).get(m.key, None) or \
                                        TAG_MAP.get(None, {}).get(m.key, None)
                                    if tag_type is not None:
                                        tags.append((tag_type, v))

                        if kv_body:
                            res = ResultSection(
                                f"Metadata extracted by hachoir-metadata [Parser: {parser_id}]",
                                body=json.dumps(kv_body),
                                body_format=BODY_FORMAT.KEY_VALUE,
                                parent=request.result)

                            for t_type, t_val in tags:
                                res.add_tag(t_type, t_val)

        # 4. Get Exiftool Metadata
        exif = subprocess.run(["exiftool", "-j", request.file_path],
                              capture_output=True,
                              check=False)
        if exif.stdout:
            exif_data = json.loads(exif.stdout.decode('utf-8',
                                                      errors="ignore"))
            res_data = exif_data[0]
            if "Error" not in res_data:
                exif_body = {
                    build_key(k): v
                    for k, v in res_data.items() if v and k not in [
                        "SourceFile", "ExifToolVersion", "FileName",
                        "Directory", "FileSize", "FileModifyDate",
                        "FileAccessDate", "FileInodeChangeDate",
                        "FilePermissions", "FileType", "FileTypeExtension",
                        "MIMEType"
                    ]
                }
                if exif_body:
                    e_res = ResultSection("Metadata extracted by ExifTool",
                                          body=json.dumps(exif_body),
                                          body_format=BODY_FORMAT.KEY_VALUE,
                                          parent=request.result)
                    for k, v in exif_body.items():
                        tag_type = TAG_MAP.get(res_data.get("FileTypeExtension", "UNK").upper(), {}).get(k, None) or \
                                   TAG_MAP.get(None, {}).get(k, None)
                        if tag_type:
                            e_res.add_tag(tag_type, v)
    def analyze_pdf(self,
                    request,
                    res_txt,
                    path,
                    working_dir,
                    heur,
                    additional_keywords,
                    get_malform=True):
        """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins,
        and PDF Parser.

        Args:
            request: AL request object.
            res_txt: Header string for AL result section title.
            path: Original PDF sample path.
            working_dir: AL working directory.
            heur: List of plugins to run on PDFId results (provided in service configuration).
            additional_keywords: List of additional keywords to be searched (provided in service configuration).
            get_malform: Extract malformed objects from PDF.

        Returns:
            AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list.
        """
        triage_keywords = set()
        all_errors = set()
        embed_present = False
        objstms = False
        res = ResultSection(title_text=res_txt)
        carved_extracted_shas = set()

        if request.deep_scan:
            run_pdfparse = True
        else:
            run_pdfparse = False

        # Run PDFId
        try:
            pdfid_result, errors = self.get_pdfid(path, additional_keywords,
                                                  heur, request.deep_scan)
        except Exception as e:
            raise NonRecoverableError(e)
        # Parse PDFId results
        pdfidres = ResultSection(title_text="PDFID Results", parent=res)
        if len(pdfid_result) == 0:
            pdfidres.add_line(
                "No results generated for file. Please see errors.")
        else:
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                version = pdfid_result.get("PDFID", None)
                if version:
                    pdfidres.add_line(version[0])
                properties = pdfid_result.get("Properties", None)
                if properties:
                    pres = ResultSection(title_text="PDF Properties",
                                         parent=pdfidres)
                    for plist in properties:
                        pres.add_line("{0}: {1}".format(plist[0], plist[1]))
                        if plist[0] == "/ModDate":
                            pres.add_tag('file.pdf.date.modified', plist[1])
                        elif plist[0] == "/CreationDate":
                            pres.add_tag('file.date.creation', plist[1])
                        elif plist[0] == "/LastModified":
                            pres.add_tag('file.date.last_modified', plist[1])
                        elif plist[0] == "/SourceModified":
                            pres.add_tag('file.pdf.date.source_modified',
                                         plist[1])
                        elif plist[0] == "/pdfx":
                            pres.add_tag('file.pdf.date.pdfx', plist[1])
                entropy = pdfid_result.get("Entropy", None)
                if entropy:
                    enres = ResultSection(title_text="Entropy",
                                          parent=pdfidres)
                    for enlist in entropy:
                        enres.add_line("{0}: {1}, ({2})".format(
                            enlist[0], enlist[1], enlist[2]))
            flags = pdfid_result.get("Flags", None)
            if flags:
                fres = ResultSection(title_text="PDF Keyword Flags",
                                     parent=pdfidres)
                for flist in flags:
                    if flist[0] == "/ObjStm":
                        objstms = True
                    if len(flist) == 3:
                        fres.add_line(
                            "{0}:Count: {1}, Hex-Encoded Count: {2}".format(
                                flist[0], flist[1], flist[2]))
                    else:
                        fres.add_line("{0}:Count: {1}".format(
                            flist[0], flist[1]))
                    fres.add_tag('file.string.extracted',
                                 flist[0].replace("/", "", 1))
                    if flist[0] in additional_keywords:
                        triage_keywords.add(flist[0].replace("/", "", 1))

            plugin = pdfid_result.get("Plugin", [])

            # If any plugin results, or flagged keywords found, run PDF Parser
            if plugin or len(triage_keywords) > 0:
                run_pdfparse = True

            for pllist in plugin:
                pl_name, pl_heur, pl_text = pllist
                pl_heur = int(pl_heur)
                pl_text = pl_text[14:]
                if not pl_text or pl_text == "None":
                    continue

                if pl_name in ['EmbeddedFile', 'Name Obfuscation']:
                    modres = ResultSection(title_text=pl_text, parent=pdfidres)

                    if pl_heur > 0:
                        modres.set_heuristic(pl_heur)

                    if pl_name == 'EmbeddedFile':
                        embed_present = True

                elif pl_name in ['Triage', 'Suspicious Properties']:
                    javascript_found = False
                    for line in pl_text.splitlines():
                        lineres = ResultSection(title_text=line)
                        # Triage results
                        if '/JavaScript' in line:
                            triage_keywords.add('JavaScript')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JS' in line:
                            triage_keywords.add('JS')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JBIG2Decode' in line:
                            triage_keywords.add('JBIG2Decode')
                            lineres.set_heuristic(3)
                        elif '/Colors > 2^24' in line:
                            triage_keywords.add('Colors > 2^24')
                            lineres.set_heuristic(20)
                        elif '/AA' in line:
                            triage_keywords.add('AA')
                            lineres.set_heuristic(1)
                        elif '/Launch' in line:
                            triage_keywords.add('Launch')
                            lineres.set_heuristic(1)
                        elif '/OpenAction' in line:
                            triage_keywords.add('OpenAction')
                            lineres.set_heuristic(1)
                        elif '/GoToE' in line:
                            triage_keywords.add('GoToE')
                            lineres.set_heuristic(21)
                        elif '/GoToR' in line:
                            triage_keywords.add('GoToR')
                            lineres.set_heuristic(22)
                        elif '/Encrypt' in line:
                            triage_keywords.add('Encrypt')
                            lineres.set_heuristic(11)
                        elif '/AcroForm' in line:
                            triage_keywords.add('AcroForm')
                            lineres.set_heuristic(4)
                        elif '/RichMedia' in line:
                            triage_keywords.add('RichMedia')
                            lineres.set_heuristic(5)
                        elif '/XFA' in line:
                            triage_keywords.add('XFA')
                            lineres.set_heuristic(23)
                        elif '/Annot' in line:
                            triage_keywords.add('Annot')
                            lineres.set_heuristic(25)
                        elif '/ObjStm' in line:
                            triage_keywords.add('ObjStm')
                            lineres.set_heuristic(7)
                        elif '/URI' in line:
                            triage_keywords.add('URI')
                            lineres.set_heuristic(24)

                        # Suspicious properties results
                        elif "eof2" in line:
                            lineres.set_heuristic(2)
                        elif "eof5" in line:
                            lineres.set_heuristic(17)
                        elif "page" in line:
                            lineres.set_heuristic(26)
                        elif "entropy" in line:
                            lineres.set_heuristic(12)
                        elif "obj/endobj" in line:
                            lineres.set_heuristic(13)
                        elif "stream/endstream" in line:
                            lineres.set_heuristic(14)

                        if lineres.heuristic is not None:
                            pdfidres.add_subsection(lineres)

        for e in errors:
            all_errors.add(e)
            if e.startswith('Error running plugin'):
                self.log.warn(e)

        if run_pdfparse:
            # CALL PDF parser and extract further information
            pdf_parserres = ResultSection(title_text="PDF Parser Results")
            # STATISTICS
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                options = {
                    "stats": True,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No statistical results generated for file. Please see errors."
                        )
                    else:
                        version = pdf_parser_result.get("version", None)
                        if version and version[0] != '0':
                            pdf_parserres.add_line(version[0])
                        stats = pdf_parser_result.get("stats", None)
                        if stats:
                            sres = ResultSection(
                                title_text="PDF Statistcs",
                                parent=pdf_parserres,
                                body_format=BODY_FORMAT.MEMORY_DUMP)
                            for p in stats:
                                sres.add_line(p)
                    for e in errors:
                        all_errors.add(e)

            # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream)
            carved_content = {}  # Format { "objnum": [{keyword: content list}}
            obj_extract_triage = set()
            jbig_objs = set()

            for keyword in triage_keywords:
                # ObjStms handled differently
                if keyword == 'ObjStm':
                    continue

                options = {
                    "search": keyword,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    for p in pdf_parser_result['parts']:
                        content = ""
                        references = []
                        # Trailer will be extracted anyways, try and grab all references anyways -- will be messy
                        if p.startswith("trailer:"):
                            # Grab the content after the keyword
                            # Check that keyword actually in content
                            if "/{}".format(keyword) in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').split("/", 1)[0].strip()
                                    references = re.findall(
                                        "[0-9]* [0-9]* R", content)
                                except Exception:
                                    continue
                        # If not trailer, should be object
                        elif 'Referencing:' in p:
                            # Grab the content after the keyword
                            if '>>++>>' in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').strip()
                                except Exception:
                                    try:
                                        content = p.split("\n", 3)[3]
                                    except Exception:
                                        content = p
                            else:
                                try:
                                    content = p.split("\n", 3)[3]
                                except Exception:
                                    content = p
                            # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R"
                            if content.startswith("/{}".format(keyword)):
                                try:
                                    content = re.sub("/{}[ ]*".format(keyword),
                                                     "", content, 1)
                                except Exception:
                                    pass
                            try:
                                references = p.split("\n", 3)[2].replace(
                                    'Referencing:', '').strip().split(", ")
                            except Exception:
                                pass
                        # Only extract JBIG2Decode objects with deep scan, but always report on their presence
                        if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p:
                            try:
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                if request.deep_scan:
                                    obj_extract_triage.add(objnum)
                                jbig_objs.add(objnum)
                                continue
                            except Exception as e:
                                self.log.debug(e)
                                continue
                        # If no content, then keyword likely points to reference objects, so grab those
                        if content == '':
                            if len(references) > 0:
                                content = references
                            else:
                                # Something is wrong, drop it.
                                continue
                        else:
                            while True:
                                # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R]
                                islist = re.match(
                                    r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]",
                                    content)
                                if islist:
                                    content = re.sub(
                                        r"[\[\]]", "",
                                        islist.group(0).replace(
                                            "s ", '').replace("R ",
                                                              "R,")).split(",")
                                    break
                                # References might be with instructions, i.e. [# # R /FitH null]
                                withinst = re.match(
                                    r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}"
                                    r"[/a-zA-Z0-9 ]*[ ]?\]", content)
                                if withinst:
                                    content = [withinst.group(1)]
                                    break
                                content = [content]
                                break
                        for c in content:
                            # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted
                            if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[
                                    0:5]:
                                continue
                            if c in references or re.match(
                                    "[0-9]* [0-9]* R", c):
                                try:
                                    ref_obj = c.split(" ", 1)[0]
                                    options = {
                                        "object": ref_obj,
                                        "get_object_detail": True
                                    }
                                    pdf_parser_subresult, err = self.get_pdf_parser(
                                        path, working_dir, options)

                                    if pdf_parser_subresult:
                                        for sub_p in pdf_parser_subresult[
                                                'parts']:
                                            sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\
                                                .strip().split(", ")
                                            ptyp = sub_p.split(
                                                "\n", 2)[1].replace(
                                                    'Type:',
                                                    '').strip().replace(
                                                        "/", "")
                                            # If the object contains a stream, extract the object.
                                            if "Contains stream" in sub_p:
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    obj_extract_triage.add(
                                                        objnum)
                                                except Exception:
                                                    pass
                                            # Or if the object Type is the keyword, grab all referenced objects.
                                            elif sub_references[0] != '' and len(sub_references) >= 1 \
                                                    and ptyp == keyword:
                                                for sr in sub_references:
                                                    try:
                                                        objnum = sr.split(
                                                            " ", 1)[0]
                                                        obj_extract_triage.add(
                                                            objnum)
                                                    except Exception:
                                                        pass
                                            # If not, extract object detail in to carved output
                                            elif pdf_parser_subresult[
                                                    'obj_details'] != "":
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    if objnum in carved_content:
                                                        carved_content[objnum]\
                                                            .append({keyword: pdf_parser_subresult['obj_details']})
                                                    else:
                                                        carved_content[objnum] = \
                                                            [{keyword: pdf_parser_subresult['obj_details']}]
                                                except Exception:
                                                    continue

                                    for e in err:
                                        errors.add(e)
                                except Exception:
                                    # If none of that work, just extract the original object for examination.
                                    try:
                                        objnum = p.split("\n",
                                                         1)[0].split(" ")[1]
                                        obj_extract_triage.add(objnum)
                                    except Exception:
                                        pass
                            # If content does not look like a reference:
                            else:
                                if p.startswith("trailer:"):
                                    continue
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                # If the object contains a stream extract the object
                                if p.split("\n", 4)[3] == "Contains stream":
                                    obj_extract_triage.add(objnum)
                                else:
                                    # Or just carve the content
                                    if objnum in carved_content:
                                        carved_content[objnum].append(
                                            {keyword: c})
                                    else:
                                        carved_content[objnum] = [{keyword: c}]

                    for e in errors:
                        all_errors.add(e)

            # Add carved content to result output
            show_content_of_interest = False
            if len(carved_content) > 0 or len(jbig_objs) > 0:
                carres = ResultSection(title_text="Content of Interest")
            else:
                carres = None

            if len(jbig_objs) > 0:
                jbigres = ResultSection(
                    title_text=
                    "The following Object IDs are JBIG2DECODE streams:",
                    body_format=BODY_FORMAT.MEMORY_DUMP,
                    parent=carres)
                jbigres.add_line(', '.join(map(str, jbig_objs)))
                show_content_of_interest = True

            if len(carved_content) > 0:
                for k, l in sorted(carved_content.items()):
                    for d in l:
                        for keyw, con in d.items():
                            subres = ResultSection(
                                title_text="Object {0}: Hits for Keyword '{1}':"
                                .format(k, keyw))
                            subres.set_heuristic(8)

                            con_bytes = con.encode()
                            if len(con) < 500:
                                subres.body_format = BODY_FORMAT.MEMORY_DUMP
                                subres.add_line(con)

                                # Check for IOC content
                                patterns = PatternMatch()
                                st_value = patterns.ioc_match(con_bytes,
                                                              bogon_ip=True)
                                if len(st_value) > 0:
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    for ty, val in st_value.items():
                                        if val == "":
                                            asc_asc = unicodedata.normalize(
                                                'NFKC',
                                                val).encode('ascii', 'ignore')
                                            subres.add_tag(ty, asc_asc)
                                        else:
                                            ulis = list(set(val))
                                            for v in ulis:
                                                subres.add_tag(ty, v)
                            else:
                                crv_sha = hashlib.sha256(con_bytes).hexdigest()

                                if crv_sha not in carved_extracted_shas:
                                    f_name = "carved_content_obj_{}_{}".format(
                                        k, crv_sha[0:7])
                                    subres.add_lines([
                                        "Content over 500 bytes it will be extracted for analysis",
                                        "Name: {} - SHA256: {}".format(
                                            f_name, crv_sha)
                                    ])
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    crvf = os.path.join(
                                        self.working_directory, f_name)
                                    with open(crvf, 'wb') as f:
                                        f.write(con_bytes)
                                    request.add_extracted(
                                        crvf, os.path.basename(crvf),
                                        "Extracted content from object {}".
                                        format(k))
                                    carved_extracted_shas.add(crv_sha)

            if show_content_of_interest:
                pdf_parserres.add_subsection(carres)

            # ELEMENTS
            # Do not show for objstms
            if get_malform:
                if request.deep_scan:
                    options = {
                        "verbose": True,
                        "nocanonicalizedoutput": True,
                        "get_malform": get_malform
                    }
                elif embed_present:
                    options = {
                        "verbose": True,
                        "elements": "ctsi",
                        "type": "/EmbeddedFile",
                        "get_malform": get_malform
                    }
                else:
                    options = {
                        "verbose": True,
                        "elements": "cst",
                        "get_malform": get_malform
                    }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                embed_extracted = set()
                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No structure information generated for file. Please see errors."
                        )
                    else:
                        # PDF Parser will write any malformed content over 100 bytes to a file
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'malformed':
                                    if len(l) > 0:
                                        pdf_parserres.set_heuristic(6)
                                    for i in l:
                                        request.add_extracted(
                                            i, os.path.basename(i),
                                            "Extracted malformed content in PDF Parser Analysis."
                                        )

                        parts = pdf_parser_result.get("parts", None)
                        # Extract service will extract the sample's embedded files.
                        # However we want to make note of them so that they are not extracted again below
                        if parts:
                            for p in sorted(parts):
                                if "Type: /EmbeddedFile" in p:
                                    getobj = p.split("\n", 1)[0].split(" ")[1]
                                    embed_extracted.add(getobj)

                # Extract objects collected from above analysis
                obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs

                if len(obj_to_extract) > 0:
                    options = {
                        "filter": True,
                        "object": obj_to_extract,
                        "dump": "extracted_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        files = pdf_parser_result.get("files", None)
                        extracted_files = []
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_obj_", "")
                                        extracted_files.append(
                                            "Extracted object {} as {}".format(
                                                obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "Object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))
                        for e in errors:
                            all_errors.add(e)

                        if extracted_files:
                            extract_res = ResultSection(
                                title_text="Extracted embedded objects",
                                parent=pdf_parserres)
                            extract_res.set_heuristic(9)
                            extract_res.add_lines(extracted_files)

                # Extract jbig2decode objects in deep scan mode
                if request.deep_scan and len(jbig_objs) > 0:
                    options = {
                        "object": jbig_objs,
                        "dump": "extracted_jb_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        extracted_jb = []
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_jb_obj_", "")
                                        extracted_jb.append(
                                            "JBIG2DECODE object {} extracted as {}"
                                            .format(obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "JBIG2DECODE object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))

                        for e in errors:
                            all_errors.add(e)

                        if extracted_jb:
                            jbig_extract_res = ResultSection(
                                title_text="Extracted JBIG2Decode objects",
                                parent=pdf_parserres)
                            jbig_extract_res.set_heuristic(9)
                            jbig_extract_res.add_lines(extracted_jb)

            if len(pdf_parserres.subsections) > 0:
                res.add_subsection(pdf_parserres)

        return res, objstms, all_errors
    def execute(self, request):
        # ==================================================================
        # Execute a request:
        #   Every time your service receives a new file to scan, the execute function is called
        #   This is where you should execute your processing code.
        #   For the purpose of this example, we will only generate results ...

        # You should run your code here...

        # ==================================================================
        # Check if we're scanning an embedded file
        #   This service always drop 3 embedded file which two generates random results and the other empty results
        #   We're making a check to see if we're scanning the embedded file.
        #   In a normal service this is not something you would do at all but since we are using this
        #   service in our unit test to test all features of our report generator, we have to do this
        if request.sha256 not in ['d729ecfb2cf40bc4af8038dac609a57f57dbe6515d35357af973677d5e66417a',
                                  '5ce5ae8ef56a54af2c44415800a81ecffd49a33ae8895dfe38fc1075d3f619ec',
                                  'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06']:
            # Main file results...

            # ==================================================================
            # Write the results:
            #   First, create a result object where all the result sections will be saved to
            result = Result()

            # ==================================================================
            # Standard text section: BODY_FORMAT.TEXT - DEFAULT
            #   Text sections basically just dumps the text to the screen...
            #     All sections scores will be SUMed in the service result
            #     The Result classification will be the highest classification found in the sections
            text_section = ResultSection('Example of a default section')
            # You can add lines to your section one at a time
            #   Here we will generate a random line
            text_section.add_line(get_random_phrase())
            # Or your can add them from a list
            #   Here we will generate random amount of random lines
            text_section.add_lines([get_random_phrase() for _ in range(random.randint(1, 5))])
            # If the section needs to affect the score of the file you need to set a heuristics
            #   Here we will pick one at random
            #     In addition to add a heuristic, we will associated a signature with the heuristic,
            #     we're doing this by adding the signature name to the heuristic. (Here we generating a random name)
            text_section.set_heuristic(3, signature="sig_one")
            # You can attach attack ids to heuristics after they where defined
            text_section.heuristic.add_attack_id("T1066")
            # Same thing for the signatures, they can be added to heuristic after the fact and you can even say how
            #   many time the signature fired by setting its frequency. If you call add_signature_id twice with the
            #   same signature, this will effectively increase the frequency of the signature.
            text_section.heuristic.add_signature_id("sig_two", score=20, frequency=2)
            text_section.heuristic.add_signature_id("sig_two", score=20, frequency=3)
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_three")
            text_section.heuristic.add_signature_id("sig_four", score=0)
            # The heuristic for text_section should have the following properties
            #   1. 1 attack ID: T1066
            #   2. 4 signatures: sig_one, sig_two, sig_three and sig_four
            #   3. Signature frequencies are cumulative therefor they will be as follow:
            #      - sig_one = 1
            #      - sig_two = 5
            #      - sig_three = 2
            #      - sig_four = 1
            #   4. The score used by each heuristic is driven by the following rules: signature_score_map is higher
            #      priority, then score value for the add_signature_id is in second place and finally the default
            #      heuristic score is use. Therefor the score used to calculate the total score for the text_section is
            #      as follow:
            #      - sig_one: 10    -> heuristic default score
            #      - sig_two: 20    -> score provided by the function add_signature_id
            #      - sig_three: 30  -> score provided by the heuristic map
            #      - sig_four: 40   -> score provided by the heuristic map because it's higher priority than the
            #                          function score
            #    5. Total section score is then: 1x10 + 5x20 + 2x30 + 1x40 = 210
            # Make sure you add your section to the result
            result.add_section(text_section)

            # ==================================================================
            # Color map Section: BODY_FORMAT.GRAPH_DATA
            #     Creates a color map bar using a minimum and maximum domain
            #     e.g. We are using this section to display the entropy distribution in some services
            cmap_min = 0
            cmap_max = 20
            color_map_data = {
                'type': 'colormap',
                'data': {
                    'domain': [cmap_min, cmap_max],
                    'values': [random.random() * cmap_max for _ in range(50)]
                }
            }
            # The classification of a section can be set to any valid classification for your system
            section_color_map = ResultSection("Example of colormap result section", body_format=BODY_FORMAT.GRAPH_DATA,
                                              body=json.dumps(color_map_data), classification=cl_engine.RESTRICTED)
            result.add_section(section_color_map)

            # ==================================================================
            # URL section: BODY_FORMAT.URL
            #   Generate a list of clickable urls using a json encoded format
            #     As you can see here, the body of the section can be set directly instead of line by line
            random_host = get_random_host()
            url_section = ResultSection('Example of a simple url section', body_format=BODY_FORMAT.URL,
                                        body=json.dumps({"name": "Random url!", "url": f"https://{random_host}/"}))

            # Since urls are very important features we can tag those features in the system so they are easy to find
            #   Tags are defined by a type and a value
            url_section.add_tag("network.static.domain", random_host)

            # You may also want to provide a list of url!
            #   Also, No need to provide a name, the url link will be displayed
            host1 = get_random_host()
            host2 = get_random_host()
            ip1 = get_random_ip()
            ip2 = get_random_ip()
            ip3 = get_random_ip()
            urls = [
                {"url": f"https://{host1}/"},
                {"url": f"https://{host2}/"},
                {"url": f"https://{ip1}/"},
                {"url": f"https://{ip2}/"},
                {"url": f"https://{ip3}/"}]

            # A heuristic can fire more then once without being associated to a signature
            url_heuristic = Heuristic(4, frequency=len(urls))

            url_sub_section = ResultSection('Example of a url section with multiple links',
                                            body=json.dumps(urls), body_format=BODY_FORMAT.URL,
                                            heuristic=url_heuristic)
            url_sub_section.add_tag("network.static.ip", ip1)
            url_sub_section.add_tag("network.static.ip", ip2)
            url_sub_section.add_tag("network.static.ip", ip3)
            url_sub_section.add_tag("network.static.domain", host1)
            url_sub_section.add_tag("network.dynamic.domain", host2)
            # Since url_sub_section is a sub-section of url_section
            # we will add it as a sub-section of url_section not to the main result itself
            url_section.add_subsection(url_sub_section)
            result.add_section(url_section)

            # ==================================================================
            # Memory dump section: BODY_FORMAT.MEMORY_DUMP
            #     Dump whatever string content you have into a <pre/> html tag so you can do your own formatting
            data = hexdump(b"This is some random text that we will format as an hexdump and you'll see "
                           b"that the hexdump formatting will be preserved by the memory dump section!")
            memdump_section = ResultSection('Example of a memory dump section', body_format=BODY_FORMAT.MEMORY_DUMP,
                                            body=data)
            memdump_section.set_heuristic(random.randint(1, 4))
            result.add_section(memdump_section)

            # ==================================================================
            # KEY_VALUE section:
            #     This section allows the service writer to list a bunch of key/value pairs to be displayed in the UI
            #     while also providing easy to parse data for auto mated tools.
            #     NB: You should definitely use this over a JSON body type since this one will be displayed correctly
            #         in the UI for the user
            #     The body argument must be a json dumps of a dictionary (only str, int, and booleans are allowed)
            kv_body = {
                "a_str": "Some string",
                "a_bool": False,
                "an_int": 102,
            }
            kv_section = ResultSection('Example of a KEY_VALUE section', body_format=BODY_FORMAT.KEY_VALUE,
                                       body=json.dumps(kv_body))
            result.add_section(kv_section)

            # ==================================================================
            # JSON section:
            #     Re-use the JSON editor we use for administration (https://github.com/josdejong/jsoneditor)
            #     to display a tree view of JSON results.
            #     NB: Use this sparingly! As a service developer you should do your best to include important
            #     results as their own result sections.
            #     The body argument must be a json dump of a python dictionary
            json_body = {
                "a_str": "Some string",
                "a_list": ["a", "b", "c"],
                "a_bool": False,
                "an_int": 102,
                "a_dict": {
                    "list_of_dict": [
                        {"d1_key": "val", "d1_key2": "val2"},
                        {"d2_key": "val", "d2_key2": "val2"}
                    ],
                    "bool": True
                }
            }
            json_section = ResultSection('Example of a JSON section', body_format=BODY_FORMAT.JSON,
                                         body=json.dumps(json_body))
            result.add_section(json_section)

            # ==================================================================
            # PROCESS_TREE section:
            #     This section allows the service writer to list a bunch of dictionary objects that have nested lists
            #     of dictionaries to be displayed in the UI. Each dictionary object represents a process, and therefore
            #     each dictionary must have be of the following format:
            #     {
            #       "process_pid": int,
            #       "process_name": str,
            #       "command_line": str,
            #       "children": [] NB: This list either is empty or contains more dictionaries that have the same
            #                          structure
            #     }
            nc_body = [
                {
                    "process_pid": 123,
                    "process_name": "evil.exe",
                    "command_line": "C:\\evil.exe",
                    "signatures": {},
                    "children": [
                        {
                            "process_pid": 321,
                            "process_name": "takeovercomputer.exe",
                            "command_line": "C:\\Temp\\takeovercomputer.exe -f do_bad_stuff",
                            "signatures": {"one":250},
                            "children": [
                                {
                                    "process_pid": 456,
                                    "process_name": "evenworsethanbefore.exe",
                                    "command_line": "C:\\Temp\\evenworsethanbefore.exe -f change_reg_key_cuz_im_bad",
                                    "signatures": {"one":10, "two":10, "three":10},
                                    "children": []
                                },
                                {
                                    "process_pid": 234,
                                    "process_name": "badfile.exe",
                                    "command_line": "C:\\badfile.exe -k nothing_to_see_here",
                                    "signatures": {"one":1000, "two":10, "three":10, "four":10, "five":10},
                                    "children": []
                                }
                            ]
                        },
                        {
                            "process_pid": 345,
                            "process_name": "benignexe.exe",
                            "command_line": "C:\\benignexe.exe -f \"just kidding, i'm evil\"",
                            "signatures": {"one": 2000},
                            "children": []
                        }
                    ]
                },
                {
                    "process_pid": 987,
                    "process_name": "runzeroday.exe",
                    "command_line": "C:\\runzeroday.exe -f insert_bad_spelling",
                    "signatures": {},
                    "children": []
                }
            ]
            nc_section = ResultSection('Example of a PROCESS_TREE section',
                                       body_format=BODY_FORMAT.PROCESS_TREE,
                                       body=json.dumps(nc_body))
            result.add_section(nc_section)
            
            # ==================================================================
            # TABLE section:
            #     This section allows the service writer to have their content displayed in a table format in the UI
            #     The body argument must be a list [] of dict {} objects. A dict object can have a key value pair
            #     where the value is a flat nested dictionary, and this nested dictionary will be displayed as a nested
            #     table within a cell.
            table_body = [
                {
                    "a_str": "Some string1",
                    "extra_column_here": "confirmed",
                    "a_bool": False,
                    "an_int": 101,
                },
                {
                    "a_str": "Some string2",
                    "a_bool": True,
                    "an_int": 102,
                },
                {
                    "a_str": "Some string3",
                    "a_bool": False,
                    "an_int": 103,
                },
                {
                    "a_str": "Some string4",
                    "a_bool": None,
                    "an_int": -1000000000000000000,
                    "extra_column_there": "confirmed",
                    "nested_table": {
                        "a_str": "Some string3",
                        "a_bool": False,
                        "nested_table_thats_too_deep": {
                            "a_str": "Some string3",
                            "a_bool": False,
                            "an_int": 103,
                        },
                    },
                },
            ]
            table_section = ResultSection('Example of a TABLE section',
                                          body_format=BODY_FORMAT.TABLE,
                                          body=json.dumps(table_body))
            result.add_section(table_section)

            # ==================================================================
            # Re-Submitting files to the system
            #     Adding extracted files will have them resubmitted to the system for analysis

            # This file will generate random results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(data.encode())
            request.add_extracted(temp_path, "file.txt", "Extracted by some magic!")

            # Embedded files can also have their own classification!
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"CLASSIFIED!!!__"+data.encode())
            request.add_extracted(temp_path, "classified.doc", "Classified file ... don't look",
                                  classification=cl_engine.RESTRICTED)

            # This file will generate empty results on the next run
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "wb") as myfile:
                myfile.write(b"EMPTY")
            request.add_extracted(temp_path, "empty.txt", "Extracted empty resulting file")

            # ==================================================================
            # Supplementary files
            #     Adding supplementary files will save them on the datastore for future
            #      reference but wont reprocess those files.
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(urls))
            request.add_supplementary(temp_path, "urls.json", "These are urls as a JSON file")
            # like embedded files, you can add more then one supplementary files
            fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
            with os.fdopen(fd, "w") as myfile:
                myfile.write(json.dumps(json_body))
            request.add_supplementary(temp_path, "json_body.json", "This is the json_body as a JSON file")

            # ==================================================================
            # Wrap-up:
            #     Save your result object back into the request
            request.result = result

        # ==================================================================
        # Empty results file
        elif request.sha256 == 'cc1d2f838445db7aec431df9ee8a871f40e7aa5e064fc056633ef8c60fab7b06':
            # Creating and empty result object
            request.result = Result()

        # ==================================================================
        # Randomized results file
        else:
            # For the randomized  results file, we will completely randomize the results
            #   The content of those results do not matter since we've already showed you
            #   all the different result sections, tagging, heuristics and file upload functions
            embedded_result = Result()

            # random number of sections
            for _ in range(1, 3):
                embedded_result.add_section(self._create_random_section())

            request.result = embedded_result
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)
        content_str = request.file_contents

        # Attempt conversion of potential Outlook file -> eml
        if request.file_type == "document/office/email":
            try:
                content_str = msg2eml(request.file_path).as_bytes()
            except Exception:
                # Try using mailparser to convert
                converted_path, _ = msgconvert(request.file_path)
                content_str = open(converted_path, "rb").read()

        header_agg = {
            "From": set(),
            "To": set(),
            "Cc": set(),
            "Sent": set(),
            "Reply-To": set(),
            "Date": set()
        }
        # Assume this is an email saved in HTML format
        if request.file_type == "code/html":
            parsed_html = BeautifulSoup(content_str, "lxml")
            valid_headers = [
                "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:"
            ]

            if not parsed_html.body or not any(header in parsed_html.body.text
                                               for header in valid_headers):
                # We can assume this is just an HTML doc (or lacking body), one of which we can't process
                request.result = Result()
                return

            # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails
            valid_headers.append("Date:")

            html_email = email.message_from_bytes(content_str)
            generator_metadata_content = ""
            for meta in parsed_html.find_all("meta"):
                if meta.attrs.get("name", None) == "Generator":
                    generator_metadata_content = meta.attrs.get("content", "")
                    break

            # Process HTML emails generated from Outlook
            if generator_metadata_content == "Microsoft Word 15":
                paragraphs = parsed_html.body.find_all("p")
                # Likely an email that was exported with original email headers
                if any(header in paragraphs[0] for header in valid_headers):
                    for p in paragraphs:
                        if any(valid_header in p.text
                               for valid_header in valid_headers):
                            h_key, h_value = p.text.replace(
                                "\xa0", "").replace("\r\n", " ").split(":", 1)
                            html_email[h_key] = h_value
                            # Subject line indicates the end of the email header, beginning of body
                            if "Subject" in p.text:
                                break
            # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers)
            elif (generator_metadata_content
                  == "Microsoft Word 15 (filtered medium)"
                  or generator_metadata_content == "Microsoft Exchange Server"
                  or generator_metadata_content == ""):
                subject = None
                for div in parsed_html.find_all("div"):
                    # Header information within divs
                    if any(header in div.text for header in valid_headers
                           ) and "WordSection1" not in div.attrs.get(
                               "class", []):
                        # Usually expect headers to be \n separated in text output but check first
                        if "\n" in div.text:
                            for h in div.text.split("\n"):
                                if any(header in h
                                       for header in valid_headers):
                                    h_key, h_value = h.split(":", 1)

                                    # Implying some malformed message got mixed with the headers of another message
                                    if h_key not in valid_headers:
                                        for header in valid_headers:
                                            if header in h:
                                                h_key = header[:-1]

                                    # Use the latest message's subject (this maintains FW, RE, etc.)
                                    if h_key == "Subject" and not subject:
                                        subject = h_value
                                    elif h_key != "Subject":
                                        header_agg[h_key].add(h_value)

                        # Document was probably not well formatted, so we'll use the headers as delimiters
                        else:
                            header_offset_map = {}
                            # Determine the position of each header
                            for header in list(
                                    header_agg.keys()) + ["Subject"]:
                                if header in div.text:
                                    header_offset_map[div.text.index(
                                        header)] = header
                            # Use the positions and length of header name to determine an offset
                            for i in range(len(header_offset_map)):
                                sorted_keys = sorted(header_offset_map.keys())
                                header_name = header_offset_map[sorted_keys[i]]
                                offset = len(
                                    f"{header_name}: ") + sorted_keys[i]
                                value = (div.text[offset:sorted_keys[i + 1]]
                                         if i < len(header_offset_map) - 1 else
                                         div.text[offset:])

                                if header_name == "Subject":
                                    subject = value
                                else:
                                    header_agg[header_name].add(value)

                # Assign aggregated info to email object
                html_email["Subject"] = subject
                for key, value in header_agg.items():
                    html_email[key] = "; ".join(value)
            content_str = html_email.as_bytes()

        parsed_eml = parser.decode_email_bytes(content_str)
        result = Result()
        header = parsed_eml["header"]

        if "from" in header or "to" in header:
            all_uri = set()
            body_words = set(extract_passwords(header["subject"]))
            for body_counter, body in enumerate(parsed_eml["body"]):
                body_text = BeautifulSoup(body["content"]).text
                body_words.update(extract_passwords(body_text))
                if request.get_param("extract_body_text"):
                    fd, path = mkstemp()
                    with open(path, "w") as f:
                        f.write(body["content"])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body["uri"]:
                        all_uri.add(uri)
            # Words in the email body, used by extract to guess passwords
            request.temp_submission_data["email_body"] = list(body_words)

            kv_section = ResultSection("Email Headers",
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            from_addr = header["from"].strip() if header.get("from",
                                                             None) else None
            if from_addr and re.match(EMAIL_REGEX, from_addr):
                kv_section.add_tag("network.email.address", from_addr)
            [
                kv_section.add_tag("network.email.address", to.strip())
                for to in header["to"] if re.match(EMAIL_REGEX, to.strip())
            ]

            kv_section.add_tag("network.email.date",
                               str(header["date"]).strip())

            subject = header["subject"].strip() if header.get("subject",
                                                              None) else None
            if subject:
                kv_section.add_tag("network.email.subject", subject)

            # Add CCs to body and tags
            if "cc" in header:
                [
                    kv_section.add_tag("network.email.address", cc.strip())
                    for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip())
                ]
            # Add Message ID to body and tags
            if "message-id" in header["header"]:
                kv_section.add_tag("network.email.msg_id",
                                   header["header"]["message-id"][0].strip())

            # Add Tags for received IPs
            if "received_ip" in header:
                for ip in header["received_ip"]:
                    ip = ip.strip()
                    try:
                        if isinstance(ip_address(ip), IPv4Address):
                            kv_section.add_tag("network.static.ip", ip)
                    except ValueError:
                        pass

            # Add Tags for received Domains
            if "received_domain" in header:
                for dom in header["received_domain"]:
                    kv_section.add_tag("network.static.domain", dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection("URIs Found:", parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag("network.static.uri", uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag("network.static.ip",
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag("network.static.domain",
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop("header", {})
            header.pop("received", None)
            header.update(extra_header)

            # Convert to common format
            header["date"] = [self.json_serial(header["date"])]

            # Replace with aggregated date(s) if any available
            if header_agg["Date"]:
                # Replace
                if any(
                        default_date in header["date"] for default_date in
                    ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000"
                     ]):
                    header["date"] = list(header_agg["Date"])
                # Append
                else:
                    header["date"] += list(header_agg["Date"])
                (kv_section.add_tag("network.email.date",
                                    str(date).strip())
                 for date in header_agg["Date"])

            # Filter out useless headers from results
            self.log.debug(header.keys())
            [header.pop(h) for h in self.header_filter if h in header.keys()]
            kv_section.set_body(json.dumps(header, default=self.json_serial))

            attachments_added = []
            if "attachment" in parsed_eml:
                attachments = parsed_eml["attachment"]
                for attachment in attachments:
                    fd, path = mkstemp()

                    with open(path, "wb") as f:
                        f.write(base64.b64decode(attachment["raw"]))
                        os.close(fd)
                    try:
                        if request.add_extracted(
                                path,
                                attachment["filename"],
                                "Attachment ",
                                safelist_interface=self.api_interface):
                            attachments_added.append(attachment["filename"])
                    except MaxExtractedExceeded:
                        self.log.warning(
                            f"Extract limit reached on attachments: "
                            f"{len(attachment) - len(attachments_added)} not added"
                        )
                        break
                ResultSection("Extracted Attachments:",
                              body="\n".join([x for x in attachments_added]),
                              parent=result)

            if request.get_param("save_emlparser_output"):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                attachments = parsed_eml.get("attachment", [])
                # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted
                for attachment in attachments:
                    _ = attachment.pop("raw", None)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            self.log.warning(
                "emlParser could not parse EML; no useful information in result's headers"
            )

        request.result = result
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)

        # Validate URLs in sample, strip out [] if found
        content_str = request.file_contents.decode(errors="ignore")
        content_str, retry = self.validate_urls(content_str)
        while retry:
            content_str, retry = self.validate_urls(content_str)
        parsed_eml = parser.decode_email_bytes(content_str.encode())

        result = Result()
        header = parsed_eml['header']

        if "from" in header:
            all_uri = set()

            for body_counter, body in enumerate(parsed_eml['body']):
                if request.get_param('extract_body_text'):
                    fd, path = mkstemp()
                    with open(path, 'w') as f:
                        f.write(body['content'])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body['uri']:
                        all_uri.add(uri)

            kv_section = ResultSection('Email Headers',
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            kv_section.add_tag("network.email.address", header['from'].strip())
            for to in header['to']:
                kv_section.add_tag("network.email.address", to)
            kv_section.add_tag("network.email.date",
                               str(header['date']).strip())
            kv_section.add_tag("network.email.subject",
                               header['subject'].strip())

            # Add CCs to body and tags
            if 'cc' in header:
                for to in header['to']:
                    kv_section.add_tag("network.email.address", to.strip())

            # Add Message ID to body and tags
            if 'message-id' in header['header']:
                kv_section.add_tag("network.email.msg_id",
                                   header['header']['message-id'][0].strip())

            # Add Tags for received IPs
            if 'received_ip' in header:
                for ip in header['received_ip']:
                    kv_section.add_tag('network.static.ip', ip.strip())

            # Add Tags for received Domains
            if 'received_domain' in header:
                for dom in header['received_domain']:
                    kv_section.add_tag('network.static.domain', dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection('URIs Found:', parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag('network.static.uri', uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag('network.static.ip',
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag('network.static.domain',
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop('header', {})
            header.pop('received', None)
            header.update(extra_header)

            kv_section.body = json.dumps(header, default=self.json_serial)

            if "attachment" in parsed_eml:
                for attachment in parsed_eml['attachment']:
                    fd, path = mkstemp()

                    with open(path, 'wb') as f:
                        f.write(base64.b64decode(attachment['raw']))
                        os.close(fd)
                    request.add_extracted(path, attachment['filename'],
                                          "Attachment ")
                ResultSection('Extracted Attachments:',
                              body="\n".join([
                                  x['filename']
                                  for x in parsed_eml['attachment']
                              ]),
                              parent=result)

            if request.get_param('save_emlparser_output'):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            text_section = ResultSection('EML parsing results')
            text_section.add_line("Could not parse EML")
            result.add_section(text_section)

        request.result = result
    def execute(self, request):
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()
        before = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values:
            if request.get_param('extract_original_iocs'):
                ioc_res = ResultSection(
                    "The following IOCs were found in the original file",
                    parent=request.result,
                    body_format=BODY_FORMAT.MEMORY_DUMP)
            else:
                ioc_res = None
            for k, val in pat_values.items():
                if val == "":
                    asc_asc = unicodedata.normalize('NFKC', val).encode(
                        'ascii', 'ignore')
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}"
                        )
                        ioc_res.add_tag(k, asc_asc)
                    before.add((k, asc_asc))
                else:
                    for v in val:
                        if ioc_res:
                            ioc_res.add_line(
                                f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_res.add_tag(k, v)
                        before.add((k, v))

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if re.match(re.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            for name, technique in techniques:
                res = technique(layer)
                if res:
                    layers_list.append((name, res))
                    # Looks like it worked, restart with new layer
                    layer = res
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    for name, technique in final_pass:
                        res = technique(layer)
                        if res:
                            layers_list.append((name, res))
                    break
                else:
                    for x in second_pass:
                        techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)
            heur_id = None

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            elif num_layers >= 100:
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags = {}

                for k, val in pat_values.items():
                    if val == "":
                        asc_asc = unicodedata.normalize('NFKC', val).encode(
                            'ascii', 'ignore')
                        if (k, asc_asc) not in before:
                            diff_tags.setdefault(k, [])
                            diff_tags[k].append(asc_asc)
                    else:
                        for v in val:
                            if (k, v) not in before:
                                diff_tags.setdefault(k, [])
                                diff_tags[k].append(v)

                if request.deep_scan or \
                        (len(clean) > 1000 and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    fn = f"{request.file_name}_decoded_final"
                    fp = os.path.join(self.working_directory, fn)
                    with open(fp, 'wb') as dcf:
                        dcf.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {fp}")
                    request.add_extracted(fp, fn, "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for f in self.files_extracted:
                        ext_file_res.add_line(os.path.basename(f))
                        request.add_extracted(
                            f, os.path.basename(f),
                            "File of interest deobfuscated from sample")
    def check_file_name_anomalies(self, filename):
        """Filename anomalies detection"""

        is_double_ext, f_ext = self.fna_check_double_extension(filename)
        is_empty_filename = self.fna_check_empty_filename(filename, f_ext)
        too_many_whitespaces = self.fna_check_filename_ws(filename, f_ext)
        has_unicode_ext_hiding_ctrls = self.fna_check_unicode_bidir_ctrls(filename, f_ext)

        file_res = Result()

        if too_many_whitespaces or is_double_ext or has_unicode_ext_hiding_ctrls or is_empty_filename:
            res = ResultSection(title_text="File Name Anomalies", parent=file_res)

            # Tag filename as it might be of interest
            res.add_tag("file.name.extracted", filename)

            # Remove Unicode controls, if any, for reporting
            fn_no_controls = "".join(
                c for c in filename if c not in ["\u202E", "\u202B", "\u202D", "\u202A", "\u200E", "\u200F"]
            )

            # Also add a line with "actual" file name
            res.add_line(f"Actual file name: {wrap_bidir_unicode_string(fn_no_controls)}")

            if too_many_whitespaces:
                sec = ResultSection("Too many whitespaces", parent=res, heuristic=Heuristic(1))
                sec.add_tag("file.name.anomaly", "TOO_MANY_WHITESPACES")
                sec.add_tag("file.behavior", "File name has too many whitespaces")

            if is_double_ext:
                sec = ResultSection("Double file extension", parent=res, heuristic=Heuristic(2))
                sec.add_tag("file.name.anomaly", "DOUBLE_FILE_EXTENSION")
                sec.add_tag("file.behavior", "Double file extension")

            if has_unicode_ext_hiding_ctrls:
                sec = ResultSection("Hidden launchable file extension", parent=res, heuristic=Heuristic(3))
                sec.add_tag("file.name.anomaly", "UNICODE_EXTENSION_HIDING")
                sec.add_tag("file.behavior", "Real file extension hidden using unicode trickery")

            if is_empty_filename:
                sec = ResultSection("Empty Filename", parent=res, heuristic=Heuristic(4))
                sec.add_tag("file.name.anomaly", "FILENAME_EMPTY_OR_ALL_SPACES")
                sec.add_tag("file.behavior", "File name is empty or all whitespaces")

        return file_res