Python PatternMatch.ioc_matchの例

プログラミング言語: Python

名前空間/パッケージ名: assemblyline_v4_service.common.balbuzard.patterns

クラス/型: PatternMatch

メソッド/関数: ioc_match

hotexamples.comのコード掲載数: 6

Python PatternMatch.ioc_match - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのassemblyline_v4_service.common.balbuzard.patterns.PatternMatch.ioc_matchの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

PatternMatch(8)

ioc_match(6)

bbcr(1)

よく使われるメソッド

PatternMatch (8)

ioc_match (6)

bbcr (1)

コード例 #1

ファイルを表示

ファイル: frankenstrings.py プロジェクト: CybercentreCanada/assemblyline-service-frankenstrings

    def ioc_to_tag(self,
                   data: bytes,
                   patterns: PatternMatch,
                   res: Optional[ResultSection] = None,
                   taglist: bool = False,
                   check_length: bool = False,
                   strs_max_size: int = 0,
                   st_max_length: int = 300) -> Dict[str, Set[str]]:
        """Searches data for patterns and adds as AL tag to result output.

        Args:
            data: Data to be searched.
            patterns: FrankenStrings Patterns() object.
            res: AL result.
            taglist: True if tag list should be returned.
            check_length: True if length of string should be compared to st_max_length.
            strs_max_size: Maximum size of strings list. If greater then only network IOCs will be searched.
            st_max_length: Maximum length of a string from data that can be searched.

        Returns: tag list as dictionary (always empty if taglist is false)
        """

        tags: Dict[str, Set[str]] = {}

        min_length = self.st_min_length if check_length else 4

        strs: Set[bytes] = set()
        just_network = False

        # Flare-FLOSS ascii string extract
        for ast in strings.extract_ascii_strings(data, n=min_length):
            if not check_length or len(ast.s) < st_max_length:
                strs.add(ast.s)
        # Flare-FLOSS unicode string extract
        for ust in strings.extract_unicode_strings(data, n=min_length):
            if not check_length or len(ust.s) < st_max_length:
                strs.add(ust.s)

        if check_length and len(strs) > strs_max_size:
            just_network = True

        for s in strs:
            st_value: Dict[str, Iterable[bytes]] = patterns.ioc_match(
                s, bogon_ip=True, just_network=just_network)
            for ty, val in st_value.items():
                if taglist and ty not in tags:
                    tags[ty] = set()
                for v in val:
                    if ty == 'network.static.domain' and not is_valid_domain(
                            v.decode('utf-8')):
                        continue
                    if ty == 'network.email.address' and not is_valid_email(
                            v.decode('utf-8')):
                        continue
                    if len(v) < 1001:
                        if res:
                            res.add_tag(ty, safe_str(v))
                        if taglist:
                            tags[ty].add(safe_str(v))
        return tags

コード例 #2

ファイルを表示

 def additional_parsing(self, file_path: str) -> Optional[ResultSection]:
     urls = set()
     try:
         with pikepdf.open(file_path) as pdf:
             num_pages = len(pdf.pages)
             for page in pdf.pages:
                 if '/Annots' not in page:
                     continue
                 for annot in page['/Annots'].as_list():
                     if annot.get('/Subtype') == '/Link':
                         if '/A' not in annot:
                             continue
                     _url = annot['/A'].get('/URI')
                     if not hasattr(_url, '__str__'):
                         continue
                     url = str(_url)
                     if re.match(FULL_URI, url):
                         urls.add(url)
         if not urls:
             return None
         patterns = PatternMatch()
         body = '\n'.join(urls)
         tags: dict[str, set[bytes]] = patterns.ioc_match(body.encode())
         result = ResultSection(
             'URL in Annotations',
             heuristic=Heuristic(
                 27, signature='one_page' if num_pages == 1 else None),
             body=body)
         for ty, vals in tags.items():
             for val in vals:
                 result.add_tag(ty, val)
         return result
     except Exception as e:
         self.log.warning(f'pikepdf failed to parse sample: {e}')
         return None

コード例 #3

ファイルを表示

def ioc_tag(text: bytes,
            result: ResultSection,
            just_network: bool = False) -> bool:
    """ Tags iocs found in text to result

    text: text to search for iocs
    result: ResultSection to tag with iocs
    just_network: whether non-network iocs should be skipped

    returns: whether iocs are found
    """
    pattern = PatternMatch()
    ioc = pattern.ioc_match(text, bogon_ip=True, just_network=just_network)
    for kind, values in ioc.items():
        for val in values:
            result.add_tag(kind, val[:MAX_TAG_LEN])
    # Return whether any IOCs were found
    return bool(ioc)

コード例 #4

ファイルを表示

ファイル: deobs.py プロジェクト: digitalarche/assemblyline-service-deobfuscripter

    def execute(self, request):
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()
        before = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values:
            if request.get_param('extract_original_iocs'):
                ioc_res = ResultSection(
                    "The following IOCs were found in the original file",
                    parent=request.result,
                    body_format=BODY_FORMAT.MEMORY_DUMP)
            else:
                ioc_res = None
            for k, val in pat_values.items():
                if val == "":
                    asc_asc = unicodedata.normalize('NFKC', val).encode(
                        'ascii', 'ignore')
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(asc_asc)}"
                        )
                        ioc_res.add_tag(k, asc_asc)
                    before.add((k, asc_asc))
                else:
                    for v in val:
                        if ioc_res:
                            ioc_res.add_line(
                                f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_res.add_tag(k, v)
                        before.add((k, v))

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if re.match(re.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            for name, technique in techniques:
                res = technique(layer)
                if res:
                    layers_list.append((name, res))
                    # Looks like it worked, restart with new layer
                    layer = res
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    for name, technique in final_pass:
                        res = technique(layer)
                        if res:
                            layers_list.append((name, res))
                    break
                else:
                    for x in second_pass:
                        techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)
            heur_id = None

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            elif num_layers >= 100:
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags = {}

                for k, val in pat_values.items():
                    if val == "":
                        asc_asc = unicodedata.normalize('NFKC', val).encode(
                            'ascii', 'ignore')
                        if (k, asc_asc) not in before:
                            diff_tags.setdefault(k, [])
                            diff_tags[k].append(asc_asc)
                    else:
                        for v in val:
                            if (k, v) not in before:
                                diff_tags.setdefault(k, [])
                                diff_tags[k].append(v)

                if request.deep_scan or \
                        (len(clean) > 1000 and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    fn = f"{request.file_name}_decoded_final"
                    fp = os.path.join(self.working_directory, fn)
                    with open(fp, 'wb') as dcf:
                        dcf.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {fp}")
                    request.add_extracted(fp, fn, "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for f in self.files_extracted:
                        ext_file_res.add_line(os.path.basename(f))
                        request.add_extracted(
                            f, os.path.basename(f),
                            "File of interest deobfuscated from sample")

コード例 #5

ファイルを表示

ファイル: pdf_id.py プロジェクト: digitalarche/assemblyline-service-pdfid

    def analyze_pdf(self,
                    request,
                    res_txt,
                    path,
                    working_dir,
                    heur,
                    additional_keywords,
                    get_malform=True):
        """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins,
        and PDF Parser.

        Args:
            request: AL request object.
            res_txt: Header string for AL result section title.
            path: Original PDF sample path.
            working_dir: AL working directory.
            heur: List of plugins to run on PDFId results (provided in service configuration).
            additional_keywords: List of additional keywords to be searched (provided in service configuration).
            get_malform: Extract malformed objects from PDF.

        Returns:
            AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list.
        """
        triage_keywords = set()
        all_errors = set()
        embed_present = False
        objstms = False
        res = ResultSection(title_text=res_txt)
        carved_extracted_shas = set()

        if request.deep_scan:
            run_pdfparse = True
        else:
            run_pdfparse = False

        # Run PDFId
        try:
            pdfid_result, errors = self.get_pdfid(path, additional_keywords,
                                                  heur, request.deep_scan)
        except Exception as e:
            raise NonRecoverableError(e)
        # Parse PDFId results
        pdfidres = ResultSection(title_text="PDFID Results", parent=res)
        if len(pdfid_result) == 0:
            pdfidres.add_line(
                "No results generated for file. Please see errors.")
        else:
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                version = pdfid_result.get("PDFID", None)
                if version:
                    pdfidres.add_line(version[0])
                properties = pdfid_result.get("Properties", None)
                if properties:
                    pres = ResultSection(title_text="PDF Properties",
                                         parent=pdfidres)
                    for plist in properties:
                        pres.add_line("{0}: {1}".format(plist[0], plist[1]))
                        if plist[0] == "/ModDate":
                            pres.add_tag('file.pdf.date.modified', plist[1])
                        elif plist[0] == "/CreationDate":
                            pres.add_tag('file.date.creation', plist[1])
                        elif plist[0] == "/LastModified":
                            pres.add_tag('file.date.last_modified', plist[1])
                        elif plist[0] == "/SourceModified":
                            pres.add_tag('file.pdf.date.source_modified',
                                         plist[1])
                        elif plist[0] == "/pdfx":
                            pres.add_tag('file.pdf.date.pdfx', plist[1])
                entropy = pdfid_result.get("Entropy", None)
                if entropy:
                    enres = ResultSection(title_text="Entropy",
                                          parent=pdfidres)
                    for enlist in entropy:
                        enres.add_line("{0}: {1}, ({2})".format(
                            enlist[0], enlist[1], enlist[2]))
            flags = pdfid_result.get("Flags", None)
            if flags:
                fres = ResultSection(title_text="PDF Keyword Flags",
                                     parent=pdfidres)
                for flist in flags:
                    if flist[0] == "/ObjStm":
                        objstms = True
                    if len(flist) == 3:
                        fres.add_line(
                            "{0}:Count: {1}, Hex-Encoded Count: {2}".format(
                                flist[0], flist[1], flist[2]))
                    else:
                        fres.add_line("{0}:Count: {1}".format(
                            flist[0], flist[1]))
                    fres.add_tag('file.string.extracted',
                                 flist[0].replace("/", "", 1))
                    if flist[0] in additional_keywords:
                        triage_keywords.add(flist[0].replace("/", "", 1))

            plugin = pdfid_result.get("Plugin", [])

            # If any plugin results, or flagged keywords found, run PDF Parser
            if plugin or len(triage_keywords) > 0:
                run_pdfparse = True

            for pllist in plugin:
                pl_name, pl_heur, pl_text = pllist
                pl_heur = int(pl_heur)
                pl_text = pl_text[14:]
                if not pl_text or pl_text == "None":
                    continue

                if pl_name in ['EmbeddedFile', 'Name Obfuscation']:
                    modres = ResultSection(title_text=pl_text, parent=pdfidres)

                    if pl_heur > 0:
                        modres.set_heuristic(pl_heur)

                    if pl_name == 'EmbeddedFile':
                        embed_present = True

                elif pl_name in ['Triage', 'Suspicious Properties']:
                    javascript_found = False
                    for line in pl_text.splitlines():
                        lineres = ResultSection(title_text=line)
                        # Triage results
                        if '/JavaScript' in line:
                            triage_keywords.add('JavaScript')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JS' in line:
                            triage_keywords.add('JS')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JBIG2Decode' in line:
                            triage_keywords.add('JBIG2Decode')
                            lineres.set_heuristic(3)
                        elif '/Colors > 2^24' in line:
                            triage_keywords.add('Colors > 2^24')
                            lineres.set_heuristic(20)
                        elif '/AA' in line:
                            triage_keywords.add('AA')
                            lineres.set_heuristic(1)
                        elif '/Launch' in line:
                            triage_keywords.add('Launch')
                            lineres.set_heuristic(1)
                        elif '/OpenAction' in line:
                            triage_keywords.add('OpenAction')
                            lineres.set_heuristic(1)
                        elif '/GoToE' in line:
                            triage_keywords.add('GoToE')
                            lineres.set_heuristic(21)
                        elif '/GoToR' in line:
                            triage_keywords.add('GoToR')
                            lineres.set_heuristic(22)
                        elif '/Encrypt' in line:
                            triage_keywords.add('Encrypt')
                            lineres.set_heuristic(11)
                        elif '/AcroForm' in line:
                            triage_keywords.add('AcroForm')
                            lineres.set_heuristic(4)
                        elif '/RichMedia' in line:
                            triage_keywords.add('RichMedia')
                            lineres.set_heuristic(5)
                        elif '/XFA' in line:
                            triage_keywords.add('XFA')
                            lineres.set_heuristic(23)
                        elif '/Annot' in line:
                            triage_keywords.add('Annot')
                            lineres.set_heuristic(25)
                        elif '/ObjStm' in line:
                            triage_keywords.add('ObjStm')
                            lineres.set_heuristic(7)
                        elif '/URI' in line:
                            triage_keywords.add('URI')
                            lineres.set_heuristic(24)

                        # Suspicious properties results
                        elif "eof2" in line:
                            lineres.set_heuristic(2)
                        elif "eof5" in line:
                            lineres.set_heuristic(17)
                        elif "page" in line:
                            lineres.set_heuristic(26)
                        elif "entropy" in line:
                            lineres.set_heuristic(12)
                        elif "obj/endobj" in line:
                            lineres.set_heuristic(13)
                        elif "stream/endstream" in line:
                            lineres.set_heuristic(14)

                        if lineres.heuristic is not None:
                            pdfidres.add_subsection(lineres)

        for e in errors:
            all_errors.add(e)
            if e.startswith('Error running plugin'):
                self.log.warn(e)

        if run_pdfparse:
            # CALL PDF parser and extract further information
            pdf_parserres = ResultSection(title_text="PDF Parser Results")
            # STATISTICS
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                options = {
                    "stats": True,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No statistical results generated for file. Please see errors."
                        )
                    else:
                        version = pdf_parser_result.get("version", None)
                        if version and version[0] != '0':
                            pdf_parserres.add_line(version[0])
                        stats = pdf_parser_result.get("stats", None)
                        if stats:
                            sres = ResultSection(
                                title_text="PDF Statistcs",
                                parent=pdf_parserres,
                                body_format=BODY_FORMAT.MEMORY_DUMP)
                            for p in stats:
                                sres.add_line(p)
                    for e in errors:
                        all_errors.add(e)

            # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream)
            carved_content = {}  # Format { "objnum": [{keyword: content list}}
            obj_extract_triage = set()
            jbig_objs = set()

            for keyword in triage_keywords:
                # ObjStms handled differently
                if keyword == 'ObjStm':
                    continue

                options = {
                    "search": keyword,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    for p in pdf_parser_result['parts']:
                        content = ""
                        references = []
                        # Trailer will be extracted anyways, try and grab all references anyways -- will be messy
                        if p.startswith("trailer:"):
                            # Grab the content after the keyword
                            # Check that keyword actually in content
                            if "/{}".format(keyword) in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').split("/", 1)[0].strip()
                                    references = re.findall(
                                        "[0-9]* [0-9]* R", content)
                                except Exception:
                                    continue
                        # If not trailer, should be object
                        elif 'Referencing:' in p:
                            # Grab the content after the keyword
                            if '>>++>>' in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').strip()
                                except Exception:
                                    try:
                                        content = p.split("\n", 3)[3]
                                    except Exception:
                                        content = p
                            else:
                                try:
                                    content = p.split("\n", 3)[3]
                                except Exception:
                                    content = p
                            # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R"
                            if content.startswith("/{}".format(keyword)):
                                try:
                                    content = re.sub("/{}[ ]*".format(keyword),
                                                     "", content, 1)
                                except Exception:
                                    pass
                            try:
                                references = p.split("\n", 3)[2].replace(
                                    'Referencing:', '').strip().split(", ")
                            except Exception:
                                pass
                        # Only extract JBIG2Decode objects with deep scan, but always report on their presence
                        if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p:
                            try:
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                if request.deep_scan:
                                    obj_extract_triage.add(objnum)
                                jbig_objs.add(objnum)
                                continue
                            except Exception as e:
                                self.log.debug(e)
                                continue
                        # If no content, then keyword likely points to reference objects, so grab those
                        if content == '':
                            if len(references) > 0:
                                content = references
                            else:
                                # Something is wrong, drop it.
                                continue
                        else:
                            while True:
                                # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R]
                                islist = re.match(
                                    r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]",
                                    content)
                                if islist:
                                    content = re.sub(
                                        r"[\[\]]", "",
                                        islist.group(0).replace(
                                            "s ", '').replace("R ",
                                                              "R,")).split(",")
                                    break
                                # References might be with instructions, i.e. [# # R /FitH null]
                                withinst = re.match(
                                    r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}"
                                    r"[/a-zA-Z0-9 ]*[ ]?\]", content)
                                if withinst:
                                    content = [withinst.group(1)]
                                    break
                                content = [content]
                                break
                        for c in content:
                            # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted
                            if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[
                                    0:5]:
                                continue
                            if c in references or re.match(
                                    "[0-9]* [0-9]* R", c):
                                try:
                                    ref_obj = c.split(" ", 1)[0]
                                    options = {
                                        "object": ref_obj,
                                        "get_object_detail": True
                                    }
                                    pdf_parser_subresult, err = self.get_pdf_parser(
                                        path, working_dir, options)

                                    if pdf_parser_subresult:
                                        for sub_p in pdf_parser_subresult[
                                                'parts']:
                                            sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\
                                                .strip().split(", ")
                                            ptyp = sub_p.split(
                                                "\n", 2)[1].replace(
                                                    'Type:',
                                                    '').strip().replace(
                                                        "/", "")
                                            # If the object contains a stream, extract the object.
                                            if "Contains stream" in sub_p:
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    obj_extract_triage.add(
                                                        objnum)
                                                except Exception:
                                                    pass
                                            # Or if the object Type is the keyword, grab all referenced objects.
                                            elif sub_references[0] != '' and len(sub_references) >= 1 \
                                                    and ptyp == keyword:
                                                for sr in sub_references:
                                                    try:
                                                        objnum = sr.split(
                                                            " ", 1)[0]
                                                        obj_extract_triage.add(
                                                            objnum)
                                                    except Exception:
                                                        pass
                                            # If not, extract object detail in to carved output
                                            elif pdf_parser_subresult[
                                                    'obj_details'] != "":
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    if objnum in carved_content:
                                                        carved_content[objnum]\
                                                            .append({keyword: pdf_parser_subresult['obj_details']})
                                                    else:
                                                        carved_content[objnum] = \
                                                            [{keyword: pdf_parser_subresult['obj_details']}]
                                                except Exception:
                                                    continue

                                    for e in err:
                                        errors.add(e)
                                except Exception:
                                    # If none of that work, just extract the original object for examination.
                                    try:
                                        objnum = p.split("\n",
                                                         1)[0].split(" ")[1]
                                        obj_extract_triage.add(objnum)
                                    except Exception:
                                        pass
                            # If content does not look like a reference:
                            else:
                                if p.startswith("trailer:"):
                                    continue
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                # If the object contains a stream extract the object
                                if p.split("\n", 4)[3] == "Contains stream":
                                    obj_extract_triage.add(objnum)
                                else:
                                    # Or just carve the content
                                    if objnum in carved_content:
                                        carved_content[objnum].append(
                                            {keyword: c})
                                    else:
                                        carved_content[objnum] = [{keyword: c}]

                    for e in errors:
                        all_errors.add(e)

            # Add carved content to result output
            show_content_of_interest = False
            if len(carved_content) > 0 or len(jbig_objs) > 0:
                carres = ResultSection(title_text="Content of Interest")
            else:
                carres = None

            if len(jbig_objs) > 0:
                jbigres = ResultSection(
                    title_text=
                    "The following Object IDs are JBIG2DECODE streams:",
                    body_format=BODY_FORMAT.MEMORY_DUMP,
                    parent=carres)
                jbigres.add_line(', '.join(map(str, jbig_objs)))
                show_content_of_interest = True

            if len(carved_content) > 0:
                for k, l in sorted(carved_content.items()):
                    for d in l:
                        for keyw, con in d.items():
                            subres = ResultSection(
                                title_text="Object {0}: Hits for Keyword '{1}':"
                                .format(k, keyw))
                            subres.set_heuristic(8)

                            con_bytes = con.encode()
                            if len(con) < 500:
                                subres.body_format = BODY_FORMAT.MEMORY_DUMP
                                subres.add_line(con)

                                # Check for IOC content
                                patterns = PatternMatch()
                                st_value = patterns.ioc_match(con_bytes,
                                                              bogon_ip=True)
                                if len(st_value) > 0:
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    for ty, val in st_value.items():
                                        if val == "":
                                            asc_asc = unicodedata.normalize(
                                                'NFKC',
                                                val).encode('ascii', 'ignore')
                                            subres.add_tag(ty, asc_asc)
                                        else:
                                            ulis = list(set(val))
                                            for v in ulis:
                                                subres.add_tag(ty, v)
                            else:
                                crv_sha = hashlib.sha256(con_bytes).hexdigest()

                                if crv_sha not in carved_extracted_shas:
                                    f_name = "carved_content_obj_{}_{}".format(
                                        k, crv_sha[0:7])
                                    subres.add_lines([
                                        "Content over 500 bytes it will be extracted for analysis",
                                        "Name: {} - SHA256: {}".format(
                                            f_name, crv_sha)
                                    ])
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    crvf = os.path.join(
                                        self.working_directory, f_name)
                                    with open(crvf, 'wb') as f:
                                        f.write(con_bytes)
                                    request.add_extracted(
                                        crvf, os.path.basename(crvf),
                                        "Extracted content from object {}".
                                        format(k))
                                    carved_extracted_shas.add(crv_sha)

            if show_content_of_interest:
                pdf_parserres.add_subsection(carres)

            # ELEMENTS
            # Do not show for objstms
            if get_malform:
                if request.deep_scan:
                    options = {
                        "verbose": True,
                        "nocanonicalizedoutput": True,
                        "get_malform": get_malform
                    }
                elif embed_present:
                    options = {
                        "verbose": True,
                        "elements": "ctsi",
                        "type": "/EmbeddedFile",
                        "get_malform": get_malform
                    }
                else:
                    options = {
                        "verbose": True,
                        "elements": "cst",
                        "get_malform": get_malform
                    }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                embed_extracted = set()
                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No structure information generated for file. Please see errors."
                        )
                    else:
                        # PDF Parser will write any malformed content over 100 bytes to a file
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'malformed':
                                    if len(l) > 0:
                                        pdf_parserres.set_heuristic(6)
                                    for i in l:
                                        request.add_extracted(
                                            i, os.path.basename(i),
                                            "Extracted malformed content in PDF Parser Analysis."
                                        )

                        parts = pdf_parser_result.get("parts", None)
                        # Extract service will extract the sample's embedded files.
                        # However we want to make note of them so that they are not extracted again below
                        if parts:
                            for p in sorted(parts):
                                if "Type: /EmbeddedFile" in p:
                                    getobj = p.split("\n", 1)[0].split(" ")[1]
                                    embed_extracted.add(getobj)

                # Extract objects collected from above analysis
                obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs

                if len(obj_to_extract) > 0:
                    options = {
                        "filter": True,
                        "object": obj_to_extract,
                        "dump": "extracted_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        files = pdf_parser_result.get("files", None)
                        extracted_files = []
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_obj_", "")
                                        extracted_files.append(
                                            "Extracted object {} as {}".format(
                                                obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "Object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))
                        for e in errors:
                            all_errors.add(e)

                        if extracted_files:
                            extract_res = ResultSection(
                                title_text="Extracted embedded objects",
                                parent=pdf_parserres)
                            extract_res.set_heuristic(9)
                            extract_res.add_lines(extracted_files)

                # Extract jbig2decode objects in deep scan mode
                if request.deep_scan and len(jbig_objs) > 0:
                    options = {
                        "object": jbig_objs,
                        "dump": "extracted_jb_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        extracted_jb = []
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_jb_obj_", "")
                                        extracted_jb.append(
                                            "JBIG2DECODE object {} extracted as {}"
                                            .format(obj_id, f_name))
                                        request.add_extracted(
                                            i, f_name,
                                            "JBIG2DECODE object {} extracted in PDF Parser Analysis."
                                            .format(obj_id))

                        for e in errors:
                            all_errors.add(e)

                        if extracted_jb:
                            jbig_extract_res = ResultSection(
                                title_text="Extracted JBIG2Decode objects",
                                parent=pdf_parserres)
                            jbig_extract_res.set_heuristic(9)
                            jbig_extract_res.add_lines(extracted_jb)

            if len(pdf_parserres.subsections) > 0:
                res.add_subsection(pdf_parserres)

        return res, objstms, all_errors

コード例 #6

ファイルを表示

    def execute(self, request: ServiceRequest) -> None:
        # --- Setup ----------------------------------------------------------------------------------------------
        request.result = Result()
        patterns = PatternMatch()

        if request.deep_scan:
            max_attempts = 100
        else:
            max_attempts = 10

        self.files_extracted = set()
        self.hashes = set()

        # --- Pre-Processing --------------------------------------------------------------------------------------
        # Get all IOCs prior to de-obfuscation
        pat_values = patterns.ioc_match(request.file_contents,
                                        bogon_ip=True,
                                        just_network=False)
        if pat_values and request.get_param('extract_original_iocs'):
            ioc_res = ResultSection(
                "The following IOCs were found in the original file",
                parent=request.result,
                body_format=BODY_FORMAT.MEMORY_DUMP)
            for k, val in pat_values.items():
                for v in val:
                    if ioc_res:
                        ioc_res.add_line(
                            f"Found {k.upper().replace('.', ' ')}: {safe_str(v)}"
                        )
                        ioc_res.add_tag(k, v)

        # --- Prepare Techniques ----------------------------------------------------------------------------------
        techniques = [
            ('MSOffice Embedded script', self.msoffice_embedded_script_string),
            ('CHR and CHRB decode', self.chr_decode),
            ('String replace', self.string_replace),
            ('Powershell carets', self.powershell_carets),
            ('Array of strings', self.array_of_strings),
            ('Fake array vars', self.vars_of_fake_arrays),
            ('Reverse strings', self.str_reverse),
            ('B64 Decode', self.b64decode_str),
            ('Simple XOR function', self.simple_xor_function),
        ]
        second_pass = [('Concat strings', self.concat_strings),
                       ('MSWord macro vars', self.mswordmacro_vars),
                       ('Powershell vars', self.powershell_vars),
                       ('Charcode hex', self.charcode_hex)]
        final_pass = [
            ('Charcode', self.charcode),
        ]

        code_extracts = [('.*html.*', "HTML scripts extraction",
                          self.extract_htmlscript)]

        layers_list: List[Tuple[str, bytes]] = []
        layer = request.file_contents

        # --- Stage 1: Script Extraction --------------------------------------------------------------------------
        for pattern, name, func in code_extracts:
            if regex.match(regex.compile(pattern), request.task.file_type):
                extracted_parts = func(request.file_contents)
                layer = b"\n".join(extracted_parts).strip()
                layers_list.append((name, layer))
                break

        # --- Stage 2: Deobsfucation ------------------------------------------------------------------------------
        idx = 0
        first_pass_len = len(techniques)
        layers_count = len(layers_list)
        while True:
            if idx > max_attempts:
                final_pass.extend(techniques)
                for name, technique in final_pass:
                    res = technique(layer)
                    if res:
                        layers_list.append((name, res))
                break
            with ThreadPoolExecutor() as executor:
                threads = [
                    executor.submit(technique, layer)
                    for name, technique in techniques
                ]
                results = [thread.result() for thread in threads]
                for i in range(len(results)):
                    result = results[i]
                    if result:
                        layers_list.append((techniques[i][0], result))
                        # Looks like it worked, restart with new layer
                        layer = result
            # If the layers haven't changed in a passing, break
            if layers_count == len(layers_list):
                if len(techniques) != first_pass_len:
                    final_pass.extend(techniques)
                    with ThreadPoolExecutor() as executor:
                        threads = [
                            executor.submit(technique, layer)
                            for name, technique in final_pass
                        ]
                        results = [thread.result() for thread in threads]
                        for i in range(len(results)):
                            result = results[i]
                            if result:
                                layers_list.append((techniques[i][0], result))
                    break
                for x in second_pass:
                    techniques.insert(0, x)
            layers_count = len(layers_list)
            idx += 1

        # --- Compiling results ----------------------------------------------------------------------------------
        if len(layers_list) > 0:
            extract_file = False
            num_layers = len(layers_list)

            # Compute heuristic
            if num_layers < 5:
                heur_id = 1
            elif num_layers < 10:
                heur_id = 2
            elif num_layers < 50:
                heur_id = 3
            elif num_layers < 100:
                heur_id = 4
            else:  # num_layers >= 100
                heur_id = 5

            # Cleanup final layer
            clean = self.clean_up_final_layer(layers_list[-1][1])
            if clean != request.file_contents:
                # Check for new IOCs
                pat_values = patterns.ioc_match(clean,
                                                bogon_ip=True,
                                                just_network=False)
                diff_tags: Dict[str, List[bytes]] = {}

                for uri in pat_values.get('network.static.uri', []):
                    # Compare URIs without query string
                    uri = uri.split(b'?', 1)[0]
                    if uri not in request.file_contents:
                        diff_tags.setdefault('network.static.uri', [])
                        diff_tags['network.static.uri'].append(uri)

                if request.deep_scan or (len(clean) > 1000
                                         and heur_id >= 4) or diff_tags:
                    extract_file = True

                # Display obfuscation steps
                mres = ResultSection(
                    "De-obfuscation steps taken by DeobsfuScripter",
                    parent=request.result)
                if heur_id:
                    mres.set_heuristic(heur_id)

                lcount = Counter([x[0] for x in layers_list])
                for l, c in lcount.items():
                    mres.add_line(f"{l}, {c} time(s).")

                # Display final layer
                byte_count = 5000
                if extract_file:
                    # Save extracted file
                    byte_count = 500
                    file_name = f"{os.path.basename(request.file_name)}_decoded_final"
                    file_path = os.path.join(self.working_directory, file_name)
                    # Ensure directory exists before write
                    os.makedirs(os.path.dirname(file_path), exist_ok=True)
                    with open(file_path, 'wb+') as f:
                        f.write(clean)
                        self.log.debug(
                            f"Submitted dropped file for analysis: {file_path}"
                        )
                    request.add_extracted(file_path, file_name,
                                          "Final deobfuscation layer")

                ResultSection(f"First {byte_count} bytes of the final layer:",
                              body=safe_str(clean[:byte_count]),
                              body_format=BODY_FORMAT.MEMORY_DUMP,
                              parent=request.result)

                # Display new IOCs from final layer
                if len(diff_tags) > 0:
                    ioc_new = ResultSection(
                        "New IOCs found after de-obfustcation",
                        parent=request.result,
                        body_format=BODY_FORMAT.MEMORY_DUMP)
                    has_network_heur = False
                    for ty, val in diff_tags.items():
                        for v in val:
                            if "network" in ty:
                                has_network_heur = True
                            ioc_new.add_line(
                                f"Found {ty.upper().replace('.', ' ')}: {safe_str(v)}"
                            )
                            ioc_new.add_tag(ty, v)

                    if has_network_heur:
                        ioc_new.set_heuristic(7)
                    else:
                        ioc_new.set_heuristic(6)

                if len(self.files_extracted) > 0:
                    ext_file_res = ResultSection(
                        "The following files were extracted during the deobfuscation",
                        heuristic=Heuristic(8),
                        parent=request.result)
                    for extracted in self.files_extracted:
                        file_name = os.path.basename(extracted)
                        ext_file_res.add_line(file_name)
                        request.add_extracted(
                            extracted, file_name,
                            "File of interest deobfuscated from sample")