Example #1
0
    def recurse_add_res(self, file_res, res_list, new_files, parent=None):
        for res_dic in res_list:
            # Check if condition is OK
            if self.pass_condition(res_dic.get("condition", None)):
                res = ResultSection(res_dic['title_text'],
                                    classification=res_dic.get('classification', Classification.UNRESTRICTED),
                                    parent=parent, body_format=res_dic.get('body_format', BODY_FORMAT.TEXT))
                heur_id = self.heuristic_alteration(res_dic.get('score_condition', None), res_dic['heur_id'])
                res.set_heuristic(heur_id)

                # Add Tags
                tags = res_dic.get('tags', [])
                for res_tag in tags:
                    res.add_tag(res_tag[0], res_tag[1])

                # Add body
                body = res_dic.get('body', None)
                if body:
                    res.set_body(body)

                # File for resubmit
                files = res_dic.get('files', [])
                for res_file in files:
                    if isinstance(res_file, tuple):
                        res_file = res_file[1]
                    new_files.append(res_file)

                # Add to file res if root result
                if parent is None:
                    file_res.add_section(res)
Example #2
0
    def test_init(mocker):
        from assemblyline_v4_service.common.result import ResultSection
        mocker.patch("assemblyline_v4_service.common.api.ServiceAPIError")
        from metadefender import AvErrorSection
        av_name = "blah"
        engine = {}
        actual_res_sec = AvErrorSection(av_name, engine)
        correct_result_section = ResultSection(
            f"{av_name} failed to scan the file")
        correct_result_section.set_body("")
        assert check_section_equality(actual_res_sec, correct_result_section)

        engine = {"version": "blah", "def_time": "blah"}
        actual_res_sec = AvErrorSection(av_name, engine)
        correct_result_section = ResultSection(
            f"{av_name} failed to scan the file")
        correct_result_section.set_body(
            f"Engine: {engine['version']} :: Definition: {engine['def_time']}")
        assert check_section_equality(actual_res_sec, correct_result_section)
Example #3
0
def subsection_builder(parent_section: ResultSection = None,
                       fields: dict = {}):
    for mwcp_field, mwcp_field_data in fields.items():
        if mwcp_field in FIELD_TAG_MAP:
            tag = FIELD_TAG_MAP[mwcp_field]
            table_body = []
            table_section = ResultSection(
                f"Extracted {mwcp_field.capitalize()}")
            if tag:
                for x in mwcp_field_data:
                    table_section.add_tag(tag, x)
                # Tag everything that we can
            # Add data to section body
            for line in mwcp_field_data:
                if type(line) is str:
                    table_body.append({mwcp_field: line})
                elif type(line) is list:
                    for item in line:
                        table_body.append({mwcp_field: item})
            table_section.set_body(body_format=BODY_FORMAT.TABLE,
                                   body=json.dumps(table_body))

            parent_section.add_subsection(table_section)
    def execute(self, request):
        parser = eml_parser.eml_parser.EmlParser(include_raw_body=True,
                                                 include_attachment_data=True)
        content_str = request.file_contents

        # Attempt conversion of potential Outlook file -> eml
        if request.file_type == "document/office/email":
            try:
                content_str = msg2eml(request.file_path).as_bytes()
            except Exception:
                # Try using mailparser to convert
                converted_path, _ = msgconvert(request.file_path)
                content_str = open(converted_path, "rb").read()

        header_agg = {
            "From": set(),
            "To": set(),
            "Cc": set(),
            "Sent": set(),
            "Reply-To": set(),
            "Date": set()
        }
        # Assume this is an email saved in HTML format
        if request.file_type == "code/html":
            parsed_html = BeautifulSoup(content_str, "lxml")
            valid_headers = [
                "To:", "Cc:", "Sent:", "From:", "Subject:", "Reply-To:"
            ]

            if not parsed_html.body or not any(header in parsed_html.body.text
                                               for header in valid_headers):
                # We can assume this is just an HTML doc (or lacking body), one of which we can't process
                request.result = Result()
                return

            # Can't trust 'Date' to determine the difference between HTML docs vs HTML emails
            valid_headers.append("Date:")

            html_email = email.message_from_bytes(content_str)
            generator_metadata_content = ""
            for meta in parsed_html.find_all("meta"):
                if meta.attrs.get("name", None) == "Generator":
                    generator_metadata_content = meta.attrs.get("content", "")
                    break

            # Process HTML emails generated from Outlook
            if generator_metadata_content == "Microsoft Word 15":
                paragraphs = parsed_html.body.find_all("p")
                # Likely an email that was exported with original email headers
                if any(header in paragraphs[0] for header in valid_headers):
                    for p in paragraphs:
                        if any(valid_header in p.text
                               for valid_header in valid_headers):
                            h_key, h_value = p.text.replace(
                                "\xa0", "").replace("\r\n", " ").split(":", 1)
                            html_email[h_key] = h_value
                            # Subject line indicates the end of the email header, beginning of body
                            if "Subject" in p.text:
                                break
            # Process HTML emails from MS Exchange Server or missing top-level headers (aggregate headers)
            elif (generator_metadata_content
                  == "Microsoft Word 15 (filtered medium)"
                  or generator_metadata_content == "Microsoft Exchange Server"
                  or generator_metadata_content == ""):
                subject = None
                for div in parsed_html.find_all("div"):
                    # Header information within divs
                    if any(header in div.text for header in valid_headers
                           ) and "WordSection1" not in div.attrs.get(
                               "class", []):
                        # Usually expect headers to be \n separated in text output but check first
                        if "\n" in div.text:
                            for h in div.text.split("\n"):
                                if any(header in h
                                       for header in valid_headers):
                                    h_key, h_value = h.split(":", 1)

                                    # Implying some malformed message got mixed with the headers of another message
                                    if h_key not in valid_headers:
                                        for header in valid_headers:
                                            if header in h:
                                                h_key = header[:-1]

                                    # Use the latest message's subject (this maintains FW, RE, etc.)
                                    if h_key == "Subject" and not subject:
                                        subject = h_value
                                    elif h_key != "Subject":
                                        header_agg[h_key].add(h_value)

                        # Document was probably not well formatted, so we'll use the headers as delimiters
                        else:
                            header_offset_map = {}
                            # Determine the position of each header
                            for header in list(
                                    header_agg.keys()) + ["Subject"]:
                                if header in div.text:
                                    header_offset_map[div.text.index(
                                        header)] = header
                            # Use the positions and length of header name to determine an offset
                            for i in range(len(header_offset_map)):
                                sorted_keys = sorted(header_offset_map.keys())
                                header_name = header_offset_map[sorted_keys[i]]
                                offset = len(
                                    f"{header_name}: ") + sorted_keys[i]
                                value = (div.text[offset:sorted_keys[i + 1]]
                                         if i < len(header_offset_map) - 1 else
                                         div.text[offset:])

                                if header_name == "Subject":
                                    subject = value
                                else:
                                    header_agg[header_name].add(value)

                # Assign aggregated info to email object
                html_email["Subject"] = subject
                for key, value in header_agg.items():
                    html_email[key] = "; ".join(value)
            content_str = html_email.as_bytes()

        parsed_eml = parser.decode_email_bytes(content_str)
        result = Result()
        header = parsed_eml["header"]

        if "from" in header or "to" in header:
            all_uri = set()
            body_words = set(extract_passwords(header["subject"]))
            for body_counter, body in enumerate(parsed_eml["body"]):
                body_text = BeautifulSoup(body["content"]).text
                body_words.update(extract_passwords(body_text))
                if request.get_param("extract_body_text"):
                    fd, path = mkstemp()
                    with open(path, "w") as f:
                        f.write(body["content"])
                        os.close(fd)
                    request.add_extracted(path, "body_" + str(body_counter),
                                          "Body text")
                if "uri" in body:
                    for uri in body["uri"]:
                        all_uri.add(uri)
            # Words in the email body, used by extract to guess passwords
            request.temp_submission_data["email_body"] = list(body_words)

            kv_section = ResultSection("Email Headers",
                                       body_format=BODY_FORMAT.KEY_VALUE,
                                       parent=result)

            # Basic tags
            from_addr = header["from"].strip() if header.get("from",
                                                             None) else None
            if from_addr and re.match(EMAIL_REGEX, from_addr):
                kv_section.add_tag("network.email.address", from_addr)
            [
                kv_section.add_tag("network.email.address", to.strip())
                for to in header["to"] if re.match(EMAIL_REGEX, to.strip())
            ]

            kv_section.add_tag("network.email.date",
                               str(header["date"]).strip())

            subject = header["subject"].strip() if header.get("subject",
                                                              None) else None
            if subject:
                kv_section.add_tag("network.email.subject", subject)

            # Add CCs to body and tags
            if "cc" in header:
                [
                    kv_section.add_tag("network.email.address", cc.strip())
                    for cc in header["cc"] if re.match(EMAIL_REGEX, cc.strip())
                ]
            # Add Message ID to body and tags
            if "message-id" in header["header"]:
                kv_section.add_tag("network.email.msg_id",
                                   header["header"]["message-id"][0].strip())

            # Add Tags for received IPs
            if "received_ip" in header:
                for ip in header["received_ip"]:
                    ip = ip.strip()
                    try:
                        if isinstance(ip_address(ip), IPv4Address):
                            kv_section.add_tag("network.static.ip", ip)
                    except ValueError:
                        pass

            # Add Tags for received Domains
            if "received_domain" in header:
                for dom in header["received_domain"]:
                    kv_section.add_tag("network.static.domain", dom.strip())

            # If we've found URIs, add them to a section
            if len(all_uri) > 0:
                uri_section = ResultSection("URIs Found:", parent=result)
                for uri in all_uri:
                    uri_section.add_line(uri)
                    uri_section.add_tag("network.static.uri", uri.strip())
                    parsed_url = urlparse(uri)
                    if parsed_url.hostname and re.match(
                            IP_ONLY_REGEX, parsed_url.hostname):
                        uri_section.add_tag("network.static.ip",
                                            parsed_url.hostname)
                    else:
                        uri_section.add_tag("network.static.domain",
                                            parsed_url.hostname)

            # Bring all headers together...
            extra_header = header.pop("header", {})
            header.pop("received", None)
            header.update(extra_header)

            # Convert to common format
            header["date"] = [self.json_serial(header["date"])]

            # Replace with aggregated date(s) if any available
            if header_agg["Date"]:
                # Replace
                if any(
                        default_date in header["date"] for default_date in
                    ["1970-01-01T00:00:00", "Thu, 01 Jan 1970 00:00:00 +0000"
                     ]):
                    header["date"] = list(header_agg["Date"])
                # Append
                else:
                    header["date"] += list(header_agg["Date"])
                (kv_section.add_tag("network.email.date",
                                    str(date).strip())
                 for date in header_agg["Date"])

            # Filter out useless headers from results
            self.log.debug(header.keys())
            [header.pop(h) for h in self.header_filter if h in header.keys()]
            kv_section.set_body(json.dumps(header, default=self.json_serial))

            attachments_added = []
            if "attachment" in parsed_eml:
                attachments = parsed_eml["attachment"]
                for attachment in attachments:
                    fd, path = mkstemp()

                    with open(path, "wb") as f:
                        f.write(base64.b64decode(attachment["raw"]))
                        os.close(fd)
                    try:
                        if request.add_extracted(
                                path,
                                attachment["filename"],
                                "Attachment ",
                                safelist_interface=self.api_interface):
                            attachments_added.append(attachment["filename"])
                    except MaxExtractedExceeded:
                        self.log.warning(
                            f"Extract limit reached on attachments: "
                            f"{len(attachment) - len(attachments_added)} not added"
                        )
                        break
                ResultSection("Extracted Attachments:",
                              body="\n".join([x for x in attachments_added]),
                              parent=result)

            if request.get_param("save_emlparser_output"):
                fd, temp_path = tempfile.mkstemp(dir=self.working_directory)
                attachments = parsed_eml.get("attachment", [])
                # Remove raw attachments, all attachments up to MaxExtractedExceeded already extracted
                for attachment in attachments:
                    _ = attachment.pop("raw", None)
                with os.fdopen(fd, "w") as myfile:
                    myfile.write(
                        json.dumps(parsed_eml, default=self.json_serial))
                request.add_supplementary(
                    temp_path, "parsing.json",
                    "These are the raw results of running GOVCERT-LU's eml_parser"
                )
        else:
            self.log.warning(
                "emlParser could not parse EML; no useful information in result's headers"
            )

        request.result = result
    def peepdf_analysis(self, temp_filename, file_content, request):
        file_res = Result()
        try:
            res_list = []
            # js_stream = []
            f_list = []
            js_dump = []

            pdf_parser = PDFParser()
            ret, pdf_file = pdf_parser.parse(temp_filename, True, False, file_content)
            if ret == 0:
                stats_dict = pdf_file.getStats()

                if ", ".join(stats_dict['Errors']) == "Bad PDF header, %%EOF not found, PDF sections not found, No " \
                                                      "indirect objects found in the body":
                    # Not a PDF
                    return

                json_body = dict(
                    version=stats_dict['Version'],
                    binary=stats_dict['Binary'],
                    linearized=stats_dict['Linearized'],
                    encrypted=stats_dict['Encrypted'],
                )

                if stats_dict['Encryption Algorithms']:
                    temp = []
                    for algorithmInfo in stats_dict['Encryption Algorithms']:
                        temp.append(f"{algorithmInfo[0]} {str(algorithmInfo[1])} bits")
                    json_body["encryption_algorithms"] = temp

                json_body.update(dict(
                    updates=stats_dict['Updates'],
                    objects=stats_dict['Objects'],
                    streams=stats_dict['Streams'],
                    comments=stats_dict['Comments'],
                    errors={True: ", ".join(stats_dict['Errors']),
                            False: "None"}[len(stats_dict['Errors']) != 0]
                ))
                res = ResultSection("PDF File Information", body_format=BODY_FORMAT.KEY_VALUE,
                                    body=json.dumps(json_body))

                for version in range(len(stats_dict['Versions'])):
                    stats_version = stats_dict['Versions'][version]
                    v_json_body = dict(
                        catalog=stats_version['Catalog'] or "no",
                        info=stats_version['Info'] or "no",
                        objects=self.list_first_x(stats_version['Objects'][1]),
                    )

                    if stats_version['Compressed Objects'] is not None:
                        v_json_body['compressed_objects'] = self.list_first_x(stats_version['Compressed Objects'][1])

                    if stats_version['Errors'] is not None:
                        v_json_body['errors'] = self.list_first_x(stats_version['Errors'][1])

                    v_json_body['streams'] = self.list_first_x(stats_version['Streams'][1])

                    if stats_version['Xref Streams'] is not None:
                        v_json_body['xref_streams'] = self.list_first_x(stats_version['Xref Streams'][1])

                    if stats_version['Object Streams'] is not None:
                        v_json_body['object_streams'] = self.list_first_x(stats_version['Object Streams'][1])

                    if int(stats_version['Streams'][0]) > 0:
                        v_json_body['encoded'] = self.list_first_x(stats_version['Encoded'][1])
                        if stats_version['Decoding Errors'] is not None:
                            v_json_body['decoding_errors'] = self.list_first_x(stats_version['Decoding Errors'][1])

                    if stats_version['Objects with JS code'] is not None:
                        v_json_body['objects_with_js_code'] = \
                            self.list_first_x(stats_version['Objects with JS code'][1])
                        # js_stream.extend(stats_version['Objects with JS code'][1])

                    res_version = ResultSection(f"Version {str(version)}", parent=res,
                                                body_format=BODY_FORMAT.KEY_VALUE, body=json.dumps(v_json_body))

                    actions = stats_version['Actions']
                    events = stats_version['Events']
                    vulns = stats_version['Vulns']
                    elements = stats_version['Elements']
                    is_suspicious = False
                    if events is not None or actions is not None or vulns is not None or elements is not None:
                        res_suspicious = ResultSection('Suspicious elements', parent=res_version)
                        if events is not None:
                            for event in events:
                                res_suspicious.add_line(f"{event}: {self.list_first_x(events[event])}")
                            is_suspicious = True
                        if actions is not None:
                            for action in actions:
                                res_suspicious.add_line(f"{action}: {self.list_first_x(actions[action])}")
                            is_suspicious = True
                        if vulns is not None:
                            for vuln in vulns:
                                if vuln in vulnsDict:
                                    temp = [vuln, ' (']
                                    for vulnCVE in vulnsDict[vuln]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                            vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                            temp.append(vulnCVE)
                                            cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                            if cve_found:
                                                res_suspicious.add_tag('attribution.exploit',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                                res_suspicious.add_tag('file.behavior',
                                                                       vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(vulns[vuln]))
                                    res_suspicious.add_line(temp)
                                else:
                                    res_suspicious.add_line(f"{vuln}: {str(vulns[vuln])}")
                                is_suspicious = True
                        if elements is not None:
                            for element in elements:
                                if element in vulnsDict:
                                    temp = [element, ' (']
                                    for vulnCVE in vulnsDict[element]:
                                        if len(temp) != 2:
                                            temp.append(',')
                                        vulnCVE = "".join(vulnCVE) if isinstance(vulnCVE, list) else vulnCVE
                                        temp.append(vulnCVE)
                                        cve_found = re.search("CVE-[0-9]{4}-[0-9]{4}", vulnCVE)
                                        if cve_found:
                                            res_suspicious.add_tag('attribution.exploit',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                            res_suspicious.add_tag('file.behavior',
                                                                   vulnCVE[cve_found.start():cve_found.end()])
                                    temp.append('): ')
                                    temp.append(str(elements[element]))
                                    res_suspicious.add_line(temp)
                                    is_suspicious = True
                                else:
                                    res_suspicious.add_line(f"\t\t{element}: {str(elements[element])}")
                                    is_suspicious = True
                    res_suspicious.set_heuristic(8) if is_suspicious else None

                    urls = stats_version['URLs']
                    if urls is not None:
                        res.add_line("")
                        res_url = ResultSection('Found URLs', parent=res)
                        for url in urls:
                            res_url.add_line(f"\t\t{url}")
                            res_url.set_heuristic(9)

                    for obj in stats_version['Objects'][1]:
                        cur_obj = pdf_file.getObject(obj, version)

                        if cur_obj.containsJScode:
                            cur_res = ResultSection(f"Object [{obj} {version}] contains {len(cur_obj.JSCode)} "
                                                    f"block of JavaScript")
                            score_modifier = 0

                            js_idx = 0
                            for js in cur_obj.JSCode:
                                sub_res = ResultSection('Block of JavaScript', parent=cur_res)
                                js_idx += 1
                                js_score = 0
                                js_code, unescaped_bytes, _, _, _ = analyseJS(js)

                                js_dump += [x for x in js_code]

                                # Malicious characteristics
                                big_buffs = self.get_big_buffs("".join(js_code))
                                if len(big_buffs) == 1:
                                    js_score += 500 * len(big_buffs)
                                if len(big_buffs) > 0:
                                    js_score += 500 * len(big_buffs)
                                has_eval, has_unescape = self.check_dangerous_func("".join(js_code))
                                if has_unescape:
                                    js_score += 100
                                if has_eval:
                                    js_score += 100

                                js_cmt = ""
                                if has_eval or has_unescape or len(big_buffs) > 0:
                                    score_modifier += js_score
                                    js_cmt = "Suspiciously malicious "
                                    cur_res.add_tag('file.behavior', "Suspicious JavaScript in PDF")
                                    sub_res.set_heuristic(7)
                                js_res = ResultSection(f"{js_cmt}JavaScript Code (block: {js_idx})", parent=sub_res)

                                if js_score > 0:
                                    temp_js_outname = f"object{obj}-{version}_{js_idx}.js"
                                    temp_js_path = os.path.join(self.working_directory, temp_js_outname)
                                    temp_js_bin = "".join(js_code).encode("utf-8")
                                    f = open(temp_js_path, "wb")
                                    f.write(temp_js_bin)
                                    f.close()
                                    f_list.append(temp_js_path)

                                    js_res.add_line(f"The JavaScript block was saved as {temp_js_outname}")
                                    if has_eval or has_unescape:
                                        analysis_res = ResultSection("[Suspicious Functions]", parent=js_res)
                                        if has_eval:
                                            analysis_res.add_line("eval: This JavaScript block uses eval() function "
                                                                  "which is often used to launch deobfuscated "
                                                                  "JavaScript code.")
                                            analysis_res.set_heuristic(3)
                                        if has_unescape:
                                            analysis_res.add_line("unescape: This JavaScript block uses unescape() "
                                                                  "function. It may be legitimate but it is definitely "
                                                                  "suspicious since malware often use this to "
                                                                  "deobfuscate code blocks.")
                                            analysis_res.set_heuristic(3)

                                    buff_idx = 0
                                    for buff in big_buffs:
                                        buff_idx += 1
                                        error, new_buff = unescape(buff)
                                        if error == 0:
                                            buff = new_buff

                                        if buff not in unescaped_bytes:
                                            temp_path_name = None
                                            if ";base64," in buff[:100] and "data:" in buff[:100]:
                                                temp_path_name = f"obj{obj}_unb64_{buff_idx}.buff"
                                                try:
                                                    buff = b64decode(buff.split(";base64,")[1].strip())
                                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                                    f = open(temp_path, "wb")
                                                    f.write(buff)
                                                    f.close()
                                                    f_list.append(temp_path)
                                                except Exception:
                                                    self.log.error("Found 'data:;base64, ' buffer "
                                                                   "but failed to base64 decode.")
                                                    temp_path_name = None

                                            if temp_path_name is not None:
                                                buff_cond = f" and was resubmitted as {temp_path_name}"
                                            else:
                                                buff_cond = ""
                                            buff_res = ResultSection(
                                                f"A {len(buff)} bytes buffer was found in the JavaScript "
                                                f"block{buff_cond}. Here are the first 256 bytes.",
                                                parent=js_res, body=hexdump(bytes(buff[:256], "utf-8")),
                                                body_format=BODY_FORMAT.MEMORY_DUMP)
                                            buff_res.set_heuristic(2)

                                processed_sc = []
                                sc_idx = 0
                                for sc in unescaped_bytes:
                                    if sc not in processed_sc:
                                        sc_idx += 1
                                        processed_sc.append(sc)

                                        try:
                                            sc = sc.decode("hex")
                                        except Exception:
                                            pass

                                        shell_score = 500
                                        temp_path_name = f"obj{obj}_unescaped_{sc_idx}.buff"

                                        shell_res = ResultSection(f"Unknown unescaped {len(sc)} bytes JavaScript "
                                                                  f"buffer (id: {sc_idx}) was resubmitted as "
                                                                  f"{temp_path_name}. Here are the first 256 bytes.",
                                                                  parent=js_res)
                                        shell_res.set_body(hexdump(sc[:256]), body_format=BODY_FORMAT.MEMORY_DUMP)

                                        temp_path = os.path.join(self.working_directory, temp_path_name)
                                        f = open(temp_path, "wb")
                                        f.write(sc)
                                        f.close()
                                        f_list.append(temp_path)

                                        cur_res.add_tag('file.behavior', "Unescaped JavaScript Buffer")
                                        shell_res.set_heuristic(6)
                                        score_modifier += shell_score

                            if score_modifier > 0:
                                res_list.append(cur_res)

                        elif cur_obj.type == "stream":
                            if cur_obj.isEncodedStream and cur_obj.filter is not None:
                                data = cur_obj.decodedStream
                                encoding = cur_obj.filter.value.replace("[", "").replace("]", "").replace("/",
                                                                                                          "").strip()
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            else:
                                data = cur_obj.rawStream
                                encoding = None
                                val = cur_obj.rawValue
                                otype = cur_obj.elements.get("/Type", None)
                                sub_type = cur_obj.elements.get("/Subtype", None)
                                length = cur_obj.elements.get("/Length", None)

                            if otype:
                                otype = otype.value.replace("/", "").lower()
                            if sub_type:
                                sub_type = sub_type.value.replace("/", "").lower()
                            if length:
                                length = length.value

                            if otype == "embeddedfile":
                                if len(data) > 4096:
                                    if encoding is not None:
                                        temp_encoding_str = f"_{encoding}"
                                    else:
                                        temp_encoding_str = ""

                                    cur_res = ResultSection(
                                        f'Embedded file found ({length} bytes) [obj: {obj} {version}] '
                                        f'and dumped for analysis {f"(Type: {otype}) " if otype is not None else ""}'
                                        f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                        f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                    )

                                    temp_path_name = f"EmbeddedFile_{obj}{temp_encoding_str}.obj"
                                    temp_path = os.path.join(self.working_directory, temp_path_name)
                                    f = open(temp_path, "wb")
                                    f.write(data)
                                    f.close()
                                    f_list.append(temp_path)

                                    cur_res.add_line(f"The EmbeddedFile object was saved as {temp_path_name}")
                                    res_list.append(cur_res)

                            elif otype not in BANNED_TYPES:
                                cur_res = ResultSection(
                                    f'Unknown stream found [obj: {obj} {version}] '
                                    f'{f"(Type: {otype}) " if otype is not None else ""}'
                                    f'{f"(SubType: {sub_type}) " if sub_type is not None else ""}'
                                    f'{f"(Encoded with {encoding})" if encoding is not None else ""}'
                                )
                                for line in val.splitlines():
                                    cur_res.add_line(line)

                                emb_res = ResultSection('First 256 bytes', parent=cur_res)
                                first_256 = data[:256]
                                if isinstance(first_256, str):
                                    first_256 = first_256.encode()
                                emb_res.set_body(hexdump(first_256), BODY_FORMAT.MEMORY_DUMP)
                                res_list.append(cur_res)
                        else:
                            pass

                file_res.add_section(res)

                for results in res_list:
                    file_res.add_section(results)

                if js_dump:
                    js_dump_res = ResultSection('Full JavaScript dump')

                    temp_js_dump = "javascript_dump.js"
                    temp_js_dump_path = os.path.join(self.working_directory, temp_js_dump)
                    try:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump).encode("utf-8")
                    except UnicodeDecodeError:
                        temp_js_dump_bin = "\n\n----\n\n".join(js_dump)
                    temp_js_dump_sha1 = hashlib.sha1(temp_js_dump_bin).hexdigest()
                    f = open(temp_js_dump_path, "wb")
                    f.write(temp_js_dump_bin)
                    f.flush()
                    f.close()
                    f_list.append(temp_js_dump_path)

                    js_dump_res.add_line(f"The JavaScript dump was saved as {temp_js_dump}")
                    js_dump_res.add_line(f"The SHA-1 for the JavaScript dump is {temp_js_dump_sha1}")

                    js_dump_res.add_tag('file.pdf.javascript.sha1', temp_js_dump_sha1)
                    file_res.add_section(js_dump_res)

                for filename in f_list:
                    request.add_extracted(filename, os.path.basename(filename),
                                          f"Dumped from {os.path.basename(temp_filename)}")

            else:
                res = ResultSection("ERROR: Could not parse file with PeePDF.")
                file_res.add_section(res)
        finally:
            request.result = file_res
            try:
                del pdf_file
            except Exception:
                pass

            try:
                del pdf_parser
            except Exception:
                pass

            gc.collect()
Example #6
0
    def test_init(mocker):
        from json import dumps
        from assemblyline_v4_service.common.result import BODY_FORMAT, ResultSection
        mocker.patch("assemblyline_v4_service.common.api.ServiceAPIError")
        from metadefender import AvHitSection
        av_name = "blah"
        virus_name = "blah"
        engine = {}
        heur_id = 1
        sig_score_rev_map = {}
        kw_score_rev_map = {}
        safelist_match = []
        actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id,
                                      sig_score_rev_map, kw_score_rev_map,
                                      safelist_match)
        correct_result_section = ResultSection(
            f"{av_name} identified the file as {virus_name}")
        correct_result_section.set_heuristic(1)
        correct_result_section.heuristic.add_signature_id(
            f"{av_name}.{virus_name}")
        correct_result_section.add_tag("av.virus_name", virus_name)
        correct_result_section.set_body(
            dumps({
                "av_name": av_name,
                "virus_name": virus_name,
                "scan_result": "infected",
                "engine_version": "unknown",
                "engine_definition_time": "unknown"
            }), BODY_FORMAT.KEY_VALUE)
        assert check_section_equality(actual_res_sec, correct_result_section)

        engine = {"version": "blah", "def_time": 1}
        heur_id = 2
        safelist_match = ["blah"]
        actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id,
                                      sig_score_rev_map, kw_score_rev_map,
                                      safelist_match)
        correct_result_section = ResultSection(
            f"{av_name} identified the file as {virus_name}")
        correct_result_section.add_tag("av.virus_name", virus_name)
        correct_result_section.set_heuristic(2)
        correct_result_section.heuristic.add_signature_id(
            f"{av_name}.{virus_name}", 0)
        correct_result_section.set_body(
            dumps({
                "av_name": av_name,
                "virus_name": virus_name,
                "scan_result": "suspicious",
                "engine_version": "blah",
                "engine_definition_time": 1
            }), BODY_FORMAT.KEY_VALUE)
        assert check_section_equality(actual_res_sec, correct_result_section)

        kw_score_rev_map = {"bla": 1}
        actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id,
                                      sig_score_rev_map, kw_score_rev_map,
                                      safelist_match)
        correct_result_section = ResultSection(
            f"{av_name} identified the file as {virus_name}")
        correct_result_section.add_tag("av.virus_name", virus_name)
        correct_result_section.set_heuristic(2)
        correct_result_section.heuristic.add_signature_id(
            f"{av_name}.{virus_name}", 1)
        correct_result_section.set_body(
            dumps({
                "av_name": av_name,
                "virus_name": virus_name,
                "scan_result": "suspicious",
                "engine_version": "blah",
                "engine_definition_time": 1
            }), BODY_FORMAT.KEY_VALUE)
        assert check_section_equality(actual_res_sec, correct_result_section)

        kw_score_rev_map = {"bla": 1, "h": 2}
        actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id,
                                      sig_score_rev_map, kw_score_rev_map,
                                      safelist_match)
        correct_result_section = ResultSection(
            f"{av_name} identified the file as {virus_name}")
        correct_result_section.add_tag("av.virus_name", virus_name)
        correct_result_section.set_heuristic(2)
        correct_result_section.heuristic.add_signature_id(
            f"{av_name}.{virus_name}", 2)
        correct_result_section.set_body(
            dumps({
                "av_name": av_name,
                "virus_name": virus_name,
                "scan_result": "suspicious",
                "engine_version": "blah",
                "engine_definition_time": 1
            }), BODY_FORMAT.KEY_VALUE)
        assert check_section_equality(actual_res_sec, correct_result_section)

        sig_score_rev_map = {f"{av_name}.{virus_name}": 10}
        actual_res_sec = AvHitSection(av_name, virus_name, engine, heur_id,
                                      sig_score_rev_map, kw_score_rev_map,
                                      safelist_match)
        correct_result_section = ResultSection(
            f"{av_name} identified the file as {virus_name}")
        correct_result_section.add_tag("av.virus_name", virus_name)
        correct_result_section.set_heuristic(2)
        correct_result_section.heuristic.add_signature_id(
            f"{av_name}.{virus_name}", 10)
        correct_result_section.set_body(
            dumps({
                "av_name": av_name,
                "virus_name": virus_name,
                "scan_result": "suspicious",
                "engine_version": "blah",
                "engine_definition_time": 1
            }), BODY_FORMAT.KEY_VALUE)
        assert check_section_equality(actual_res_sec, correct_result_section)
Example #7
0
    def analyze_pdf(self,
                    request,
                    res_txt,
                    path,
                    working_dir,
                    heur,
                    additional_keywords,
                    get_malform=True):
        """Extract metadata, keyword objects and content of interest from a PDF sample using PDFId, PDFId plugins,
        and PDF Parser.

        Args:
            request: AL request object.
            res_txt: Header string for AL result section title.
            path: Original PDF sample path.
            working_dir: AL working directory.
            heur: List of plugins to run on PDFId results (provided in service configuration).
            additional_keywords: List of additional keywords to be searched (provided in service configuration).
            get_malform: Extract malformed objects from PDF.

        Returns:
            AL result object, AL heuristics list to add to result, list of object streams (objstms), and an errors list.
        """
        triage_keywords = set()
        all_errors = set()
        embed_present = False
        objstms = False
        res = ResultSection(title_text=res_txt)
        carved_extracted_shas = set()

        if request.deep_scan:
            run_pdfparse = True
        else:
            run_pdfparse = False

        # Run PDFId
        try:
            pdfid_result, errors = self.get_pdfid([path], additional_keywords,
                                                  heur, request.deep_scan)
        except Exception as e:
            raise NonRecoverableError(e)
        # Parse PDFId results
        pdfidres = ResultSection(title_text="PDFID Results", parent=res)
        if len(pdfid_result) == 0:
            pdfidres.add_line(
                "No results generated for file. Please see errors.")
        else:
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                version = pdfid_result.get("PDFID", None)
                if version:
                    pdfidres.add_line(version)
                properties = pdfid_result.get("Properties", None)
                if properties:
                    pres = ResultSection(title_text="PDF Properties",
                                         parent=pdfidres)
                    for k, v in properties.items():
                        pres.add_line(f"{k}: {v}")
                        if k == "/ModDate":
                            pres.add_tag('file.pdf.date.modified', v)
                        elif k == "/CreationDate":
                            pres.add_tag('file.date.creation', v)
                        elif k == "/LastModified":
                            pres.add_tag('file.date.last_modified', v)
                        elif k == "/SourceModified":
                            pres.add_tag('file.pdf.date.source_modified', v)
                        elif k == "/pdfx":
                            pres.add_tag('file.pdf.date.pdfx', v)
                entropy = pdfid_result.get("Entropy", None)
                if entropy:
                    enres = ResultSection(title_text="Entropy",
                                          parent=pdfidres)
                    for enlist in entropy:
                        enres.add_line(
                            f"{enlist[0]}: {enlist[1]}, ({enlist[2]})")
            flags = pdfid_result.get("Flags", None)
            if isinstance(flags, dict):
                fres = ResultSection(title_text="PDF Keyword Flags (Count)",
                                     parent=pdfidres)
                for k, v in flags.items():
                    if k == "/ObjStm":
                        objstms = True
                        # Filter out seemingly meaningless keywords
                    if ((not isinstance(v, dict) and int(v) > 1) or
                        (isinstance(v, dict))) and len(k) > 2:
                        fres.add_line(f"{k}: {v}")
                        fres.add_tag('file.string.extracted',
                                     k.replace("/", "", 1))
                    if k in additional_keywords:
                        triage_keywords.add(k.replace("/", "", 1))

            plugin = pdfid_result.get("Plugin", [])

            # If any plugin results, or flagged keywords found, run PDF Parser
            if plugin or len(triage_keywords) > 0:
                run_pdfparse = True

            for pllist in plugin:
                pl_name, pl_heur, pl_text = pllist
                pl_heur = int(pl_heur)
                pl_text = pl_text[14:]
                if not pl_text or pl_text == "None":
                    continue

                if pl_name in ['EmbeddedFile', 'Name Obfuscation']:
                    modres = ResultSection(title_text=pl_text, parent=pdfidres)

                    if pl_heur > 0:
                        modres.set_heuristic(pl_heur)

                    if pl_name == 'EmbeddedFile':
                        embed_present = True

                elif pl_name in ['Triage', 'Suspicious Properties']:
                    javascript_found = False
                    for line in pl_text.splitlines():
                        lineres = ResultSection(title_text=line)
                        # Triage results
                        if '/JavaScript' in line:
                            triage_keywords.add('JavaScript')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JS' in line:
                            triage_keywords.add('JS')
                            if not javascript_found:
                                lineres.set_heuristic(19)
                                javascript_found = True
                        elif '/JBIG2Decode' in line:
                            triage_keywords.add('JBIG2Decode')
                            lineres.set_heuristic(3)
                        elif '/Colors > 2^24' in line:
                            triage_keywords.add('Colors > 2^24')
                            lineres.set_heuristic(20)
                        elif '/AA' in line:
                            triage_keywords.add('AA')
                            lineres.set_heuristic(1)
                        elif '/Launch' in line:
                            triage_keywords.add('Launch')
                            lineres.set_heuristic(1)
                        elif '/OpenAction' in line:
                            triage_keywords.add('OpenAction')
                            lineres.set_heuristic(1)
                        elif '/GoToE' in line:
                            triage_keywords.add('GoToE')
                            lineres.set_heuristic(21)
                        elif '/GoToR' in line:
                            triage_keywords.add('GoToR')
                            lineres.set_heuristic(22)
                        elif '/Encrypt' in line:
                            triage_keywords.add('Encrypt')
                            lineres.set_heuristic(11)
                        elif '/AcroForm' in line:
                            triage_keywords.add('AcroForm')
                            lineres.set_heuristic(4)
                        elif '/RichMedia' in line:
                            triage_keywords.add('RichMedia')
                            lineres.set_heuristic(5)
                        elif '/XFA' in line:
                            triage_keywords.add('XFA')
                            lineres.set_heuristic(23)
                        elif '/Annot' in line:
                            triage_keywords.add('Annot')
                            lineres.set_heuristic(25)
                        elif '/ObjStm' in line:
                            triage_keywords.add('ObjStm')
                            lineres.set_heuristic(7)
                        elif '/URI' in line:
                            triage_keywords.add('URI')
                            lineres.set_heuristic(24)

                        # Suspicious properties results
                        elif "eof2" in line:
                            lineres.set_heuristic(2)
                        elif "eof5" in line:
                            lineres.set_heuristic(17)
                        elif "page" in line:
                            lineres.set_heuristic(26)
                        elif "entropy" in line:
                            lineres.set_heuristic(12)
                        elif "obj/endobj" in line:
                            lineres.set_heuristic(13)
                        elif "stream/endstream" in line:
                            lineres.set_heuristic(14)

                        if lineres.heuristic is not None:
                            pdfidres.add_subsection(lineres)

        for e in errors:
            all_errors.add(e)
            if e.startswith('Error running plugin'):
                self.log.warn(e)

        if run_pdfparse:
            # CALL PDF parser and extract further information
            pdf_parserres = ResultSection(title_text="PDF Parser Results")
            # STATISTICS
            # Do not run for objstms, which are being analyzed when get_malform == False
            if get_malform:
                options = {
                    "stats": True,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No statistical results generated for file. Please see errors."
                        )
                    else:
                        version = pdf_parser_result.get("version", None)
                        if version and version[0] != '0':
                            pdf_parserres.add_line(version[0])
                        stats = pdf_parser_result.get("stats", None)
                        if stats:
                            sres = ResultSection(
                                title_text="PDF Statistcs",
                                parent=pdf_parserres,
                                body_format=BODY_FORMAT.MEMORY_DUMP)
                            for p in stats:
                                sres.add_line(p)
                    for e in errors:
                        all_errors.add(e)

            # Triage plugin -- search sample for keywords and carve content or extract object (if it contains a stream)
            carved_content = {}  # Format { "objnum": [{keyword: content list}}
            obj_extract_triage = set()
            jbig_objs = set()

            for keyword in triage_keywords:
                # ObjStms handled differently
                if keyword == 'ObjStm':
                    continue

                options = {
                    "search": keyword,
                }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                if pdf_parser_result:
                    for p in pdf_parser_result['parts']:
                        content = ""
                        references = []
                        # Trailer will be extracted anyways, try and grab all references anyways -- will be messy
                        if p.startswith("trailer:"):
                            # Grab the content after the keyword
                            # Check that keyword actually in content
                            if f"/{keyword}" in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').split("/", 1)[0].strip()
                                    references = re.findall(
                                        "[0-9]* [0-9]* R", content)
                                except Exception:
                                    continue
                        # If not trailer, should be object
                        elif 'Referencing:' in p:
                            # Grab the content after the keyword
                            if '>>++>>' in p:
                                try:
                                    content = p.split(keyword, 1)[1].replace(
                                        '>>++>>', '').strip()
                                except Exception:
                                    try:
                                        content = p.split("\n", 3)[3]
                                    except Exception:
                                        content = p
                            else:
                                try:
                                    content = p.split("\n", 3)[3]
                                except Exception:
                                    content = p
                            # Sometimes the content is the same keyword with references (i.e "/URI /URI 10 0 R"
                            if content.startswith(f"/{keyword}"):
                                try:
                                    content = re.sub(f"/{keyword}[ ]*", "",
                                                     content, 1)
                                except Exception:
                                    pass
                            try:
                                references = p.split("\n", 3)[2].replace(
                                    'Referencing:', '').strip().split(", ")
                            except Exception:
                                pass
                        # Only extract JBIG2Decode objects with deep scan, but always report on their presence
                        if keyword == "JBIG2Decode" and "/Filter" in p and "Contains stream" in p:
                            try:
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                if request.deep_scan:
                                    obj_extract_triage.add(objnum)
                                jbig_objs.add(objnum)
                                continue
                            except Exception as e:
                                self.log.debug(e)
                                continue
                        # If no content, then keyword likely points to reference objects, so grab those
                        if content == '':
                            if len(references) > 0:
                                content = references
                            else:
                                # Something is wrong, drop it.
                                continue
                        else:
                            while True:
                                # Multiple references might be in a list, i.e. /Annot # # R vs. /Annots [# # R # # R]
                                islist = re.match(
                                    r"[s]?[ ]?\[([0-9]* [0-9]* R[ \\rn]{0,8})*\]",
                                    content)
                                if islist:
                                    content = re.sub(
                                        r"[\[\]]", "",
                                        islist.group(0).replace(
                                            "s ", '').replace("R ",
                                                              "R,")).split(",")
                                    break
                                # References might be with instructions, i.e. [# # R /FitH null]
                                withinst = re.match(
                                    r"[s]?[ \\']{0,3}\[[ ]?([0-9]* [0-9]* R)[ \\rn]{1,8}"
                                    r"[/a-zA-Z0-9 ]*[ ]?\]", content)
                                if withinst:
                                    content = [withinst.group(1)]
                                    break
                                content = [content]
                                break
                        for c in content:
                            # If keyword = Javascript and content starts with '/JS', disregard as 'JS' will be extracted
                            if "JS" in triage_keywords and keyword == "JavaScript" and "/JS" in c[
                                    0:5]:
                                continue
                            if c in references or re.match(
                                    "[0-9]* [0-9]* R", c):
                                try:
                                    ref_obj = c.split(" ", 1)[0]
                                    options = {
                                        "object": ref_obj,
                                        "get_object_detail": True
                                    }
                                    pdf_parser_subresult, err = self.get_pdf_parser(
                                        path, working_dir, options)

                                    if pdf_parser_subresult:
                                        for sub_p in pdf_parser_subresult[
                                                'parts']:
                                            sub_references = sub_p.split("\n", 3)[2].replace('Referencing:', '')\
                                                .strip().split(", ")
                                            ptyp = sub_p.split(
                                                "\n", 2)[1].replace(
                                                    'Type:',
                                                    '').strip().replace(
                                                        "/", "")
                                            # If the object contains a stream, extract the object.
                                            if "Contains stream" in sub_p:
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    obj_extract_triage.add(
                                                        objnum)
                                                except Exception:
                                                    pass
                                            # Or if the object Type is the keyword, grab all referenced objects.
                                            elif sub_references[0] != '' and len(sub_references) >= 1 \
                                                    and ptyp == keyword:
                                                for sr in sub_references:
                                                    try:
                                                        objnum = sr.split(
                                                            " ", 1)[0]
                                                        obj_extract_triage.add(
                                                            objnum)
                                                    except Exception:
                                                        pass
                                            # If not, extract object detail in to carved output
                                            elif pdf_parser_subresult[
                                                    'obj_details'] != "":
                                                try:
                                                    objnum = sub_p.split(
                                                        "\n",
                                                        1)[0].split(" ")[1]
                                                    if objnum in carved_content:
                                                        carved_content[objnum]\
                                                            .append({keyword: pdf_parser_subresult['obj_details']})
                                                    else:
                                                        carved_content[objnum] = \
                                                            [{keyword: pdf_parser_subresult['obj_details']}]
                                                except Exception:
                                                    continue

                                    for e in err:
                                        errors.add(e)
                                except Exception:
                                    # If none of that work, just extract the original object for examination.
                                    try:
                                        objnum = p.split("\n",
                                                         1)[0].split(" ")[1]
                                        obj_extract_triage.add(objnum)
                                    except Exception:
                                        pass
                            # If content does not look like a reference:
                            else:
                                if p.startswith("trailer:"):
                                    continue
                                objnum = p.split("\n", 1)[0].split(" ")[1]
                                # If the object contains a stream extract the object
                                if p.split("\n", 4)[3] == "Contains stream":
                                    obj_extract_triage.add(objnum)
                                else:
                                    # Or just carve the content
                                    if objnum in carved_content:
                                        carved_content[objnum].append(
                                            {keyword: c})
                                    else:
                                        carved_content[objnum] = [{keyword: c}]

                    for e in errors:
                        all_errors.add(e)

            # Add carved content to result output
            show_content_of_interest = False
            if len(carved_content) > 0 or len(jbig_objs) > 0:
                carres = ResultSection(title_text="Content of Interest")
            else:
                carres = None

            if len(jbig_objs) > 0:
                jbigres = ResultSection(
                    title_text=
                    "The following Object IDs are JBIG2DECODE streams:",
                    body_format=BODY_FORMAT.MEMORY_DUMP,
                    parent=carres)
                jbigres.add_line(', '.join(map(str, jbig_objs)))
                show_content_of_interest = True

            if len(carved_content) > 0:
                carved_obj_size_limit = request.get_param(
                    'carved_obj_size_limit')
                for k, l in sorted(carved_content.items()):
                    for d in l:
                        for keyw, con in d.items():
                            subres = ResultSection(
                                title_text=
                                f"Object {k}: Hits for Keyword '{keyw}':")
                            subres.set_heuristic(8)

                            con_bytes = con.encode()
                            if len(con) < carved_obj_size_limit:
                                subres.set_body(con, BODY_FORMAT.MEMORY_DUMP)

                                # Check for IOC content
                                patterns = PatternMatch()
                                st_value = patterns.ioc_match(con_bytes,
                                                              bogon_ip=True)
                                if len(st_value) > 0:
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    for ty, val in st_value.items():
                                        for v in val:
                                            subres.add_tag(ty, v)
                            else:
                                crv_sha = hashlib.sha256(con_bytes).hexdigest()
                                is_supplementary = keyw in ['URI']
                                extraction_purpose = "as supplementary file" if is_supplementary else "for analysis"

                                if crv_sha not in carved_extracted_shas:
                                    f_name = f"carved_content_obj_{k}_{crv_sha[0:7]}"
                                    subres.add_lines([
                                        f"Content over {carved_obj_size_limit} bytes it will be extracted {extraction_purpose}",
                                        f"Name: {f_name} - SHA256: {crv_sha}"
                                    ])
                                    carres.add_subsection(subres)
                                    show_content_of_interest = True
                                    crvf = os.path.join(
                                        self.working_directory, f_name)
                                    with open(crvf, 'wb') as f:
                                        f.write(con_bytes)
                                    try:
                                        if is_supplementary:
                                            # Add as supplementary
                                            request.add_supplementary(
                                                crvf, os.path.basename(crvf),
                                                f"Supplementary content from object {k}"
                                            )
                                        else:
                                            request.add_extracted(
                                                crvf,
                                                os.path.basename(crvf),
                                                f"Extracted content from object {k}",
                                                safelist_interface=self.
                                                api_interface)
                                    except MaxExtractedExceeded:
                                        pass
                                    carved_extracted_shas.add(crv_sha)

            if show_content_of_interest:
                pdf_parserres.add_subsection(carres)

            # ELEMENTS
            # Do not show for objstms
            if get_malform:
                if request.deep_scan:
                    options = {
                        "verbose": True,
                        "nocanonicalizedoutput": True,
                        "get_malform": get_malform
                    }
                elif embed_present:
                    options = {
                        "verbose": True,
                        "elements": "ctsi",
                        "type": "/EmbeddedFile",
                        "get_malform": get_malform
                    }
                else:
                    options = {
                        "verbose": True,
                        "elements": "cst",
                        "get_malform": get_malform
                    }
                pdf_parser_result, errors = self.get_pdf_parser(
                    path, working_dir, options)

                embed_extracted = set()
                if pdf_parser_result:
                    if len(pdf_parser_result) == 0:
                        pdf_parserres.add_line(
                            "No structure information generated for file. Please see errors."
                        )
                    else:
                        # PDF Parser will write any malformed content over 100 bytes to a file
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'malformed':
                                    if len(l) > 0:
                                        pdf_parserres.set_heuristic(6)
                                    for i in l:
                                        try:
                                            request.add_extracted(
                                                i,
                                                os.path.basename(i),
                                                "Extracted malformed content in PDF Parser Analysis.",
                                                safelist_interface=self.
                                                api_interface)
                                        except MaxExtractedExceeded:
                                            break

                        parts = pdf_parser_result.get("parts", None)
                        # Extract service will extract the sample's embedded files.
                        # However we want to make note of them so that they are not extracted again below
                        if parts:
                            for p in sorted(parts):
                                if "Type: /EmbeddedFile" in p:
                                    getobj = p.split("\n", 1)[0].split(" ")[1]
                                    embed_extracted.add(getobj)

                # Extract objects collected from above analysis
                obj_to_extract = obj_extract_triage - embed_extracted - jbig_objs

                if len(obj_to_extract) > 0:
                    options = {
                        "filter": True,
                        "object": obj_to_extract,
                        "dump": "extracted_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        files = pdf_parser_result.get("files", None)
                        extracted_files = []
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_obj_", "")
                                        try:
                                            if request.add_extracted(
                                                    i,
                                                    f_name,
                                                    f"Object {obj_id} extracted in PDF Parser Analysis.",
                                                    safelist_interface=self.
                                                    api_interface):
                                                extracted_files.append(
                                                    f"Extracted object {obj_id} as {f_name}"
                                                )
                                        except MaxExtractedExceeded:
                                            break
                        for e in errors:
                            all_errors.add(e)

                        if extracted_files:
                            extract_res = ResultSection(
                                title_text="Extracted embedded objects",
                                parent=pdf_parserres)
                            extract_res.set_heuristic(9)
                            extract_res.add_lines(extracted_files)

                # Extract jbig2decode objects in deep scan mode
                if request.deep_scan and len(jbig_objs) > 0:
                    options = {
                        "object": jbig_objs,
                        "dump": "extracted_jb_obj_",
                    }
                    pdf_parser_result, errors = self.get_pdf_parser(
                        path, working_dir, options)

                    if pdf_parser_result:
                        extracted_jb = []
                        files = pdf_parser_result.get("files", None)
                        if files:
                            for f, l in files.items():
                                if f == 'embedded':
                                    for i in l:
                                        f_name = os.path.basename(i)
                                        obj_id = f_name.replace(
                                            "extracted_jb_obj_", "")
                                        extracted_jb.append(
                                            f"JBIG2DECODE object {obj_id} extracted as {f_name}"
                                        )
                                        try:
                                            if request.add_extracted(
                                                    i,
                                                    f_name,
                                                    f"JBIG2DECODE object {obj_id} extracted in PDF Parser Analysis.",
                                                    safelist_interface=self.
                                                    api_interface):
                                                extracted_jb.append(
                                                    f"JBIG2DECODE object {obj_id} extracted as {f_name}"
                                                )
                                        except MaxExtractedExceeded:
                                            break

                        for e in errors:
                            all_errors.add(e)

                        if extracted_jb:
                            jbig_extract_res = ResultSection(
                                title_text="Extracted JBIG2Decode objects",
                                parent=pdf_parserres)
                            jbig_extract_res.set_heuristic(9)
                            jbig_extract_res.add_lines(extracted_jb)

            if len(pdf_parserres.subsections) > 0:
                res.add_subsection(pdf_parserres)

        return res, objstms, all_errors
Example #8
0
    def _add_resultinfo_for_match(self, result: Result, match):
        """
        Parse from Yara signature match and add information to the overall AL service result. This module determines
        result score and identifies any AL tags that should be added (i.e. IMPLANT_NAME, THREAT_ACTOR, etc.).

        Args:
            result: AL ResultSection object.
            match: Yara rules Match object item.

        Returns:
            None.
        """
        almeta = YaraMetadata(match)
        self._normalize_metadata(almeta)

        section = ResultSection('', classification=almeta.classification)
        if self.deep_scan or almeta.al_status != "NOISY":
            section.set_heuristic(self.YARA_HEURISTICS_MAP.get(
                almeta.category, 1),
                                  signature=f'{match.namespace}.{match.rule}',
                                  attack_id=almeta.mitre_att)
        section.add_tag(f'file.rule.{self.name.lower()}',
                        f'{match.namespace}.{match.rule}')

        title_elements = [
            f"[{match.namespace}] {match.rule}",
        ]

        if almeta.actor_type:
            section.add_tag('attribution.actor', almeta.actor_type)

        for tag in almeta.tags:
            section.add_tag(tag['type'], tag['value'])

        # Malware Tags
        implant_title_elements = []
        for (implant_name, implant_family) in almeta.malwares:
            if implant_name:
                implant_title_elements.append(implant_name)
                section.add_tag('attribution.implant', implant_name)
            if implant_family:
                implant_title_elements.append(implant_family)
                section.add_tag('attribution.family', implant_family)
        if implant_title_elements:
            title_elements.append(
                f"- Implant(s): {', '.join(implant_title_elements)}")

        # Threat Actor metadata
        for actor in almeta.actors:
            title_elements.append(actor)
            section.add_tag('attribution.actor', actor)

        # Exploit / CVE metadata
        if almeta.exploits:
            title_elements.append(
                f"- Exploit(s): {', '.join(almeta.exploits)}")
        for exploit in almeta.exploits:
            section.add_tag('attribution.exploit', exploit)

        # Include technique descriptions in the section behavior
        for (category, name) in almeta.techniques:
            descriptor = self.TECHNIQUE_DESCRIPTORS.get(category, None)
            if descriptor:
                technique_type, technique_description = descriptor
                section.add_tag(technique_type, name)
                almeta.behavior.add(technique_description)

        for (category, name) in almeta.infos:
            descriptor = self.INFO_DESCRIPTORS.get(category, None)
            if descriptor:
                info_type, info_description = descriptor
                section.add_tag(info_type, name)
                almeta.behavior.add(info_description)

        # Summaries
        if almeta.behavior:
            title_elements.append(f"- Behavior: {', '.join(almeta.behavior)}")
        for element in almeta.behavior:
            section.add_tag('file.behavior', element)

        title = " ".join(title_elements)
        section.title_text = title

        json_body = dict(name=match.rule, )

        for item in [
                'id', 'version', 'author', 'description', 'source', 'malware',
                'info', 'technique', 'tool', 'exploit', 'actor', 'category',
                'mitre_att'
        ]:
            val = almeta.__dict__.get(item, None)
            if val:
                json_body[item] = val

        string_match_data = self._add_string_match_data(match)
        if string_match_data:
            json_body['string_hits'] = string_match_data

        section.set_body(json.dumps(json_body),
                         body_format=BODY_FORMAT.KEY_VALUE)

        result.add_section(section)
Example #9
0
    def section_builder(self, parser, field_dict, result, parsertype="MWCP"):
        json_body = {}
        malware_name = ''
        malware_types = []
        mitre_group = ''
        mitre_att = ''
        category = 'malware'
        # get malware names from parser objects
        if parsertype == "RATDecoder":
            malware_name = parser
        if parsertype == "MWCP":
            for name, obj in self.file_parsers.items():
                if parser in obj.parser_list:
                    malware_name = obj.malware
                    malware_types = obj.malware_types
                    mitre_att = obj.mitre_att
                    mitre_group = obj.mitre_group
                    category = obj.category
                    for item in [
                            'classification', 'mitre_group', 'mitre_att',
                            'malware', 'malware_types', 'category'
                    ]:
                        val = getattr(obj, item, None)
                        if val:
                            json_body[item] = val
                    break
        parser_section = ResultSection(f"{parsertype} : {parser}")

        parser_section = classification_checker(parser_section, parser,
                                                self.file_parsers)
        if len(field_dict) > 0:  # if any decoder output exists raise heuristic
            parser_section.set_body(json.dumps(json_body),
                                    body_format=BODY_FORMAT.KEY_VALUE)
            parser_section.set_heuristic(HEURISTICS_MAP.get(category, 1),
                                         attack_id=mitre_att)
            parser_section.add_tag("source", parsertype)

            if malware_name:
                parser_section.add_tag('attribution.implant',
                                       malware_name.upper())
            if mitre_group:
                parser_section.add_tag('attribution.actor',
                                       mitre_group.upper())
            for malware_type in malware_types:
                parser_section.add_tag('attribution.family',
                                       malware_type.upper())
        # Create subsections and attach them to the main parser_section
        subsection_builder(parser_section, field_dict)

        other_key = "other"
        if other_key in field_dict:
            other_content = field_dict[other_key]
            other_section = ResultSection(f"Other metadata found",
                                          body_format=BODY_FORMAT.KEY_VALUE,
                                          body=json.dumps(other_content))
            parser_section.add_subsection(other_section)

        for field in field_dict:
            if field != other_key and field not in FIELD_TAG_MAP:
                self.log.debug(f"{field} does not exist in FIELD_TAG_MAP")
        result.add_section(parser_section)
Example #10
0
    def execute(self, request):
        file_path = request.file_path
        result = Result()

        # Report the version of suricata as the service context
        request.set_service_context(
            f"Suricata version: {self.get_suricata_version()}")

        # restart Suricata if we need to
        self.start_suricata_if_necessary()

        # Strip frame headers from the PCAP, since Suricata sometimes has trouble parsing strange PCAPs
        stripped_filepath = self.strip_frame_headers(file_path)

        # Check to make sure the size of the stripped file isn't 0 - this happens on pcapng files
        # TODO: there's probably a better way to do this - don't event strip it if it's pcapng
        if os.stat(stripped_filepath).st_size == 0:
            stripped_filepath = file_path

        # Switch stdout and stderr so we don't get our logs polluted
        mystdout = StringIO()
        old_stdout = sys.stdout
        sys.stdout = mystdout

        mystderr = StringIO()
        old_stderr = sys.stderr
        sys.stderr = mystderr

        # Pass the pcap file to Suricata via the socket
        ret = self.suricata_sc.send_command(
            "pcap-file", {
                "filename": stripped_filepath,
                "output-dir": self.working_directory
            })

        if not ret or ret["return"] != "OK":
            self.log.exception(
                f"Failed to submit PCAP for processing: {ret['message']}")

        # Wait for the socket finish processing our PCAP
        while True:
            time.sleep(1)
            try:
                ret = self.suricata_sc.send_command("pcap-current")
                if ret and ret["message"] == "None":
                    break
            except ConnectionResetError as e:
                raise RecoverableError(e)

        # Bring back stdout and stderr
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        # NOTE: for now we will ignore content of mystdout and mystderr but we have them just in case...

        alerts, signatures, domains, ips, urls, email_addresses, tls_dict, extracted_files, reverse_lookup = self.parse_suricata_output(
        ).values()

        file_extracted_section = ResultSection("File(s) extracted by Suricata")
        # Parse the json results of the service
        if request.get_param("extract_files"):
            for file in extracted_files:
                sha256, filename, extracted_file_path = file.values()
                self.log.info(f"extracted file {filename}")
                try:
                    if request.add_extracted(
                            extracted_file_path,
                            filename,
                            "Extracted by Suricata",
                            safelist_interface=self.api_interface):
                        file_extracted_section.add_line(filename)
                        if filename != sha256:
                            file_extracted_section.add_tag(
                                'file.name.extracted', filename)
                except FileNotFoundError as e:
                    # An intermittent issue, just try again
                    raise RecoverableError(e)
                except MaxExtractedExceeded:
                    # We've hit our limit
                    pass

        # Report a null score to indicate that files were extracted. If no sigs hit, it's not clear
        # where the extracted files came from
        if file_extracted_section.body:
            result.add_section(file_extracted_section)

        # Add tags for the domains, urls, and IPs we've discovered
        root_section = ResultSection("Discovered IOCs", parent=result)
        if domains:
            domain_section = ResultSection("Domains", parent=root_section)
            for domain in domains:
                domain_section.add_line(domain)
                domain_section.add_tag('network.dynamic.domain', domain)
        if ips:
            ip_section = ResultSection("IP Addresses", parent=root_section)
            for ip in ips:
                # Make sure it's not a local IP
                if not (ip.startswith("127.") or ip.startswith("192.168.")
                        or ip.startswith("10.") or
                        (ip.startswith("172.")
                         and 16 <= int(ip.split(".")[1]) <= 31)):
                    ip_section.add_line(ip)
                    ip_section.add_tag('network.dynamic.ip', ip)

        if urls:
            url_section = ResultSection("URLs", parent=root_section)
            for url in urls:
                url_section.add_line(url)
                url_section.add_tag('network.dynamic.uri', url)
        if email_addresses:
            email_section = ResultSection("Email Addresses",
                                          parent=root_section)
            for eml in email_addresses:
                email_section.add_line(eml)
                email_section.add_tag('network.email.address', eml)

        # Map between suricata key names and AL tag types
        tls_mappings = {
            "subject": 'cert.subject',
            "issuerdn": 'cert.issuer',
            "version": 'cert.version',
            "notbefore": 'cert.valid.start',
            "notafter": 'cert.valid.end',
            "fingerprint": 'cert.thumbprint',
            "sni": 'network.tls.sni'
        }

        if tls_dict:
            tls_section = ResultSection("TLS Information",
                                        parent=root_section,
                                        body_format=BODY_FORMAT.JSON)
            kv_body = {}
            for tls_type, tls_values in tls_dict.items():
                if tls_type == "fingerprint":
                    # make sure the cert fingerprint/thumbprint matches other values,
                    # like from PEFile
                    tls_values = [
                        v.replace(":", "").lower() for v in tls_values
                    ]

                if tls_type in tls_mappings:
                    kv_body[tls_type] = tls_values

                    tag_type = tls_mappings[tls_type]
                    if tag_type is not None:
                        for tls_value in tls_values:
                            tls_section.add_tag(tag_type, tls_value)

                elif tls_type == "ja3":
                    kv_body.setdefault('ja3_hash', [])
                    kv_body.setdefault('ja3_string', [])

                    for ja3_entry in tls_values:
                        ja3_hash = ja3_entry.get("hash")
                        ja3_string = ja3_entry.get("string")
                        if ja3_hash:
                            kv_body['ja3_hash'].append(ja3_hash)
                            tls_section.add_tag('network.tls.ja3_hash',
                                                ja3_hash)
                        if ja3_string:
                            kv_body['ja3_string'].append(ja3_string)
                            tls_section.add_tag('network.tls.ja3_string',
                                                ja3_string)

                else:
                    kv_body[tls_type] = tls_values
                    # stick a message in the logs about a new TLS type found in suricata logs
                    self.log.info(
                        f"Found new TLS type {tls_type} with values {tls_values}"
                    )
            tls_section.set_body(json.dumps(kv_body))

        # Create the result sections if there are any hits
        if len(alerts) > 0:
            for signature_id, signature_details in signatures.items():
                signature = signature_details['signature']
                attributes = signature_details['attributes']
                section = ResultSection(f'{signature_id}: {signature}')
                heur_id = 3
                if any(x in signature for x in self.config.get("sure_score")):
                    heur_id = 1
                elif any(x in signature
                         for x in self.config.get("vhigh_score")):
                    heur_id = 2

                section.set_heuristic(heur_id)
                if signature_details['al_signature']:
                    section.add_tag("file.rule.suricata",
                                    signature_details['al_signature'])
                for timestamp, src_ip, src_port, dest_ip, dest_port in alerts[
                        signature_id][:10]:
                    section.add_line(
                        f"{timestamp} {src_ip}:{src_port} -> {dest_ip}:{dest_port}"
                    )
                if len(alerts[signature_id]) > 10:
                    section.add_line(
                        f'And {len(alerts[signature_id]) - 10} more flows')

                # Tag IPs/Domains/URIs associated to signature
                for flow in alerts[signature_id]:
                    dest_ip = flow[3]
                    section.add_tag('network.dynamic.ip', dest_ip)
                    if dest_ip in reverse_lookup.keys():
                        section.add_tag('network.dynamic.domain',
                                        reverse_lookup[dest_ip])
                    [
                        section.add_tag('network.dynamic.uri', uri)
                        for uri in urls
                        if dest_ip in uri or (reverse_lookup.get(dest_ip) and
                                              reverse_lookup[dest_ip] in uri)
                    ]

                # Add a tag for the signature id and the message
                section.add_tag('network.signature.signature_id',
                                str(signature_id))
                section.add_tag('network.signature.message', signature)
                [
                    section.add_tag('network.static.uri', attr['uri'])
                    for attr in attributes if attr.get('uri')
                ]
                # Tag malware_family
                for malware_family in signature_details['malware_family']:
                    section.add_tag('attribution.family', malware_family)

                result.add_section(section)
                self.ontology.add_result_part(
                    Signature,
                    data=dict(
                        name=signature_details['al_signature'],
                        type="SURICATA",
                        malware_families=signature_details['malware_family']
                        or None,
                        attributes=attributes))

            # Add the original Suricata output as a supplementary file in the result
            request.add_supplementary(
                os.path.join(self.working_directory, 'eve.json'),
                'SuricataEventLog.json', 'json')

        # Add the stats.log to the result, which can be used to determine service success
        if os.path.exists(os.path.join(self.working_directory, 'stats.log')):
            request.add_supplementary(
                os.path.join(self.working_directory, 'stats.log'), 'stats.log',
                'log')

        request.result = result