Example #1
1
def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optinal configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {}  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting of if is required initiate the empty variable arry
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting of if is required initiate the empty variable arry
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc['subject'] = eml_parser.decode.decode_field(subject)

    # If parsing had problem... report it...
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode from
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = eml_parser.decode.workaround_bug_27257(msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = eml_parser.regex.email_regex.search(msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode to
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode Cc
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc['date'] = eml_parser.decode.robust_string2date(msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter()  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = eml_parser.routing.parserouting(received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get('from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = ['Duplicate SMTP by entrypoint']

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug('Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in eml_parser.regex.recv_dom_regex.findall(received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(m)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in eml_parser.regex.email_regex.findall(received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(headers_struc['received_email']))
    headers_struc['received_domain'] = list(set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}
    multipart = True  # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False  # No only "one" Part
    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in eml_parser.regex.email_regex.findall(body):
                list_observed_email.append(match.lower())
            for match in eml_parser.regex.dom_regex.findall(body):
                list_observed_dom.append(match.lower())
            for match in eml_parser.regex.ipv4_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in eml_parser.regex.ipv6_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in eml_parser.regex.email_regex.findall(body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in eml_parser.regex.dom_regex.findall(body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in eml_parser.regex.ipv4_regex.findall(body[scn_pt - 11:scn_pt + 3]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in eml_parser.regex.ipv6_regex.findall(body[scn_pt - 4:scn_pt + 35]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, precompute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith('content'):  # otherwise, we got all header headers
                    k = k.lower().replace('.', ':')
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = eml_parser.decode.workaround_bug_27257_field_value(msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    try:
        report_struc['attachment'] = traverse_multipart(msg, 0, include_attachment_data)
    except (binascii.Error, AssertionError):
        # we hit this exception if the payload contains invalid data
        logger.exception('Exception occured while parsing attachment data. Collected data will not be complete!')
        report_struc['attachment'] = None

    # Dirty hack... transform hash into list.. need to be done in the function.
    # Mandatory to search efficiently in mongodb
    # See Bug 11 of eml_parser
    if not report_struc['attachment']:
        del report_struc['attachment']
    else:
        newattach = []
        for attachment in report_struc['attachment']:
            newattach.append(report_struc['attachment'][attachment])
        report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
Example #2
0
def parse_email(msg: email.message.Message,
                include_raw_body: bool = False,
                include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None,
                parse_attachments: bool = True) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optional configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

      parse_attachments (bool, optional): Set this to false if you want to disable the parsing of attachments.
                                          Please note that HTML attachments as well as other text data marked to be
                                          in-lined, will always be parsed.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {
    }  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting is required, set to emtpy list
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting is required, set to emtpy list
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc[
        'subject'] = auxiliar.external.eml_parser.decode.decode_field(subject)

    # If parsing had problems, report it
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode "from"
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = auxiliar.external.eml_parser.decode.workaround_bug_27257(
            msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = auxiliar.external.eml_parser.regex.email_regex.search(
            msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode "to"
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode "cc"
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc[
                'date'] = auxiliar.external.eml_parser.decode.robust_string2date(
                    msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse(
                '1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse(
            '1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter(
        )  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+',
                                        ' ',
                                        line,
                                        flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = auxiliar.external.eml_parser.routing.parserouting(
                received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get(
                                'from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[
                                    byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(
                                        ['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = [
                                        'Duplicate SMTP by entrypoint'
                                    ]

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = auxiliar.external.eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   auxiliar.external.eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug(
                        'Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private
                            or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in auxiliar.external.eml_parser.regex.recv_dom_regex.findall(
                    received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(
                        m
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in auxiliar.external.eml_parser.regex.email_regex.findall(
                    received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(
        headers_struc['received_email']))
    headers_struc['received_domain'] = list(
        set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(
                set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}

    # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False
    else:
        multipart = True

    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in auxiliar.external.eml_parser.regex.email_regex.findall(
                    body):
                list_observed_email.append(match.lower())
            for match in auxiliar.external.eml_parser.regex.dom_regex.findall(
                    body):
                list_observed_dom.append(match.lower())
            for match in auxiliar.external.eml_parser.regex.ipv4_regex.findall(
                    body):
                if not auxiliar.external.eml_parser.regex.priv_ip_regex.match(
                        match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in auxiliar.external.eml_parser.regex.ipv6_regex.findall(
                    body):
                if not auxiliar.external.eml_parser.regex.priv_ip_regex.match(
                        match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(
                    body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in auxiliar.external.eml_parser.regex.email_regex.findall(
                        body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in auxiliar.external.eml_parser.regex.dom_regex.findall(
                        body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in auxiliar.external.eml_parser.regex.ipv4_regex.findall(
                        body[scn_pt - 11:scn_pt + 3]):
                    if not auxiliar.external.eml_parser.regex.priv_ip_regex.match(
                            match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in auxiliar.external.eml_parser.regex.ipv6_regex.findall(
                        body[scn_pt - 4:scn_pt + 35]):
                    if not auxiliar.external.eml_parser.regex.priv_ip_regex.match(
                            match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, pre-compute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith(
                        'content'):  # otherwise, we got all header headers
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = auxiliar.external.eml_parser.decode.workaround_bug_27257_field_value(
                msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    if parse_attachments:
        try:
            report_struc['attachment'] = traverse_multipart(
                msg, 0, include_attachment_data)
        except (binascii.Error, AssertionError):
            # we hit this exception if the payload contains invalid data
            logger.exception(
                'Exception occured while parsing attachment data. Collected data will not be complete!'
            )
            report_struc['attachment'] = None

        # Dirty hack... transform hash into list.. need to be done in the function.
        # Mandatory to search efficiently in mongodb
        # See Bug 11 of eml_parser
        if not report_struc['attachment']:
            del report_struc['attachment']
        else:
            newattach = []
            for attachment in report_struc['attachment']:
                newattach.append(report_struc['attachment'][attachment])
            report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
Example #3
0
    def prepare_multipart_part_attachment(
            self,
            msg: email.message.Message,
            counter: int = 0) -> typing.Dict[str, typing.Any]:
        """Extract meta-information from a multipart-part.

        Args:
            msg (email.message.Message): An e-mail message object.
            counter (int, optional): A counter which is used for generating attachments
                file-names in case there are none found in the header. Default = 0.

        Returns:
            dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
                date size, file extension, real mime-type.
        """
        attachment: typing.Dict[str, typing.Any] = {}

        # In case we hit bug 27257, try to downgrade the used policy
        try:
            lower_keys = [k.lower() for k in msg.keys()]
        except AttributeError:
            former_policy: email.policy.Policy = msg.policy  # type: ignore
            msg.policy = email.policy.compat32  # type: ignore
            lower_keys = [k.lower() for k in msg.keys()]
            msg.policy = former_policy  # type: ignore

        if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \
            or msg.get_content_maintype() != 'text':
            # if it's an attachment-type, pull out the filename
            # and calculate the size in bytes
            if msg.get_content_type() == 'message/rfc822':
                payload = msg.get_payload()
                if len(payload) > 1:
                    logger.warning(
                        'More than one payload for "message/rfc822" part detected. This is not supported, please report!'
                    )

                try:
                    data = payload[0].as_bytes()
                except UnicodeEncodeError:
                    data = payload[0].as_bytes(policy=email.policy.compat32)

                file_size = len(data)
            else:
                data = msg.get_payload(decode=True)
                file_size = len(data)

            filename = msg.get_filename('')
            if filename == '':
                filename = 'part-{0:03d}'.format(counter)
            else:
                filename = eml_parser.decode.decode_field(filename)

            file_id = str(uuid.uuid1())
            attachment[file_id] = {}
            attachment[file_id]['filename'] = filename
            attachment[file_id]['size'] = file_size

            # os.path always returns the extension as second element
            # in case there is no extension it returns an empty string
            extension = os.path.splitext(filename)[1].lower()
            if extension:
                # strip leading dot
                attachment[file_id]['extension'] = extension[1:]

            attachment[file_id]['hash'] = self.get_file_hash(data)

            mime_type, mime_type_short = self.get_mime_type(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                if magic is not None:
                    logger.warning(
                        'Error determining attachment mime-type - "{}"'.format(
                            file_id))

            if self.include_attachment_data:
                attachment[file_id]['raw'] = base64.b64encode(data)

            ch: typing.Dict[str, typing.List[str]] = {}
            for k, v in msg.items():
                k = k.lower()
                v = str(v)

                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]

            attachment[file_id]['content_header'] = ch

            counter += 1

        return attachment