Ejemplo n.º 1
1
def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optinal configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {}  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting of if is required initiate the empty variable arry
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting of if is required initiate the empty variable arry
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc['subject'] = eml_parser.decode.decode_field(subject)

    # If parsing had problem... report it...
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode from
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = eml_parser.decode.workaround_bug_27257(msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = eml_parser.regex.email_regex.search(msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode to
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode Cc
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc['date'] = eml_parser.decode.robust_string2date(msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter()  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = eml_parser.routing.parserouting(received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get('from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = ['Duplicate SMTP by entrypoint']

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug('Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in eml_parser.regex.recv_dom_regex.findall(received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(m)  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in eml_parser.regex.email_regex.findall(received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(headers_struc['received_email']))
    headers_struc['received_domain'] = list(set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}
    multipart = True  # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False  # No only "one" Part
    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in eml_parser.regex.email_regex.findall(body):
                list_observed_email.append(match.lower())
            for match in eml_parser.regex.dom_regex.findall(body):
                list_observed_dom.append(match.lower())
            for match in eml_parser.regex.ipv4_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in eml_parser.regex.ipv6_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in eml_parser.regex.email_regex.findall(body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in eml_parser.regex.dom_regex.findall(body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in eml_parser.regex.ipv4_regex.findall(body[scn_pt - 11:scn_pt + 3]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in eml_parser.regex.ipv6_regex.findall(body[scn_pt - 4:scn_pt + 35]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, precompute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith('content'):  # otherwise, we got all header headers
                    k = k.lower().replace('.', ':')
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = eml_parser.decode.workaround_bug_27257_field_value(msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    try:
        report_struc['attachment'] = traverse_multipart(msg, 0, include_attachment_data)
    except (binascii.Error, AssertionError):
        # we hit this exception if the payload contains invalid data
        logger.exception('Exception occured while parsing attachment data. Collected data will not be complete!')
        report_struc['attachment'] = None

    # Dirty hack... transform hash into list.. need to be done in the function.
    # Mandatory to search efficiently in mongodb
    # See Bug 11 of eml_parser
    if not report_struc['attachment']:
        del report_struc['attachment']
    else:
        newattach = []
        for attachment in report_struc['attachment']:
            newattach.append(report_struc['attachment'][attachment])
        report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
Ejemplo n.º 2
0
    def traverse_multipart(self,
                           msg: email.message.Message,
                           counter: int = 0) -> typing.Dict[str, typing.Any]:
        """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

        Args:
            msg (email.message.Message): An e-mail message object.
            counter (int, optional): A counter which is used for generating attachments
                file-names in case there are none found in the header. Default = 0.

        Returns:
            dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
                date size, file extension, real mime-type.
        """
        attachments = {}

        if msg.is_multipart():
            if 'content-type' in msg:
                if msg.get_content_type() == 'message/rfc822':
                    # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                    attachments.update(
                        self.prepare_multipart_part_attachment(msg, counter))

            for part in msg.get_payload():
                attachments.update(self.traverse_multipart(part, counter))
        else:
            return self.prepare_multipart_part_attachment(msg, counter)

        return attachments
Ejemplo n.º 3
0
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \
        typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        if 'content-type' in msg:
            if msg.get_content_type() == 'message/rfc822':
                # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                attachments.update(
                    prepare_multipart_part_attachment(msg, counter, include_attachment_data))  # type: ignore

        for part in msg.get_payload():  # type: ignore
            attachments.update(traverse_multipart(part, counter, include_attachment_data))  # type: ignore
    else:
        return prepare_multipart_part_attachment(msg, counter, include_attachment_data)

    return attachments
Ejemplo n.º 4
0
def workaround_bug_27257_field_value(msg: email.message.Message, header: str) -> typing.List[str]:
    """Function to work around bug 27257 and just tries its best using
    the compat32 policy to extract any meaningful information.

    Args:
        msg (email.message.Message): An e-mail message object.
        header (str): The header field to decode.

    Returns:
        list: Return an extracted list of strings.
    """
    if msg.policy == email.policy.compat32:  # type: ignore
        new_policy = None
    else:
        new_policy = msg.policy  # type: ignore

    msg.policy = email.policy.compat32  # type: ignore
    return_value = []

    for value in msg.get_all(header, []):
        if value != '':
            return_value.append(value)

    if new_policy is not None:
        msg.policy = new_policy  # type: ignore

    return return_value
Ejemplo n.º 5
0
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \
        typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        if 'content-type' in msg:
            if msg.get_content_type() == 'message/rfc822':
                # This is an e-mail message attachment, add it to the attachment list apart from parsing it
                attachments.update(
                    prepare_multipart_part_attachment(
                        msg, counter, include_attachment_data))  # type: ignore

        for part in msg.get_payload():  # type: ignore
            attachments.update(
                traverse_multipart(part, counter,
                                   include_attachment_data))  # type: ignore
    else:
        return prepare_multipart_part_attachment(msg, counter,
                                                 include_attachment_data)

    return attachments
Ejemplo n.º 6
0
def checkContainsArchive(msg: email.message.Message):
    """检查对应的信封是否是Azazo的传输文件"""
    try:
        if (msg.get(Const.PROJECT_NAME_HEADER) and
                msg.get(Const.PROJECT_VERSION_HEADER)):
            return True
    except Exception:
        traceback.print_exc()
Ejemplo n.º 7
0
def extract_files_from_email(
        message: email.message.Message) -> ParseMessageResult:
    """
    Parses an email Message and returns a ParseMessageResult:
    1. Is there a subject and from in the email? If no, return.
    2. Does the subject contain the word "unsubscribe"? If yes, return.
    3. Is the subject 8 letters long? If yes, return.
    4. Otherwise:
        - Walk through the message and grab all parts that match
          "application/pdf;" or "application/epub".
        - Return a ParseMessageResult with those files.
    """
    subject: str = message.get("Subject")
    sent_from: str = message.get("From")
    assert subject and sent_from
    if "unsubscribe" in subject.lower():
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.UNSUBSCRIBE,
            extracted_files=[],
        )
    # FIXME: need a more robust check here
    if subject and len(subject) == 8:
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.REGISTER,
            extracted_files=[],
        )
    # Now we're done parsing the subject, we should check if there are any attachments
    files: List[FileTuple] = []
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            assert type(filename) == str
            files.append((filename, filebytes))
        elif "application/epub" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.epub"
            filebytes = base64.b64decode(part.get_payload())
            assert type(filename) == str
            files.append((filename, filebytes))
    if files:
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.SUCCESS,
            extracted_files=files,
        )
    else:
        # Couldn't parse any files, empty
        return ParseMessageResult(
            sent_from=sent_from,
            subject=subject,
            status=MessageStatus.FAILURE,
            extracted_files=files,
        )
Ejemplo n.º 8
0
def get_first_text_block(
    email_message_instance: email.message.Message
) -> 'list':  # http://python-3.ru/page/imap-email-python
    """Читаем внутренности письма, извлекаем содержимое"""

    maintype = email_message_instance.get_content_maintype()
    if maintype == 'multipart':
        for part in email_message_instance.get_payload():
            if part.get_content_maintype() == 'text':
                return part.get_payload().split(', ')
    elif maintype == 'text':
        return email_message_instance.get_payload().split(', ')
Ejemplo n.º 9
0
def headeremail2list(mail: email.message.Message, header: str) -> typing.List[str]:
    """Parses a given header field with e-mail addresses to a list of e-mail addresses.

    Args:
        mail (email.message.Message): An e-mail message object.
        header (str): The header field to decode.

    Returns:
        list: Returns a list of strings which represent e-mail addresses.
    """
    try:
        field = email.utils.getaddresses(mail.get_all(header, []))
    except (IndexError, AttributeError):
        field = email.utils.getaddresses(eml_parser.decode.workaround_bug_27257(mail, header))

    return_field = []

    for m in field:
        if not m[1] == '':
            if eml_parser.regex.parsing_email_force_tld:
                if eml_parser.regex.email_force_tld_regex.match(m[1]):
                    return_field.append(m[1].lower())
            else:
                return_field.append(m[1].lower())

    return return_field
Ejemplo n.º 10
0
def hash_headers(mail: email.message.Message, header_to_hash: str, bh: str) -> SHA256.SHA256Hash:
    # mail: email.message.Message object
    # header_to_hash: list of email headers to hash seperated by a colon
    # bh: body hash of the email body base64 encoded
    #
    # in:  'from:from:reply-to:subject:subject:date:date:message-id:message-id:to:to:cc:content-type:content-type:content-transfer-encoding:content-transfer-encoding'
    # build "from:Christian Schneider <*****@*****.**>\r\n..."
    #

    header_to_hash_list = header_to_hash.split(":")
    headers = ""

    for header in header_to_hash_list:
        if mail[header] and header in header_to_hash_list:
            headers += header.lower() + ":" + mail[header].strip() + "\r\n"
            header_to_hash_list.remove(header) # strip duplicate header like the from

    dkim_header = mail.get("DKIM-Signature")
    dkim_header = re.sub(r'(\n|\r)', "", dkim_header)
    dkim_header = re.sub(r'\s+', " ", dkim_header)
    headers += "dkim-signature:{}\r\n".format(dkim_header)
    headers = re.sub(r'b=[\w0-9\s/+=]+', "b=", headers) #replace b=... with be=

    hheader = SHA256.new(headers.encode())
    assert hheader.hexdigest() == "5188ff42a5ab71ae70236cf66822ab963b0977a3e7d932237fbfc35005195720"
    return hheader
Ejemplo n.º 11
0
def get_email_body_as_html(message: email.message.Message) -> str:
    """
    Loads a message and extracts the email body as HTML
    """
    # not necessairly the safest assumptions
    payload = message.get_payload(1).get_payload()
    return payload
Ejemplo n.º 12
0
def headeremail2list(mail: email.message.Message,
                     header: str) -> typing.List[str]:
    """Parses a given header field with e-mail addresses to a list of e-mail addresses.

    Args:
        mail (email.message.Message): An e-mail message object.
        header (str): The header field to decode.

    Returns:
        list: Returns a list of strings which represent e-mail addresses.
    """
    try:
        field = email.utils.getaddresses(mail.get_all(header, []))
    except (IndexError, AttributeError):
        field = email.utils.getaddresses(
            eml_parser.decode.workaround_bug_27257(mail, header))

    return_field = []

    for m in field:
        if not m[1] == '':
            if eml_parser.regex.parsing_email_force_tld:
                if eml_parser.regex.email_force_tld_regex.match(m[1]):
                    return_field.append(m[1].lower())
            else:
                return_field.append(m[1].lower())

    return return_field
Ejemplo n.º 13
0
def get_by_msg(msg: email.message.Message, attr: str, decode=False) -> Union[bytes, str]:
    """get attribute from msg"""
    get = msg.get(attr)
    if decode:
        get, charset = email.header.decode_header(get)
        if charset:
            get = get.decode(charset)
    return get
Ejemplo n.º 14
0
def get_dmarc_report(msg: email.message.Message) -> Any:
    for attachment in msg.walk():
        if attachment.get_content_type() == "application/zip":
            fp = io.BytesIO(attachment.get_payload(decode=True))
            with zipfile.ZipFile(fp, "r") as zip:
                for name in zip.namelist():
                    with zip.open(name) as xml:
                        return xml2dict(ET.parse(xml).getroot())
Ejemplo n.º 15
0
 def __init__(self, message: email.message.Message):
     # I am assuming that all messages are sent as pgp-signed
     self._message = message
     self._body = None
     self._signature = None
     self._public_key = None
     if not message.is_multipart():
         raise NotMultipart(sender=self.sender_address)
     self._parse()
Ejemplo n.º 16
0
def analysisArchive(msg: email.message.Message) -> ProjectArchiveInfo:
    """return ((filename, file-content), (ProjectName, ProjectVersion))"""
    for payload in msg.get_payload():  # type:email.message.Message
        if payload.get_content_type() == Const.ARCHIVE_TYPE:
            return ProjectArchiveInfo(
                fileName=payload.get_filename(),
                data=payload.get_payload(decode=True),
                projectName=get_by_msg(msg, Const.PROJECT_NAME_HEADER),
                version=get_by_msg(msg, Const.PROJECT_VERSION_HEADER),
            )
Ejemplo n.º 17
0
    def decode_body(message: email.message.Message):
        # If the message comes in multiple portions (i.e. there are attachments)
        if message.is_multipart():
            for part in message.get_payload():
                # Throw away attachments
                if part.get_filename():
                    continue

                # Return an html object or plaintext
                charset = part.get_content_charset()
                if part.get_content_type() == 'text/plain':
                    return part.get_payload(
                        decode=True).decode(charset).strip()
                if part.get_content_type() == 'text/html':
                    return html(part.get_payload(decode=True).decode(charset))
                # Note that we can just return because even though it's multipart, the only usual suspects for the multiple parts are extra attachments, which we're avoiding.
        else:
            return message.get_payload(decode=True).decode(
                message.get_content_charset()).strip()
Ejemplo n.º 18
0
def get_revolut_statement_part(
        msg: email.message.Message) -> Optional[email.message.Message]:
    for part in msg.walk():
        filename = part.get_filename()
        if not filename: continue

        if re.match(
                '^account-statement_\d*-\d*-\d*_\d*-\d*-\d*_de-ch_[0-9a-f]*.csv$',
                filename):
            return part
    return None
Ejemplo n.º 19
0
def _find_timepie_part(msg: email.message.Message) -> t.Sequence[Ping]:
    """Try to find which part of a potentially nested multipart message contains TagTime data."""
    for part in msg.walk():
        payload = part.get_payload(decode=True)
        if payload:
            lines = payload.decode('utf-8').splitlines()
            if lines and all(
                    re.match(r'^[0-9]{7,} [^\[]*\[.*\]$', line)
                    for line in lines):
                return parse_timepie(lines)
    raise ValueError('given message has no timepie.log part')
Ejemplo n.º 20
0
    def resolve_attachments(message: email.message.Message):
        attachments = []
        for part in message.walk():
            if part.get_content_maintype(
            ) == 'multipart' or part.get('Content-Disposition') is None:
                continue
            fileName = IMAPEmail.soft_decode(part.get_filename())
            if fileName:
                a = IMAPAttachment(IMAPEmail.soft_decode(fileName))
                a.write(part.get_payload(decode=True))
                attachments.append(a)

        return attachments
Ejemplo n.º 21
0
 def from_email(cls, m: email.message.Message):
     return Message(
         message_id=m.get("message-id", ""),
         in_reply_to=m.get("in-reply-to", ""),
         date=_parsedate(m),
         email_from=email.utils.parseaddr(m.get("from", "")),
         email_to=email.utils.parseaddr(m.get("to", "")),
         carbon_copy=email.utils.getaddresses(m.get_all("cc", [])),
         subject=m.get("subject", ""),
         reply_to=m.get("reply-to", ""),
     )
Ejemplo n.º 22
0
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]:
    """
    Get a PDF from the email.

    TODO: This is the thing to change to accommodate more than one PDF per msg.
    """
    filename = None
    filebytes = None
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            break
    else:
        # Let's try getting the subjectline and body and see if there's a code
        # for us to gobble up in there :)
        code = message.get("Subject")
        if code and len(code) == 8:
            register_user(message.get("From"), code)
            return True
        else:
            raise ValueError("No PDF in this message.")

    return (filename, filebytes)
Ejemplo n.º 23
0
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]:
    """
    Get a PDF from the email.

    TODO: This is the thing to change to accommodate more than one PDF per msg.
    """

    # Handle unsubscribes:
    subject = message.get("Subject")
    if "unsubscribe" in subject.lower():
        plog(f"Permanently removing user {message.get('From')}.")
        delete_user(message.get("From"))
        return (False, False)

    filename = None
    filebytes = None
    for part in message.walk():
        if "application/pdf;" in part["Content-Type"]:
            filename = part.get_filename() or "Remailable_Attachment.pdf"
            filebytes = base64.b64decode(part.get_payload())
            break
    else:
        # Let's try getting the subjectline and body and see if there's a code
        # for us to gobble up in there :)
        code = message.get("Subject")
        if code and len(code) == 8:
            register_user(message.get("From"), code)
            plog(f"Registered a new user {message.get('From')}.")
            send_email_if_enabled(
                message.get("From"),
                subject="Your email address is now verified!",
                message=
                "Your verification succeeded, and you can now email documents to your reMarkable tablet. Try responding to this email with a PDF attachment!",
            )
            return (False, False)
        else:
            send_email_if_enabled(
                message.get("From"),
                subject="A problem with your document :(",
                message=
                "Unfortunately, a problem occurred while processing your email. Remailable only supports PDF attachments for now. If you're still encountering issues, please get in touch with Jordan at [email protected] or on Twitter at @j6m8.",
            )
            plog(
                f"ERROR: Encountered no PDF in message from {message.get('From')}"
            )
            return (False, False)

    return (filename, filebytes)
Ejemplo n.º 24
0
 def __init__(self, message: email.message.Message, uid=None):
     self.msgraw = message
     message = email.message_from_bytes(message)
     # print(email.header.decode_header(message['subject']))
     self.uid = uid
     self.recipients = message.get('to', '').replace('\n', '').replace(
         '\r', '').replace('\t', ' ').split(', ')
     self.sender = message.get('from', '')
     self.subject = self.soft_decode(message.get('subject', ''))
     self.date = message.get('date', '')
     self.cc = message.get('cc', '').replace('\n',
                                             '').replace('\r', '').replace(
                                                 '\t', ' ').split(', ')
     self.bcc = message.get('bcc',
                            '').replace('\n', '').replace('\r', '').replace(
                                '\t', ' ').split(', ')
     self.server_uid = message.get('message-id', '').strip()
     self.attachments = self.resolve_attachments(message)
     self.body = self.decode_body(message) or ''
     self.type = 'html' if isinstance(self.body, BeautifulSoup) else 'text'
Ejemplo n.º 25
0
def prepare_multipart_part_attachment(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Extract meta-information from a multipart-part.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachment = {}

    # In case we hit bug 27257, try to downgrade the used policy
    try:
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
    except AttributeError:
        former_policy = msg.policy
        msg.policy = email.policy.compat32
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
        msg.policy = former_policy

    if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \
            or msg.get_content_maintype() != 'text':
        # if it's an attachment-type, pull out the filename
        # and calculate the size in bytes
        if msg.get_content_type() == 'message/rfc822':
            payload = msg.get_payload()
            if len(payload) > 1:
                logger.warning(
                    'More than one payload for "message/rfc822" part detected. This is not supported, please report!'
                )

            data = bytes(payload[0])
            file_size = len(data)
        else:
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

        filename = msg.get_filename('')
        if filename == '':
            filename = 'part-{0:03d}'.format(counter)
        else:
            filename = eml_parser.decode.decode_field(filename)

        file_id = str(uuid.uuid1())
        attachment[file_id] = {}
        attachment[file_id]['filename'] = filename
        attachment[file_id]['size'] = file_size

        # os.path always returns the extension as second element
        # in case there is no extension it returns an empty string
        extension = os.path.splitext(filename)[1].lower()
        if extension:
            # strip leading dot
            attachment[file_id]['extension'] = extension[1:]

        attachment[file_id]['hash'] = get_file_hash(data)

        if not (magic_mime is None or magic_none is None):
            mime_type = magic_none.buffer(data)
            mime_type_short = magic_mime.buffer(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                logger.warning(
                    'Error determining attachment mime-type - "{}"'.format(
                        file_id))

        if include_attachment_data:
            attachment[file_id]['raw'] = base64.b64encode(data)

        ch = {}  # type: typing.Dict[str, typing.List[str]]
        for k, v in msg.items():
            k = k.lower()
            v = str(v)

            if k in ch:
                ch[k].append(v)
            else:
                ch[k] = [v]

        attachment[file_id]['content_header'] = ch

        counter += 1

    return attachment
Ejemplo n.º 26
0
def _parsedate(m: email.message.Message) -> datetime.datetime:
    parsed = email.utils.parsedate_to_datetime(m.get("date", ""))
    return parsed.astimezone(datetime.timezone.utc)
Ejemplo n.º 27
0
def prepare_multipart_part_attachment(msg: email.message.Message, counter: int = 0,
                                      include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Extract meta-information from a multipart-part.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachment = {}

    # In case we hit bug 27257, try to downgrade the used policy
    try:
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
    except AttributeError:
        former_policy = msg.policy
        msg.policy = email.policy.compat32
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
        msg.policy = former_policy

    if 'content-disposition' in lower_keys or not msg.get_content_maintype() == 'text':
        # if it's an attachment-type, pull out the filename
        # and calculate the size in bytes
        if msg.get_content_type() == 'message/rfc822':
            payload = msg.get_payload()
            if len(payload) > 1:
                logger.warning(
                    'More than one payload for "message/rfc822" part detected. This is not supported, please report!')

            data = bytes(payload[0])
            file_size = len(data)
        else:
            data = msg.get_payload(decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

        filename = msg.get_filename('')
        if filename == '':
            filename = 'part-{0:03d}'.format(counter)
        else:
            filename = eml_parser.decode.decode_field(filename)

        file_id = str(uuid.uuid1())
        attachment[file_id] = {}
        attachment[file_id]['filename'] = filename
        attachment[file_id]['size'] = file_size

        # os.path always returns the extension as second element
        # in case there is no extension it returns an empty string
        extension = os.path.splitext(filename)[1].lower()
        if extension:
            # strip leading dot
            attachment[file_id]['extension'] = extension[1:]

        attachment[file_id]['hash'] = get_file_hash(data)

        if not (magic_mime is None or magic_none is None):
            mime_type = magic_none.buffer(data)
            mime_type_short = magic_mime.buffer(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                logger.warning('Error determining attachment mime-type - "{}"'.format(file_id))

        if include_attachment_data:
            attachment[file_id]['raw'] = base64.b64encode(data)

        ch = {}  # type: typing.Dict[str, typing.List[str]]
        for k, v in msg.items():
            k = k.lower()
            v = str(v)

            if k in ch:
                ch[k].append(v)
            else:
                ch[k] = [v]

        attachment[file_id]['content_header'] = ch

        counter += 1

    return attachment
Ejemplo n.º 28
0
def get_tls_report(msg: email.message.Message) -> Any:
    for attachment in msg.walk():
        if attachment.get_content_type() == "application/tlsrpt+gzip":
            return json.loads(gzip.decompress(attachment.get_payload(decode=True)))
        if attachment.get_content_type() == "application/tlsrpt+json":
            return json.loads(attachment.get_payload(decode=True))
Ejemplo n.º 29
0
def traverse_multipart(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            attachments.update(
                traverse_multipart(part, counter,
                                   include_attachment_data))  # type: ignore
    else:
        # In case we hit bug 27257, try to downgrade the used policy
        try:
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
        except AttributeError:
            former_policy = msg.policy
            msg.policy = email.policy.compat32
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
            msg.policy = former_policy

        if 'content-disposition' in lower_keys or not msg.get_content_maintype(
        ) == 'text':
            # if it's an attachment-type, pull out the filename
            # and calculate the size in bytes
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

            filename = msg.get_filename('')
            if filename == '':
                filename = 'part-{0:03d}'.format(counter)
            else:
                filename = eml_parser.decode.decode_field(filename)

            file_id = str(uuid.uuid1())
            attachments[file_id] = {}
            attachments[file_id]['filename'] = filename
            attachments[file_id]['size'] = file_size

            # os.path always returns the extension as second element
            # in case there is no extension it returns an empty string
            extension = os.path.splitext(filename)[1].lower()
            if extension:
                # strip leading dot
                attachments[file_id]['extension'] = extension[1:]

            attachments[file_id]['hash'] = get_file_hash(data)

            if not (magic_mime is None or magic_none is None):
                attachments[file_id]['mime_type'] = magic_none.buffer(data)
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachments[file_id]['mime_type_short'] = magic_mime.buffer(
                    data)

            if include_attachment_data:
                attachments[file_id]['raw'] = base64.b64encode(data)

            ch = {}  # type: typing.Dict[str, typing.List[str]]
            for k, v in msg.items():
                k = k.lower()
                v = str(v)

                if k in ch:
                    # print "%s<<<>>>%s" % (k, v)
                    ch[k].append(v)
                else:
                    ch[k] = [v]

            attachments[file_id]['content_header'] = ch

            counter += 1

    return attachments
Ejemplo n.º 30
0
def extract_message_content(mime_msg: email.message.Message) -> str:
    for part in mime_msg.get_payload():
        if part.get_content_maintype() == 'text':
            return part.get_payload(decode=True).decode()
    return ''
Ejemplo n.º 31
0
def parse_email(msg: email.message.Message,
                include_raw_body: bool = False,
                include_attachment_data: bool = False,
                pconf: typing.Optional[dict] = None,
                parse_attachments: bool = True) -> dict:
    """Parse an e-mail and return a dictionary containing the various parts of
    the e-mail broken down into key-value pairs.

    Args:
      msg (str): Raw EML e-mail string.
      include_raw_body (bool, optional): If True, includes the raw body in the resulting
                               dictionary. Defaults to False.
      include_attachment_data (bool, optional): If True, includes the full attachment
                                                data in the resulting dictionary.
                                                Defaults to False.
      pconf (dict, optional): A dict with various optional configuration parameters,
                              e.g. whitelist IPs, whitelist e-mail addresses, etc.

      parse_attachments (bool, optional): Set this to false if you want to disable the parsing of attachments.
                                          Please note that HTML attachments as well as other text data marked to be
                                          in-lined, will always be parsed.

    Returns:
      dict: A dictionary with the content of the EML parsed and broken down into
            key-value pairs.
    """
    header = {}  # type: typing.Dict[str, typing.Any]
    report_struc = {}  # type: typing.Dict[str, typing.Any]  # Final structure
    headers_struc = {
    }  # type: typing.Dict[str, typing.Any]  # header_structure
    bodys_struc = {}  # type: typing.Dict[str, typing.Any]  # body structure

    # If no pconf was specified, default to empty dict
    pconf = pconf or {}

    # If no whitelisting is required, set to emtpy list
    if 'whiteip' not in pconf:
        pconf['whiteip'] = []
    # If no whitelisting is required, set to emtpy list
    if 'whitefor' not in pconf:
        pconf['whitefor'] = []

    # parse and decode subject
    subject = msg.get('subject', '')
    headers_struc['subject'] = eml_parser.decode.decode_field(subject)

    # If parsing had problems, report it
    if msg.defects:
        headers_struc['defect'] = []
        for exception in msg.defects:
            headers_struc['defect'].append(str(exception))

    # parse and decode "from"
    # @TODO verify if this hack is necessary for other e-mail fields as well
    try:
        msg_header_field = str(msg.get('from', '')).lower()
    except (IndexError, AttributeError):
        # We have hit current open issue #27257
        # https://bugs.python.org/issue27257
        # The field will be set to emtpy as a workaround.
        #
        logger.exception('We hit bug 27257!')

        _from = eml_parser.decode.workaround_bug_27257(msg, 'from')
        msg.__delitem__('from')

        if _from:
            msg.add_header('from', _from[0])
            __from = _from[0].lower()
        else:
            msg.add_header('from', '')
            __from = ''

        msg_header_field = __from

    if msg_header_field != '':
        m = eml_parser.regex.email_regex.search(msg_header_field)
        if m:
            headers_struc['from'] = m.group(1)
        else:
            from_ = email.utils.parseaddr(msg.get('from', '').lower())
            headers_struc['from'] = from_[1]

    # parse and decode "to"
    headers_struc['to'] = headeremail2list(msg, 'to')
    # parse and decode "cc"
    headers_struc['cc'] = headeremail2list(msg, 'cc')
    if not headers_struc['cc']:
        headers_struc.pop('cc')

    # parse and decode delivered-to
    headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to')
    if not headers_struc['delivered_to']:
        headers_struc.pop('delivered_to')

    # parse and decode Date
    # If date field is present
    if 'date' in msg:
        try:
            headers_struc['date'] = eml_parser.decode.robust_string2date(
                msg.get('date'))
        except (TypeError, Exception):
            logger.warning('Error parsing date.')
            headers_struc['date'] = dateutil.parser.parse(
                '1970-01-01T00:00:00+0000')
            msg.replace_header('date', headers_struc['date'])
    else:
        # If date field is absent...
        headers_struc['date'] = dateutil.parser.parse(
            '1970-01-01T00:00:00+0000')

    # mail receiver path / parse any domain, e-mail
    # @TODO parse case where domain is specified but in parentheses only an IP
    headers_struc['received'] = []
    headers_struc['received_email'] = []
    headers_struc['received_domain'] = []
    headers_struc['received_ip'] = []
    try:
        found_smtpin = collections.Counter(
        )  # type: collections.Counter  # Array for storing potential duplicate "HOP"

        for received_line in msg.get_all('received', []):
            line = str(received_line).lower()

            received_line_flat = re.sub(r'(\r|\n|\s|\t)+',
                                        ' ',
                                        line,
                                        flags=re.UNICODE)

            # Parse and split routing headers.
            # Return dict of list
            #   date string
            #   from list
            #   for list
            #   by list
            #   with string
            #   warning list
            parsed_routing = eml_parser.routing.parserouting(
                received_line_flat)

            # If required collect the IP of the gateway that have injected the mail.
            # Iterate all parsed item and find IP
            # It is parsed from the MOST recent to the OLDEST (from IN > Out)
            # We match external IP from the most "OUT" Found.
            # Warning .. It may be spoofed !!
            # It add a warning if multiple identical items are found.

            if pconf.get('byhostentry'):
                for by_item in parsed_routing.get('by', []):  # type: ignore
                    for byhostentry_ in pconf['byhostentry']:
                        byhostentry = byhostentry_.lower()
                        # print ("%s %s" % (byhostentry, by_item))
                        if byhostentry in by_item:
                            # Save the last Found.. ( most external )
                            headers_struc['received_src'] = parsed_routing.get(
                                'from')

                            # Increment watched by detection counter, and warn if needed
                            found_smtpin[byhostentry] += 1
                            if found_smtpin[
                                    byhostentry] > 1:  # Twice found the header...
                                if parsed_routing.get('warning'):
                                    parsed_routing['warning'].append(
                                        ['Duplicate SMTP by entrypoint'])
                                else:
                                    parsed_routing['warning'] = [
                                        'Duplicate SMTP by entrypoint'
                                    ]

            headers_struc['received'].append(parsed_routing)

            # Parse IPs in "received headers"
            ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \
                                   eml_parser.regex.ipv4_regex.findall(received_line_flat)
            for ip in ips_in_received_line:
                try:
                    ip_obj = ipaddress.ip_address(
                        ip
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    logger.debug(
                        'Invalid IP in received line - "{}"'.format(ip))
                else:
                    if not (ip_obj.is_private
                            or str(ip_obj) in pconf['whiteip']):
                        headers_struc['received_ip'].append(str(ip_obj))

            # search for domain
            for m in eml_parser.regex.recv_dom_regex.findall(
                    received_line_flat):
                try:
                    ip_obj = ipaddress.ip_address(
                        m
                    )  # type: ignore  # type of findall is list[str], so this is correct
                except ValueError:
                    # we find IPs using the previous IP crawler, hence we ignore them
                    # here.
                    # iff the regex fails, we add the entry
                    headers_struc['received_domain'].append(m)

            # search for e-mail addresses
            for mail_candidate in eml_parser.regex.email_regex.findall(
                    received_line_flat):
                if mail_candidate not in parsed_routing.get('for', []):
                    headers_struc['received_email'] += [mail_candidate]

    except TypeError:  # Ready to parse email without received headers.
        logger.exception('Exception occured while parsing received lines.')

    # Concatenate for emails into one array | uniq
    # for rapid "find"
    headers_struc['received_foremail'] = []
    if 'received' in headers_struc:
        for _parsed_routing in headers_struc['received']:
            for itemfor in _parsed_routing.get('for', []):
                if itemfor not in pconf['whitefor']:
                    headers_struc['received_foremail'].append(itemfor)

    # Uniq data found
    headers_struc['received_email'] = list(set(
        headers_struc['received_email']))
    headers_struc['received_domain'] = list(
        set(headers_struc['received_domain']))
    headers_struc['received_ip'] = list(set(headers_struc['received_ip']))

    # Clean up if empty
    if not headers_struc['received_email']:
        del headers_struc['received_email']

    if 'received_foremail' in headers_struc:
        if not headers_struc['received_foremail']:
            del headers_struc['received_foremail']
        else:
            headers_struc['received_foremail'] = list(
                set(headers_struc['received_foremail']))

    if not headers_struc['received_domain']:
        del headers_struc['received_domain']

    if not headers_struc['received_ip']:
        del headers_struc['received_ip']
    ####################

    # Parse text body
    raw_body = get_raw_body_text(msg)

    if include_raw_body:
        bodys_struc['raw_body'] = raw_body

    bodys = {}

    # Is it a multipart email ?
    if len(raw_body) == 1:
        multipart = False
    else:
        multipart = True

    for body_tup in raw_body:
        bodie = {}  # type: typing.Dict[str, typing.Any]
        _, body, body_multhead = body_tup
        # Parse any URLs and mail found in the body
        list_observed_urls = []  # type: typing.List[str]
        list_observed_email = []  # type: typing.List[str]
        list_observed_dom = []  # type: typing.List[str]
        list_observed_ip = []  # type: typing.List[str]

        # If we start directly a findall on 500K+ body we got time and memory issues...
        # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ."
        # in order to reduce regex complexity.
        if len(body) < 4096:
            list_observed_urls = get_uri_ondata(body)
            for match in eml_parser.regex.email_regex.findall(body):
                list_observed_email.append(match.lower())
            for match in eml_parser.regex.dom_regex.findall(body):
                list_observed_dom.append(match.lower())
            for match in eml_parser.regex.ipv4_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match not in pconf['whiteip']:
                        list_observed_ip.append(match)
            for match in eml_parser.regex.ipv6_regex.findall(body):
                if not eml_parser.regex.priv_ip_regex.match(match):
                    if match.lower() not in pconf['whiteip']:
                        list_observed_ip.append(match.lower())
        else:
            for scn_pt in findall('://', body):
                list_observed_urls = get_uri_ondata(
                    body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls

            for scn_pt in findall('@', body):
                # RFC 3696, 5322, 5321 for email size limitations
                for match in eml_parser.regex.email_regex.findall(
                        body[scn_pt - 64:scn_pt + 255]):
                    list_observed_email.append(match.lower())

            for scn_pt in findall('.', body):
                # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035
                # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035.
                for match in eml_parser.regex.dom_regex.findall(
                        body[scn_pt - 253:scn_pt + 1004]):
                    list_observed_dom.append(match.lower())

                # Find IPv4 addresses
                for match in eml_parser.regex.ipv4_regex.findall(
                        body[scn_pt - 11:scn_pt + 3]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match not in pconf['whiteip']:
                            list_observed_ip.append(match)

            for scn_pt in findall(':', body):
                # The maximum length of IPv6 is 32 Char + 7 ":"
                for match in eml_parser.regex.ipv6_regex.findall(
                        body[scn_pt - 4:scn_pt + 35]):
                    if not eml_parser.regex.priv_ip_regex.match(match):
                        if match.lower() not in pconf['whiteip']:
                            list_observed_ip.append(match.lower())

        # Report uri,email and observed domain or hash if no raw body
        if include_raw_body:
            if list_observed_urls:
                bodie['uri'] = list(set(list_observed_urls))

            if list_observed_email:
                bodie['email'] = list(set(list_observed_email))

            if list_observed_dom:
                bodie['domain'] = list(set(list_observed_dom))

            if list_observed_ip:
                bodie['ip'] = list(set(list_observed_ip))

        else:
            if list_observed_urls:
                bodie['uri_hash'] = []
                for uri in list(set(list_observed_urls)):
                    bodie['uri_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_email:
                bodie['email_hash'] = []
                for emel in list(set(list_observed_email)):
                    # Email already lowered
                    bodie['email_hash'].append(wrap_hash_sha256(emel))
            if list_observed_dom:
                bodie['domain_hash'] = []
                for uri in list(set(list_observed_dom)):
                    bodie['domain_hash'].append(wrap_hash_sha256(uri.lower()))
            if list_observed_ip:
                bodie['ip_hash'] = []
                for fip in list(set(list_observed_ip)):
                    # IP (v6) already lowered
                    bodie['ip_hash'].append(wrap_hash_sha256(fip))

        # For mail without multipart we will only get the "content....something" headers
        # all other headers are in "header"
        # but we need to convert header tuples in dict..
        # "a","toto"           a: [toto,titi]
        # "a","titi"   --->    c: [truc]
        # "c","truc"
        ch = {}  # type: typing.Dict[str, typing.List]
        for k, v in body_multhead:
            # make sure we are working with strings only
            v = str(v)

            # We are using replace . to : for avoiding issue in mongo
            k = k.lower().replace('.', ':')  # Lot of lowers, pre-compute :) .
            # print v
            if multipart:
                if k in ch:
                    ch[k].append(v)
                else:
                    ch[k] = [v]
            else:  # if not multipart, store only content-xx related header with part
                if k.startswith(
                        'content'):  # otherwise, we got all header headers
                    if k in ch:
                        ch[k].append(v)
                    else:
                        ch[k] = [v]
        bodie['content_header'] = ch  # Store content headers dict

        if include_raw_body:
            bodie['content'] = body

        # Sometimes bad people play with multiple header instances.
        # We "display" the "LAST" one .. as does thunderbird
        val = ch.get('content-type')
        if val:
            header_val = val[-1]
            bodie['content_type'] = header_val.split(';', 1)[0].strip()

        # Hash the body
        bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest()

        uid = str(uuid.uuid1())
        bodys[uid] = bodie

    bodys_struc = bodys

    # Get all other bulk raw headers
    # "a","toto"           a: [toto,titi]
    # "a","titi"   --->    c: [truc]
    # "c","truc"
    #
    for k in set(msg.keys()):
        # We are using replace . to : for avoiding issue in mongo
        k = k.lower()  # Lot of lower, precompute...
        decoded_values = []

        try:
            for value in msg.get_all(k, []):
                if value:
                    decoded_values.append(value)
        except (IndexError, AttributeError):
            # We have hit current open issue #27257
            # https://bugs.python.org/issue27257
            # The field will be set to emtpy as a workaround.
            logger.exception('We hit bug 27257!')

            decoded_values = eml_parser.decode.workaround_bug_27257_field_value(
                msg, k)

            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

        if decoded_values:
            if k in header:
                header[k] += decoded_values
            else:
                header[k] = decoded_values

    headers_struc['header'] = header

    # parse attachments
    if parse_attachments:
        try:
            report_struc['attachment'] = traverse_multipart(
                msg, 0, include_attachment_data)
        except (binascii.Error, AssertionError):
            # we hit this exception if the payload contains invalid data
            logger.exception(
                'Exception occured while parsing attachment data. Collected data will not be complete!'
            )
            report_struc['attachment'] = None

        # Dirty hack... transform hash into list.. need to be done in the function.
        # Mandatory to search efficiently in mongodb
        # See Bug 11 of eml_parser
        if not report_struc['attachment']:
            del report_struc['attachment']
        else:
            newattach = []
            for attachment in report_struc['attachment']:
                newattach.append(report_struc['attachment'][attachment])
            report_struc['attachment'] = newattach

    newbody = []
    for body in bodys_struc:
        newbody.append(bodys_struc[body])
    report_struc['body'] = newbody
    # End of dirty hack

    # Get all other bulk headers
    report_struc['header'] = headers_struc

    return report_struc
Ejemplo n.º 32
0
def get_raw_body_text(
    msg: email.message.Message
) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = [
    ]  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitly treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.'
            )
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \
                or (filename.endswith('.html') or filename.endswith('.htm')) \
                or ('content-disposition' in msg and msg.get_content_disposition() == 'inline'
                    and msg.get_content_maintype() == 'text'):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(
                    raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(
                        charset, 'ignore')
                except Exception:
                    logger.debug(
                        'An exception occured while decoding the payload!',
                        exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode(
                        'ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
Ejemplo n.º 33
0
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = []  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.')
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or (
                filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore')
                except Exception:
                    logger.debug('An exception occured while decoding the payload!', exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body