class MailMessage(object):
    """
    Mail message structure.

    Got a mail in raw rfc2822 format, parse it to
    resolve all recipients emails, parts and group headers
    """

    zope.interface.implements(IMessageParser)

    recipient_headers = ['From', 'To', 'Cc', 'Bcc']
    message_protocol = 'email'
    warnings = []
    body_html = ""
    body_plain = ""

    def __init__(self, raw_data):
        """Parse an RFC2822,5322 mail message."""
        self.raw = raw_data
        self._extra_parameters = {}
        try:
            self.mail = Message(raw_data)
        except Exception as exc:
            log.error('Parse message failed %s' % exc)
            raise exc
        if self.mail.defects:
            # XXX what to do ?
            log.warn('Defects on parsed mail %r' % self.mail.defects)
            self.warning = self.mail.defects
        self.get_bodies()

    def get_bodies(self):
        """Extract body alternatives, if any."""
        body_html = ""
        body_plain = ""

        if self.mail.get("Content-Type", None):
            if self.mail.is_multipart():
                if self.mail.get_content_subtype() == 'encrypted':
                    parts = self.mail.get_payload()
                    if len(parts) == 2:
                        self.body_plain = parts[1].get_payload()
                        return
                    else:
                        log.warn('Encrypted message with invalid parts count')
                for top_level_part in self.mail.get_payload():
                    if top_level_part.get_content_maintype() == "multipart":
                        for alternative in top_level_part.get_payload():
                            charset = alternative.get_param("charset")
                            if isinstance(charset, tuple):
                                charset = unicode(charset[2], charset[0]
                                                  or "us-ascii")
                            if alternative.get_content_type() == "text/plain":
                                body_plain = alternative.get_payload(
                                    decode=True)
                                self.body_plain = to_utf8(body_plain, charset)
                            elif alternative.get_content_type() == "text/html":
                                body_html = alternative. \
                                    get_payload(decode=True)
                                self.body_html = to_utf8(body_html, charset)
                        break
                    else:
                        charset = top_level_part.get_param("charset")
                        if isinstance(charset, tuple):
                            charset = unicode(charset[2], charset[0]
                                              or "us-ascii")
                        if top_level_part.get_content_type() == "text/plain":
                            body_plain = top_level_part. \
                                get_payload(decode=True)
                            self.body_plain = to_utf8(body_plain, charset)
                        elif top_level_part.get_content_type() == "text/html":
                            body_html = top_level_part.get_payload(decode=True)
                            self.body_html = to_utf8(body_html, charset)
            else:
                charset = self.mail.get_param("charset")
                if isinstance(charset, tuple):
                    charset = unicode(charset[2], charset[0] or "us-ascii")
                if self.mail.get_content_type() == "text/html":
                    body_html = self.mail.get_payload(decode=True)
                    self.body_html = to_utf8(body_html, charset)
                else:
                    body_plain = self.mail.get_payload(decode=True)
                    self.body_plain = to_utf8(body_plain, charset)
        else:
            self.body_plain = self.mail.get_payload(decode=True)

    @property
    def subject(self):
        """Mail subject."""
        s = decode_header(self.mail.get('Subject'))
        charset = s[0][1]
        if charset is not None:
            return s[0][0].decode(charset, "replace"). \
                encode("utf-8", "replace")
        else:
            try:
                return s[0][0].decode('utf-8', errors='ignore')
            except UnicodeError:
                log.warn('Invalid subject encoding')
                return s[0][0]

    @property
    def size(self):
        """Get mail size in bytes."""
        return len(self.mail.as_string())

    @property
    def external_references(self):
        """Return mail references to be used as external references.

         making use of RFC5322 headers :
            message-id
            in-reply-to
            references
        headers' strings are pruned to extract email addresses only.
        """
        ext_id = self.mail.get('Message-Id')
        parent_id = self.mail.get('In-Reply-To')
        ref = self.mail.get_all("References")
        ref_addr = getaddresses(ref) if ref else None
        ref_ids = [address[1] for address in ref_addr] if ref_addr else []
        mid = clean_email_address(ext_id)[1] if ext_id else None
        if not mid:
            log.error('Unable to find correct message_id {}'.format(ext_id))
            mid = ext_id
        pid = clean_email_address(parent_id)[1] if parent_id else None
        if not pid:
            pid = parent_id
        return {'message_id': mid, 'parent_id': pid, 'ancestors_ids': ref_ids}

    @property
    def date(self):
        """Get UTC date from a mail message."""
        mail_date = self.mail.get('Date')
        if mail_date:
            try:
                tmp_date = parsedate_tz(mail_date)
                return datetime.datetime.fromtimestamp(mktime_tz(tmp_date))
            except TypeError:
                log.error('Invalid date in mail {}'.format(mail_date))
        log.debug('No date on mail using now (UTC)')
        return datetime.datetime.now(tz=pytz.utc)

    @property
    def participants(self):
        """Mail participants."""
        participants = []
        for header in self.recipient_headers:
            addrs = []
            participant_type = header.capitalize()
            if self.mail.get(header):
                parts = self.mail.get(header).split('>,')
                if not parts:
                    pass
                if parts and parts[0] == 'undisclosed-recipients:;':
                    pass
                filtered = [x for x in parts if '@' in x]
                addrs.extend(filtered)
            for addr in addrs:
                participant = MailParticipant(participant_type, addr.lower())
                if participant.address == '' and participant.label == '':
                    log.warn('Invalid email address {}'.format(addr))
                else:
                    participants.append(participant)
        return participants

    @property
    def attachments(self):
        """Extract parts which we consider as attachments."""
        if not self.mail.is_multipart():
            return []
        attchs = []
        for p in walk_with_boundary(self.mail, ""):
            if not p.is_multipart():
                if p.get_content_subtype() == 'pgp-encrypted':
                    # Special consideration. Do not present it as an attachment
                    # but set _extra_parameters accordingly
                    self._extra_parameters.update({'encrypted': 'pgp'})
                    continue
                if MailAttachment.is_attachment(p):
                    attchs.append(MailAttachment(p))
        return attchs

    @property
    def extra_parameters(self):
        """Mail message extra parameters."""
        lists = self.mail.get_all("List-ID")
        lists_addr = getaddresses(lists) if lists else None
        lists_ids = [address[1] for address in lists_addr] \
            if lists_addr else []
        self._extra_parameters.update({'lists': lists_ids})
        return self._extra_parameters

    # Others parameters specific for mail message

    @property
    def headers(self):
        """Extract all headers into list.

        Duplicate on headers exists, group them by name
        with a related list of values
        """
        def keyfunc(item):
            return item[0]

        # Group multiple value for same headers into a dict of list
        headers = {}
        data = sorted(self.mail.items(), key=keyfunc)
        for k, g in groupby(data, key=keyfunc):
            headers[k] = [x[1] for x in g]
        return headers

    @property
    def external_flags(self):
        """
        Get headers added by our fetcher that represent flags or labels
        set by external provider,
        returned as list of tags
        """
        tags = []
        for h in ['X-Fetched-Imap-Flags', 'X-Fetched-X-GM-LABELS']:
            enc_flags = self.mail.get(h)
            if enc_flags:
                flags_str = base64.decodestring(enc_flags)
                for flag in string.split(flags_str, '\r\n'):
                    if flag not in EXCLUDED_EXT_FLAGS:
                        tag = Tag()
                        tag.name = flag
                        tag.label = flag
                        tag.type = 'imported'
                        tags.append(tag)
        return tags
Exemple #2
0
class MailMessage(object):
    """
    Mail message structure.

    Got a mail in raw rfc2822 format, parse it to
    resolve all recipients emails, parts and group headers
    """

    zope.interface.implements(IMessageParser)

    recipient_headers = ['From', 'To', 'Cc', 'Bcc']
    message_protocol = 'email'
    warnings = []
    body_html = ""
    body_plain = ""

    def __init__(self, raw_data):
        """Parse an RFC2822,5322 mail message."""
        self.raw = raw_data
        try:
            self.mail = Message(raw_data)
        except Exception as exc:
            log.error('Parse message failed %s' % exc)
            raise exc
        if self.mail.defects:
            # XXX what to do ?
            log.warn('Defects on parsed mail %r' % self.mail.defects)
            self.warning = self.mail.defects
        self.get_bodies()

    def get_bodies(self):
        """Extract body alternatives, if any."""
        body_html = ""
        body_plain = ""

        if self.mail.get("Content-Type", None):
            if self.mail.is_multipart():
                for top_level_part in self.mail.get_payload():
                    if top_level_part.get_content_maintype() == "multipart":
                        for alternative in top_level_part.get_payload():
                            charset = alternative.get_param("charset")
                            if isinstance(charset, tuple):
                                charset = unicode(charset[2], charset[0]
                                                  or "us-ascii")
                            if alternative.get_content_type() == "text/plain":
                                body_plain = alternative.get_payload(
                                    decode=True)
                                self.body_plain = to_utf8(body_plain, charset)
                            elif alternative.get_content_type() == "text/html":
                                body_html = alternative. \
                                    get_payload(decode=True)
                                self.body_html = to_utf8(body_html, charset)
                        break
                    else:
                        charset = top_level_part.get_param("charset")
                        if isinstance(charset, tuple):
                            charset = unicode(charset[2], charset[0]
                                              or "us-ascii")
                        if top_level_part.get_content_type() == "text/plain":
                            body_plain = top_level_part. \
                                get_payload(decode=True)
                            self.body_plain = to_utf8(body_plain, charset)
                        elif top_level_part.get_content_type() == "text/html":
                            body_html = top_level_part.get_payload(decode=True)
                            self.body_html = to_utf8(body_html, charset)
            else:
                charset = self.mail.get_param("charset")
                if isinstance(charset, tuple):
                    charset = unicode(charset[2], charset[0] or "us-ascii")
                if self.mail.get_content_type() == "text/html":
                    body_html = self.mail.get_payload(decode=True)
                    self.body_html = to_utf8(body_html, charset)
                else:
                    body_plain = self.mail.get_payload(decode=True)
                    self.body_plain = to_utf8(body_plain, charset)
        else:
            self.body_plain = self.mail.get_payload(decode=True)

    @property
    def subject(self):
        """Mail subject."""
        s = decode_header(self.mail.get('Subject'))
        charset = s[0][1]
        if charset is not None:
            return s[0][0].decode(charset, "replace"). \
                encode("utf-8", "replace")
        else:
            return s[0][0]

    @property
    def size(self):
        """Get mail size in bytes."""
        return len(self.mail.as_string())

    @property
    def external_references(self):
        """Return mail references to be used as external references.

         making use of RFC5322 headers :
            message-id
            in-reply-to
            references
        headers' strings are pruned to extract email addresses only.
        """
        ext_id = self.mail.get('Message-Id')
        parent_id = self.mail.get('In-Reply-To')
        ref = self.mail.get_all("References")
        ref_addr = getaddresses(ref) if ref else None
        ref_ids = [address[1] for address in ref_addr] if ref_addr else []
        mid = clean_email_address(ext_id)[1] if ext_id else None
        pid = clean_email_address(parent_id)[1] if parent_id else None
        return {'message_id': mid, 'parent_id': pid, 'ancestors_ids': ref_ids}

    @property
    def date(self):
        """Get UTC date from a mail message."""
        mail_date = self.mail.get('Date')
        if mail_date:
            tmp_date = parsedate_tz(mail_date)
            return datetime.datetime.fromtimestamp(mktime_tz(tmp_date))
        log.debug('No date on mail using now (UTC)')
        return datetime.datetime.now(tz=pytz.utc)

    @property
    def participants(self):
        """Mail participants."""
        participants = []
        for header in self.recipient_headers:
            addrs = []
            participant_type = header.capitalize()
            if self.mail.get(header):
                if ',' in self.mail.get(header):
                    parts = self.mail.get(header).split(',')
                    filtered = [x for x in parts if '@' in x]
                    addrs.extend(filtered)
                else:
                    addrs.append(self.mail.get(header))
            for addr in addrs:
                participant = MailParticipant(participant_type, addr)
                participants.append(participant)
        return participants

    @property
    def hash_participants(self):
        """Create an hash from participants addresses for global lookup."""
        addresses = [x.address for x in self.participants]
        addresses = list(set(addresses))
        addresses.sort()
        return hashlib.sha256(''.join(addresses)).hexdigest()

    @property
    def attachments(self):
        """Extract parts which we consider as attachments."""
        if not self.mail.is_multipart():
            return []
        attchs = []
        for p in walk_with_boundary(self.mail, ""):
            if not p.is_multipart():
                if MailAttachment.is_attachment(p):
                    attchs.append(MailAttachment(p))
        return attchs

    @property
    def extra_parameters(self):
        """Mail message extra parameters."""
        lists = self.mail.get_all("List-ID")
        lists_addr = getaddresses(lists) if lists else None
        lists_ids = [address[1] for address in lists_addr] \
            if lists_addr else []
        return {'lists': lists_ids}

    def lookup_discussion_sequence(self, *args, **kwargs):
        """Return list of lookup type, value from a mail message."""
        seq = []

        # list lookup first
        for list_id in self.extra_parameters.get('lists', []):
            seq.append(('list', list_id))

        seq.append(('global', self.hash_participants))

        # try to link message to external thread's root message-id
        if len(self.external_references["ancestors_ids"]) > 0:
            seq.append(
                ("thread", self.external_references["ancestors_ids"][0]))
        elif self.external_references["parent_id"]:
            seq.append(("thread", self.external_references["parent_id"]))
        elif self.external_references["message_id"]:
            seq.append(("thread", self.external_references["message_id"]))

        return seq

    # Others parameters specific for mail message

    @property
    def headers(self):
        """Extract all headers into list.

        Duplicate on headers exists, group them by name
        with a related list of values
        """
        def keyfunc(item):
            return item[0]

        # Group multiple value for same headers into a dict of list
        headers = {}
        data = sorted(self.mail.items(), key=keyfunc)
        for k, g in groupby(data, key=keyfunc):
            headers[k] = [x[1] for x in g]
        return headers
Exemple #3
0
class MailMessage(object):
    """
    Mail message structure.

    Got a mail in raw rfc2822 format, parse it to
    resolve all recipients emails, parts and group headers
    """

    zope.interface.implements(IMessageParser)

    recipient_headers = ['From', 'To', 'Cc', 'Bcc']
    message_protocol = 'email'
    warnings = []
    body_html = ""
    body_plain = ""

    def __init__(self, raw_data):
        """Parse an RFC2822,5322 mail message."""
        self.raw = raw_data
        try:
            self.mail = Message(raw_data)
        except Exception as exc:
            log.error('Parse message failed %s' % exc)
            raise exc
        if self.mail.defects:
            # XXX what to do ?
            log.warn('Defects on parsed mail %r' % self.mail.defects)
            self.warning = self.mail.defects
        self.get_bodies()

    def get_bodies(self):
        """Extract body alternatives, if any."""
        body_html = ""
        body_plain = ""

        if self.mail.get("Content-Type", None):
            if self.mail.is_multipart():
                if self.mail.get_content_subtype() == 'encrypted':
                    parts = self.mail.get_payload()
                    if len(parts) == 2:
                        self.body_plain = parts[1].get_payload()
                        return
                    else:
                        log.warn('Encrypted message with invalid parts count')
                for top_level_part in self.mail.get_payload():
                    if top_level_part.get_content_maintype() == "multipart":
                        for alternative in top_level_part.get_payload():
                            charset = alternative.get_param("charset")
                            if isinstance(charset, tuple):
                                charset = unicode(charset[2],
                                                  charset[0] or "us-ascii")
                            if alternative.get_content_type() == "text/plain":
                                body_plain = alternative.get_payload(
                                    decode=True)
                                self.body_plain = to_utf8(body_plain, charset)
                            elif alternative.get_content_type() == "text/html":
                                body_html = alternative. \
                                    get_payload(decode=True)
                                self.body_html = to_utf8(body_html, charset)
                        break
                    else:
                        charset = top_level_part.get_param("charset")
                        if isinstance(charset, tuple):
                            charset = unicode(charset[2],
                                              charset[0] or "us-ascii")
                        if top_level_part.get_content_type() == "text/plain":
                            body_plain = top_level_part. \
                                get_payload(decode=True)
                            self.body_plain = to_utf8(body_plain, charset)
                        elif top_level_part.get_content_type() == "text/html":
                            body_html = top_level_part.get_payload(decode=True)
                            self.body_html = to_utf8(body_html, charset)
            else:
                charset = self.mail.get_param("charset")
                if isinstance(charset, tuple):
                    charset = unicode(charset[2], charset[0] or "us-ascii")
                if self.mail.get_content_type() == "text/html":
                    body_html = self.mail.get_payload(decode=True)
                    self.body_html = to_utf8(body_html, charset)
                else:
                    body_plain = self.mail.get_payload(decode=True)
                    self.body_plain = to_utf8(body_plain, charset)
        else:
            self.body_plain = self.mail.get_payload(decode=True)

    @property
    def subject(self):
        """Mail subject."""
        s = decode_header(self.mail.get('Subject'))
        charset = s[0][1]
        if charset is not None:
            return s[0][0].decode(charset, "replace"). \
                encode("utf-8", "replace")
        else:
            return s[0][0]

    @property
    def size(self):
        """Get mail size in bytes."""
        return len(self.mail.as_string())

    @property
    def external_references(self):
        """Return mail references to be used as external references.

         making use of RFC5322 headers :
            message-id
            in-reply-to
            references
        headers' strings are pruned to extract email addresses only.
        """
        ext_id = self.mail.get('Message-Id')
        parent_id = self.mail.get('In-Reply-To')
        ref = self.mail.get_all("References")
        ref_addr = getaddresses(ref) if ref else None
        ref_ids = [address[1] for address in ref_addr] if ref_addr else []
        mid = clean_email_address(ext_id)[1] if ext_id else None
        pid = clean_email_address(parent_id)[1] if parent_id else None
        return {
            'message_id': mid,
            'parent_id': pid,
            'ancestors_ids': ref_ids}

    @property
    def date(self):
        """Get UTC date from a mail message."""
        mail_date = self.mail.get('Date')
        if mail_date:
            tmp_date = parsedate_tz(mail_date)
            return datetime.datetime.fromtimestamp(mktime_tz(tmp_date))
        log.debug('No date on mail using now (UTC)')
        return datetime.datetime.now(tz=pytz.utc)

    @property
    def participants(self):
        """Mail participants."""
        participants = []
        for header in self.recipient_headers:
            addrs = []
            participant_type = header.capitalize()
            if self.mail.get(header):
                if ',' in self.mail.get(header):
                    parts = self.mail.get(header).split(',')
                    filtered = [x for x in parts if '@' in x]
                    addrs.extend(filtered)
                else:
                    addrs.append(self.mail.get(header))
            for addr in addrs:
                participant = MailParticipant(participant_type, addr)
                participants.append(participant)
        return participants

    @property
    def attachments(self):
        """Extract parts which we consider as attachments."""
        if not self.mail.is_multipart():
            return []
        attchs = []
        for p in walk_with_boundary(self.mail, ""):
            if not p.is_multipart():
                if MailAttachment.is_attachment(p):
                    attchs.append(MailAttachment(p))
        return attchs

    @property
    def extra_parameters(self):
        """Mail message extra parameters."""
        lists = self.mail.get_all("List-ID")
        lists_addr = getaddresses(lists) if lists else None
        lists_ids = [address[1] for address in lists_addr] \
            if lists_addr else []
        return {'lists': lists_ids}

    # Others parameters specific for mail message

    @property
    def headers(self):
        """Extract all headers into list.

        Duplicate on headers exists, group them by name
        with a related list of values
        """
        def keyfunc(item):
            return item[0]

        # Group multiple value for same headers into a dict of list
        headers = {}
        data = sorted(self.mail.items(), key=keyfunc)
        for k, g in groupby(data, key=keyfunc):
            headers[k] = [x[1] for x in g]
        return headers