class MailMessage(object): """ Mail message structure. Got a mail in raw rfc2822 format, parse it to resolve all recipients emails, parts and group headers """ zope.interface.implements(IMessageParser) recipient_headers = ['From', 'To', 'Cc', 'Bcc'] message_protocol = 'email' warnings = [] body_html = "" body_plain = "" def __init__(self, raw_data): """Parse an RFC2822,5322 mail message.""" self.raw = raw_data try: self.mail = Message(raw_data) except Exception as exc: log.error('Parse message failed %s' % exc) raise exc if self.mail.defects: # XXX what to do ? log.warn('Defects on parsed mail %r' % self.mail.defects) self.warning = self.mail.defects self.get_bodies() def get_bodies(self): """Extract body alternatives, if any.""" body_html = "" body_plain = "" if self.mail.get("Content-Type", None): if self.mail.is_multipart(): for top_level_part in self.mail.get_payload(): if top_level_part.get_content_maintype() == "multipart": for alternative in top_level_part.get_payload(): charset = alternative.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if alternative.get_content_type() == "text/plain": body_plain = alternative.get_payload( decode=True) self.body_plain = to_utf8(body_plain, charset) elif alternative.get_content_type() == "text/html": body_html = alternative. \ get_payload(decode=True) self.body_html = to_utf8(body_html, charset) break else: charset = top_level_part.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if top_level_part.get_content_type() == "text/plain": body_plain = top_level_part. \ get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) elif top_level_part.get_content_type() == "text/html": body_html = top_level_part.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: charset = self.mail.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if self.mail.get_content_type() == "text/html": body_html = self.mail.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: body_plain = self.mail.get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) else: self.body_plain = self.mail.get_payload(decode=True) @property def subject(self): """Mail subject.""" s = decode_header(self.mail.get('Subject')) charset = s[0][1] if charset is not None: return s[0][0].decode(charset, "replace"). \ encode("utf-8", "replace") else: return s[0][0] @property def size(self): """Get mail size in bytes.""" return len(self.mail.as_string()) @property def external_references(self): """Return mail references to be used as external references. making use of RFC5322 headers : message-id in-reply-to references headers' strings are pruned to extract email addresses only. """ ext_id = self.mail.get('Message-Id') parent_id = self.mail.get('In-Reply-To') ref = self.mail.get_all("References") ref_addr = getaddresses(ref) if ref else None ref_ids = [address[1] for address in ref_addr] if ref_addr else [] mid = clean_email_address(ext_id)[1] if ext_id else None pid = clean_email_address(parent_id)[1] if parent_id else None return {'message_id': mid, 'parent_id': pid, 'ancestors_ids': ref_ids} @property def date(self): """Get UTC date from a mail message.""" mail_date = self.mail.get('Date') if mail_date: tmp_date = parsedate_tz(mail_date) return datetime.datetime.fromtimestamp(mktime_tz(tmp_date)) log.debug('No date on mail using now (UTC)') return datetime.datetime.now(tz=pytz.utc) @property def participants(self): """Mail participants.""" participants = [] for header in self.recipient_headers: addrs = [] participant_type = header.capitalize() if self.mail.get(header): if ',' in self.mail.get(header): parts = self.mail.get(header).split(',') filtered = [x for x in parts if '@' in x] addrs.extend(filtered) else: addrs.append(self.mail.get(header)) for addr in addrs: participant = MailParticipant(participant_type, addr) participants.append(participant) return participants @property def hash_participants(self): """Create an hash from participants addresses for global lookup.""" addresses = [x.address for x in self.participants] addresses = list(set(addresses)) addresses.sort() return hashlib.sha256(''.join(addresses)).hexdigest() @property def attachments(self): """Extract parts which we consider as attachments.""" if not self.mail.is_multipart(): return [] attchs = [] for p in walk_with_boundary(self.mail, ""): if not p.is_multipart(): if MailAttachment.is_attachment(p): attchs.append(MailAttachment(p)) return attchs @property def extra_parameters(self): """Mail message extra parameters.""" lists = self.mail.get_all("List-ID") lists_addr = getaddresses(lists) if lists else None lists_ids = [address[1] for address in lists_addr] \ if lists_addr else [] return {'lists': lists_ids} def lookup_discussion_sequence(self, *args, **kwargs): """Return list of lookup type, value from a mail message.""" seq = [] # list lookup first for list_id in self.extra_parameters.get('lists', []): seq.append(('list', list_id)) seq.append(('global', self.hash_participants)) # try to link message to external thread's root message-id if len(self.external_references["ancestors_ids"]) > 0: seq.append( ("thread", self.external_references["ancestors_ids"][0])) elif self.external_references["parent_id"]: seq.append(("thread", self.external_references["parent_id"])) elif self.external_references["message_id"]: seq.append(("thread", self.external_references["message_id"])) return seq # Others parameters specific for mail message @property def headers(self): """Extract all headers into list. Duplicate on headers exists, group them by name with a related list of values """ def keyfunc(item): return item[0] # Group multiple value for same headers into a dict of list headers = {} data = sorted(self.mail.items(), key=keyfunc) for k, g in groupby(data, key=keyfunc): headers[k] = [x[1] for x in g] return headers
class MailMessage(object): """ Mail message structure. Got a mail in raw rfc2822 format, parse it to resolve all recipients emails, parts and group headers """ recipient_headers = ['From', 'To', 'Cc', 'Bcc'] message_type = 'mail' def __init__(self, raw): """Initialize structure from a raw mail.""" try: self.mail = Message(raw) except Exception as exc: log.error('Parse message failed %s' % exc) raise if self.mail.defects: # XXX what to do ? log.warn('Defects on parsed mail %r' % self.mail.defects) self.recipients = self._extract_recipients() self.parts = self._extract_parts() self.headers = self._extract_headers() self.subject = self.mail.get('Subject') tmp_date = parsedate_tz(self.mail['Date']) self.date = datetime.fromtimestamp(mktime_tz(tmp_date)) self.external_message_id = self.mail.get('Message-Id') self.external_parent_id = self.mail.get('In-Reply-To') self.size = len(raw) @property def text(self): """Message all text.""" # XXX : more complexity ? return "\n".join([x.data for x in self.parts if x.can_index]) def _extract_recipients(self): recip = {} for header in self.recipient_headers: addrs = [] recipient_type = header.lower() if self.mail.get(header): if ',' in self.mail.get(header): addrs.extend(self.mail.get(header).split(',')) else: addrs.append(self.mail.get(header)) addrs = [clean_email_address(x) for x in addrs] recip[recipient_type] = addrs return recip def _extract_headers(self): """ Extract all headers into list. Duplicate on headers exists, group them by name with a related list of values """ def keyfunc(item): return item[0] # Group multiple value for same headers into a dict of list headers = {} data = sorted(self.mail.items(), key=keyfunc) for k, g in groupby(data, key=keyfunc): headers[k] = [x[1] for x in g] return headers def _extract_parts(self): """Multipart message, extract parts.""" parts = [] for p in self.mail.walk(): if not p.is_multipart(): parts.append(self._process_part(p)) return parts def _process_part(self, part): return MailPart(part) @property def transport_privacy_index(self): """Evaluate transport privacy index.""" # XXX : TODO return random.randint(0, 50) @property def content_privacy_index(self): """Evaluate content privacy index.""" # XXX: real evaluation needed ;) if 'PGP' in [x.content_type for x in self.parts]: return random.randint(50, 100) else: return 0.0 @property def spam_level(self): """Report spam level.""" try: score = self.headers.get('X-Spam-Score') score = float(score[0]) except: score = 0.0 if score < 5.0: return 0.0 if score >= 5.0 and score < 15.0: return min(score * 10, 100.0) return 100.0 @property def importance_level(self): """Return percent estimated importance level of this message.""" # XXX. real compute needed return 0 if self.spam_level else random.randint(50, 100) @property def lists(self): """List related to message.""" lists = [] for list_name in self.headers.get('List-ID', []): lists.append(list_name) return lists @property def from_(self): """Get from recipient.""" from_ = self.recipients.get('from') if from_: # XXX should do better return from_[0][1] return None def lookup_sequence(self): """Build parameter sequence for lookups.""" seq = [] # first from parent if self.external_parent_id: seq.append(('parent', self.external_parent_id)) # then list lookup for listname in self.lists: seq.append(('list', listname)) # last try to lookup from sender address if self.from_: seq.append(('from', self.from_)) return seq def to_parameter(self): """Transform mail to a NewMessage parameter.""" msg = NewMessage() msg.type = 'email' msg.subject = self.subject msg.from_ = self.from_ # XXX need transform to part parameter for part in self.parts: param = Part() param.content_type = part.content_type param.data = part.data param.size = part.size param.filename = part.filename param.can_index = part.can_index msg.parts.append(param) msg.headers = self.headers msg.date = self.date msg.size = self.size msg.text = self.text msg.external_parent_id = self.external_parent_id msg.external_message_id = self.external_message_id # XXX well .... msg.privacy_index = (self.transport_privacy_index + self.content_privacy_index) / 2 msg.importance_level = self.importance_level return msg
class MailMessage(object): """ Mail message structure. Got a mail in raw rfc2822 format, parse it to resolve all recipients emails, parts and group headers """ zope.interface.implements(IMessageParser) recipient_headers = ['From', 'To', 'Cc', 'Bcc'] message_protocol = 'email' warnings = [] body_html = "" body_plain = "" def __init__(self, raw_data): """Parse an RFC2822,5322 mail message.""" self.raw = raw_data self._extra_parameters = {} try: self.mail = Message(raw_data) except Exception as exc: log.error('Parse message failed %s' % exc) raise exc if self.mail.defects: # XXX what to do ? log.warn('Defects on parsed mail %r' % self.mail.defects) self.warning = self.mail.defects self.get_bodies() def get_bodies(self): """Extract body alternatives, if any.""" body_html = "" body_plain = "" if self.mail.get("Content-Type", None): if self.mail.is_multipart(): if self.mail.get_content_subtype() == 'encrypted': parts = self.mail.get_payload() if len(parts) == 2: self.body_plain = parts[1].get_payload() return else: log.warn('Encrypted message with invalid parts count') for top_level_part in self.mail.get_payload(): if top_level_part.get_content_maintype() == "multipart": for alternative in top_level_part.get_payload(): charset = alternative.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if alternative.get_content_type() == "text/plain": body_plain = alternative.get_payload( decode=True) self.body_plain = to_utf8(body_plain, charset) elif alternative.get_content_type() == "text/html": body_html = alternative. \ get_payload(decode=True) self.body_html = to_utf8(body_html, charset) break else: charset = top_level_part.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if top_level_part.get_content_type() == "text/plain": body_plain = top_level_part. \ get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) elif top_level_part.get_content_type() == "text/html": body_html = top_level_part.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: charset = self.mail.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if self.mail.get_content_type() == "text/html": body_html = self.mail.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: body_plain = self.mail.get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) else: self.body_plain = self.mail.get_payload(decode=True) @property def subject(self): """Mail subject.""" s = decode_header(self.mail.get('Subject')) charset = s[0][1] if charset is not None: return s[0][0].decode(charset, "replace"). \ encode("utf-8", "replace") else: try: return s[0][0].decode('utf-8', errors='ignore') except UnicodeError: log.warn('Invalid subject encoding') return s[0][0] @property def size(self): """Get mail size in bytes.""" return len(self.mail.as_string()) @property def external_references(self): """Return mail references to be used as external references. making use of RFC5322 headers : message-id in-reply-to references headers' strings are pruned to extract email addresses only. """ ext_id = self.mail.get('Message-Id') parent_id = self.mail.get('In-Reply-To') ref = self.mail.get_all("References") ref_addr = getaddresses(ref) if ref else None ref_ids = [address[1] for address in ref_addr] if ref_addr else [] mid = clean_email_address(ext_id)[1] if ext_id else None if not mid: log.error('Unable to find correct message_id {}'.format(ext_id)) mid = ext_id pid = clean_email_address(parent_id)[1] if parent_id else None if not pid: pid = parent_id return {'message_id': mid, 'parent_id': pid, 'ancestors_ids': ref_ids} @property def date(self): """Get UTC date from a mail message.""" mail_date = self.mail.get('Date') if mail_date: try: tmp_date = parsedate_tz(mail_date) return datetime.datetime.fromtimestamp(mktime_tz(tmp_date)) except TypeError: log.error('Invalid date in mail {}'.format(mail_date)) log.debug('No date on mail using now (UTC)') return datetime.datetime.now(tz=pytz.utc) @property def participants(self): """Mail participants.""" participants = [] for header in self.recipient_headers: addrs = [] participant_type = header.capitalize() if self.mail.get(header): parts = self.mail.get(header).split('>,') if not parts: pass if parts and parts[0] == 'undisclosed-recipients:;': pass filtered = [x for x in parts if '@' in x] addrs.extend(filtered) for addr in addrs: participant = MailParticipant(participant_type, addr.lower()) if participant.address == '' and participant.label == '': log.warn('Invalid email address {}'.format(addr)) else: participants.append(participant) return participants @property def attachments(self): """Extract parts which we consider as attachments.""" if not self.mail.is_multipart(): return [] attchs = [] for p in walk_with_boundary(self.mail, ""): if not p.is_multipart(): if p.get_content_subtype() == 'pgp-encrypted': # Special consideration. Do not present it as an attachment # but set _extra_parameters accordingly self._extra_parameters.update({'encrypted': 'pgp'}) continue if MailAttachment.is_attachment(p): attchs.append(MailAttachment(p)) return attchs @property def extra_parameters(self): """Mail message extra parameters.""" lists = self.mail.get_all("List-ID") lists_addr = getaddresses(lists) if lists else None lists_ids = [address[1] for address in lists_addr] \ if lists_addr else [] self._extra_parameters.update({'lists': lists_ids}) return self._extra_parameters # Others parameters specific for mail message @property def headers(self): """Extract all headers into list. Duplicate on headers exists, group them by name with a related list of values """ def keyfunc(item): return item[0] # Group multiple value for same headers into a dict of list headers = {} data = sorted(self.mail.items(), key=keyfunc) for k, g in groupby(data, key=keyfunc): headers[k] = [x[1] for x in g] return headers @property def external_flags(self): """ Get headers added by our fetcher that represent flags or labels set by external provider, returned as list of tags """ tags = [] for h in ['X-Fetched-Imap-Flags', 'X-Fetched-X-GM-LABELS']: enc_flags = self.mail.get(h) if enc_flags: flags_str = base64.decodestring(enc_flags) for flag in string.split(flags_str, '\r\n'): if flag not in EXCLUDED_EXT_FLAGS: tag = Tag() tag.name = flag tag.label = flag tag.type = 'imported' tags.append(tag) return tags
class MailMessage(object): """ Mail message structure. Got a mail in raw rfc2822 format, parse it to resolve all recipients emails, parts and group headers """ zope.interface.implements(IMessageParser) recipient_headers = ['From', 'To', 'Cc', 'Bcc'] message_protocol = 'email' warnings = [] body_html = "" body_plain = "" def __init__(self, raw_data): """Parse an RFC2822,5322 mail message.""" self.raw = raw_data try: self.mail = Message(raw_data) except Exception as exc: log.error('Parse message failed %s' % exc) raise exc if self.mail.defects: # XXX what to do ? log.warn('Defects on parsed mail %r' % self.mail.defects) self.warning = self.mail.defects self.get_bodies() def get_bodies(self): """Extract body alternatives, if any.""" body_html = "" body_plain = "" if self.mail.get("Content-Type", None): if self.mail.is_multipart(): if self.mail.get_content_subtype() == 'encrypted': parts = self.mail.get_payload() if len(parts) == 2: self.body_plain = parts[1].get_payload() return else: log.warn('Encrypted message with invalid parts count') for top_level_part in self.mail.get_payload(): if top_level_part.get_content_maintype() == "multipart": for alternative in top_level_part.get_payload(): charset = alternative.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if alternative.get_content_type() == "text/plain": body_plain = alternative.get_payload( decode=True) self.body_plain = to_utf8(body_plain, charset) elif alternative.get_content_type() == "text/html": body_html = alternative. \ get_payload(decode=True) self.body_html = to_utf8(body_html, charset) break else: charset = top_level_part.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if top_level_part.get_content_type() == "text/plain": body_plain = top_level_part. \ get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) elif top_level_part.get_content_type() == "text/html": body_html = top_level_part.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: charset = self.mail.get_param("charset") if isinstance(charset, tuple): charset = unicode(charset[2], charset[0] or "us-ascii") if self.mail.get_content_type() == "text/html": body_html = self.mail.get_payload(decode=True) self.body_html = to_utf8(body_html, charset) else: body_plain = self.mail.get_payload(decode=True) self.body_plain = to_utf8(body_plain, charset) else: self.body_plain = self.mail.get_payload(decode=True) @property def subject(self): """Mail subject.""" s = decode_header(self.mail.get('Subject')) charset = s[0][1] if charset is not None: return s[0][0].decode(charset, "replace"). \ encode("utf-8", "replace") else: return s[0][0] @property def size(self): """Get mail size in bytes.""" return len(self.mail.as_string()) @property def external_references(self): """Return mail references to be used as external references. making use of RFC5322 headers : message-id in-reply-to references headers' strings are pruned to extract email addresses only. """ ext_id = self.mail.get('Message-Id') parent_id = self.mail.get('In-Reply-To') ref = self.mail.get_all("References") ref_addr = getaddresses(ref) if ref else None ref_ids = [address[1] for address in ref_addr] if ref_addr else [] mid = clean_email_address(ext_id)[1] if ext_id else None pid = clean_email_address(parent_id)[1] if parent_id else None return { 'message_id': mid, 'parent_id': pid, 'ancestors_ids': ref_ids} @property def date(self): """Get UTC date from a mail message.""" mail_date = self.mail.get('Date') if mail_date: tmp_date = parsedate_tz(mail_date) return datetime.datetime.fromtimestamp(mktime_tz(tmp_date)) log.debug('No date on mail using now (UTC)') return datetime.datetime.now(tz=pytz.utc) @property def participants(self): """Mail participants.""" participants = [] for header in self.recipient_headers: addrs = [] participant_type = header.capitalize() if self.mail.get(header): if ',' in self.mail.get(header): parts = self.mail.get(header).split(',') filtered = [x for x in parts if '@' in x] addrs.extend(filtered) else: addrs.append(self.mail.get(header)) for addr in addrs: participant = MailParticipant(participant_type, addr) participants.append(participant) return participants @property def attachments(self): """Extract parts which we consider as attachments.""" if not self.mail.is_multipart(): return [] attchs = [] for p in walk_with_boundary(self.mail, ""): if not p.is_multipart(): if MailAttachment.is_attachment(p): attchs.append(MailAttachment(p)) return attchs @property def extra_parameters(self): """Mail message extra parameters.""" lists = self.mail.get_all("List-ID") lists_addr = getaddresses(lists) if lists else None lists_ids = [address[1] for address in lists_addr] \ if lists_addr else [] return {'lists': lists_ids} # Others parameters specific for mail message @property def headers(self): """Extract all headers into list. Duplicate on headers exists, group them by name with a related list of values """ def keyfunc(item): return item[0] # Group multiple value for same headers into a dict of list headers = {} data = sorted(self.mail.items(), key=keyfunc) for k, g in groupby(data, key=keyfunc): headers[k] = [x[1] for x in g] return headers
def msg_has_attachment(msg: Message) -> bool: return (msg.get_content_type() != "multipart" and msg.get("Content-Disposition") and msg.get_filename())