def message_keywords(self, session, msg_mid, msg_id, msg, msg_date, mailbox=None): keywords = [] textpart = None for part in msg.walk(): charset = part.get_charset() or 'iso-8859-1' if part.get_content_type() == 'text/plain': textpart = self.try_decode(part.get_payload(None, True), charset) elif part.get_content_type() == 'text/html': payload = self.try_decode(part.get_payload(None, True), charset) if len(payload) > 3: try: textpart = lxml.html.fromstring(payload).text_content() except: session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id)) textpart = payload else: textpart = payload elif 'pgp' in part.get_content_type(): keywords.append('pgp:has') att = part.get_filename() if att: att = self.try_decode(att, charset) keywords.append('attachment:has') keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())]) textpart = (textpart or '') + ' ' + att if textpart: # FIXME: Does this lowercase non-ASCII characters correctly? # FIXME: What about encrypted content? keywords.extend(re.findall(WORD_REGEXP, textpart.lower())) # FIXME: Do this better. if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart: keywords.append('pgp:has') mdate = datetime.date.fromtimestamp(msg_date) keywords.append('%s:year' % mdate.year) keywords.append('%s:month' % mdate.month) keywords.append('%s:day' % mdate.day) keywords.append('%s-%s-%s:date' % (mdate.year, mdate.month, mdate.day)) keywords.append('%s:id' % msg_id) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower())) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower())) if mailbox: keywords.append('%s:mailbox' % mailbox.lower()) keywords.append('%s:hprint' % HeaderPrint(msg)) for key in msg.keys(): key_lower = key.lower() if key_lower not in BORING_HEADERS: words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower())) words -= STOPLIST keywords.extend(['%s:%s' % (t, key_lower) for t in words]) if 'list' in key_lower: keywords.extend(['%s:list' % t for t in words]) return (set(keywords) - STOPLIST)
def message_keywords(self, session, msg_mid, msg_id, msg, msg_date, mailbox=None): keywords = [] textpart = None payload = [None] for part in msg.walk(): payload[0] = None ctype = part.get_content_type() charset = part.get_charset() or 'iso-8859-1' def _loader(p): if payload[0] is None: payload[0] = self.try_decode(p.get_payload(None, True), charset) return payload[0] if ctype == 'text/plain': textpart = _loader(part) elif ctype == 'text/html': _loader(part) if len(payload[0]) > 3: try: textpart = lxml.html.fromstring(payload[0]).text_content() except: session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id)) textpart = payload[0] else: textpart = payload[0] elif 'pgp' in part.get_content_type(): keywords.append('pgp:has') att = part.get_filename() if att: att = self.try_decode(att, charset) keywords.append('attachment:has') keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())]) textpart = (textpart or '') + ' ' + att if textpart: # FIXME: Does this lowercase non-ASCII characters correctly? # FIXME: What about encrypted content? keywords.extend(re.findall(WORD_REGEXP, textpart.lower())) # FIXME: Do this better. if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart: keywords.append('pgp:has') for extract in plugins.get_text_kw_extractors(): keywords.extend(extract(self, msg, ctype, lambda: textpart)) for extract in plugins.get_data_kw_extractors(): keywords.extend(extract(self, msg, ctype, att, part, lambda: _loader(part))) keywords.append('%s:id' % msg_id) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower())) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower())) if mailbox: keywords.append('%s:mailbox' % mailbox.lower()) keywords.append('%s:hprint' % HeaderPrint(msg)) for key in msg.keys(): key_lower = key.lower() if key_lower not in BORING_HEADERS: emails = ExtractEmails(self.hdr(msg, key).lower()) words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower())) words -= STOPLIST keywords.extend(['%s:%s' % (t, key_lower) for t in words]) keywords.extend(['%s:%s' % (e, key_lower) for e in emails]) keywords.extend(['%s:email' % e for e in emails]) if 'list' in key_lower: keywords.extend(['%s:list' % t for t in words]) for extract in plugins.get_meta_kw_extractors(): keywords.extend(extract(self, msg_mid, msg, msg_date)) return (set(keywords) - STOPLIST)
def message_keywords(self, session, msg_mid, msg_id, msg, msg_date, mailbox=None): keywords = [] payload = [None] for part in msg.walk(): textpart = payload[0] = None ctype = part.get_content_type() charset = part.get_charset() or 'iso-8859-1' def _loader(p): if payload[0] is None: payload[0] = self.try_decode(p.get_payload(None, True), charset) return payload[0] if ctype == 'text/plain': textpart = _loader(part) elif ctype == 'text/html': _loader(part) if len(payload[0]) > 3: try: textpart = lxml.html.fromstring( payload[0]).text_content() except: session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id)) textpart = payload[0] else: textpart = payload[0] elif 'pgp' in part.get_content_type(): keywords.append('pgp:has') att = part.get_filename() if att: att = self.try_decode(att, charset) keywords.append('attachment:has') keywords.extend( [t + ':att' for t in re.findall(WORD_REGEXP, att.lower())]) textpart = (textpart or '') + ' ' + att if textpart: # FIXME: Does this lowercase non-ASCII characters correctly? # FIXME: What about encrypted content? keywords.extend(re.findall(WORD_REGEXP, textpart.lower())) # FIXME: Do this better. if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart: keywords.append('pgp:has') for extract in plugins.get_text_kw_extractors(): keywords.extend(extract(self, msg, ctype, lambda: textpart)) for extract in plugins.get_data_kw_extractors(): keywords.extend( extract(self, msg, ctype, att, part, lambda: _loader(part))) keywords.append('%s:id' % msg_id) keywords.extend( re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower())) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower())) if mailbox: keywords.append('%s:mailbox' % mailbox.lower()) keywords.append('%s:hprint' % HeaderPrint(msg)) for key in msg.keys(): key_lower = key.lower() if key_lower not in BORING_HEADERS: emails = ExtractEmails(self.hdr(msg, key).lower()) words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower())) words -= STOPLIST keywords.extend(['%s:%s' % (t, key_lower) for t in words]) keywords.extend(['%s:%s' % (e, key_lower) for e in emails]) keywords.extend(['%s:email' % e for e in emails]) if 'list' in key_lower: keywords.extend(['%s:list' % t for t in words]) for extract in plugins.get_meta_kw_extractors(): keywords.extend(extract(self, msg_mid, msg, msg_date)) return (set(keywords) - STOPLIST)
def index_message(self, session, msg_mid, msg_id, msg, msg_date, mailbox=None, compact=True, filter_hooks=[]): keywords = [] for part in msg.walk(): charset = part.get_charset() or 'iso-8859-1' if part.get_content_type() == 'text/plain': textpart = self.try_decode(part.get_payload(None, True), charset) elif part.get_content_type() == 'text/html': payload = self.try_decode(part.get_payload(None, True), charset) if len(payload) > 3: try: textpart = lxml.html.fromstring(payload).text_content() except: session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id)) textpart = payload else: textpart = payload else: textpart = None att = part.get_filename() if att: att = self.try_decode(att, charset) keywords.append('attachment:has') keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())]) textpart = (textpart or '') + ' ' + att if textpart: # FIXME: Does this lowercase non-ASCII characters correctly? keywords.extend(re.findall(WORD_REGEXP, textpart.lower())) mdate = datetime.date.fromtimestamp(msg_date) keywords.append('%s:year' % mdate.year) keywords.append('%s:month' % mdate.month) keywords.append('%s:day' % mdate.day) keywords.append('%s-%s-%s:date' % (mdate.year, mdate.month, mdate.day)) keywords.append('%s:id' % msg_id) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower())) keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower())) if mailbox: keywords.append('%s:mailbox' % mailbox.lower()) for key in msg.keys(): key_lower = key.lower() if key_lower not in BORING_HEADERS: words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower())) words -= STOPLIST keywords.extend(['%s:%s' % (t, key_lower) for t in words]) if 'list' in key_lower: keywords.extend(['%s:list' % t for t in words]) keywords = set(keywords) keywords -= STOPLIST for hook in filter_hooks: keywords = hook(session, msg_mid, msg, keywords) for word in keywords: try: PostingList.Append(session, word, msg_mid, compact=compact) except UnicodeDecodeError: # FIXME: we just ignore garbage pass return keywords