Ejemplo n.º 1
0
  def message_keywords(self, session, msg_mid, msg_id, msg, msg_date,
                       mailbox=None):
    keywords = []
    textpart = None
    for part in msg.walk():
      charset = part.get_charset() or 'iso-8859-1'
      if part.get_content_type() == 'text/plain':
        textpart = self.try_decode(part.get_payload(None, True), charset)
      elif part.get_content_type() == 'text/html':
        payload = self.try_decode(part.get_payload(None, True), charset)
        if len(payload) > 3:
          try:
            textpart = lxml.html.fromstring(payload).text_content()
          except:
            session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id))
            textpart = payload
        else:
          textpart = payload
      elif 'pgp' in part.get_content_type():
        keywords.append('pgp:has')

      att = part.get_filename()
      if att:
        att = self.try_decode(att, charset)
        keywords.append('attachment:has')
        keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())])
        textpart = (textpart or '') + ' ' + att

      if textpart:
        # FIXME: Does this lowercase non-ASCII characters correctly?
        # FIXME: What about encrypted content?
        keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
        # FIXME: Do this better.
        if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart:
          keywords.append('pgp:has')

    mdate = datetime.date.fromtimestamp(msg_date)
    keywords.append('%s:year' % mdate.year)
    keywords.append('%s:month' % mdate.month)
    keywords.append('%s:day' % mdate.day)
    keywords.append('%s-%s-%s:date' % (mdate.year, mdate.month, mdate.day))
    keywords.append('%s:id' % msg_id)
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower()))
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
    if mailbox: keywords.append('%s:mailbox' % mailbox.lower())
    keywords.append('%s:hprint' % HeaderPrint(msg))

    for key in msg.keys():
      key_lower = key.lower()
      if key_lower not in BORING_HEADERS:
        words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower()))
        words -= STOPLIST
        keywords.extend(['%s:%s' % (t, key_lower) for t in words])
        if 'list' in key_lower:
          keywords.extend(['%s:list' % t for t in words])

    return (set(keywords) - STOPLIST)
Ejemplo n.º 2
0
  def message_keywords(self, session, msg_mid, msg_id, msg, msg_date,
                       mailbox=None):
    keywords = []
    textpart = None
    payload = [None]
    for part in msg.walk():
      payload[0] = None
      ctype = part.get_content_type()
      charset = part.get_charset() or 'iso-8859-1'
      def _loader(p):
        if payload[0] is None:
          payload[0] = self.try_decode(p.get_payload(None, True), charset)
        return payload[0]
      if ctype == 'text/plain':
        textpart = _loader(part)
      elif ctype == 'text/html':
        _loader(part)
        if len(payload[0]) > 3:
          try:
            textpart = lxml.html.fromstring(payload[0]).text_content()
          except:
            session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id))
            textpart = payload[0]
        else:
          textpart = payload[0]
      elif 'pgp' in part.get_content_type():
        keywords.append('pgp:has')

      att = part.get_filename()
      if att:
        att = self.try_decode(att, charset)
        keywords.append('attachment:has')
        keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())])
        textpart = (textpart or '') + ' ' + att

      if textpart:
        # FIXME: Does this lowercase non-ASCII characters correctly?
        # FIXME: What about encrypted content?
        keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
        # FIXME: Do this better.
        if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart:
          keywords.append('pgp:has')
        for extract in plugins.get_text_kw_extractors():
          keywords.extend(extract(self, msg, ctype, lambda: textpart))

      for extract in plugins.get_data_kw_extractors():
        keywords.extend(extract(self, msg, ctype, att, part,
                                lambda: _loader(part)))

    keywords.append('%s:id' % msg_id)
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower()))
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
    if mailbox: keywords.append('%s:mailbox' % mailbox.lower())
    keywords.append('%s:hprint' % HeaderPrint(msg))

    for key in msg.keys():
      key_lower = key.lower()
      if key_lower not in BORING_HEADERS:
        emails = ExtractEmails(self.hdr(msg, key).lower())
        words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower()))
        words -= STOPLIST
        keywords.extend(['%s:%s' % (t, key_lower) for t in words])
        keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
        keywords.extend(['%s:email' % e for e in emails])
        if 'list' in key_lower:
          keywords.extend(['%s:list' % t for t in words])

    for extract in plugins.get_meta_kw_extractors():
      keywords.extend(extract(self, msg_mid, msg, msg_date))

    return (set(keywords) - STOPLIST)
Ejemplo n.º 3
0
    def message_keywords(self,
                         session,
                         msg_mid,
                         msg_id,
                         msg,
                         msg_date,
                         mailbox=None):
        keywords = []
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_charset() or 'iso-8859-1'

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True),
                                                 charset)
                return payload[0]

            if ctype == 'text/plain':
                textpart = _loader(part)
            elif ctype == 'text/html':
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(
                            payload[0]).text_content()
                    except:
                        session.ui.warning('=%s/%s has bogus HTML.' %
                                           (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif 'pgp' in part.get_content_type():
                keywords.append('pgp:has')

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append('attachment:has')
                keywords.extend(
                    [t + ':att' for t in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or '') + ' ' + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                # FIXME: What about encrypted content?
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
                # FIXME: Do this better.
                if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart:
                    keywords.append('pgp:has')
                for extract in plugins.get_text_kw_extractors():
                    keywords.extend(extract(self, msg, ctype,
                                            lambda: textpart))

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(
                    extract(self, msg, ctype, att, part,
                            lambda: _loader(part)))

        keywords.append('%s:id' % msg_id)
        keywords.extend(
            re.findall(WORD_REGEXP,
                       self.hdr(msg, 'subject').lower()))
        keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
        if mailbox: keywords.append('%s:mailbox' % mailbox.lower())
        keywords.append('%s:hprint' % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP,
                                       self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(['%s:%s' % (t, key_lower) for t in words])
                keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
                keywords.extend(['%s:email' % e for e in emails])
                if 'list' in key_lower:
                    keywords.extend(['%s:list' % t for t in words])

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_date))

        return (set(keywords) - STOPLIST)
Ejemplo n.º 4
0
  def index_message(self, session, msg_mid, msg_id, msg, msg_date,
                    mailbox=None, compact=True, filter_hooks=[]):
    keywords = []
    for part in msg.walk():
      charset = part.get_charset() or 'iso-8859-1'
      if part.get_content_type() == 'text/plain':
        textpart = self.try_decode(part.get_payload(None, True), charset)
      elif part.get_content_type() == 'text/html':
        payload = self.try_decode(part.get_payload(None, True), charset)
        if len(payload) > 3:
          try:
            textpart = lxml.html.fromstring(payload).text_content()
          except:
            session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id))
            textpart = payload
        else:
          textpart = payload
      else:
        textpart = None

      att = part.get_filename()
      if att:
        att = self.try_decode(att, charset)
        keywords.append('attachment:has')
        keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())])
        textpart = (textpart or '') + ' ' + att

      if textpart:
        # FIXME: Does this lowercase non-ASCII characters correctly?
        keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))

    mdate = datetime.date.fromtimestamp(msg_date)
    keywords.append('%s:year' % mdate.year)
    keywords.append('%s:month' % mdate.month)
    keywords.append('%s:day' % mdate.day)
    keywords.append('%s-%s-%s:date' % (mdate.year, mdate.month, mdate.day))
    keywords.append('%s:id' % msg_id)
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower()))
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
    if mailbox: keywords.append('%s:mailbox' % mailbox.lower())

    for key in msg.keys():
      key_lower = key.lower()
      if key_lower not in BORING_HEADERS:
        words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower()))
        words -= STOPLIST
        keywords.extend(['%s:%s' % (t, key_lower) for t in words])
        if 'list' in key_lower:
          keywords.extend(['%s:list' % t for t in words])

    keywords = set(keywords)
    keywords -= STOPLIST

    for hook in filter_hooks:
      keywords = hook(session, msg_mid, msg, keywords)

    for word in keywords:
      try:
        PostingList.Append(session, word, msg_mid, compact=compact)
      except UnicodeDecodeError:
        # FIXME: we just ignore garbage
        pass

    return keywords