Ejemplo n.º 1
0
    def read_message(self, session, msg_mid, msg_id, msg, msg_size, msg_ts,
                     mailbox=None):
        keywords = []
        snippet = ''
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_charset() or 'iso-8859-1'

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True),
                                                 charset)
                return payload[0]

            if ctype == 'text/plain':
                textpart = _loader(part)
            elif ctype == 'text/html':
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(payload[0]
                                                        ).text_content()
                    except:
                        session.ui.warning(_('=%s/%s has bogus HTML.'
                                             ) % (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif 'pgp' in part.get_content_type():
                keywords.append('pgp:has')

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append('attachment:has')
                keywords.extend([t + ':att' for t
                                 in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or '') + ' ' + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))

                # NOTE: As a side effect here, the cryptostate plugin will
                #       add a 'crypto:has' keyword which we check for below
                #       before performing further processing.
                for kwe in plugins.get_text_kw_extractors():
                    keywords.extend(kwe(self, msg, ctype, textpart))

                if len(snippet) < 1024:
                    snippet += ' ' + textpart

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(extract(self, msg, ctype, att, part,
                                        lambda: _loader(part)))

        if 'crypto:has' in keywords:
            e = Email(self, -1)
            e.msg_parsed = msg
            e.msg_info = self.BOGUS_METADATA[:]
            tree = e.get_message_tree(want=(e.WANT_MSG_TREE_PGP +
                                            ('text_parts', )))

            # Look for inline PGP parts, update our status if found
            e.evaluate_pgp(tree, decrypt=session.config.prefs.index_encrypted)
            msg.signature_info = tree['crypto']['signature']
            msg.encryption_info = tree['crypto']['encryption']

            # Index the contents, if configured to do so
            if session.config.prefs.index_encrypted:
                for text in [t['data'] for t in tree['text_parts']]:
                    keywords.extend(re.findall(WORD_REGEXP, text.lower()))
                    for kwe in plugins.get_text_kw_extractors():
                        keywords.extend(kwe(self, msg, 'text/plain', text))

        keywords.append('%s:id' % msg_id)
        keywords.extend(re.findall(WORD_REGEXP,
                                   self.hdr(msg, 'subject').lower()))
        keywords.extend(re.findall(WORD_REGEXP,
                                   self.hdr(msg, 'from').lower()))
        if mailbox:
            keywords.append('%s:mailbox' % mailbox.lower())
        keywords.append('%s:hp' % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP,
                                       self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(['%s:%s' % (t, key_lower) for t in words])
                keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
                keywords.extend(['%s:email' % e for e in emails])
                if 'list' in key_lower:
                    keywords.extend(['%s:list' % t for t in words])
        for key in EXPECTED_HEADERS:
            if not msg[key]:
                keywords.append('missing:%s' % key)

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_size, msg_ts))

        snippet = snippet.replace('\n', ' '
                                  ).replace('\t', ' ').replace('\r', '')
        return (set(keywords) - STOPLIST), snippet.strip()
Ejemplo n.º 2
0
    def read_message(self, session, msg_mid, msg_id, msg, msg_size, msg_ts,
                     mailbox=None):
        keywords = []
        snippet = ''
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_charset() or 'iso-8859-1'

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True),
                                                 charset)
                return payload[0]

            if ctype == 'text/plain':
                textpart = _loader(part)
            elif ctype == 'text/html':
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(payload[0]
                                                        ).text_content()
                    except:
                        session.ui.warning(_('=%s/%s has bogus HTML.'
                                             ) % (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif 'pgp' in part.get_content_type():
                keywords.append('pgp:has')

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append('attachment:has')
                keywords.extend([t + ':att' for t
                                 in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or '') + ' ' + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                # FIXME: What about encrypted content?
                # FIXME: Do this better.
                if ('-----BEGIN PGP' in textpart and
                        '-----END PGP' in textpart):
                    keywords.append('pgp:has')
                    if '-----BEGIN PGP ENCRYPTED' in textpart:
                        keywords.append('pgp-encrypted-text:has')
                    else:
                        keywords.append('pgp-signed-text:has')
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
                for extract in plugins.get_text_kw_extractors():
                    keywords.extend(extract(self, msg, ctype,
                                            lambda: textpart))

                if len(snippet) < 1024:
                    snippet += ' ' + textpart

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(extract(self, msg, ctype, att, part,
                                        lambda: _loader(part)))

        if (session.config.prefs.index_encrypted and
                'pgp-encrypted-text:has' in keywords):
            e = Email(None, -1)
            e.msg_parsed = msg
            e.msg_info = ['' for i in range(0, self.MSG_FIELDS_V2)]
            tree = e.get_message_tree(want=['text_parts'])
            for text in [t['data'] for t in tree['text_parts']]:
                print 'OOO, INLINE PGP, PARSING, WOOT'
                keywords.extend(re.findall(WORD_REGEXP, text.lower()))
                for extract in plugins.get_text_kw_extractors():
                    keywords.extend(extract(self, msg, 'text/plain',
                                            lambda: text))

        keywords.append('%s:id' % msg_id)
        keywords.extend(re.findall(WORD_REGEXP,
                                   self.hdr(msg, 'subject').lower()))
        keywords.extend(re.findall(WORD_REGEXP,
                                   self.hdr(msg, 'from').lower()))
        if mailbox:
            keywords.append('%s:mailbox' % mailbox.lower())
        keywords.append('%s:hp' % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP,
                                       self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(['%s:%s' % (t, key_lower) for t in words])
                keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
                keywords.extend(['%s:email' % e for e in emails])
                if 'list' in key_lower:
                    keywords.extend(['%s:list' % t for t in words])
        for key in EXPECTED_HEADERS:
            if not msg[key]:
                keywords.append('missing:%s' % key)

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_size, msg_ts))

        snippet = snippet.replace('\n', ' '
                                  ).replace('\t', ' ').replace('\r', '')
        return (set(keywords) - STOPLIST), snippet.strip()
Ejemplo n.º 3
0
  def message_keywords(self, session, msg_mid, msg_id, msg, msg_date,
                       mailbox=None):
    keywords = []
    textpart = None
    payload = [None]
    for part in msg.walk():
      payload[0] = None
      ctype = part.get_content_type()
      charset = part.get_charset() or 'iso-8859-1'
      def _loader(p):
        if payload[0] is None:
          payload[0] = self.try_decode(p.get_payload(None, True), charset)
        return payload[0]
      if ctype == 'text/plain':
        textpart = _loader(part)
      elif ctype == 'text/html':
        _loader(part)
        if len(payload[0]) > 3:
          try:
            textpart = lxml.html.fromstring(payload[0]).text_content()
          except:
            session.ui.warning('=%s/%s has bogus HTML.' % (msg_mid, msg_id))
            textpart = payload[0]
        else:
          textpart = payload[0]
      elif 'pgp' in part.get_content_type():
        keywords.append('pgp:has')

      att = part.get_filename()
      if att:
        att = self.try_decode(att, charset)
        keywords.append('attachment:has')
        keywords.extend([t+':att' for t in re.findall(WORD_REGEXP, att.lower())])
        textpart = (textpart or '') + ' ' + att

      if textpart:
        # FIXME: Does this lowercase non-ASCII characters correctly?
        # FIXME: What about encrypted content?
        keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
        # FIXME: Do this better.
        if '-----BEGIN PGP' in textpart and '-----END PGP' in textpart:
          keywords.append('pgp:has')
        for extract in plugins.get_text_kw_extractors():
          keywords.extend(extract(self, msg, ctype, lambda: textpart))

      for extract in plugins.get_data_kw_extractors():
        keywords.extend(extract(self, msg, ctype, att, part,
                                lambda: _loader(part)))

    keywords.append('%s:id' % msg_id)
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'subject').lower()))
    keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
    if mailbox: keywords.append('%s:mailbox' % mailbox.lower())
    keywords.append('%s:hprint' % HeaderPrint(msg))

    for key in msg.keys():
      key_lower = key.lower()
      if key_lower not in BORING_HEADERS:
        emails = ExtractEmails(self.hdr(msg, key).lower())
        words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower()))
        words -= STOPLIST
        keywords.extend(['%s:%s' % (t, key_lower) for t in words])
        keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
        keywords.extend(['%s:email' % e for e in emails])
        if 'list' in key_lower:
          keywords.extend(['%s:list' % t for t in words])

    for extract in plugins.get_meta_kw_extractors():
      keywords.extend(extract(self, msg_mid, msg, msg_date))

    return (set(keywords) - STOPLIST)
Ejemplo n.º 4
0
    def read_message(self,
                     session,
                     msg_mid,
                     msg_id,
                     msg,
                     msg_ts,
                     mailbox=None):
        keywords = []
        snippet = ''
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_charset() or 'iso-8859-1'

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True),
                                                 charset)
                return payload[0]

            if ctype == 'text/plain':
                textpart = _loader(part)
            elif ctype == 'text/html':
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(
                            payload[0]).text_content()
                    except:
                        session.ui.warning(
                            ('=%s/%s has bogus HTML.') % (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif 'pgp' in part.get_content_type():
                keywords.append('pgp:has')

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append('attachment:has')
                keywords.extend(
                    [t + ':att' for t in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or '') + ' ' + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                # FIXME: What about encrypted content?
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))
                # FIXME: Do this better.
                if ('-----BEGIN PGP' in textpart
                        and '-----END PGP' in textpart):
                    keywords.append('pgp:has')
                for extract in plugins.get_text_kw_extractors():
                    keywords.extend(extract(self, msg, ctype,
                                            lambda: textpart))

                if len(snippet) < 1024:
                    snippet += ' ' + textpart

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(
                    extract(self, msg, ctype, att, part,
                            lambda: _loader(part)))

        keywords.append('%s:id' % msg_id)
        keywords.extend(
            re.findall(WORD_REGEXP,
                       self.hdr(msg, 'subject').lower()))
        keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
        if mailbox:
            keywords.append('%s:mailbox' % mailbox.lower())
        keywords.append('%s:hprint' % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP,
                                       self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(['%s:%s' % (t, key_lower) for t in words])
                keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
                keywords.extend(['%s:email' % e for e in emails])
                if 'list' in key_lower:
                    keywords.extend(['%s:list' % t for t in words])

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_ts))

        snippet = snippet.replace('\n', ' ').replace('\t',
                                                     ' ').replace('\r', '')
        return (set(keywords) - STOPLIST), snippet.strip()
Ejemplo n.º 5
0
    def read_message(self, session, msg_mid, msg_id, msg, msg_size, msg_ts, mailbox=None):
        keywords = []
        snippet = ""
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_content_charset() or "iso-8859-1"

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True), charset)
                return payload[0]

            if ctype == "text/plain":
                textpart = _loader(part)
            elif ctype == "text/html":
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(payload[0]).text_content()
                    except:
                        session.ui.warning(_("=%s/%s has bogus HTML.") % (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif "pgp" in part.get_content_type():
                keywords.append("pgp:has")

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append("attachment:has")
                keywords.extend([t + ":att" for t in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or "") + " " + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))

                # NOTE: As a side effect here, the cryptostate plugin will
                #       add a 'crypto:has' keyword which we check for below
                #       before performing further processing.
                for kwe in plugins.get_text_kw_extractors():
                    keywords.extend(kwe(self, msg, ctype, textpart))

                if len(snippet) < 1024:
                    snippet += " " + textpart

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(extract(self, msg, ctype, att, part, lambda: _loader(part)))

        if "crypto:has" in keywords:
            e = Email(self, -1)
            e.msg_parsed = msg
            e.msg_info = self.BOGUS_METADATA[:]
            tree = e.get_message_tree(want=(e.WANT_MSG_TREE_PGP + ("text_parts",)))

            # Look for inline PGP parts, update our status if found
            e.evaluate_pgp(tree, decrypt=session.config.prefs.index_encrypted)
            msg.signature_info = tree["crypto"]["signature"]
            msg.encryption_info = tree["crypto"]["encryption"]

            # Index the contents, if configured to do so
            if session.config.prefs.index_encrypted:
                for text in [t["data"] for t in tree["text_parts"]]:
                    keywords.extend(re.findall(WORD_REGEXP, text.lower()))
                    for kwe in plugins.get_text_kw_extractors():
                        keywords.extend(kwe(self, msg, "text/plain", text))

        keywords.append("%s:id" % msg_id)
        keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, "subject").lower()))
        keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, "from").lower()))
        if mailbox:
            keywords.append("%s:mailbox" % mailbox.lower())
        keywords.append("%s:hp" % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP, self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(["%s:%s" % (t, key_lower) for t in words])
                keywords.extend(["%s:%s" % (e, key_lower) for e in emails])
                keywords.extend(["%s:email" % e for e in emails])
                if "list" in key_lower:
                    keywords.extend(["%s:list" % t for t in words])
        for key in EXPECTED_HEADERS:
            if not msg[key]:
                keywords.append("%s:missing" % key)

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_size, msg_ts))

        snippet = snippet.replace("\n", " ").replace("\t", " ").replace("\r", "")
        return (set(keywords) - STOPLIST), snippet.strip()
Ejemplo n.º 6
0
    def read_message(self,
                     session,
                     msg_mid,
                     msg_id,
                     msg,
                     msg_size,
                     msg_ts,
                     mailbox=None):
        keywords = []
        snippet = ''
        payload = [None]
        for part in msg.walk():
            textpart = payload[0] = None
            ctype = part.get_content_type()
            charset = part.get_content_charset() or 'iso-8859-1'

            def _loader(p):
                if payload[0] is None:
                    payload[0] = self.try_decode(p.get_payload(None, True),
                                                 charset)
                return payload[0]

            if ctype == 'text/plain':
                textpart = _loader(part)
            elif ctype == 'text/html':
                _loader(part)
                if len(payload[0]) > 3:
                    try:
                        textpart = lxml.html.fromstring(
                            payload[0]).text_content()
                    except:
                        session.ui.warning(
                            _('=%s/%s has bogus HTML.') % (msg_mid, msg_id))
                        textpart = payload[0]
                else:
                    textpart = payload[0]
            elif 'pgp' in part.get_content_type():
                keywords.append('pgp:has')

            att = part.get_filename()
            if att:
                att = self.try_decode(att, charset)
                keywords.append('attachment:has')
                keywords.extend(
                    [t + ':att' for t in re.findall(WORD_REGEXP, att.lower())])
                textpart = (textpart or '') + ' ' + att

            if textpart:
                # FIXME: Does this lowercase non-ASCII characters correctly?
                keywords.extend(re.findall(WORD_REGEXP, textpart.lower()))

                # NOTE: As a side effect here, the cryptostate plugin will
                #       add a 'crypto:has' keyword which we check for below
                #       before performing further processing.
                for kwe in plugins.get_text_kw_extractors():
                    keywords.extend(kwe(self, msg, ctype, textpart))

                if len(snippet) < 1024:
                    snippet += ' ' + textpart

            for extract in plugins.get_data_kw_extractors():
                keywords.extend(
                    extract(self, msg, ctype, att, part,
                            lambda: _loader(part)))

        if 'crypto:has' in keywords:
            e = Email(self, -1)
            e.msg_parsed = msg
            e.msg_info = self.BOGUS_METADATA[:]
            tree = e.get_message_tree(want=(e.WANT_MSG_TREE_PGP +
                                            ('text_parts', )))

            # Look for inline PGP parts, update our status if found
            e.evaluate_pgp(tree, decrypt=session.config.prefs.index_encrypted)
            msg.signature_info = tree['crypto']['signature']
            msg.encryption_info = tree['crypto']['encryption']

            # Index the contents, if configured to do so
            if session.config.prefs.index_encrypted:
                for text in [t['data'] for t in tree['text_parts']]:
                    keywords.extend(re.findall(WORD_REGEXP, text.lower()))
                    for kwe in plugins.get_text_kw_extractors():
                        keywords.extend(kwe(self, msg, 'text/plain', text))

        keywords.append('%s:id' % msg_id)
        keywords.extend(
            re.findall(WORD_REGEXP,
                       self.hdr(msg, 'subject').lower()))
        keywords.extend(re.findall(WORD_REGEXP, self.hdr(msg, 'from').lower()))
        if mailbox:
            keywords.append('%s:mailbox' % mailbox.lower())
        keywords.append('%s:hp' % HeaderPrint(msg))

        for key in msg.keys():
            key_lower = key.lower()
            if key_lower not in BORING_HEADERS:
                emails = ExtractEmails(self.hdr(msg, key).lower())
                words = set(re.findall(WORD_REGEXP,
                                       self.hdr(msg, key).lower()))
                words -= STOPLIST
                keywords.extend(['%s:%s' % (t, key_lower) for t in words])
                keywords.extend(['%s:%s' % (e, key_lower) for e in emails])
                keywords.extend(['%s:email' % e for e in emails])
                if 'list' in key_lower:
                    keywords.extend(['%s:list' % t for t in words])
        for key in EXPECTED_HEADERS:
            if not msg[key]:
                keywords.append('%s:missing' % key)

        for extract in plugins.get_meta_kw_extractors():
            keywords.extend(extract(self, msg_mid, msg, msg_size, msg_ts))

        snippet = snippet.replace('\n', ' ').replace('\t',
                                                     ' ').replace('\r', '')
        return (set(keywords) - STOPLIST), snippet.strip()