Beispiel #1
0
    def _get_message_keys(self, messageid):
        keys = self.key_cache.get(messageid, [])
        if not keys:
            email = Email(self._idx(), messageid)

            # First we check the Autocrypt headers
            msg = email.get_msg(pgpmime='all')
            for ach in ([extract_autocrypt_header(msg)] +
                        extract_autocrypt_gossip_headers(msg)):
                if 'keydata' in ach:
                    for keydata in get_keydata(ach['keydata'],
                                               autocrypt_header=ach,
                                               include_subkeys=False):
                        keys.append((keydata, ach['keydata']))

            # Then go looking at the attachments
            attachments = email.get_message_tree(want=["attachments"]
                                                 )["attachments"]
            for part in attachments:
                if len(keys) > 100:  # Just to set some limit...
                    break
                if _might_be_pgp_key(part["filename"], part["mimetype"]):
                    key = part["part"].get_payload(None, True)
                    for keydata in get_keydata(key, include_subkeys=False):
                        keys.append((keydata, key))
            self.key_cache[messageid] = keys
        return keys
Beispiel #2
0
    def _get_message_keys(self,
                          messageid,
                          autocrypt=True,
                          autocrypt_gossip=True,
                          attachments=True):
        keys = self.key_cache.get(messageid, [])
        if not keys:
            email = Email(self._idx(), messageid)

            # First we check the Autocrypt headers
            loop_count = 0
            msg = email.get_msg(pgpmime='all')
            ac_headers = []
            if autocrypt:
                ac_headers.append(extract_autocrypt_header(msg))
            if autocrypt_gossip:
                ac_headers.extend(extract_autocrypt_gossip_headers(msg))
            for ach in ac_headers:
                loop_count += 1
                if 'keydata' in ach:
                    for keyinfo in get_keyinfo(ach['keydata'],
                                               autocrypt_header=ach,
                                               key_info_class=MailpileKeyInfo):
                        keyinfo.is_autocrypt = True
                        keyinfo.is_gossip = (loop_count > 1)
                        keys.append((keyinfo, ach['keydata']))

            # Then go looking at the attachments
            atts = []
            if attachments:
                atts.extend(
                    email.get_message_tree(
                        want=["attachments"])["attachments"])
            for part in atts:
                if len(keys) > 100:  # Just to set some limit...
                    break
                if _might_be_pgp_key(part["filename"], part["mimetype"]):
                    key = part["part"].get_payload(None, True)
                    for keyinfo in get_keyinfo(key,
                                               key_info_class=MailpileKeyInfo):
                        keys.append((keyinfo, key))
            self.key_cache[messageid] = keys
        return keys
Beispiel #3
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in autotag_configs(config)]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session, ['in:%s' % tag.slug] +
                                              no_trash).as_set()
            session.ui.notify(
                _('Have %d interesting %s messages') %
                (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in autotag_configs(config):
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session,
                                                         srch).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // len(interesting),
                                   max(0, full_size - len(tset)))
                        # Make sure we always fully utilize our budget
                        if full_size > len(tset) and not ttype:
                            want = full_size - len(tset)

                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the list of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)') %
                                (e.msg_mid(), count, len(tset), at_tag.name,
                                 which))
                            atagger.learn(at_config, e.get_msg(),
                                          self._get_keywords(e), which)
                            play_nice_with_threads()
                            if mailpile.util.QUITTING:
                                return self._error('Aborted')
                        except (IndexError, TypeError, ValueError, OSError,
                                IOError):
                            if 'autotag' in session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s') %
                                (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s') % ', '.join(
            retrained)
        session.ui.mark(message)
        return self._success(message,
                             result={
                                 'retrained': retrained,
                                 'unreadable': unreadable,
                                 'read_messages': count_all
                             })
Beispiel #4
0
    def _retrain(self, tags=None):
        "Retrain autotaggers"
        session, config, idx = self.session, self.session.config, self._idx()
        tags = tags or [asb.match_tag for asb in autotag_configs(config)]
        tids = [config.get_tag(t)._key for t in tags if t]

        session.ui.mark(_('Retraining SpamBayes autotaggers'))
        if not config.real_hasattr('autotag'):
            config.real_setattr('autotag', {})

        # Find all the interesting messages! We don't look in the trash,
        # but we do look at interesting spam.
        #
        # Note: By specifically stating that we DON'T want trash, we
        #       disable the search engine's default result suppression
        #       and guarantee these results don't corrupt the somewhat
        #       lame/broken result cache.
        #
        no_trash = ['-in:%s' % t._key for t in config.get_tags(type='trash')]
        interest = {}
        for ttype in ('replied', 'read', 'tagged'):
            interest[ttype] = set()
            for tag in config.get_tags(type=ttype):
                interest[ttype] |= idx.search(session,
                                              ['in:%s' % tag.slug] + no_trash
                                              ).as_set()
            session.ui.notify(_('Have %d interesting %s messages'
                                ) % (len(interest[ttype]), ttype))

        retrained, unreadable = [], []
        count_all = 0
        for at_config in autotag_configs(config):
            at_tag = config.get_tag(at_config.match_tag)
            if at_tag and at_tag._key in tids:
                session.ui.mark('Retraining: %s' % at_tag.name)

                yn = [(set(), set(), 'in:%s' % at_tag.slug, True),
                      (set(), set(), '-in:%s' % at_tag.slug, False)]

                # Get the current message sets: tagged and untagged messages
                # excluding trash.
                for tset, mset, srch, which in yn:
                    mset |= idx.search(session, [srch] + no_trash).as_set()

                # If we have any exclude_tags, they are particularly
                # interesting, so we'll look at them first.
                interesting = []
                for etagid in at_config.exclude_tags:
                    etag = config.get_tag(etagid)
                    if etag._key not in interest:
                        srch = ['in:%s' % etag._key] + no_trash
                        interest[etag._key] = idx.search(session, srch
                                                         ).as_set()
                    interesting.append(etag._key)
                interesting.extend(['replied', 'read', 'tagged', None])

                # Go through the interest types in order of preference and
                # while we still lack training data, add to the training set.
                for ttype in interesting:
                    for tset, mset, srch, which in yn:
                        # False positives are really annoying, and generally
                        # speaking any autotagged subset should be a small
                        # part of the Universe. So we divide the corpus
                        # budget 33% True, 67% False.
                        full_size = int(at_config.corpus_size *
                                        (0.33 if which else 0.67))
                        want = min(full_size // len(interesting),
                                   max(0, full_size - len(tset)))
                        # Make sure we always fully utilize our budget
                        if full_size > len(tset) and not ttype:
                            want = full_size - len(tset)

                        if want:
                            if ttype:
                                adding = sorted(list(mset & interest[ttype]))
                            else:
                                adding = sorted(list(mset))
                            adding = set(list(reversed(adding))[:want])
                            tset |= adding
                            mset -= adding

                # Load classifier, reset
                atagger = config.load_auto_tagger(at_config)
                atagger.reset(at_config)
                for tset, mset, srch, which in yn:
                    count = 0
                    # We go through the list of message in order, to avoid
                    # thrashing caches too badly.
                    for msg_idx in sorted(list(tset)):
                        try:
                            e = Email(idx, msg_idx)
                            count += 1
                            count_all += 1
                            session.ui.mark(
                                _('Reading %s (%d/%d, %s=%s)'
                                  ) % (e.msg_mid(), count, len(tset),
                                       at_tag.name, which))
                            atagger.learn(at_config,
                                          e.get_msg(),
                                          self._get_keywords(e),
                                          which)
                            play_nice_with_threads()
                            if mailpile.util.QUITTING:
                                return self._error('Aborted')
                        except (IndexError, TypeError, ValueError,
                                OSError, IOError):
                            if 'autotag' in session.config.sys.debug:
                                import traceback
                                traceback.print_exc()
                            unreadable.append(msg_idx)
                            session.ui.warning(
                                _('Failed to process message at =%s'
                                  ) % (b36(msg_idx)))

                # We got this far without crashing, so save the result.
                config.save_auto_tagger(at_config)
                retrained.append(at_tag.name)

        message = _('Retrained SpamBayes auto-tagging for %s'
                    ) % ', '.join(retrained)
        session.ui.mark(message)
        return self._success(message, result={
            'retrained': retrained,
            'unreadable': unreadable,
            'read_messages': count_all
        })