Exemple #1
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        
        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            def ensureDir(dirname):
                try:
                    os.mkdir(dirname)
                except OSError, e:
                    if e.errno != errno.EEXIST:
                        raise

            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
Exemple #2
0
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)
Exemple #3
0
    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%[email protected]%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support,
                                          urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >>sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                          dir, cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            b_file = file(self.bad_url_cache_name, "r")
            try:
                self.bad_urls = pickle.load(b_file)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >>sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {"url:non_resolving": (),
                                 "url:non_html": (),
                                 "url:unknown_error": ()}
            b_file.close()
Exemple #4
0
 def setup(self):
     from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
     username = options["globals", "proxy_username"]
     password = options["globals", "proxy_password"]
     server = options["globals", "proxy_server"]
     if server.find(":") != -1:
         server, port = server.split(':', 1)
     else:
         port = 8080
     if server:
         proxy_support = urllib.request.ProxyHandler({"http" : \
                                               "http://%s:%[email protected]%s:%d" % \
                                               (username, password,
                                                server, port)})
         opener = urllib.request.build_opener(proxy_support,
                                       urllib2.HTTPHandler)
     else:
         opener = urllib.request.build_opener(urllib2.HTTPHandler)
     urllib.request.install_opener(opener)
     age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
     dir = options["URLRetriever", "x-cache_directory"]
     if not os.path.exists(dir):
         if options["globals", "verbose"]:
             print("Creating URL cache directory", file=sys.stderr)
         os.makedirs(dir)
     self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                       dir, cacheSize=20)
     self.urlCorpus.removeExpiredMessages()
     self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
     self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
     if os.path.exists(self.bad_url_cache_name):
         try:
             self.bad_urls = pickle_read(self.bad_url_cache_name)
         except (IOError, ValueError):
             if options["globals", "verbose"]:
                 print("Bad URL pickle, using new.", file=sys.stderr)
             self.bad_urls = {"url:non_resolving": (),
                              "url:non_html": (),
                              "url:unknown_error": ()}
     else:
         if options["globals", "verbose"]:
             print("URL caches don't exist: creating")
         self.bad_urls = {"url:non_resolving": (),
                     "url:non_html": (),
                     "url:unknown_error": ()}
     if os.path.exists(self.http_error_cache_name):
         try:
             self.http_error_urls = pickle_read(self.http_error_cache_name)
         except IOError as ValueError:
             if options["globals", "verbose"]:
                 print("Bad HHTP error pickle, using new.", file=sys.stderr)
             self.http_error_urls = {}
     else:
         self.http_error_urls = {}
Exemple #5
0
 def createWorkers(self):
     """Using the options that were initialised in __init__ and then
     possibly overridden by the driver code, create the Bayes object,
     the Corpuses, the Trainers and so on."""
     print("Loading database...", end=' ')
     if self.isTest:
         self.useDB = "pickle"
         self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
     if not hasattr(self, "DBName"):
         self.DBName, self.useDB = storage.database_type([])
     self.bayes = storage.open_storage(self.DBName, self.useDB)
     self.mdb = spambayes.message.Message().message_info_db
     self.stats = Stats.Stats(options, self.mdb)
     self.buildStatusStrings()
     if not self.isTest:
         sc = get_pathname_option("Storage", "spam_cache")
         hc = get_pathname_option("Storage", "ham_cache")
         uc = get_pathname_option("Storage", "unknown_cache")
         for d in [sc, hc, uc]:
             storage.ensureDir(d)
         if self.gzipCache:
             factory = GzipFileMessageFactory()
         else:
             factory = FileMessageFactory()
         age = options["Storage", "cache_expiry_days"]*24*60*60
         self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                            '[0123456789\-]*',
                                            cacheSize=20)
         self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                           '[0123456789\-]*',
                                           cacheSize=20)
         self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
         self.spamCorpus.removeExpiredMessages()
         self.hamCorpus.removeExpiredMessages()
         self.unknownCorpus.removeExpiredMessages()
         self.spamTrainer = storage.SpamTrainer(self.bayes)
         self.hamTrainer = storage.HamTrainer(self.bayes)
         self.spamCorpus.addObserver(self.spamTrainer)
         self.hamCorpus.addObserver(self.hamTrainer)
Exemple #6
0
class CoreState:
    """This keeps the global state of the module - the command-line options,
    statistics like how many mails have been classified, the handle of the
    log file, the Classifier and FileCorpus objects, and so on."""
    def __init__(self):
        """Initialises the State object that holds the state of the app.
        The default settings are read from Options.py and bayescustomize.ini
        and are then overridden by the command-line processing code in the
        __main__ code below."""
        self.log_file = None
        self.bayes = None
        self.mutex = None
        self.prepared = False
        self.can_stop = True
        self.plugin = None
        self.last_base_message_name = ''
        self.uniquifier = 2
        self.numSpams = 0
        self.numHams = 0
        self.numUnsure = 0
        self.servers = ""
        self.ui_port = options["html_ui", "port"]
        self.launch_ui = options["html_ui", "launch_browser"]
        self.gzip_cache = options["Storage", "cache_use_gzip"]
        self.run_test_server = False
        self.is_test = False
        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spam_trainer = self.ham_trainer = None
        self.init()
    def init(self):
        assert not self.prepared, "init after prepare, but before close"
        self.lang_manager = None
        if options["globals", "verbose"]:
            self.log_file = open('_core_server.log', 'wb', 0)
        self.reported_errors = {}
    def close(self):
        assert self.prepared, "closed without being prepared!"
        if self.bayes is not None:
            if self.bayes.nham != 0 and self.bayes.nspam != 0:
                self.bayes.store()
            self.bayes.close()
            self.bayes = None
        spambayes.message.Message().message_info_db = None
        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spam_trainer = self.ham_trainer = None
        self.prepared = False
        self.close_platform_mutex()
    def prepare(self, can_stop=True):
        """Do whatever needs to be done to prepare for running.  If
        can_stop is False, then we may not let the user shut down the
        proxy - for example, running as a Windows service this should
        be the case."""
        self.init()
        assert self.mutex is None, "Should not already have the mutex"
        self.open_platform_mutex()
        self.can_stop = can_stop
        self.create_workers()
        self.prepared = True
    def build_status_strings(self):
        """Build the status message(s) to display on the home page of the
        web interface."""
        nspam = self.bayes.nspam
        nham = self.bayes.nham
        if nspam > 10 and nham > 10:
            db_ratio = nham/float(nspam)
            if db_ratio > 5.0:
                self.warning = _("Warning: you have much more ham than " \
                                 "spam - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            elif db_ratio < (1/5.0):
                self.warning = _("Warning: you have much more spam than " \
                                 "ham - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            else:
                self.warning = ""
        elif nspam > 0 or nham > 0:
            self.warning = _("Database only has %d good and %d spam - " \
                             "you should consider performing additional " \
                             "training.") % (nham, nspam)
        else:
            self.warning = _("Database has no training information.  " \
                             "SpamBayes will classify all messages as " \
                             "'unsure', ready for you to train.")
        spam_cut = options["Categorization", "spam_cutoff"]
        ham_cut = options["Categorization", "ham_cutoff"]
        if spam_cut < 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the spam threshold less than 0.5.")
        if ham_cut > 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the ham threshold greater than 0.5.")
        if ham_cut > spam_cut:
            self.warning += _("<br/>Warning: your ham threshold is " \
                              "<b>higher</b> than your spam threshold. " \
                              "Results are unpredictable.")
    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)
        self.build_status_strings()
        if not self.is_test:
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)
    def getNewMessageName(self):
        """The message name is the time it arrived with a uniquifier
        appended if two arrive within one clock tick of each other.
        """
        message_name = "%10.10d" % long(time.time())
        if message_name == self.last_base_message_name:
            message_name = "%s-%d" % (message_name, self.uniquifier)
            self.uniquifier += 1
        else:
            self.last_base_message_name = message_name
            self.uniquifier = 2
        return message_name
    def record_classification(self, cls, score):
        """Record the classification in the session statistics.
        cls should match one of the options["Headers", "header_*_string"]
        values.
        score is the score the message received.        
        """
        if cls == options["Headers", "header_ham_string"]:
            self.numHams += 1
        elif cls == options["Headers", "header_spam_string"]:
            self.numSpams += 1
        else:
            self.numUnsure += 1
        self.stats.RecordClassification(score)
    def buildStatusStrings(self):
        return ""
    def recreate_state(self):
        if self.prepared:    
            self.close()
        state = CoreState()
        state.prepare()
        return state
    def open_platform_mutex(self, mutex_name="SpamBayesServer"):
        """Implementations of a mutex or other resource which can prevent
        multiple servers starting at once.  Platform specific as no
        reasonable cross-platform solution exists (however, an old trick is
        to use a directory for a mutex, as a create/test atomic API
        generally exists).  Will set self.mutex or may throw
        AlreadyRunningException
        """
        if sys.platform.startswith("win"):
            try:
                import win32event, win32api, winerror
                try:
                    hmutex = win32event.CreateMutex(None, True, mutex_name)
                except win32event.error, details:
                    if details[0] != winerror.ERROR_ACCESS_DENIED:
                        raise
                    raise AlreadyRunningException
                if win32api.GetLastError()==winerror.ERROR_ALREADY_EXISTS:
                    win32api.CloseHandle(hmutex)
                    raise AlreadyRunningException
                self.mutex = hmutex
                return
            except ImportError:
                pass
Exemple #7
0
class Classifier:
    # Defining __slots__ here made Jeremy's life needlessly difficult when
    # trying to hook this all up to ZODB as a persistent object.  There's
    # no space benefit worth getting from slots in this class; slots were
    # used solely to help catch errors earlier, when this code was changing
    # rapidly.

    #__slots__ = ('wordinfo',  # map word to WordInfo record
    #             'nspam',     # number of spam messages learn() has seen
    #             'nham',      # number of non-spam messages learn() has seen
    #            )

    # allow a subclass to use a different class for WordInfo
    WordInfoClass = WordInfo

    def __init__(self):
        self.wordinfo = {}
        self.probcache = {}
        self.nspam = self.nham = 0

    def __getstate__(self):
        return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)

    def __setstate__(self, t):
        if t[0] != PICKLE_VERSION:
            raise ValueError("Can't unpickle -- version %s unknown" % t[0])
        (self.wordinfo, self.nspam, self.nham) = t[1:]
        self.probcache = {}

    # spamprob() implementations.  One of the following is aliased to
    # spamprob, depending on option settings.
    # Currently only chi-squared is available, but maybe there will be
    # an alternative again someday.

    # Across vectors of length n, containing random uniformly-distributed
    # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
    # with 2*n degrees of freedom.  This has been proven (in some
    # appropriate sense) to be the most sensitive possible test for
    # rejecting the hypothesis that a vector of probabilities is uniformly
    # distributed.  Gary Robinson's original scheme was monotonic *with*
    # this test, but skipped the details.  Turns out that getting closer
    # to the theoretical roots gives a much sharper classification, with
    # a very small (in # of msgs), but also very broad (in range of scores),
    # "middle ground", where most of the mistakes live.  In particular,
    # this scheme seems immune to all forms of "cancellation disease":  if
    # there are many strong ham *and* spam clues, this reliably scores
    # close to 0.5.  Most other schemes are extremely certain then -- and
    # often wrong.
    def chi2_spamprob(self, wordstream, evidence=False):
        """Return best-guess probability that wordstream is spam.

        wordstream is an iterable object producing words.
        The return value is a float in [0.0, 1.0].

        If optional arg evidence is True, the return value is a pair
            probability, evidence
        where evidence is a list of (word, probability) pairs.
        """

        from math import frexp, log as ln

        # We compute two chi-squared statistics, one for ham and one for
        # spam.  The sum-of-the-logs business is more sensitive to probs
        # near 0 than to probs near 1, so the spam measure uses 1-p (so
        # that high-spamprob words have greatest effect), and the ham
        # measure uses p directly (so that lo-spamprob words have greatest
        # effect).
        #
        # For optimization, sum-of-logs == log-of-product, and f.p.
        # multiplication is a lot cheaper than calling ln().  It's easy
        # to underflow to 0.0, though, so we simulate unbounded dynamic
        # range via frexp.  The real product H = this H * 2**Hexp, and
        # likewise the real product S = this S * 2**Sexp.
        H = S = 1.0
        Hexp = Sexp = 0

        clues = self._getclues(wordstream)
        for prob, word, record in clues:
            S *= 1.0 - prob
            H *= prob
            if S < 1e-200:  # prevent underflow
                S, e = frexp(S)
                Sexp += e
            if H < 1e-200:  # prevent underflow
                H, e = frexp(H)
                Hexp += e

        # Compute the natural log of the product = sum of the logs:
        # ln(x * 2**i) = ln(x) + i * ln(2).
        S = ln(S) + Sexp * LN2
        H = ln(H) + Hexp * LN2

        n = len(clues)
        if n:
            S = 1.0 - chi2Q(-2.0 * S, 2*n)
            H = 1.0 - chi2Q(-2.0 * H, 2*n)

            # How to combine these into a single spam score?  We originally
            # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H).  A
            # systematic problem is that we could end up being near-certain
            # a thing was (for example) spam, even if S was small, provided
            # that H was much smaller.
            # Rob Hooft stared at these problems and invented the measure
            # we use now, the simpler S-H, scaled into [0., 1.].
            prob = (S-H + 1.0) / 2.0
        else:
            prob = 0.5

        if evidence:
            clues = [(w, p) for p, w, _r in clues]
            clues.sort(lambda a, b: cmp(a[1], b[1]))
            clues.insert(0, ('*S*', S))
            clues.insert(0, ('*H*', H))
            return prob, clues
        else:
            return prob

    def slurping_spamprob(self, wordstream, evidence=False):
        """Do the standard chi-squared spamprob, but if the evidence
        leaves the score in the unsure range, and we have fewer tokens
        than max_discriminators, also generate tokens from the text
        obtained by following http URLs in the message."""
        h_cut = options["Categorization", "ham_cutoff"]
        s_cut = options["Categorization", "spam_cutoff"]

        # Get the raw score.
        prob, clues = self.chi2_spamprob(wordstream, True)

        # If necessary, enhance it with the tokens from whatever is
        # at the URL's destination.
        if len(clues) < options["Classifier", "max_discriminators"] and \
           prob > h_cut and prob < s_cut and slurp_wordstream:
            slurp_tokens = list(self._generate_slurp())
            slurp_tokens.extend([w for (w, _p) in clues])
            sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
            if sprob < h_cut or sprob > s_cut:
                prob = sprob
                clues = sclues
        if evidence:
            return prob, clues
        return prob

    if options["Classifier", "use_chi_squared_combining"]:
        if options["URLRetriever", "x-slurp_urls"]:
            spamprob = slurping_spamprob
        else:
            spamprob = chi2_spamprob

    def learn(self, wordstream, is_spam):
        """Teach the classifier by example.

        wordstream is a word stream representing a message.  If is_spam is
        True, you're telling the classifier this message is definitely spam,
        else that it's definitely not spam.
        """
        if options["Classifier", "use_bigrams"]:
            wordstream = self._enhance_wordstream(wordstream)
        if options["URLRetriever", "x-slurp_urls"]:
            wordstream = self._add_slurped(wordstream)
        self._add_msg(wordstream, is_spam)

    def unlearn(self, wordstream, is_spam):
        """In case of pilot error, call unlearn ASAP after screwing up.

        Pass the same arguments you passed to learn().
        """
        if options["Classifier", "use_bigrams"]:
            wordstream = self._enhance_wordstream(wordstream)
        if options["URLRetriever", "x-slurp_urls"]:
            wordstream = self._add_slurped(wordstream)
        self._remove_msg(wordstream, is_spam)

    def probability(self, record):
        """Compute, store, and return prob(msg is spam | msg contains word).

        This is the Graham calculation, but stripped of biases, and
        stripped of clamping into 0.01 thru 0.99.  The Bayesian
        adjustment following keeps them in a sane range, and one
        that naturally grows the more evidence there is to back up
        a probability.
        """

        spamcount = record.spamcount
        hamcount = record.hamcount

        # Try the cache first
        try:
            return self.probcache[spamcount][hamcount]
        except KeyError:
            pass

        nham = float(self.nham or 1)
        nspam = float(self.nspam or 1)

        assert hamcount <= nham, "Token seen in more ham than ham trained."
        hamratio = hamcount / nham

        assert spamcount <= nspam, "Token seen in more spam than spam trained."
        spamratio = spamcount / nspam

        prob = spamratio / (hamratio + spamratio)

        S = options["Classifier", "unknown_word_strength"]
        StimesX = S * options["Classifier", "unknown_word_prob"]


        # Now do Robinson's Bayesian adjustment.
        #
        #         s*x + n*p(w)
        # f(w) = --------------
        #           s + n
        #
        # I find this easier to reason about like so (equivalent when
        # s != 0):
        #
        #        x - p
        #  p +  -------
        #       1 + n/s
        #
        # IOW, it moves p a fraction of the distance from p to x, and
        # less so the larger n is, or the smaller s is.

        n = hamcount + spamcount
        prob = (StimesX + n * prob) / (S + n)

        # Update the cache
        try:
            self.probcache[spamcount][hamcount] = prob
        except KeyError:
            self.probcache[spamcount] = {hamcount: prob}

        return prob

    # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
    # n>1 times in a single message, training added n to the word's hamcount
    # or spamcount, but predicting scored words only once.  Tests showed
    # that adding only 1 in training, or scoring more than once when
    # predicting, hurt under the Graham scheme.
    # This isn't so under Robinson's scheme, though:  results improve
    # if training also counts a word only once.  The mean ham score decreases
    # significantly and consistently, ham score variance decreases likewise,
    # mean spam score decreases (but less than mean ham score, so the spread
    # increases), and spam score variance increases.
    # I (Tim) speculate that adding n times under the Graham scheme helped
    # because it acted against the various ham biases, giving frequently
    # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else,
    # adding only once in training, a word like that was simply ignored until
    # it appeared in 5 distinct training spams.  Without the ham-favoring
    # biases, though, and never ignoring words, counting n times introduces
    # a subtle and unhelpful bias.
    # There does appear to be some useful info in how many times a word
    # appears in a msg, but distorting spamprob doesn't appear a correct way
    # to exploit it.
    def _add_msg(self, wordstream, is_spam):
        self.probcache = {}    # nuke the prob cache
        if is_spam:
            self.nspam += 1
        else:
            self.nham += 1

        for word in set(wordstream):
            record = self._wordinfoget(word)
            if record is None:
                record = self.WordInfoClass()

            if is_spam:
                record.spamcount += 1
            else:
                record.hamcount += 1

            self._wordinfoset(word, record)

        self._post_training()

    def _remove_msg(self, wordstream, is_spam):
        self.probcache = {}    # nuke the prob cache
        if is_spam:
            if self.nspam <= 0:
                raise ValueError("spam count would go negative!")
            self.nspam -= 1
        else:
            if self.nham <= 0:
                raise ValueError("non-spam count would go negative!")
            self.nham -= 1

        for word in set(wordstream):
            record = self._wordinfoget(word)
            if record is not None:
                if is_spam:
                    if record.spamcount > 0:
                        record.spamcount -= 1
                else:
                    if record.hamcount > 0:
                        record.hamcount -= 1
                if record.hamcount == 0 == record.spamcount:
                    self._wordinfodel(word)
                else:
                    self._wordinfoset(word, record)

        self._post_training()

    def _post_training(self):
        """This is called after training on a wordstream.  Subclasses might
        want to ensure that their databases are in a consistent state at
        this point.  Introduced to fix bug #797890."""
        pass

    # Return list of (prob, word, record) triples, sorted by increasing
    # prob.  "word" is a token from wordstream; "prob" is its spamprob (a
    # float in 0.0 through 1.0); and "record" is word's associated
    # WordInfo record if word is in the training database, or None if it's
    # not.  No more than max_discriminators items are returned, and have
    # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream.
    # Tokens with spamprobs less than minimum_prob_strength away from 0.5
    # aren't returned.
    def _getclues(self, wordstream):
        mindist = options["Classifier", "minimum_prob_strength"]

        if options["Classifier", "use_bigrams"]:
            # This scheme mixes single tokens with pairs of adjacent tokens.
            # wordstream is "tiled" into non-overlapping unigrams and
            # bigrams.  Non-overlap is important to prevent a single original
            # token from contributing to more than one spamprob returned
            # (systematic correlation probably isn't a good thing).

            # First fill list raw with
            #     (distance, prob, word, record), indices
            # pairs, one for each unigram and bigram in wordstream.
            # indices is a tuple containing the indices (0-based relative to
            # the start of wordstream) of the tokens that went into word.
            # indices is a 1-tuple for an original token, and a 2-tuple for
            # a synthesized bigram token.  The indices are needed to detect
            # overlap later.
            raw = []
            push = raw.append
            pair = None
            # Keep track of which tokens we've already seen.
            # Don't use a set here!  This is an innermost loop, so speed is
            # important here (direct dict fiddling is much quicker than
            # invoking Python-level set methods; in Python 2.4 that will
            # change).
            seen = {pair: 1} # so the bigram token is skipped on 1st loop trip
            for i, token in enumerate(wordstream):
                if i:   # not the 1st loop trip, so there is a preceding token
                    # This string interpolation must match the one in
                    # _enhance_wordstream().
                    pair = "bi:%s %s" % (last_token, token)
                last_token = token
                for clue, indices in (token, (i,)), (pair, (i-1, i)):
                    if clue not in seen:    # as always, skip duplicates
                        seen[clue] = 1
                        tup = self._worddistanceget(clue)
                        if tup[0] >= mindist:
                            push((tup, indices))

            # Sort raw, strongest to weakest spamprob.
            raw.sort()
            raw.reverse()
            # Fill clues with the strongest non-overlapping clues.
            clues = []
            push = clues.append
            # Keep track of which indices have already contributed to a
            # clue in clues.
            seen = {}
            for tup, indices in raw:
                overlap = [i for i in indices if i in seen]
                if not overlap: # no overlap with anything already in clues
                    for i in indices:
                        seen[i] = 1
                    push(tup)
            # Leave sorted from smallest to largest spamprob.
            clues.reverse()

        else:
            # The all-unigram scheme just scores the tokens as-is.  A set()
            # is used to weed out duplicates at high speed.
            clues = []
            push = clues.append
            for word in set(wordstream):
                tup = self._worddistanceget(word)
                if tup[0] >= mindist:
                    push(tup)
            clues.sort()

        if len(clues) > options["Classifier", "max_discriminators"]:
            del clues[0 : -options["Classifier", "max_discriminators"]]
        # Return (prob, word, record).
        return [t[1:] for t in clues]

    def _worddistanceget(self, word):
        record = self._wordinfoget(word)
        if record is None:
            prob = options["Classifier", "unknown_word_prob"]
        else:
            prob = self.probability(record)
        distance = abs(prob - 0.5)
        return distance, prob, word, record

    def _wordinfoget(self, word):
        return self.wordinfo.get(word)

    def _wordinfoset(self, word, record):
        self.wordinfo[word] = record

    def _wordinfodel(self, word):
        del self.wordinfo[word]

    def _enhance_wordstream(self, wordstream):
        """Add bigrams to the wordstream.

        For example, a b c -> a b "a b" c "b c"

        Note that these are *token* bigrams, and not *word* bigrams - i.e.
        'synthetic' tokens get bigram'ed, too.

        The bigram token is simply "bi:unigram1 unigram2" - a space should
        be sufficient as a separator, since spaces aren't in any other
        tokens, apart from 'synthetic' ones.  The "bi:" prefix is added
        to avoid conflict with tokens we generate (like "subject: word",
        which could be "word" in a subject, or a bigram of "subject:" and
        "word").

        If the "Classifier":"use_bigrams" option is removed, this function
        can be removed, too.
        """

        last = None
        for token in wordstream:
            yield token
            if last:
                # This string interpolation must match the one in
                # _getclues().
                yield "bi:%s %s" % (last, token)
            last = token

    def _generate_slurp(self):
        # We don't want to do this recursively and check URLs
        # on webpages, so we have this little cheat.
        if not hasattr(self, "setup_done"):
            self.setup()
            self.setup_done = True
        if not hasattr(self, "do_slurp") or self.do_slurp:
            if slurp_wordstream:
                self.do_slurp = False

                tokens = self.slurp(*slurp_wordstream)
                self.do_slurp = True
                self._save_caches()
                return tokens
        return []

    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%[email protected]%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support,
                                          urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >> sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                          dir, cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            try:
                self.bad_urls = pickle_read(self.bad_url_cache_name)
            except (IOError, ValueError):
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {"url:non_resolving": (),
                                 "url:non_html": (),
                                 "url:unknown_error": ()}
        else:
            if options["globals", "verbose"]:
                print "URL caches don't exist: creating"
            self.bad_urls = {"url:non_resolving": (),
                        "url:non_html": (),
                        "url:unknown_error": ()}
        if os.path.exists(self.http_error_cache_name):
            try:
                self.http_error_urls = pickle_read(self.http_error_cache_name)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad HHTP error pickle, using new."
                self.http_error_urls = {}
        else:
Exemple #8
0
class State:
    def __init__(self):
        """Initialises the State object that holds the state of the app.
        The default settings are read from Options.py and bayescustomize.ini
        and are then overridden by the command-line processing code in the
        __main__ code below."""
        self.logFile = None
        self.bayes = None
        self.platform_mutex = None
        self.prepared = False
        self.can_stop = True
        self.init()

        # Load up the other settings from Option.py / bayescustomize.ini
        self.uiPort = options["html_ui", "port"]
        self.launchUI = options["html_ui", "launch_browser"]
        self.gzipCache = options["Storage", "cache_use_gzip"]
        self.cacheExpiryDays = options["Storage", "cache_expiry_days"]
        self.runTestServer = False
        self.isTest = False

    def init(self):
        assert not self.prepared, "init after prepare, but before close"
        # Load the environment for translation.
        self.lang_manager = i18n.LanguageManager()
        # Set the system user default language.
        self.lang_manager.set_language(\
            self.lang_manager.locale_default_lang())
        # Set interface to use the user language in the configuration file.
        for language in reversed(options["globals", "language"]):
            # We leave the default in there as the last option, to fall
            # back on if necessary.
            self.lang_manager.add_language(language)
        if options["globals", "verbose"]:
            print "Asked to add languages: " + \
                  ", ".join(options["globals", "language"])
            print "Set language to " + \
                  str(self.lang_manager.current_langs_codes)

        # Open the log file.
        if options["globals", "verbose"]:
            self.logFile = open('_pop3proxy.log', 'wb', 0)

        if not hasattr(self, "servers"):
            # Could have already been set via the command line.
            self.servers = []
            if options["pop3proxy", "remote_servers"]:
                for server in options["pop3proxy", "remote_servers"]:
                    server = server.strip()
                    if server.find(':') > -1:
                        server, port = server.split(':', 1)
                    else:
                        port = '110'
                    self.servers.append((server, int(port)))

        if not hasattr(self, "proxyPorts"):
            # Could have already been set via the command line.
            self.proxyPorts = []
            if options["pop3proxy", "listen_ports"]:
                splitPorts = options["pop3proxy", "listen_ports"]
                self.proxyPorts = map(_addressAndPort, splitPorts)

        if len(self.servers) != len(self.proxyPorts):
            print "pop3proxy_servers & pop3proxy_ports are different lengths!"
            sys.exit()

        # Remember reported errors.
        self.reported_errors = {}

        # Set up the statistics.
        self.totalSessions = 0
        self.activeSessions = 0
        self.numSpams = 0
        self.numHams = 0
        self.numUnsure = 0

        # Unique names for cached messages - see `getNewMessageName()` below.
        self.lastBaseMessageName = ''
        self.uniquifier = 2

    def close(self):
        assert self.prepared, "closed without being prepared!"
        self.servers = None
        if self.bayes is not None:
            # Only store a non-empty db.
            if self.bayes.nham != 0 and self.bayes.nspam != 0:
                state.bayes.store()
            self.bayes.close()
            self.bayes = None
        if self.mdb is not None:
            self.mdb.store()
            self.mdb.close()
            self.mdb = None
            spambayes.message.Message().message_info_db = None

        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spamTrainer = self.hamTrainer = None

        self.prepared = False
        close_platform_mutex(self.platform_mutex)
        self.platform_mutex = None

    def prepare(self, can_stop=True):
        """Do whatever needs to be done to prepare for running.  If
        can_stop is False, then we may not let the user shut down the
        proxy - for example, running as a Windows service this should
        be the case."""
        # If we can, prevent multiple servers from running at the same time.
        assert self.platform_mutex is None, "Should not already have the mutex"
        self.platform_mutex = open_platform_mutex()

        self.can_stop = can_stop

        # Do whatever we've been asked to do...
        self.createWorkers()
        self.prepared = True

    def buildServerStrings(self):
        """After the server details have been set up, this creates string
        versions of the details, for display in the Status panel."""
        serverStrings = ["%s:%s" % (s, p) for s, p in self.servers]
        self.serversString = ', '.join(serverStrings)
        self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts))

    def buildStatusStrings(self):
        """Build the status message(s) to display on the home page of the
        web interface."""
        nspam = self.bayes.nspam
        nham = self.bayes.nham
        if nspam > 10 and nham > 10:
            db_ratio = nham/float(nspam)
            if db_ratio > 5.0:
                self.warning = _("Warning: you have much more ham than " \
                                 "spam - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            elif db_ratio < (1/5.0):
                self.warning = _("Warning: you have much more spam than " \
                                 "ham - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            else:
                self.warning = ""
        elif nspam > 0 or nham > 0:
            self.warning = _("Database only has %d good and %d spam - " \
                             "you should consider performing additional " \
                             "training.") % (nham, nspam)
        else:
            self.warning = _("Database has no training information.  " \
                             "SpamBayes will classify all messages as " \
                             "'unsure', ready for you to train.")
        # Add an additional warning message if the user's thresholds are
        # truly odd.
        spam_cut = options["Categorization", "spam_cutoff"]
        ham_cut = options["Categorization", "ham_cutoff"]
        if spam_cut < 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the spam threshold less than 0.5.")
        if ham_cut > 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the ham threshold greater than 0.5.")
        if ham_cut > spam_cut:
            self.warning += _("<br/>Warning: your ham threshold is " \
                              "<b>higher</b> than your spam threshold. " \
                              "Results are unpredictable.")

    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        self.mdb = spambayes.message.Message().message_info_db

        # Load stats manager.
        self.stats = Stats.Stats(options, self.mdb)

        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(storage.ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)

    def getNewMessageName(self):
        # The message name is the time it arrived, with a uniquifier
        # appended if two arrive within one clock tick of each other.
        messageName = "%10.10d" % long(time.time())
        if messageName == self.lastBaseMessageName:
            messageName = "%s-%d" % (messageName, self.uniquifier)
            self.uniquifier += 1
        else:
            self.lastBaseMessageName = messageName
            self.uniquifier = 2
        return messageName

    def RecordClassification(self, cls, score):
        """Record the classification in the session statistics.

        cls should match one of the options["Headers", "header_*_string"]
        values.

        score is the score the message received.        
        """
        if cls == options["Headers", "header_ham_string"]:
            self.numHams += 1
        elif cls == options["Headers", "header_spam_string"]:
            self.numSpams += 1
        else:
            self.numUnsure += 1
        self.stats.RecordClassification(score)
Exemple #9
0
class CoreState:
    """This keeps the global state of the module - the command-line options,
    statistics like how many mails have been classified, the handle of the
    log file, the Classifier and FileCorpus objects, and so on."""

    def __init__(self):
        """Initialises the State object that holds the state of the app.
        The default settings are read from Options.py and bayescustomize.ini
        and are then overridden by the command-line processing code in the
        __main__ code below."""
        self.log_file = None
        self.bayes = None
        self.mutex = None
        self.prepared = False
        self.can_stop = True
        self.plugin = None

        # Unique names for cached messages - see `getNewMessageName()` below.
        self.last_base_message_name = ''
        self.uniquifier = 2

        # Set up the statistics.
        self.numSpams = 0
        self.numHams = 0
        self.numUnsure = 0

        self.servers = ""

        # Load up the other settings from Option.py / bayescustomize.ini
        self.ui_port = options["html_ui", "port"]
        self.launch_ui = options["html_ui", "launch_browser"]
        self.gzip_cache = options["Storage", "cache_use_gzip"]
        self.run_test_server = False
        self.is_test = False

        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spam_trainer = self.ham_trainer = None

        self.init()

    def init(self):
        assert not self.prepared, "init after prepare, but before close"
## no i18n yet...
##         # Load the environment for translation.
##         self.lang_manager = i18n.LanguageManager()
##         # Set the system user default language.
##         self.lang_manager.set_language(\
##             self.lang_manager.locale_default_lang())
##         # Set interface to use the user language in the configuration file.
##         for language in reversed(options["globals", "language"]):
##             # We leave the default in there as the last option, to fall
##             # back on if necessary.
##             self.lang_manager.add_language(language)
##         if options["globals", "verbose"]:
##             print "Asked to add languages: " + \
##                   ", ".join(options["globals", "language"])
##             print "Set language to " + \
##                   str(self.lang_manager.current_langs_codes)
        self.lang_manager = None

        # Open the log file.
        if options["globals", "verbose"]:
            self.log_file = open('_core_server.log', 'wb', 0)

        # Remember reported errors.
        self.reported_errors = {}

    def close(self):
        assert self.prepared, "closed without being prepared!"
        if self.bayes is not None:
            # Only store a non-empty db.
            if self.bayes.nham != 0 and self.bayes.nspam != 0:
                self.bayes.store()
            self.bayes.close()
            self.bayes = None
        spambayes.message.Message().message_info_db = None

        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spam_trainer = self.ham_trainer = None

        self.prepared = False
        self.close_platform_mutex()

    def prepare(self, can_stop=True):
        """Do whatever needs to be done to prepare for running.  If
        can_stop is False, then we may not let the user shut down the
        proxy - for example, running as a Windows service this should
        be the case."""

        self.init()
        # If we can, prevent multiple servers from running at the same time.
        assert self.mutex is None, "Should not already have the mutex"
        self.open_platform_mutex()

        self.can_stop = can_stop

        # Do whatever we've been asked to do...
        self.create_workers()
        self.prepared = True

    def build_status_strings(self):
        """Build the status message(s) to display on the home page of the
        web interface."""
        nspam = self.bayes.nspam
        nham = self.bayes.nham
        if nspam > 10 and nham > 10:
            db_ratio = nham/float(nspam)
            if db_ratio > 5.0:
                self.warning = _("Warning: you have much more ham than " \
                                 "spam - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            elif db_ratio < (1/5.0):
                self.warning = _("Warning: you have much more spam than " \
                                 "ham - SpamBayes works best with " \
                                 "approximately even numbers of ham and " \
                                 "spam.")
            else:
                self.warning = ""
        elif nspam > 0 or nham > 0:
            self.warning = _("Database only has %d good and %d spam - " \
                             "you should consider performing additional " \
                             "training.") % (nham, nspam)
        else:
            self.warning = _("Database has no training information.  " \
                             "SpamBayes will classify all messages as " \
                             "'unsure', ready for you to train.")
        # Add an additional warning message if the user's thresholds are
        # truly odd.
        spam_cut = options["Categorization", "spam_cutoff"]
        ham_cut = options["Categorization", "ham_cutoff"]
        if spam_cut < 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the spam threshold less than 0.5.")
        if ham_cut > 0.5:
            self.warning += _("<br/>Warning: we do not recommend " \
                              "setting the ham threshold greater than 0.5.")
        if ham_cut > spam_cut:
            self.warning += _("<br/>Warning: your ham threshold is " \
                              "<b>higher</b> than your spam threshold. " \
                              "Results are unpredictable.")

    def create_workers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        if self.is_test:
            self.use_db = "pickle"
            self.db_name = '_core_server.pickle'   # This is never saved.
        if not hasattr(self, "db_name"):
            self.db_name, self.use_db = storage.database_type([])
        self.bayes = storage.open_storage(self.db_name, self.use_db)

        # Load stats manager.
        self.stats = Stats.Stats(options,
                                 spambayes.message.Message().message_info_db)

        self.build_status_strings()

        # Don't set up the caches and training objects when running the
        # self-test, so as not to clutter the filesystem.
        if not self.is_test:
            # Create/open the Corpuses.  Use small cache sizes to avoid
            # hogging lots of memory.
            sc = get_pathname_option("Storage", "core_spam_cache")
            hc = get_pathname_option("Storage", "core_ham_cache")
            uc = get_pathname_option("Storage", "core_unknown_cache")
            for d in [sc, hc, uc]:
                storage.ensureDir(d)
            if self.gzip_cache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spam_trainer = storage.SpamTrainer(self.bayes)
            self.ham_trainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spam_trainer)
            self.hamCorpus.addObserver(self.ham_trainer)

    def getNewMessageName(self):
        """The message name is the time it arrived with a uniquifier
        appended if two arrive within one clock tick of each other.
        """
        message_name = "%10.10d" % long(time.time())
        if message_name == self.last_base_message_name:
            message_name = "%s-%d" % (message_name, self.uniquifier)
            self.uniquifier += 1
        else:
            self.last_base_message_name = message_name
            self.uniquifier = 2
        return message_name

    def record_classification(self, cls, score):
        """Record the classification in the session statistics.

        cls should match one of the options["Headers", "header_*_string"]
        values.

        score is the score the message received.        
        """
        if cls == options["Headers", "header_ham_string"]:
            self.numHams += 1
        elif cls == options["Headers", "header_spam_string"]:
            self.numSpams += 1
        else:
            self.numUnsure += 1
        self.stats.RecordClassification(score)

    def buildStatusStrings(self):
        return ""

    def recreate_state(self):
        if self.prepared:    
            # Close the state (which saves if necessary)
            self.close()
        # And get a new one going.
        state = CoreState()

        state.prepare()
        return state

    def open_platform_mutex(self, mutex_name="SpamBayesServer"):
        """Implementations of a mutex or other resource which can prevent
        multiple servers starting at once.  Platform specific as no
        reasonable cross-platform solution exists (however, an old trick is
        to use a directory for a mutex, as a create/test atomic API
        generally exists).  Will set self.mutex or may throw
        AlreadyRunningException
        """

        if sys.platform.startswith("win"):
            try:
                import win32event, win32api, winerror
                # ideally, the mutex name could include either the username,
                # or the munged path to the INI file - this would mean we
                # would allow multiple starts so long as they weren't for
                # the same user.  However, as of now, the service version
                # is likely to start as a different user, so a single mutex
                # is best for now.
                # XXX - even if we do get clever with another mutex name, we
                # should consider still creating a non-exclusive
                # "SpamBayesServer" mutex, if for no better reason than so
                # an installer can check if we are running
                try:
                    hmutex = win32event.CreateMutex(None, True, mutex_name)
                except win32event.error as details:
                    # If another user has the mutex open, we get an "access
                    # denied" error - this is still telling us what we need
                    # to know.
                    if details[0] != winerror.ERROR_ACCESS_DENIED:
                        raise
                    raise AlreadyRunningException
                # mutex opened - now check if we actually created it.
                if win32api.GetLastError()==winerror.ERROR_ALREADY_EXISTS:
                    win32api.CloseHandle(hmutex)
                    raise AlreadyRunningException
                self.mutex = hmutex
                return
            except ImportError:
                # no win32all - no worries, just start
                pass
        self.mutex = None

    def close_platform_mutex(self):
        """Toss out the current mutex."""
        if sys.platform.startswith("win"):
            if self.mutex is not None:
                self.mutex.Close()
        self.mutex = None
Exemple #10
0
class Classifier:
    WordInfoClass = WordInfo
    def __init__(self):
        self.wordinfo = {}
        self.probcache = {}
        self.nspam = self.nham = 0
    def __getstate__(self):
        return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
    def __setstate__(self, t):
        if t[0] != PICKLE_VERSION:
            raise ValueError("Can't unpickle -- version %s unknown" % t[0])
        (self.wordinfo, self.nspam, self.nham) = t[1:]
        self.probcache = {}
    def chi2_spamprob(self, wordstream, evidence=False):
        """Return best-guess probability that wordstream is spam.
        wordstream is an iterable object producing words.
        The return value is a float in [0.0, 1.0].
        If optional arg evidence is True, the return value is a pair
            probability, evidence
        where evidence is a list of (word, probability) pairs.
        """
        from math import frexp, log as ln
        H = S = 1.0
        Hexp = Sexp = 0
        clues = self._getclues(wordstream)
        for prob, word, record in clues:
            S *= 1.0 - prob
            H *= prob
            if S < 1e-200:  # prevent underflow
                S, e = frexp(S)
                Sexp += e
            if H < 1e-200:  # prevent underflow
                H, e = frexp(H)
                Hexp += e
        S = ln(S) + Sexp * LN2
        H = ln(H) + Hexp * LN2
        n = len(clues)
        if n:
            S = 1.0 - chi2Q(-2.0 * S, 2*n)
            H = 1.0 - chi2Q(-2.0 * H, 2*n)
            prob = (S-H + 1.0) / 2.0
        else:
            prob = 0.5
        if evidence:
            clues = [(w, p) for p, w, r in clues]
            clues.sort(lambda a, b: cmp(a[1], b[1]))
            clues.insert(0, ('*S*', S))
            clues.insert(0, ('*H*', H))
            return prob, clues
        else:
            return prob
    def slurping_spamprob(self, wordstream, evidence=False):
        """Do the standard chi-squared spamprob, but if the evidence
        leaves the score in the unsure range, and we have fewer tokens
        than max_discriminators, also generate tokens from the text
        obtained by following http URLs in the message."""
        h_cut = options["Categorization", "ham_cutoff"]
        s_cut = options["Categorization", "spam_cutoff"]
        prob, clues = self.chi2_spamprob(wordstream, True)
        if len(clues) < options["Classifier", "max_discriminators"] and \
           prob > h_cut and prob < s_cut and slurp_wordstream:
            slurp_tokens = list(self._generate_slurp())
            slurp_tokens.extend([w for (w,p) in clues])
            sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
            if sprob < h_cut or sprob > s_cut:
                prob = sprob
                clues = sclues
        if evidence:
            return prob, clues
        return prob
    if options["Classifier", "use_chi_squared_combining"]:
        if options["URLRetriever", "x-slurp_urls"]:
            spamprob = slurping_spamprob
        else:
            spamprob = chi2_spamprob
    def learn(self, wordstream, is_spam):
        """Teach the classifier by example.
        wordstream is a word stream representing a message.  If is_spam is
        True, you're telling the classifier this message is definitely spam,
        else that it's definitely not spam.
        """
        if options["Classifier", "use_bigrams"]:
            wordstream = self._enhance_wordstream(wordstream)
        if options["URLRetriever", "x-slurp_urls"]:
            wordstream = self._add_slurped(wordstream)
        self._add_msg(wordstream, is_spam)
    def unlearn(self, wordstream, is_spam):
        """In case of pilot error, call unlearn ASAP after screwing up.
        Pass the same arguments you passed to learn().
        """
        if options["Classifier", "use_bigrams"]:
            wordstream = self._enhance_wordstream(wordstream)
        if options["URLRetriever", "x-slurp_urls"]:
            wordstream = self._add_slurped(wordstream)
        self._remove_msg(wordstream, is_spam)
    def probability(self, record):
        """Compute, store, and return prob(msg is spam | msg contains word).
        This is the Graham calculation, but stripped of biases, and
        stripped of clamping into 0.01 thru 0.99.  The Bayesian
        adjustment following keeps them in a sane range, and one
        that naturally grows the more evidence there is to back up
        a probability.
        """
        spamcount = record.spamcount
        hamcount = record.hamcount
        try:
            return self.probcache[spamcount][hamcount]
        except KeyError:
            pass
        nham = float(self.nham or 1)
        nspam = float(self.nspam or 1)
        assert hamcount <= nham, "Token seen in more ham than ham trained."
        hamratio = hamcount / nham
        assert spamcount <= nspam, "Token seen in more spam than spam trained."
        spamratio = spamcount / nspam
        prob = spamratio / (hamratio + spamratio)
        S = options["Classifier", "unknown_word_strength"]
        StimesX = S * options["Classifier", "unknown_word_prob"]
        n = hamcount + spamcount
        prob = (StimesX + n * prob) / (S + n)
        try:
            self.probcache[spamcount][hamcount] = prob
        except KeyError:
            self.probcache[spamcount] = {hamcount: prob}
        return prob
    def _add_msg(self, wordstream, is_spam):
        self.probcache = {}    # nuke the prob cache
        if is_spam:
            self.nspam += 1
        else:
            self.nham += 1
        for word in Set(wordstream):
            record = self._wordinfoget(word)
            if record is None:
                record = self.WordInfoClass()
            if is_spam:
                record.spamcount += 1
            else:
                record.hamcount += 1
            self._wordinfoset(word, record)
        self._post_training()
    def _remove_msg(self, wordstream, is_spam):
        self.probcache = {}    # nuke the prob cache
        if is_spam:
            if self.nspam <= 0:
                raise ValueError("spam count would go negative!")
            self.nspam -= 1
        else:
            if self.nham <= 0:
                raise ValueError("non-spam count would go negative!")
            self.nham -= 1
        for word in Set(wordstream):
            record = self._wordinfoget(word)
            if record is not None:
                if is_spam:
                    if record.spamcount > 0:
                        record.spamcount -= 1
                else:
                    if record.hamcount > 0:
                        record.hamcount -= 1
                if record.hamcount == 0 == record.spamcount:
                    self._wordinfodel(word)
                else:
                    self._wordinfoset(word, record)
        self._post_training()
    def _post_training(self):
        """This is called after training on a wordstream.  Subclasses might
        want to ensure that their databases are in a consistent state at
        this point.  Introduced to fix bug #797890."""
        pass
    def _getclues(self, wordstream):
        mindist = options["Classifier", "minimum_prob_strength"]
        if options["Classifier", "use_bigrams"]:
            raw = []
            push = raw.append
            pair = None
            seen = {pair: 1} # so the bigram token is skipped on 1st loop trip
            for i, token in enumerate(wordstream):
                if i:   # not the 1st loop trip, so there is a preceding token
                    pair = "bi:%s %s" % (last_token, token)
                last_token = token
                for clue, indices in (token, (i,)), (pair, (i-1, i)):
                    if clue not in seen:    # as always, skip duplicates
                        seen[clue] = 1
                        tup = self._worddistanceget(clue)
                        if tup[0] >= mindist:
                            push((tup, indices))
            raw.sort()
            raw.reverse()
            clues = []
            push = clues.append
            seen = {}
            for tup, indices in raw:
                overlap = [i for i in indices if i in seen]
                if not overlap: # no overlap with anything already in clues
                    for i in indices:
                        seen[i] = 1
                    push(tup)
            clues.reverse()
        else:
            clues = []
            push = clues.append
            for word in Set(wordstream):
                tup = self._worddistanceget(word)
                if tup[0] >= mindist:
                    push(tup)
            clues.sort()
        if len(clues) > options["Classifier", "max_discriminators"]:
            del clues[0 : -options["Classifier", "max_discriminators"]]
        return [t[1:] for t in clues]
    def _worddistanceget(self, word):
        record = self._wordinfoget(word)
        if record is None:
            prob = options["Classifier", "unknown_word_prob"]
        else:
            prob = self.probability(record)
        distance = abs(prob - 0.5)
        return distance, prob, word, record
    def _wordinfoget(self, word):
        return self.wordinfo.get(word)
    def _wordinfoset(self, word, record):
        self.wordinfo[word] = record
    def _wordinfodel(self, word):
        del self.wordinfo[word]
    def _enhance_wordstream(self, wordstream):
        """Add bigrams to the wordstream.
        For example, a b c -> a b "a b" c "b c"
        Note that these are *token* bigrams, and not *word* bigrams - i.e.
        'synthetic' tokens get bigram'ed, too.
        The bigram token is simply "bi:unigram1 unigram2" - a space should
        be sufficient as a separator, since spaces aren't in any other
        tokens, apart from 'synthetic' ones.  The "bi:" prefix is added
        to avoid conflict with tokens we generate (like "subject: word",
        which could be "word" in a subject, or a bigram of "subject:" and
        "word").
        If the "Classifier":"use_bigrams" option is removed, this function
        can be removed, too.
        """
        last = None
        for token in wordstream:
            yield token
            if last:
                yield "bi:%s %s" % (last, token)
            last = token
    def _generate_slurp(self):
        if not hasattr(self, "setup_done"):
            self.setup()
            self.setup_done = True
        if not hasattr(self, "do_slurp") or self.do_slurp:
            if slurp_wordstream:
                self.do_slurp = False
                tokens = self.slurp(*slurp_wordstream)
                self.do_slurp = True
                self._save_caches()
                return tokens
        return []
    def setup(self):
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%[email protected]%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support,
                                          urllib2.HTTPHandler)
        else:
            opener = urllib2.build_opener(urllib2.HTTPHandler)
        urllib2.install_opener(opener)
        age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            if options["globals", "verbose"]:
                print >>sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)
        self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                          dir, cacheSize=20)
        self.urlCorpus.removeExpiredMessages()
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            b_file = file(self.bad_url_cache_name, "r")
            try:
                self.bad_urls = pickle.load(b_file)
            except IOError, ValueError:
                if options["globals", "verbose"]:
                    print >>sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {"url:non_resolving": (),
                                 "url:non_html": (),
                                 "url:unknown_error": ()}
            b_file.close()
        else:
Exemple #11
0
class State:
    def __init__(self):
        """Initialises the State object that holds the state of the app.
        The default settings are read from Options.py and bayescustomize.ini
        and are then overridden by the command-line processing code in the
        __main__ code below."""
        self.logFile = None
        self.bayes = None
        self.platform_mutex = None
        self.prepared = False
        self.init()

        # Load up the other settings from Option.py / bayescustomize.ini
        self.uiPort = options["html_ui", "port"]
        self.launchUI = options["html_ui", "launch_browser"]
        self.gzipCache = options["Storage", "cache_use_gzip"]
        self.cacheExpiryDays = options["Storage", "cache_expiry_days"]
        self.runTestServer = False
        self.isTest = False

    def init(self):
        assert not self.prepared, "init after prepare, but before close"
        # Open the log file.
        if options["globals", "verbose"]:
            self.logFile = open('_pop3proxy.log', 'wb', 0)
        self.servers = []
        self.proxyPorts = []
        if options["pop3proxy", "remote_servers"]:
            for server in options["pop3proxy", "remote_servers"]:
                server = server.strip()
                if server.find(':') > -1:
                    server, port = server.split(':', 1)
                else:
                    port = '110'
                self.servers.append((server, int(port)))

        if options["pop3proxy", "listen_ports"]:
            splitPorts = options["pop3proxy", "listen_ports"]
            self.proxyPorts = map(_addressAndPort, splitPorts)

        if len(self.servers) != len(self.proxyPorts):
            print "pop3proxy_servers & pop3proxy_ports are different lengths!"
            sys.exit()

        # Remember reported errors.
        self.reported_errors = {}

        # Set up the statistics.
        self.totalSessions = 0
        self.activeSessions = 0
        self.numSpams = 0
        self.numHams = 0
        self.numUnsure = 0

        # Unique names for cached messages - see `getNewMessageName()` below.
        self.lastBaseMessageName = ''
        self.uniquifier = 2

    def close(self):
        assert self.prepared, "closed without being prepared!"
        self.servers = None
        if self.bayes is not None:
            # Only store a non-empty db.
            if self.bayes.nham != 0 and self.bayes.nspam != 0:
                state.bayes.store()
            self.bayes.close()
            self.bayes = None

        self.spamCorpus = self.hamCorpus = self.unknownCorpus = None
        self.spamTrainer = self.hamTrainer = None

        self.prepared = False
        close_platform_mutex(self.platform_mutex)
        self.platform_mutex = None

    def prepare(self):
        # If we can, prevent multiple servers from running at the same time.
        assert self.platform_mutex is None, "Should not already have the mutex"
        self.platform_mutex = open_platform_mutex()

        # Do whatever we've been asked to do...
        self.createWorkers()
        self.prepared = True

    def buildServerStrings(self):
        """After the server details have been set up, this creates string
        versions of the details, for display in the Status panel."""
        serverStrings = ["%s:%s" % (s, p) for s, p in self.servers]
        self.serversString = ', '.join(serverStrings)
        self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts))

    def buildStatusStrings(self):
        """Build the status message(s) to display on the home page of the
        web interface."""
        nspam = self.bayes.nspam
        nham = self.bayes.nham
        if nspam > 10 and nham > 10:
            db_ratio = nham/float(nspam)
            big = small = None
            if db_ratio > 5.0:
                big = "ham"
                small = "spam"
            elif db_ratio < (1/5.0):
                big = "spam"
                small = "ham"
            if big is not None:
                self.warning = "Warning: you have much more %s than %s - " \
                               "SpamBayes works best with approximately even " \
                               "numbers of ham and spam." % (big, small)
            else:
                self.warning = ""
        elif nspam > 0 or nham > 0:
            self.warning = "Database only has %d good and %d spam - you should " \
                           "consider performing additional training." % (nham, nspam)
        else:
            self.warning = "Database has no training information.  SpamBayes " \
                           "will classify all messages as 'unsure', " \
                           "ready for you to train."

    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        
        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            def ensureDir(dirname):
                try:
                    os.mkdir(dirname)
                except OSError, e:
                    if e.errno != errno.EEXIST:
                        raise

            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)