class CoreState: """This keeps the global state of the module - the command-line options, statistics like how many mails have been classified, the handle of the log file, the Classifier and FileCorpus objects, and so on.""" def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.log_file = None self.bayes = None self.mutex = None self.prepared = False self.can_stop = True self.plugin = None self.last_base_message_name = '' self.uniquifier = 2 self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 self.servers = "" self.ui_port = options["html_ui", "port"] self.launch_ui = options["html_ui", "launch_browser"] self.gzip_cache = options["Storage", "cache_use_gzip"] self.run_test_server = False self.is_test = False self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.init() def init(self): assert not self.prepared, "init after prepare, but before close" self.lang_manager = None if options["globals", "verbose"]: self.log_file = open('_core_server.log', 'wb', 0) self.reported_errors = {} def close(self): assert self.prepared, "closed without being prepared!" if self.bayes is not None: if self.bayes.nham != 0 and self.bayes.nspam != 0: self.bayes.store() self.bayes.close() self.bayes = None spambayes.message.Message().message_info_db = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.prepared = False self.close_platform_mutex() def prepare(self, can_stop=True): """Do whatever needs to be done to prepare for running. If can_stop is False, then we may not let the user shut down the proxy - for example, running as a Windows service this should be the case.""" self.init() assert self.mutex is None, "Should not already have the mutex" self.open_platform_mutex() self.can_stop = can_stop self.create_workers() self.prepared = True def build_status_strings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") elif db_ratio < (1/5.0): self.warning = _("Warning: you have much more spam than " \ "ham - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = _("Database only has %d good and %d spam - " \ "you should consider performing additional " \ "training.") % (nham, nspam) else: self.warning = _("Database has no training information. " \ "SpamBayes will classify all messages as " \ "'unsure', ready for you to train.") spam_cut = options["Categorization", "spam_cutoff"] ham_cut = options["Categorization", "ham_cutoff"] if spam_cut < 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the spam threshold less than 0.5.") if ham_cut > 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the ham threshold greater than 0.5.") if ham_cut > spam_cut: self.warning += _("<br/>Warning: your ham threshold is " \ "<b>higher</b> than your spam threshold. " \ "Results are unpredictable.") def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() if not self.is_test: sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer) def getNewMessageName(self): """The message name is the time it arrived with a uniquifier appended if two arrive within one clock tick of each other. """ message_name = "%10.10d" % long(time.time()) if message_name == self.last_base_message_name: message_name = "%s-%d" % (message_name, self.uniquifier) self.uniquifier += 1 else: self.last_base_message_name = message_name self.uniquifier = 2 return message_name def record_classification(self, cls, score): """Record the classification in the session statistics. cls should match one of the options["Headers", "header_*_string"] values. score is the score the message received. """ if cls == options["Headers", "header_ham_string"]: self.numHams += 1 elif cls == options["Headers", "header_spam_string"]: self.numSpams += 1 else: self.numUnsure += 1 self.stats.RecordClassification(score) def buildStatusStrings(self): return "" def recreate_state(self): if self.prepared: self.close() state = CoreState() state.prepare() return state def open_platform_mutex(self, mutex_name="SpamBayesServer"): """Implementations of a mutex or other resource which can prevent multiple servers starting at once. Platform specific as no reasonable cross-platform solution exists (however, an old trick is to use a directory for a mutex, as a create/test atomic API generally exists). Will set self.mutex or may throw AlreadyRunningException """ if sys.platform.startswith("win"): try: import win32event, win32api, winerror try: hmutex = win32event.CreateMutex(None, True, mutex_name) except win32event.error, details: if details[0] != winerror.ERROR_ACCESS_DENIED: raise raise AlreadyRunningException if win32api.GetLastError()==winerror.ERROR_ALREADY_EXISTS: win32api.CloseHandle(hmutex) raise AlreadyRunningException self.mutex = hmutex return except ImportError: pass
class State: def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.logFile = None self.bayes = None self.platform_mutex = None self.prepared = False self.can_stop = True self.init() # Load up the other settings from Option.py / bayescustomize.ini self.uiPort = options["html_ui", "port"] self.launchUI = options["html_ui", "launch_browser"] self.gzipCache = options["Storage", "cache_use_gzip"] self.cacheExpiryDays = options["Storage", "cache_expiry_days"] self.runTestServer = False self.isTest = False def init(self): assert not self.prepared, "init after prepare, but before close" # Load the environment for translation. self.lang_manager = i18n.LanguageManager() # Set the system user default language. self.lang_manager.set_language(\ self.lang_manager.locale_default_lang()) # Set interface to use the user language in the configuration file. for language in reversed(options["globals", "language"]): # We leave the default in there as the last option, to fall # back on if necessary. self.lang_manager.add_language(language) if options["globals", "verbose"]: print "Asked to add languages: " + \ ", ".join(options["globals", "language"]) print "Set language to " + \ str(self.lang_manager.current_langs_codes) # Open the log file. if options["globals", "verbose"]: self.logFile = open('_pop3proxy.log', 'wb', 0) if not hasattr(self, "servers"): # Could have already been set via the command line. self.servers = [] if options["pop3proxy", "remote_servers"]: for server in options["pop3proxy", "remote_servers"]: server = server.strip() if server.find(':') > -1: server, port = server.split(':', 1) else: port = '110' self.servers.append((server, int(port))) if not hasattr(self, "proxyPorts"): # Could have already been set via the command line. self.proxyPorts = [] if options["pop3proxy", "listen_ports"]: splitPorts = options["pop3proxy", "listen_ports"] self.proxyPorts = map(_addressAndPort, splitPorts) if len(self.servers) != len(self.proxyPorts): print "pop3proxy_servers & pop3proxy_ports are different lengths!" sys.exit() # Remember reported errors. self.reported_errors = {} # Set up the statistics. self.totalSessions = 0 self.activeSessions = 0 self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 # Unique names for cached messages - see `getNewMessageName()` below. self.lastBaseMessageName = '' self.uniquifier = 2 def close(self): assert self.prepared, "closed without being prepared!" self.servers = None if self.bayes is not None: # Only store a non-empty db. if self.bayes.nham != 0 and self.bayes.nspam != 0: state.bayes.store() self.bayes.close() self.bayes = None if self.mdb is not None: self.mdb.store() self.mdb.close() self.mdb = None spambayes.message.Message().message_info_db = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spamTrainer = self.hamTrainer = None self.prepared = False close_platform_mutex(self.platform_mutex) self.platform_mutex = None def prepare(self, can_stop=True): """Do whatever needs to be done to prepare for running. If can_stop is False, then we may not let the user shut down the proxy - for example, running as a Windows service this should be the case.""" # If we can, prevent multiple servers from running at the same time. assert self.platform_mutex is None, "Should not already have the mutex" self.platform_mutex = open_platform_mutex() self.can_stop = can_stop # Do whatever we've been asked to do... self.createWorkers() self.prepared = True def buildServerStrings(self): """After the server details have been set up, this creates string versions of the details, for display in the Status panel.""" serverStrings = ["%s:%s" % (s, p) for s, p in self.servers] self.serversString = ', '.join(serverStrings) self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts)) def buildStatusStrings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham / float(nspam) if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") elif db_ratio < (1 / 5.0): self.warning = _("Warning: you have much more spam than " \ "ham - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = _("Database only has %d good and %d spam - " \ "you should consider performing additional " \ "training.") % (nham, nspam) else: self.warning = _("Database has no training information. " \ "SpamBayes will classify all messages as " \ "'unsure', ready for you to train.") # Add an additional warning message if the user's thresholds are # truly odd. spam_cut = options["Categorization", "spam_cutoff"] ham_cut = options["Categorization", "ham_cutoff"] if spam_cut < 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the spam threshold less than 0.5.") if ham_cut > 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the ham threshold greater than 0.5.") if ham_cut > spam_cut: self.warning += _("<br/>Warning: your ham threshold is " \ "<b>higher</b> than your spam threshold. " \ "Results are unpredictable.") def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db # Load stats manager. self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(storage.ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer) def getNewMessageName(self): # The message name is the time it arrived, with a uniquifier # appended if two arrive within one clock tick of each other. messageName = "%10.10d" % long(time.time()) if messageName == self.lastBaseMessageName: messageName = "%s-%d" % (messageName, self.uniquifier) self.uniquifier += 1 else: self.lastBaseMessageName = messageName self.uniquifier = 2 return messageName def RecordClassification(self, cls, score): """Record the classification in the session statistics. cls should match one of the options["Headers", "header_*_string"] values. score is the score the message received. """ if cls == options["Headers", "header_ham_string"]: self.numHams += 1 elif cls == options["Headers", "header_spam_string"]: self.numSpams += 1 else: self.numUnsure += 1 self.stats.RecordClassification(score)
class State: def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.logFile = None self.bayes = None self.platform_mutex = None self.prepared = False self.can_stop = True self.init() # Load up the other settings from Option.py / bayescustomize.ini self.uiPort = options["html_ui", "port"] self.launchUI = options["html_ui", "launch_browser"] self.gzipCache = options["Storage", "cache_use_gzip"] self.cacheExpiryDays = options["Storage", "cache_expiry_days"] self.runTestServer = False self.isTest = False def init(self): assert not self.prepared, "init after prepare, but before close" # Load the environment for translation. self.lang_manager = i18n.LanguageManager() # Set the system user default language. self.lang_manager.set_language(\ self.lang_manager.locale_default_lang()) # Set interface to use the user language in the configuration file. for language in reversed(options["globals", "language"]): # We leave the default in there as the last option, to fall # back on if necessary. self.lang_manager.add_language(language) if options["globals", "verbose"]: print "Asked to add languages: " + \ ", ".join(options["globals", "language"]) print "Set language to " + \ str(self.lang_manager.current_langs_codes) # Open the log file. if options["globals", "verbose"]: self.logFile = open('_pop3proxy.log', 'wb', 0) if not hasattr(self, "servers"): # Could have already been set via the command line. self.servers = [] if options["pop3proxy", "remote_servers"]: for server in options["pop3proxy", "remote_servers"]: server = server.strip() if server.find(':') > -1: server, port = server.split(':', 1) else: port = '110' self.servers.append((server, int(port))) if not hasattr(self, "proxyPorts"): # Could have already been set via the command line. self.proxyPorts = [] if options["pop3proxy", "listen_ports"]: splitPorts = options["pop3proxy", "listen_ports"] self.proxyPorts = map(_addressAndPort, splitPorts) if len(self.servers) != len(self.proxyPorts): print "pop3proxy_servers & pop3proxy_ports are different lengths!" sys.exit() # Remember reported errors. self.reported_errors = {} # Set up the statistics. self.totalSessions = 0 self.activeSessions = 0 self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 # Unique names for cached messages - see `getNewMessageName()` below. self.lastBaseMessageName = '' self.uniquifier = 2 def close(self): assert self.prepared, "closed without being prepared!" self.servers = None if self.bayes is not None: # Only store a non-empty db. if self.bayes.nham != 0 and self.bayes.nspam != 0: state.bayes.store() self.bayes.close() self.bayes = None if self.mdb is not None: self.mdb.store() self.mdb.close() self.mdb = None spambayes.message.Message().message_info_db = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spamTrainer = self.hamTrainer = None self.prepared = False close_platform_mutex(self.platform_mutex) self.platform_mutex = None def prepare(self, can_stop=True): """Do whatever needs to be done to prepare for running. If can_stop is False, then we may not let the user shut down the proxy - for example, running as a Windows service this should be the case.""" # If we can, prevent multiple servers from running at the same time. assert self.platform_mutex is None, "Should not already have the mutex" self.platform_mutex = open_platform_mutex() self.can_stop = can_stop # Do whatever we've been asked to do... self.createWorkers() self.prepared = True def buildServerStrings(self): """After the server details have been set up, this creates string versions of the details, for display in the Status panel.""" serverStrings = ["%s:%s" % (s, p) for s, p in self.servers] self.serversString = ', '.join(serverStrings) self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts)) def buildStatusStrings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") elif db_ratio < (1/5.0): self.warning = _("Warning: you have much more spam than " \ "ham - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = _("Database only has %d good and %d spam - " \ "you should consider performing additional " \ "training.") % (nham, nspam) else: self.warning = _("Database has no training information. " \ "SpamBayes will classify all messages as " \ "'unsure', ready for you to train.") # Add an additional warning message if the user's thresholds are # truly odd. spam_cut = options["Categorization", "spam_cutoff"] ham_cut = options["Categorization", "ham_cutoff"] if spam_cut < 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the spam threshold less than 0.5.") if ham_cut > 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the ham threshold greater than 0.5.") if ham_cut > spam_cut: self.warning += _("<br/>Warning: your ham threshold is " \ "<b>higher</b> than your spam threshold. " \ "Results are unpredictable.") def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db # Load stats manager. self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(storage.ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer) def getNewMessageName(self): # The message name is the time it arrived, with a uniquifier # appended if two arrive within one clock tick of each other. messageName = "%10.10d" % long(time.time()) if messageName == self.lastBaseMessageName: messageName = "%s-%d" % (messageName, self.uniquifier) self.uniquifier += 1 else: self.lastBaseMessageName = messageName self.uniquifier = 2 return messageName def RecordClassification(self, cls, score): """Record the classification in the session statistics. cls should match one of the options["Headers", "header_*_string"] values. score is the score the message received. """ if cls == options["Headers", "header_ham_string"]: self.numHams += 1 elif cls == options["Headers", "header_spam_string"]: self.numSpams += 1 else: self.numUnsure += 1 self.stats.RecordClassification(score)
class Classifier: # Defining __slots__ here made Jeremy's life needlessly difficult when # trying to hook this all up to ZODB as a persistent object. There's # no space benefit worth getting from slots in this class; slots were # used solely to help catch errors earlier, when this code was changing # rapidly. #__slots__ = ('wordinfo', # map word to WordInfo record # 'nspam', # number of spam messages learn() has seen # 'nham', # number of non-spam messages learn() has seen # ) # allow a subclass to use a different class for WordInfo WordInfoClass = WordInfo def __init__(self): self.wordinfo = {} self.probcache = {} self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = {} # spamprob() implementations. One of the following is aliased to # spamprob, depending on option settings. # Currently only chi-squared is available, but maybe there will be # an alternative again someday. # Across vectors of length n, containing random uniformly-distributed # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution # with 2*n degrees of freedom. This has been proven (in some # appropriate sense) to be the most sensitive possible test for # rejecting the hypothesis that a vector of probabilities is uniformly # distributed. Gary Robinson's original scheme was monotonic *with* # this test, but skipped the details. Turns out that getting closer # to the theoretical roots gives a much sharper classification, with # a very small (in # of msgs), but also very broad (in range of scores), # "middle ground", where most of the mistakes live. In particular, # this scheme seems immune to all forms of "cancellation disease": if # there are many strong ham *and* spam clues, this reliably scores # close to 0.5. Most other schemes are extremely certain then -- and # often wrong. def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, _r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob def slurping_spamprob(self, wordstream, evidence=False): """Do the standard chi-squared spamprob, but if the evidence leaves the score in the unsure range, and we have fewer tokens than max_discriminators, also generate tokens from the text obtained by following http URLs in the message.""" h_cut = options["Categorization", "ham_cutoff"] s_cut = options["Categorization", "spam_cutoff"] # Get the raw score. prob, clues = self.chi2_spamprob(wordstream, True) # If necessary, enhance it with the tokens from whatever is # at the URL's destination. if len(clues) < options["Classifier", "max_discriminators"] and \ prob > h_cut and prob < s_cut and slurp_wordstream: slurp_tokens = list(self._generate_slurp()) slurp_tokens.extend([w for (w, _p) in clues]) sprob, sclues = self.chi2_spamprob(slurp_tokens, True) if sprob < h_cut or sprob > s_cut: prob = sprob clues = sclues if evidence: return prob, clues return prob if options["Classifier", "use_chi_squared_combining"]: if options["URLRetriever", "x-slurp_urls"]: spamprob = slurping_spamprob else: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._add_msg(wordstream, is_spam) def unlearn(self, wordstream, is_spam): """In case of pilot error, call unlearn ASAP after screwing up. Pass the same arguments you passed to learn(). """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._remove_msg(wordstream, is_spam) def probability(self, record): """Compute, store, and return prob(msg is spam | msg contains word). This is the Graham calculation, but stripped of biases, and stripped of clamping into 0.01 thru 0.99. The Bayesian adjustment following keeps them in a sane range, and one that naturally grows the more evidence there is to back up a probability. """ spamcount = record.spamcount hamcount = record.hamcount # Try the cache first try: return self.probcache[spamcount][hamcount] except KeyError: pass nham = float(self.nham or 1) nspam = float(self.nspam or 1) assert hamcount <= nham, "Token seen in more ham than ham trained." hamratio = hamcount / nham assert spamcount <= nspam, "Token seen in more spam than spam trained." spamratio = spamcount / nspam prob = spamratio / (hamratio + spamratio) S = options["Classifier", "unknown_word_strength"] StimesX = S * options["Classifier", "unknown_word_prob"] # Now do Robinson's Bayesian adjustment. # # s*x + n*p(w) # f(w) = -------------- # s + n # # I find this easier to reason about like so (equivalent when # s != 0): # # x - p # p + ------- # 1 + n/s # # IOW, it moves p a fraction of the distance from p to x, and # less so the larger n is, or the smaller s is. n = hamcount + spamcount prob = (StimesX + n * prob) / (S + n) # Update the cache try: self.probcache[spamcount][hamcount] = prob except KeyError: self.probcache[spamcount] = {hamcount: prob} return prob # NOTE: Graham's scheme had a strange asymmetry: when a word appeared # n>1 times in a single message, training added n to the word's hamcount # or spamcount, but predicting scored words only once. Tests showed # that adding only 1 in training, or scoring more than once when # predicting, hurt under the Graham scheme. # This isn't so under Robinson's scheme, though: results improve # if training also counts a word only once. The mean ham score decreases # significantly and consistently, ham score variance decreases likewise, # mean spam score decreases (but less than mean ham score, so the spread # increases), and spam score variance increases. # I (Tim) speculate that adding n times under the Graham scheme helped # because it acted against the various ham biases, giving frequently # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else, # adding only once in training, a word like that was simply ignored until # it appeared in 5 distinct training spams. Without the ham-favoring # biases, though, and never ignoring words, counting n times introduces # a subtle and unhelpful bias. # There does appear to be some useful info in how many times a word # appears in a msg, but distorting spamprob doesn't appear a correct way # to exploit it. def _add_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: self.nspam += 1 else: self.nham += 1 for word in set(wordstream): record = self._wordinfoget(word) if record is None: record = self.WordInfoClass() if is_spam: record.spamcount += 1 else: record.hamcount += 1 self._wordinfoset(word, record) self._post_training() def _remove_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: if self.nspam <= 0: raise ValueError("spam count would go negative!") self.nspam -= 1 else: if self.nham <= 0: raise ValueError("non-spam count would go negative!") self.nham -= 1 for word in set(wordstream): record = self._wordinfoget(word) if record is not None: if is_spam: if record.spamcount > 0: record.spamcount -= 1 else: if record.hamcount > 0: record.hamcount -= 1 if record.hamcount == 0 == record.spamcount: self._wordinfodel(word) else: self._wordinfoset(word, record) self._post_training() def _post_training(self): """This is called after training on a wordstream. Subclasses might want to ensure that their databases are in a consistent state at this point. Introduced to fix bug #797890.""" pass # Return list of (prob, word, record) triples, sorted by increasing # prob. "word" is a token from wordstream; "prob" is its spamprob (a # float in 0.0 through 1.0); and "record" is word's associated # WordInfo record if word is in the training database, or None if it's # not. No more than max_discriminators items are returned, and have # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream. # Tokens with spamprobs less than minimum_prob_strength away from 0.5 # aren't returned. def _getclues(self, wordstream): mindist = options["Classifier", "minimum_prob_strength"] if options["Classifier", "use_bigrams"]: # This scheme mixes single tokens with pairs of adjacent tokens. # wordstream is "tiled" into non-overlapping unigrams and # bigrams. Non-overlap is important to prevent a single original # token from contributing to more than one spamprob returned # (systematic correlation probably isn't a good thing). # First fill list raw with # (distance, prob, word, record), indices # pairs, one for each unigram and bigram in wordstream. # indices is a tuple containing the indices (0-based relative to # the start of wordstream) of the tokens that went into word. # indices is a 1-tuple for an original token, and a 2-tuple for # a synthesized bigram token. The indices are needed to detect # overlap later. raw = [] push = raw.append pair = None # Keep track of which tokens we've already seen. # Don't use a set here! This is an innermost loop, so speed is # important here (direct dict fiddling is much quicker than # invoking Python-level set methods; in Python 2.4 that will # change). seen = {pair: 1} # so the bigram token is skipped on 1st loop trip for i, token in enumerate(wordstream): if i: # not the 1st loop trip, so there is a preceding token # This string interpolation must match the one in # _enhance_wordstream(). pair = "bi:%s %s" % (last_token, token) last_token = token for clue, indices in (token, (i,)), (pair, (i-1, i)): if clue not in seen: # as always, skip duplicates seen[clue] = 1 tup = self._worddistanceget(clue) if tup[0] >= mindist: push((tup, indices)) # Sort raw, strongest to weakest spamprob. raw.sort() raw.reverse() # Fill clues with the strongest non-overlapping clues. clues = [] push = clues.append # Keep track of which indices have already contributed to a # clue in clues. seen = {} for tup, indices in raw: overlap = [i for i in indices if i in seen] if not overlap: # no overlap with anything already in clues for i in indices: seen[i] = 1 push(tup) # Leave sorted from smallest to largest spamprob. clues.reverse() else: # The all-unigram scheme just scores the tokens as-is. A set() # is used to weed out duplicates at high speed. clues = [] push = clues.append for word in set(wordstream): tup = self._worddistanceget(word) if tup[0] >= mindist: push(tup) clues.sort() if len(clues) > options["Classifier", "max_discriminators"]: del clues[0 : -options["Classifier", "max_discriminators"]] # Return (prob, word, record). return [t[1:] for t in clues] def _worddistanceget(self, word): record = self._wordinfoget(word) if record is None: prob = options["Classifier", "unknown_word_prob"] else: prob = self.probability(record) distance = abs(prob - 0.5) return distance, prob, word, record def _wordinfoget(self, word): return self.wordinfo.get(word) def _wordinfoset(self, word, record): self.wordinfo[word] = record def _wordinfodel(self, word): del self.wordinfo[word] def _enhance_wordstream(self, wordstream): """Add bigrams to the wordstream. For example, a b c -> a b "a b" c "b c" Note that these are *token* bigrams, and not *word* bigrams - i.e. 'synthetic' tokens get bigram'ed, too. The bigram token is simply "bi:unigram1 unigram2" - a space should be sufficient as a separator, since spaces aren't in any other tokens, apart from 'synthetic' ones. The "bi:" prefix is added to avoid conflict with tokens we generate (like "subject: word", which could be "word" in a subject, or a bigram of "subject:" and "word"). If the "Classifier":"use_bigrams" option is removed, this function can be removed, too. """ last = None for token in wordstream: yield token if last: # This string interpolation must match the one in # _getclues(). yield "bi:%s %s" % (last, token) last = token def _generate_slurp(self): # We don't want to do this recursively and check URLs # on webpages, so we have this little cheat. if not hasattr(self, "setup_done"): self.setup() self.setup_done = True if not hasattr(self, "do_slurp") or self.do_slurp: if slurp_wordstream: self.do_slurp = False tokens = self.slurp(*slurp_wordstream) self.do_slurp = True self._save_caches() return tokens return [] def setup(self): # Can't import this at the top because it's circular. # XXX Someone smarter than me, please figure out the right # XXX way to do this. from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: # Build a new opener that uses a proxy requiring authorization proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: # Build a new opener without any proxy information. opener = urllib2.build_opener(urllib2.HTTPHandler) # Install it urllib2.install_opener(opener) # Setup the cache for retrieved urls age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) # Kill any old information in the cache self.urlCorpus.removeExpiredMessages() # Setup caches for unretrievable urls self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} else: if options["globals", "verbose"]: print "URL caches don't exist: creating" self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {} else:
class CoreState: """This keeps the global state of the module - the command-line options, statistics like how many mails have been classified, the handle of the log file, the Classifier and FileCorpus objects, and so on.""" def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.log_file = None self.bayes = None self.mutex = None self.prepared = False self.can_stop = True self.plugin = None # Unique names for cached messages - see `getNewMessageName()` below. self.last_base_message_name = '' self.uniquifier = 2 # Set up the statistics. self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 self.servers = "" # Load up the other settings from Option.py / bayescustomize.ini self.ui_port = options["html_ui", "port"] self.launch_ui = options["html_ui", "launch_browser"] self.gzip_cache = options["Storage", "cache_use_gzip"] self.run_test_server = False self.is_test = False self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.init() def init(self): assert not self.prepared, "init after prepare, but before close" ## no i18n yet... ## # Load the environment for translation. ## self.lang_manager = i18n.LanguageManager() ## # Set the system user default language. ## self.lang_manager.set_language(\ ## self.lang_manager.locale_default_lang()) ## # Set interface to use the user language in the configuration file. ## for language in reversed(options["globals", "language"]): ## # We leave the default in there as the last option, to fall ## # back on if necessary. ## self.lang_manager.add_language(language) ## if options["globals", "verbose"]: ## print "Asked to add languages: " + \ ## ", ".join(options["globals", "language"]) ## print "Set language to " + \ ## str(self.lang_manager.current_langs_codes) self.lang_manager = None # Open the log file. if options["globals", "verbose"]: self.log_file = open('_core_server.log', 'wb', 0) # Remember reported errors. self.reported_errors = {} def close(self): assert self.prepared, "closed without being prepared!" if self.bayes is not None: # Only store a non-empty db. if self.bayes.nham != 0 and self.bayes.nspam != 0: self.bayes.store() self.bayes.close() self.bayes = None spambayes.message.Message().message_info_db = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.prepared = False self.close_platform_mutex() def prepare(self, can_stop=True): """Do whatever needs to be done to prepare for running. If can_stop is False, then we may not let the user shut down the proxy - for example, running as a Windows service this should be the case.""" self.init() # If we can, prevent multiple servers from running at the same time. assert self.mutex is None, "Should not already have the mutex" self.open_platform_mutex() self.can_stop = can_stop # Do whatever we've been asked to do... self.create_workers() self.prepared = True def build_status_strings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") elif db_ratio < (1/5.0): self.warning = _("Warning: you have much more spam than " \ "ham - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = _("Database only has %d good and %d spam - " \ "you should consider performing additional " \ "training.") % (nham, nspam) else: self.warning = _("Database has no training information. " \ "SpamBayes will classify all messages as " \ "'unsure', ready for you to train.") # Add an additional warning message if the user's thresholds are # truly odd. spam_cut = options["Categorization", "spam_cutoff"] ham_cut = options["Categorization", "ham_cutoff"] if spam_cut < 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the spam threshold less than 0.5.") if ham_cut > 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the ham threshold greater than 0.5.") if ham_cut > spam_cut: self.warning += _("<br/>Warning: your ham threshold is " \ "<b>higher</b> than your spam threshold. " \ "Results are unpredictable.") def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer) def getNewMessageName(self): """The message name is the time it arrived with a uniquifier appended if two arrive within one clock tick of each other. """ message_name = "%10.10d" % long(time.time()) if message_name == self.last_base_message_name: message_name = "%s-%d" % (message_name, self.uniquifier) self.uniquifier += 1 else: self.last_base_message_name = message_name self.uniquifier = 2 return message_name def record_classification(self, cls, score): """Record the classification in the session statistics. cls should match one of the options["Headers", "header_*_string"] values. score is the score the message received. """ if cls == options["Headers", "header_ham_string"]: self.numHams += 1 elif cls == options["Headers", "header_spam_string"]: self.numSpams += 1 else: self.numUnsure += 1 self.stats.RecordClassification(score) def buildStatusStrings(self): return "" def recreate_state(self): if self.prepared: # Close the state (which saves if necessary) self.close() # And get a new one going. state = CoreState() state.prepare() return state def open_platform_mutex(self, mutex_name="SpamBayesServer"): """Implementations of a mutex or other resource which can prevent multiple servers starting at once. Platform specific as no reasonable cross-platform solution exists (however, an old trick is to use a directory for a mutex, as a create/test atomic API generally exists). Will set self.mutex or may throw AlreadyRunningException """ if sys.platform.startswith("win"): try: import win32event, win32api, winerror # ideally, the mutex name could include either the username, # or the munged path to the INI file - this would mean we # would allow multiple starts so long as they weren't for # the same user. However, as of now, the service version # is likely to start as a different user, so a single mutex # is best for now. # XXX - even if we do get clever with another mutex name, we # should consider still creating a non-exclusive # "SpamBayesServer" mutex, if for no better reason than so # an installer can check if we are running try: hmutex = win32event.CreateMutex(None, True, mutex_name) except win32event.error as details: # If another user has the mutex open, we get an "access # denied" error - this is still telling us what we need # to know. if details[0] != winerror.ERROR_ACCESS_DENIED: raise raise AlreadyRunningException # mutex opened - now check if we actually created it. if win32api.GetLastError()==winerror.ERROR_ALREADY_EXISTS: win32api.CloseHandle(hmutex) raise AlreadyRunningException self.mutex = hmutex return except ImportError: # no win32all - no worries, just start pass self.mutex = None def close_platform_mutex(self): """Toss out the current mutex.""" if sys.platform.startswith("win"): if self.mutex is not None: self.mutex.Close() self.mutex = None
class Classifier: WordInfoClass = WordInfo def __init__(self): self.wordinfo = {} self.probcache = {} self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = {} def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) prob = (S-H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) return prob, clues else: return prob def slurping_spamprob(self, wordstream, evidence=False): """Do the standard chi-squared spamprob, but if the evidence leaves the score in the unsure range, and we have fewer tokens than max_discriminators, also generate tokens from the text obtained by following http URLs in the message.""" h_cut = options["Categorization", "ham_cutoff"] s_cut = options["Categorization", "spam_cutoff"] prob, clues = self.chi2_spamprob(wordstream, True) if len(clues) < options["Classifier", "max_discriminators"] and \ prob > h_cut and prob < s_cut and slurp_wordstream: slurp_tokens = list(self._generate_slurp()) slurp_tokens.extend([w for (w,p) in clues]) sprob, sclues = self.chi2_spamprob(slurp_tokens, True) if sprob < h_cut or sprob > s_cut: prob = sprob clues = sclues if evidence: return prob, clues return prob if options["Classifier", "use_chi_squared_combining"]: if options["URLRetriever", "x-slurp_urls"]: spamprob = slurping_spamprob else: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._add_msg(wordstream, is_spam) def unlearn(self, wordstream, is_spam): """In case of pilot error, call unlearn ASAP after screwing up. Pass the same arguments you passed to learn(). """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._remove_msg(wordstream, is_spam) def probability(self, record): """Compute, store, and return prob(msg is spam | msg contains word). This is the Graham calculation, but stripped of biases, and stripped of clamping into 0.01 thru 0.99. The Bayesian adjustment following keeps them in a sane range, and one that naturally grows the more evidence there is to back up a probability. """ spamcount = record.spamcount hamcount = record.hamcount try: return self.probcache[spamcount][hamcount] except KeyError: pass nham = float(self.nham or 1) nspam = float(self.nspam or 1) assert hamcount <= nham, "Token seen in more ham than ham trained." hamratio = hamcount / nham assert spamcount <= nspam, "Token seen in more spam than spam trained." spamratio = spamcount / nspam prob = spamratio / (hamratio + spamratio) S = options["Classifier", "unknown_word_strength"] StimesX = S * options["Classifier", "unknown_word_prob"] n = hamcount + spamcount prob = (StimesX + n * prob) / (S + n) try: self.probcache[spamcount][hamcount] = prob except KeyError: self.probcache[spamcount] = {hamcount: prob} return prob def _add_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: self.nspam += 1 else: self.nham += 1 for word in Set(wordstream): record = self._wordinfoget(word) if record is None: record = self.WordInfoClass() if is_spam: record.spamcount += 1 else: record.hamcount += 1 self._wordinfoset(word, record) self._post_training() def _remove_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: if self.nspam <= 0: raise ValueError("spam count would go negative!") self.nspam -= 1 else: if self.nham <= 0: raise ValueError("non-spam count would go negative!") self.nham -= 1 for word in Set(wordstream): record = self._wordinfoget(word) if record is not None: if is_spam: if record.spamcount > 0: record.spamcount -= 1 else: if record.hamcount > 0: record.hamcount -= 1 if record.hamcount == 0 == record.spamcount: self._wordinfodel(word) else: self._wordinfoset(word, record) self._post_training() def _post_training(self): """This is called after training on a wordstream. Subclasses might want to ensure that their databases are in a consistent state at this point. Introduced to fix bug #797890.""" pass def _getclues(self, wordstream): mindist = options["Classifier", "minimum_prob_strength"] if options["Classifier", "use_bigrams"]: raw = [] push = raw.append pair = None seen = {pair: 1} # so the bigram token is skipped on 1st loop trip for i, token in enumerate(wordstream): if i: # not the 1st loop trip, so there is a preceding token pair = "bi:%s %s" % (last_token, token) last_token = token for clue, indices in (token, (i,)), (pair, (i-1, i)): if clue not in seen: # as always, skip duplicates seen[clue] = 1 tup = self._worddistanceget(clue) if tup[0] >= mindist: push((tup, indices)) raw.sort() raw.reverse() clues = [] push = clues.append seen = {} for tup, indices in raw: overlap = [i for i in indices if i in seen] if not overlap: # no overlap with anything already in clues for i in indices: seen[i] = 1 push(tup) clues.reverse() else: clues = [] push = clues.append for word in Set(wordstream): tup = self._worddistanceget(word) if tup[0] >= mindist: push(tup) clues.sort() if len(clues) > options["Classifier", "max_discriminators"]: del clues[0 : -options["Classifier", "max_discriminators"]] return [t[1:] for t in clues] def _worddistanceget(self, word): record = self._wordinfoget(word) if record is None: prob = options["Classifier", "unknown_word_prob"] else: prob = self.probability(record) distance = abs(prob - 0.5) return distance, prob, word, record def _wordinfoget(self, word): return self.wordinfo.get(word) def _wordinfoset(self, word, record): self.wordinfo[word] = record def _wordinfodel(self, word): del self.wordinfo[word] def _enhance_wordstream(self, wordstream): """Add bigrams to the wordstream. For example, a b c -> a b "a b" c "b c" Note that these are *token* bigrams, and not *word* bigrams - i.e. 'synthetic' tokens get bigram'ed, too. The bigram token is simply "bi:unigram1 unigram2" - a space should be sufficient as a separator, since spaces aren't in any other tokens, apart from 'synthetic' ones. The "bi:" prefix is added to avoid conflict with tokens we generate (like "subject: word", which could be "word" in a subject, or a bigram of "subject:" and "word"). If the "Classifier":"use_bigrams" option is removed, this function can be removed, too. """ last = None for token in wordstream: yield token if last: yield "bi:%s %s" % (last, token) last = token def _generate_slurp(self): if not hasattr(self, "setup_done"): self.setup() self.setup_done = True if not hasattr(self, "do_slurp") or self.do_slurp: if slurp_wordstream: self.do_slurp = False tokens = self.slurp(*slurp_wordstream) self.do_slurp = True self._save_caches() return tokens return [] def setup(self): from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: opener = urllib2.build_opener(urllib2.HTTPHandler) urllib2.install_opener(opener) age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): if options["globals", "verbose"]: print >>sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) self.urlCorpus.removeExpiredMessages() self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): b_file = file(self.bad_url_cache_name, "r") try: self.bad_urls = pickle.load(b_file) except IOError, ValueError: if options["globals", "verbose"]: print >>sys.stderr, "Bad URL pickle, using new." self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} b_file.close() else:
class State: def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.logFile = None self.bayes = None self.platform_mutex = None self.prepared = False self.init() # Load up the other settings from Option.py / bayescustomize.ini self.uiPort = options["html_ui", "port"] self.launchUI = options["html_ui", "launch_browser"] self.gzipCache = options["Storage", "cache_use_gzip"] self.cacheExpiryDays = options["Storage", "cache_expiry_days"] self.runTestServer = False self.isTest = False def init(self): assert not self.prepared, "init after prepare, but before close" # Open the log file. if options["globals", "verbose"]: self.logFile = open('_pop3proxy.log', 'wb', 0) self.servers = [] self.proxyPorts = [] if options["pop3proxy", "remote_servers"]: for server in options["pop3proxy", "remote_servers"]: server = server.strip() if server.find(':') > -1: server, port = server.split(':', 1) else: port = '110' self.servers.append((server, int(port))) if options["pop3proxy", "listen_ports"]: splitPorts = options["pop3proxy", "listen_ports"] self.proxyPorts = map(_addressAndPort, splitPorts) if len(self.servers) != len(self.proxyPorts): print "pop3proxy_servers & pop3proxy_ports are different lengths!" sys.exit() # Remember reported errors. self.reported_errors = {} # Set up the statistics. self.totalSessions = 0 self.activeSessions = 0 self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 # Unique names for cached messages - see `getNewMessageName()` below. self.lastBaseMessageName = '' self.uniquifier = 2 def close(self): assert self.prepared, "closed without being prepared!" self.servers = None if self.bayes is not None: # Only store a non-empty db. if self.bayes.nham != 0 and self.bayes.nspam != 0: state.bayes.store() self.bayes.close() self.bayes = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spamTrainer = self.hamTrainer = None self.prepared = False close_platform_mutex(self.platform_mutex) self.platform_mutex = None def prepare(self): # If we can, prevent multiple servers from running at the same time. assert self.platform_mutex is None, "Should not already have the mutex" self.platform_mutex = open_platform_mutex() # Do whatever we've been asked to do... self.createWorkers() self.prepared = True def buildServerStrings(self): """After the server details have been set up, this creates string versions of the details, for display in the Status panel.""" serverStrings = ["%s:%s" % (s, p) for s, p in self.servers] self.serversString = ', '.join(serverStrings) self.proxyPortsString = ', '.join(map(_addressPortStr, self.proxyPorts)) def buildStatusStrings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham/float(nspam) big = small = None if db_ratio > 5.0: big = "ham" small = "spam" elif db_ratio < (1/5.0): big = "spam" small = "ham" if big is not None: self.warning = "Warning: you have much more %s than %s - " \ "SpamBayes works best with approximately even " \ "numbers of ham and spam." % (big, small) else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = "Database only has %d good and %d spam - you should " \ "consider performing additional training." % (nham, nspam) else: self.warning = "Database has no training information. SpamBayes " \ "will classify all messages as 'unsure', " \ "ready for you to train." def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print "Loading database...", if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.buildStatusStrings() # Don't set up the caches and training objects when running the self-test, # so as not to clutter the filesystem. if not self.isTest: def ensureDir(dirname): try: os.mkdir(dirname) except OSError, e: if e.errno != errno.EEXIST: raise # Create/open the Corpuses. Use small cache sizes to avoid hogging # lots of memory. sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") map(ensureDir, [sc, hc, uc]) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
class Classifier: # Defining __slots__ here made Jeremy's life needlessly difficult when # trying to hook this all up to ZODB as a persistent object. There's # no space benefit worth getting from slots in this class; slots were # used solely to help catch errors earlier, when this code was changing # rapidly. #__slots__ = ('wordinfo', # map word to WordInfo record # 'nspam', # number of spam messages learn() has seen # 'nham', # number of non-spam messages learn() has seen # ) # allow a subclass to use a different class for WordInfo WordInfoClass = WordInfo def __init__(self): self.wordinfo = {} self.probcache = {} self.nspam = self.nham = 0 def __getstate__(self): return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham) def __setstate__(self, t): if t[0] != PICKLE_VERSION: raise ValueError("Can't unpickle -- version %s unknown" % t[0]) (self.wordinfo, self.nspam, self.nham) = t[1:] self.probcache = {} # spamprob() implementations. One of the following is aliased to # spamprob, depending on option settings. # Currently only chi-squared is available, but maybe there will be # an alternative again someday. # Across vectors of length n, containing random uniformly-distributed # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution # with 2*n degrees of freedom. This has been proven (in some # appropriate sense) to be the most sensitive possible test for # rejecting the hypothesis that a vector of probabilities is uniformly # distributed. Gary Robinson's original scheme was monotonic *with* # this test, but skipped the details. Turns out that getting closer # to the theoretical roots gives a much sharper classification, with # a very small (in # of msgs), but also very broad (in range of scores), # "middle ground", where most of the mistakes live. In particular, # this scheme seems immune to all forms of "cancellation disease": if # there are many strong ham *and* spam clues, this reliably scores # close to 0.5. Most other schemes are extremely certain then -- and # often wrong. def chi2_spamprob(self, wordstream, evidence=False): """Return best-guess probability that wordstream is spam. wordstream is an iterable object producing words. The return value is a float in [0.0, 1.0]. If optional arg evidence is True, the return value is a pair probability, evidence where evidence is a list of (word, probability) pairs. """ from math import frexp, log as ln # We compute two chi-squared statistics, one for ham and one for # spam. The sum-of-the-logs business is more sensitive to probs # near 0 than to probs near 1, so the spam measure uses 1-p (so # that high-spamprob words have greatest effect), and the ham # measure uses p directly (so that lo-spamprob words have greatest # effect). # # For optimization, sum-of-logs == log-of-product, and f.p. # multiplication is a lot cheaper than calling ln(). It's easy # to underflow to 0.0, though, so we simulate unbounded dynamic # range via frexp. The real product H = this H * 2**Hexp, and # likewise the real product S = this S * 2**Sexp. H = S = 1.0 Hexp = Sexp = 0 clues = self._getclues(wordstream) """ wordstream.allclues = list(set(wordstream.allclues + clues)) """ for prob, word, record in clues: S *= 1.0 - prob H *= prob if S < 1e-200: # prevent underflow S, e = frexp(S) Sexp += e if H < 1e-200: # prevent underflow H, e = frexp(H) Hexp += e # Compute the natural log of the product = sum of the logs: # ln(x * 2**i) = ln(x) + i * ln(2). S = ln(S) + Sexp * LN2 H = ln(H) + Hexp * LN2 n = len(clues) if n: S = 1.0 - chi2Q(-2.0 * S, 2 * n) H = 1.0 - chi2Q(-2.0 * H, 2 * n) # How to combine these into a single spam score? We originally # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A # systematic problem is that we could end up being near-certain # a thing was (for example) spam, even if S was small, provided # that H was much smaller. # Rob Hooft stared at these problems and invented the measure # we use now, the simpler S-H, scaled into [0., 1.]. prob = (S - H + 1.0) / 2.0 else: prob = 0.5 if evidence: clues = [(w, p) for p, w, _r in clues] clues.sort(lambda a, b: cmp(a[1], b[1])) clues.insert(0, ('*S*', S)) clues.insert(0, ('*H*', H)) wordstream.prob = prob return prob, clues else: wordstream.prob = prob return prob def slurping_spamprob(self, wordstream, evidence=False): """Do the standard chi-squared spamprob, but if the evidence leaves the score in the unsure range, and we have fewer tokens than max_discriminators, also generate tokens from the text obtained by following http URLs in the message.""" h_cut = options["Categorization", "ham_cutoff"] s_cut = options["Categorization", "spam_cutoff"] # Get the raw score. prob, clues = self.chi2_spamprob(wordstream, True) # If necessary, enhance it with the tokens from whatever is # at the URL's destination. if len(clues) < options["Classifier", "max_discriminators"] and \ prob > h_cut and prob < s_cut and slurp_wordstream: slurp_tokens = list(self._generate_slurp()) slurp_tokens.extend([w for (w, _p) in clues]) sprob, sclues = self.chi2_spamprob(slurp_tokens, True) if sprob < h_cut or sprob > s_cut: prob = sprob clues = sclues if evidence: return prob, clues return prob if options["Classifier", "use_chi_squared_combining"]: if options["URLRetriever", "x-slurp_urls"]: spamprob = slurping_spamprob else: spamprob = chi2_spamprob def learn(self, wordstream, is_spam): """Teach the classifier by example. wordstream is a word stream representing a message. If is_spam is True, you're telling the classifier this message is definitely spam, else that it's definitely not spam. """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._add_msg(wordstream, is_spam) def unlearn(self, wordstream, is_spam): """In case of pilot error, call unlearn ASAP after screwing up. Pass the same arguments you passed to learn(). """ if options["Classifier", "use_bigrams"]: wordstream = self._enhance_wordstream(wordstream) if options["URLRetriever", "x-slurp_urls"]: wordstream = self._add_slurped(wordstream) self._remove_msg(wordstream, is_spam) def probability(self, record): """Compute, store, and return prob(msg is spam | msg contains word). This is the Graham calculation, but stripped of biases, and stripped of clamping into 0.01 thru 0.99. The Bayesian adjustment following keeps them in a sane range, and one that naturally grows the more evidence there is to back up a probability. """ spamcount = record.spamcount hamcount = record.hamcount # Try the cache first try: return self.probcache[spamcount][hamcount] except KeyError: pass nham = float(self.nham or 1) nspam = float(self.nspam or 1) assert hamcount <= nham, "Token seen in more ham than ham trained." hamratio = hamcount / nham assert spamcount <= nspam, "Token seen in more spam than spam trained." spamratio = spamcount / nspam prob = spamratio / (hamratio + spamratio) S = options["Classifier", "unknown_word_strength"] StimesX = S * options["Classifier", "unknown_word_prob"] # Now do Robinson's Bayesian adjustment. # # s*x + n*p(w) # f(w) = -------------- # s + n # # I find this easier to reason about like so (equivalent when # s != 0): # # x - p # p + ------- # 1 + n/s # # IOW, it moves p a fraction of the distance from p to x, and # less so the larger n is, or the smaller s is. n = hamcount + spamcount prob = (StimesX + n * prob) / (S + n) # Update the cache try: self.probcache[spamcount][hamcount] = prob except KeyError: self.probcache[spamcount] = {hamcount: prob} return prob # NOTE: Graham's scheme had a strange asymmetry: when a word appeared # n>1 times in a single message, training added n to the word's hamcount # or spamcount, but predicting scored words only once. Tests showed # that adding only 1 in training, or scoring more than once when # predicting, hurt under the Graham scheme. # This isn't so under Robinson's scheme, though: results improve # if training also counts a word only once. The mean ham score decreases # significantly and consistently, ham score variance decreases likewise, # mean spam score decreases (but less than mean ham score, so the spread # increases), and spam score variance increases. # I (Tim) speculate that adding n times under the Graham scheme helped # because it acted against the various ham biases, giving frequently # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else, # adding only once in training, a word like that was simply ignored until # it appeared in 5 distinct training spams. Without the ham-favoring # biases, though, and never ignoring words, counting n times introduces # a subtle and unhelpful bias. # There does appear to be some useful info in how many times a word # appears in a msg, but distorting spamprob doesn't appear a correct way # to exploit it. def _add_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: self.nspam += 1 else: self.nham += 1 for word in wordstream: record = self._wordinfoget(word) if record is None: record = self.WordInfoClass() if is_spam: record.spamcount += 1 if record.spamcount > self.nspam: print wordstream.guts print "Word: " + word raise AssertionError( str(record.spamcount) + " " + str(self.nspam)) else: record.hamcount += 1 self._wordinfoset(word, record) self._post_training() def _remove_msg(self, wordstream, is_spam): self.probcache = {} # nuke the prob cache if is_spam: if self.nspam <= 0: raise ValueError("spam count would go negative!") self.nspam -= 1 else: if self.nham <= 0: raise ValueError("non-spam count would go negative!") self.nham -= 1 for word in wordstream: record = self._wordinfoget(word) if record is not None: if is_spam: if record.spamcount > 0: record.spamcount -= 1 else: if record.hamcount > 0: record.hamcount -= 1 if record.hamcount == 0 == record.spamcount: self._wordinfodel(word) else: self._wordinfoset(word, record) self._post_training() def _post_training(self): """This is called after training on a wordstream. Subclasses might want to ensure that their databases are in a consistent state at this point. Introduced to fix bug #797890.""" pass # Return list of (prob, word, record) triples, sorted by increasing # prob. "word" is a token from wordstream; "prob" is its spamprob (a # float in 0.0 through 1.0); and "record" is word's associated # WordInfo record if word is in the training database, or None if it's # not. No more than max_discriminators items are returned, and have # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream. # Tokens with spamprobs less than minimum_prob_strength away from 0.5 # aren't returned. def _getclues(self, wordstream): mindist = options["Classifier", "minimum_prob_strength"] if options["Classifier", "use_bigrams"]: # This scheme mixes single tokens with pairs of adjacent tokens. # wordstream is "tiled" into non-overlapping unigrams and # bigrams. Non-overlap is important to prevent a single original # token from contributing to more than one spamprob returned # (systematic correlation probably isn't a good thing). # First fill list raw with # (distance, prob, word, record), indices # pairs, one for each unigram and bigram in wordstream. # indices is a tuple containing the indices (0-based relative to # the start of wordstream) of the tokens that went into word. # indices is a 1-tuple for an original token, and a 2-tuple for # a synthesized bigram token. The indices are needed to detect # overlap later. raw = [] push = raw.append pair = None # Keep track of which tokens we've already seen. # Don't use a set here! This is an innermost loop, so speed is # important here (direct dict fiddling is much quicker than # invoking Python-level set methods; in Python 2.4 that will # change). seen = {pair: 1} # so the bigram token is skipped on 1st loop trip for i, token in enumerate(wordstream): if i: # not the 1st loop trip, so there is a preceding token # This string interpolation must match the one in # _enhance_wordstream(). pair = "bi:%s %s" % (last_token, token) last_token = token for clue, indices in (token, (i, )), (pair, (i - 1, i)): if clue not in seen: # as always, skip duplicates seen[clue] = 1 tup = self._worddistanceget(clue) if tup[0] >= mindist: push((tup, indices)) # Sort raw, strongest to weakest spamprob. raw.sort() raw.reverse() # Fill clues with the strongest non-overlapping clues. clues = [] push = clues.append # Keep track of which indices have already contributed to a # clue in clues. seen = {} for tup, indices in raw: overlap = [i for i in indices if i in seen] if not overlap: # no overlap with anything already in clues for i in indices: seen[i] = 1 push(tup) # Leave sorted from smallest to largest spamprob. clues.reverse() else: if len(wordstream.clues) != 0: clues = map(self._tupledistanceget, wordstream.clues) wordstream.clues = clues return clues else: # The all-unigram scheme just scores the tokens as-is. A set() # is used to weed out duplicates at high speed. clues = [] push = clues.append """ for word in wordstream: tup = self._worddistanceget(word) if tup[0] >= mindist: push(tup) """ for word in wordstream: record = self.wordinfo.get(word) if record is not None: try: prob = self.probability(record) except AssertionError: print "Word: " + word print wordstream.guts print "Record: " + str(record) raise AssertionError else: prob = options["Classifier", "unknown_word_prob"] distance = abs(prob - 0.5) if distance >= mindist: push((distance, prob, word, record)) clues.sort() if len(clues) > options["Classifier", "max_discriminators"]: del clues[0:-options["Classifier", "max_discriminators"]] # Return (prob, word, record). trunc_clues = [t[1:] for t in clues] wordstream.clues = trunc_clues return trunc_clues def update_clue_prob(self, record): mindist = options["Classifier", "minimum_prob_strength"] prob = self.probability(record) if abs(prob - 0.5) >= mindist: return prob else: raise AssertionError("Cached record has become too weak.") def _tupledistanceget(self, clue): return tuple(self._worddistanceget(clue[1])[1:]) def _worddistanceget(self, word): record = self._wordinfoget(word) if record is None: prob = options["Classifier", "unknown_word_prob"] else: prob = self.probability(record) distance = abs(prob - 0.5) return distance, prob, word, record def _wordinfoget(self, word): return self.wordinfo.get(word) def _wordinfoset(self, word, record): self.wordinfo[word] = record def _wordinfodel(self, word): del self.wordinfo[word] def _enhance_wordstream(self, wordstream): """Add bigrams to the wordstream. For example, a b c -> a b "a b" c "b c" Note that these are *token* bigrams, and not *word* bigrams - i.e. 'synthetic' tokens get bigram'ed, too. The bigram token is simply "bi:unigram1 unigram2" - a space should be sufficient as a separator, since spaces aren't in any other tokens, apart from 'synthetic' ones. The "bi:" prefix is added to avoid conflict with tokens we generate (like "subject: word", which could be "word" in a subject, or a bigram of "subject:" and "word"). If the "Classifier":"use_bigrams" option is removed, this function can be removed, too. """ last = None for token in wordstream: yield token if last: # This string interpolation must match the one in # _getclues(). yield "bi:%s %s" % (last, token) last = token def _generate_slurp(self): # We don't want to do this recursively and check URLs # on webpages, so we have this little cheat. if not hasattr(self, "setup_done"): self.setup() self.setup_done = True if not hasattr(self, "do_slurp") or self.do_slurp: if slurp_wordstream: self.do_slurp = False tokens = self.slurp(*slurp_wordstream) self.do_slurp = True self._save_caches() return tokens return [] def setup(self): # Can't import this at the top because it's circular. # XXX Someone smarter than me, please figure out the right # XXX way to do this. from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: # Build a new opener that uses a proxy requiring authorization proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: # Build a new opener without any proxy information. opener = urllib2.build_opener(urllib2.HTTPHandler) # Install it urllib2.install_opener(opener) # Setup the cache for retrieved urls age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) # Kill any old information in the cache self.urlCorpus.removeExpiredMessages() # Setup caches for unretrievable urls self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } else: if options["globals", "verbose"]: print "URL caches don't exist: creating" self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {} else:
class CoreState: """This keeps the global state of the module - the command-line options, statistics like how many mails have been classified, the handle of the log file, the Classifier and FileCorpus objects, and so on.""" def __init__(self): """Initialises the State object that holds the state of the app. The default settings are read from Options.py and bayescustomize.ini and are then overridden by the command-line processing code in the __main__ code below.""" self.log_file = None self.bayes = None self.mutex = None self.prepared = False self.can_stop = True self.plugin = None # Unique names for cached messages - see `getNewMessageName()` below. self.last_base_message_name = '' self.uniquifier = 2 # Set up the statistics. self.numSpams = 0 self.numHams = 0 self.numUnsure = 0 self.servers = "" # Load up the other settings from Option.py / bayescustomize.ini self.ui_port = options["html_ui", "port"] self.launch_ui = options["html_ui", "launch_browser"] self.gzip_cache = options["Storage", "cache_use_gzip"] self.run_test_server = False self.is_test = False self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.init() def init(self): assert not self.prepared, "init after prepare, but before close" ## no i18n yet... ## # Load the environment for translation. ## self.lang_manager = i18n.LanguageManager() ## # Set the system user default language. ## self.lang_manager.set_language(\ ## self.lang_manager.locale_default_lang()) ## # Set interface to use the user language in the configuration file. ## for language in reversed(options["globals", "language"]): ## # We leave the default in there as the last option, to fall ## # back on if necessary. ## self.lang_manager.add_language(language) ## if options["globals", "verbose"]: ## print "Asked to add languages: " + \ ## ", ".join(options["globals", "language"]) ## print "Set language to " + \ ## str(self.lang_manager.current_langs_codes) self.lang_manager = None # Open the log file. if options["globals", "verbose"]: self.log_file = open('_core_server.log', 'wb', 0) # Remember reported errors. self.reported_errors = {} def close(self): assert self.prepared, "closed without being prepared!" if self.bayes is not None: # Only store a non-empty db. if self.bayes.nham != 0 and self.bayes.nspam != 0: self.bayes.store() self.bayes.close() self.bayes = None spambayes.message.Message().message_info_db = None self.spamCorpus = self.hamCorpus = self.unknownCorpus = None self.spam_trainer = self.ham_trainer = None self.prepared = False self.close_platform_mutex() def prepare(self, can_stop=True): """Do whatever needs to be done to prepare for running. If can_stop is False, then we may not let the user shut down the proxy - for example, running as a Windows service this should be the case.""" self.init() # If we can, prevent multiple servers from running at the same time. assert self.mutex is None, "Should not already have the mutex" self.open_platform_mutex() self.can_stop = can_stop # Do whatever we've been asked to do... self.create_workers() self.prepared = True def build_status_strings(self): """Build the status message(s) to display on the home page of the web interface.""" nspam = self.bayes.nspam nham = self.bayes.nham if nspam > 10 and nham > 10: db_ratio = nham / float(nspam) if db_ratio > 5.0: self.warning = _("Warning: you have much more ham than " \ "spam - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") elif db_ratio < (1 / 5.0): self.warning = _("Warning: you have much more spam than " \ "ham - SpamBayes works best with " \ "approximately even numbers of ham and " \ "spam.") else: self.warning = "" elif nspam > 0 or nham > 0: self.warning = _("Database only has %d good and %d spam - " \ "you should consider performing additional " \ "training.") % (nham, nspam) else: self.warning = _("Database has no training information. " \ "SpamBayes will classify all messages as " \ "'unsure', ready for you to train.") # Add an additional warning message if the user's thresholds are # truly odd. spam_cut = options["Categorization", "spam_cutoff"] ham_cut = options["Categorization", "ham_cutoff"] if spam_cut < 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the spam threshold less than 0.5.") if ham_cut > 0.5: self.warning += _("<br/>Warning: we do not recommend " \ "setting the ham threshold greater than 0.5.") if ham_cut > spam_cut: self.warning += _("<br/>Warning: your ham threshold is " \ "<b>higher</b> than your spam threshold. " \ "Results are unpredictable.") def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer) def getNewMessageName(self): """The message name is the time it arrived with a uniquifier appended if two arrive within one clock tick of each other. """ message_name = "%10.10d" % long(time.time()) if message_name == self.last_base_message_name: message_name = "%s-%d" % (message_name, self.uniquifier) self.uniquifier += 1 else: self.last_base_message_name = message_name self.uniquifier = 2 return message_name def record_classification(self, cls, score): """Record the classification in the session statistics. cls should match one of the options["Headers", "header_*_string"] values. score is the score the message received. """ if cls == options["Headers", "header_ham_string"]: self.numHams += 1 elif cls == options["Headers", "header_spam_string"]: self.numSpams += 1 else: self.numUnsure += 1 self.stats.RecordClassification(score) def buildStatusStrings(self): return "" def recreate_state(self): if self.prepared: # Close the state (which saves if necessary) self.close() # And get a new one going. state = CoreState() state.prepare() return state def open_platform_mutex(self, mutex_name="SpamBayesServer"): """Implementations of a mutex or other resource which can prevent multiple servers starting at once. Platform specific as no reasonable cross-platform solution exists (however, an old trick is to use a directory for a mutex, as a create/test atomic API generally exists). Will set self.mutex or may throw AlreadyRunningException """ if sys.platform.startswith("win"): try: import win32event, win32api, winerror # ideally, the mutex name could include either the username, # or the munged path to the INI file - this would mean we # would allow multiple starts so long as they weren't for # the same user. However, as of now, the service version # is likely to start as a different user, so a single mutex # is best for now. # XXX - even if we do get clever with another mutex name, we # should consider still creating a non-exclusive # "SpamBayesServer" mutex, if for no better reason than so # an installer can check if we are running try: hmutex = win32event.CreateMutex(None, True, mutex_name) except win32event.error, details: # If another user has the mutex open, we get an "access # denied" error - this is still telling us what we need # to know. if details[0] != winerror.ERROR_ACCESS_DENIED: raise raise AlreadyRunningException # mutex opened - now check if we actually created it. if win32api.GetLastError() == winerror.ERROR_ALREADY_EXISTS: win32api.CloseHandle(hmutex) raise AlreadyRunningException self.mutex = hmutex return except ImportError: # no win32all - no worries, just start pass