def onTrain(self, file, text, which): """Train on an uploaded or pasted message.""" self._writePreamble(_("Train")) content = file or text isSpam = (which == _('Train as Spam')) if file: content = self._convertToMbox(content) content = content.replace('\r\n', '\n').replace('\r', '\n') messages = self._convertUploadToMessageList(content) if isSpam: desired_corpus = "spamCorpus" else: desired_corpus = "hamCorpus" if hasattr(self, desired_corpus): corpus = getattr(self, desired_corpus) else: if hasattr(self, "state"): corpus = getattr(self.state, desired_corpus) setattr(self, desired_corpus, corpus) self.msg_name_func = self.state.getNewMessageName else: if isSpam: fn = storage.get_pathname_option("Storage", "spam_cache") else: fn = storage.get_pathname_option("Storage", "ham_cache") storage.ensureDir(fn) if options["Storage", "cache_use_gzip"]: factory = FileCorpus.GzipFileMessageFactory() else: factory = FileCorpus.FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn, '[0123456789\-]*', cacheSize=20) setattr(self, desired_corpus, corpus) class UniqueNamer(object): count = -1 def generate_name(self): self.count += 1 return "%10.10d-%d" % (long(time.time()), self.count) Namer = UniqueNamer() self.msg_name_func = Namer.generate_name self.write("<b>" + _("Training") + "...</b>\n") self.flush() for message in messages: key = self.msg_name_func() msg = corpus.makeMessage(key, message) msg.setId(key) corpus.addMessage(msg) msg.RememberTrained(isSpam) self.stats.RecordTraining(not isSpam) self._doSave() self.write(_("%sOK. Return %sHome%s or train again:%s") % ("<p>", "<a href='home'>", "</a", "</p>")) self.write(self._buildTrainBox()) self._writePostamble()
def train_mime(self, msg_text, encoding, is_spam): if self.state.bayes is None: self.state.create_workers() # Get msg_text into canonical string representation. # Make sure we have a unicode object... if isinstance(msg_text, str): msg_text = unicode(msg_text, encoding) # ... then encode it as utf-8. if isinstance(msg_text, unicode): msg_text = msg_text.encode("utf-8") msg = message_from_string(msg_text, _class=spambayes.message.SBHeaderMessage) if is_spam: desired_corpus = "spamCorpus" else: desired_corpus = "hamCorpus" if hasattr(self, desired_corpus): corpus = getattr(self, desired_corpus) else: if hasattr(self, "state"): corpus = getattr(self.state, desired_corpus) setattr(self, desired_corpus, corpus) self.msg_name_func = self.state.getNewMessageName else: if is_spam: fn = storage.get_pathname_option("Storage", "spam_cache") else: fn = storage.get_pathname_option("Storage", "ham_cache") storage.ensureDir(fn) if options["Storage", "cache_use_gzip"]: factory = FileCorpus.GzipFileMessageFactory() else: factory = FileCorpus.FileMessageFactory() age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60 corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn, '[0123456789\-]*', cacheSize=20) setattr(self, desired_corpus, corpus) class UniqueNamer(object): count = -1 def generate_name(self): self.count += 1 return "%10.10d-%d" % (long(time.time()), self.count) Namer = UniqueNamer() self.msg_name_func = Namer.generate_name key = self.msg_name_func() mime_message = unicode(msg.as_string(), "utf-8").encode("utf-8") msg = corpus.makeMessage(key, mime_message) msg.setId(key) corpus.addMessage(msg) msg.RememberTrained(is_spam)
def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer)
def train_mime(self, msg_text, encoding, is_spam): if self.state.bayes is None: self.state.create_workers() # Get msg_text into canonical string representation. # Make sure we have a unicode object... if isinstance(msg_text, str): msg_text = unicode(msg_text, encoding) # ... then encode it as utf-8. if isinstance(msg_text, unicode): msg_text = msg_text.encode("utf-8") msg = message_from_string(msg_text, _class=spambayes.message.SBHeaderMessage) if is_spam: desired_corpus = "spamCorpus" else: desired_corpus = "hamCorpus" if hasattr(self, desired_corpus): corpus = getattr(self, desired_corpus) else: if hasattr(self, "state"): corpus = getattr(self.state, desired_corpus) setattr(self, desired_corpus, corpus) self.msg_name_func = self.state.getNewMessageName else: if is_spam: fn = storage.get_pathname_option("Storage", "spam_cache") else: fn = storage.get_pathname_option("Storage", "ham_cache") storage.ensureDir(fn) if options["Storage", "cache_use_gzip"]: factory = FileCorpus.GzipFileMessageFactory() else: factory = FileCorpus.FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 corpus = FileCorpus.ExpiryFileCorpus(age, factory, fn, '[0123456789\-]*', cacheSize=20) setattr(self, desired_corpus, corpus) class UniqueNamer(object): count = -1 def generate_name(self): self.count += 1 return "%10.10d-%d" % (long(time.time()), self.count) Namer = UniqueNamer() self.msg_name_func = Namer.generate_name key = self.msg_name_func() mime_message = unicode(msg.as_string(), "utf-8").encode("utf-8") msg = corpus.makeMessage(key, mime_message) msg.setId(key) corpus.addMessage(msg) msg.RememberTrained(is_spam)
def createWorkers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" print("Loading database...", end=' ') if self.isTest: self.useDB = "pickle" self.DBName = '_pop3proxy_test.pickle' # This is never saved. if not hasattr(self, "DBName"): self.DBName, self.useDB = storage.database_type([]) self.bayes = storage.open_storage(self.DBName, self.useDB) self.mdb = spambayes.message.Message().message_info_db self.stats = Stats.Stats(options, self.mdb) self.buildStatusStrings() if not self.isTest: sc = get_pathname_option("Storage", "spam_cache") hc = get_pathname_option("Storage", "ham_cache") uc = get_pathname_option("Storage", "unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzipCache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"]*24*60*60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() self.spamTrainer = storage.SpamTrainer(self.bayes) self.hamTrainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spamTrainer) self.hamCorpus.addObserver(self.hamTrainer)
def create_workers(self): """Using the options that were initialised in __init__ and then possibly overridden by the driver code, create the Bayes object, the Corpuses, the Trainers and so on.""" if self.is_test: self.use_db = "pickle" self.db_name = '_core_server.pickle' # This is never saved. if not hasattr(self, "db_name"): self.db_name, self.use_db = storage.database_type([]) self.bayes = storage.open_storage(self.db_name, self.use_db) # Load stats manager. self.stats = Stats.Stats(options, spambayes.message.Message().message_info_db) self.build_status_strings() # Don't set up the caches and training objects when running the # self-test, so as not to clutter the filesystem. if not self.is_test: # Create/open the Corpuses. Use small cache sizes to avoid # hogging lots of memory. sc = get_pathname_option("Storage", "core_spam_cache") hc = get_pathname_option("Storage", "core_ham_cache") uc = get_pathname_option("Storage", "core_unknown_cache") for d in [sc, hc, uc]: storage.ensureDir(d) if self.gzip_cache: factory = GzipFileMessageFactory() else: factory = FileMessageFactory() age = options["Storage", "cache_expiry_days"] * 24 * 60 * 60 self.spamCorpus = ExpiryFileCorpus(age, factory, sc, '[0123456789\-]*', cacheSize=20) self.hamCorpus = ExpiryFileCorpus(age, factory, hc, '[0123456789\-]*', cacheSize=20) self.unknownCorpus = ExpiryFileCorpus(age, factory, uc, '[0123456789\-]*', cacheSize=20) # Given that (hopefully) users will get to the stage # where they do not need to do any more regular training to # be satisfied with spambayes' performance, we expire old # messages from not only the trained corpora, but the unknown # as well. self.spamCorpus.removeExpiredMessages() self.hamCorpus.removeExpiredMessages() self.unknownCorpus.removeExpiredMessages() # Create the Trainers. self.spam_trainer = storage.SpamTrainer(self.bayes) self.ham_trainer = storage.HamTrainer(self.bayes) self.spamCorpus.addObserver(self.spam_trainer) self.hamCorpus.addObserver(self.ham_trainer)