def _update(self, folders, is_spam): changed = False for f in folders: log("update from %s" % f.path) added, removed = f.read() if added: log("added %d" % len(added)) if removed: log("removed %d" % len(removed)) get_transaction().commit() if not (added or removed): continue changed = True # It's important not to commit a transaction until # after update_probabilities is called in update(). # Otherwise some new entries will cause scoring to fail. for msg in added.keys(): self.classifier.learn(tokenize(msg), is_spam) del added get_transaction().commit(1) log("learned") for msg in removed.keys(): self.classifier.unlearn(tokenize(msg), is_spam) if removed: log("unlearned") del removed get_transaction().commit(1) return changed
def test_dbm_export(self): # Create a dbm classifier to export. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save & Close. bayes.store() bayes.close() # Export. sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME) # Reopen the original. bayes = open_storage(TEMP_DBM_NAME, "dbm") # Verify that the CSV holds all the original data (and, by using # the CSV module to open it, that it is valid CSV data). fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def train_message(msg, is_spam, cdata): # Train an individual message. # Returns True if newly added (message will be correctly # untrained if it was in the wrong category), False if already # in the correct category. Catch your own damn exceptions. # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) from spambayes.tokenizer import tokenize cdata.message_db.load_msg(msg) was_spam = msg.t if was_spam == is_spam: return False # already correctly classified # Brand new (was_spam is None), or incorrectly classified. stream = msg.GetEmailPackageObject() if was_spam is not None: # The classification has changed; unlearn the old classification. cdata.bayes.unlearn(tokenize(stream), was_spam) # Learn the correct classification. cdata.bayes.learn(tokenize(stream), is_spam) msg.t = is_spam cdata.message_db.store_msg(msg) cdata.dirty = True return True
def _update(self, folders, is_spam): changed = False for f in folders: log("update from %s" % f.path) added, removed = f.read() if added: log("added %d" % len(added)) if removed: log("removed %d" % len(removed)) get_transaction().commit() if not (added or removed): continue changed = True for msg in added.keys(): self.classifier.learn(tokenize(msg), is_spam) del added get_transaction().commit(1) log("learned") for msg in removed.keys(): self.classifier.unlearn(tokenize(msg), is_spam) if removed: log("unlearned") del removed get_transaction().commit(1) return changed
def mapmessages(f, mboxtype, mapdb): i = 0 for msg in getmbox(f): i += 1 sys.stdout.write('\r%s: %d' % (f, i)) sys.stdout.flush() msgid = msg.get("message-id") if msgid is None: continue for t in tokenize(msg): ham, spam = mapdb.get(t, ({}, {})) if mboxtype == "ham": msgids = ham.get(f, set()) msgids.add(msgid) ham[f] = msgids else: msgids = spam.get(f, set()) msgids.add(msgid) spam[f] = msgids mapdb[t] = (ham, spam) if options["Classifier", "x-use_bigrams"]: for t in Classifier()._enhance_wordstream(tokenize(msg)): ham, spam = mapdb.get(t, ({}, {})) if mboxtype == "ham": msgids = ham.get(f, set()) msgids.add(msgid) ham[f] = msgids else: msgids = spam.get(f, set()) msgids.add(msgid) spam[f] = msgids mapdb[t] = (ham, spam) sys.stdout.write("\n")
def train_message(msg, is_spam, cdata): # Train an individual message. # Returns True if newly added (message will be correctly # untrained if it was in the wrong category), False if already # in the correct category. Catch your own damn exceptions. # If re-classified AND rescore = True, then a new score will # be written to the message (so the user can see some effects) from spambayes.tokenizer import tokenize if not cdata.message_db.has_key(msg.searchkey): was_spam = None else: was_spam = cdata.message_db[msg.searchkey]=='1' if was_spam == is_spam: return False # already correctly classified # Brand new (was_spam is None), or incorrectly classified. stream = msg.GetEmailPackageObject() if was_spam is not None: # The classification has changed; unlearn the old classification. cdata.bayes.unlearn(tokenize(stream), was_spam) # Learn the correct classification. cdata.bayes.learn(tokenize(stream), is_spam) cdata.message_db[msg.searchkey] = ['0', '1'][is_spam] cdata.dirty = True return True
def untrain_message(msg, cdata): from spambayes.tokenizer import tokenize stream = msg.GetEmailPackageObject() cdata.message_db.load_msg(msg) if been_trained_as_spam(msg): assert not been_trained_as_ham(msg), "Can't have been both!" cdata.bayes.unlearn(tokenize(stream), True) cdata.message_db.remove_msg(msg) cdata.dirty = True return True if been_trained_as_ham(msg): assert not been_trained_as_spam(msg), "Can't have been both!" cdata.bayes.unlearn(tokenize(stream), False) cdata.message_db.remove_msg(msg) cdata.dirty = True return False return None
def test_merge_to_pickle(self): bayes = PickledClassifier(TEMP_PICKLE_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False, TEMP_CSV_NAME) bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle") self.assertEqual(bayes2.nham, nham + bayes.nham) self.assertEqual(bayes2.nspam, nspam + bayes.nspam) words = bayes._wordinfokeys() words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = bayes._wordinfoget(word) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def train_message(msg, is_spam, cdata): from spambayes.tokenizer import tokenize cdata.message_db.load_msg(msg) was_spam = msg.t if was_spam == is_spam: return False stream = msg.GetEmailPackageObject() if was_spam is not None: cdata.bayes.unlearn(tokenize(stream), was_spam) cdata.bayes.learn(tokenize(stream), is_spam) msg.t = is_spam cdata.message_db.store_msg(msg) cdata.dirty = True return True
def test_filter_train(self): self.h.open('c') self.h.h.bayes.learn(tokenize(good1), False) self.h.h.bayes.learn(tokenize(spam1), True) self.h.h.store() result = email.message_from_string(self.h.filter_train(spam1)) self.assert_(result[options["Headers", "classification_header_name"]].\ startswith(options["Headers", "header_spam_string"])) self.assertEqual(self.h.h.bayes.nspam, 2) result = email.message_from_string(self.h.filter_train(good1)) self.assert_(result[options["Headers", "classification_header_name"]].\ startswith(options["Headers", "header_ham_string"])) self.assertEqual(self.h.h.bayes.nham, 2)
def runUIAndProxy(): httpServer = UserInterfaceServer(8881) proxyUI = ProxyUserInterface(state, _recreateState) httpServer.register(proxyUI) BayesProxyListener('localhost', 8110, ('', 8111)) state.bayes.learn(tokenizer.tokenize(spam1), True) state.bayes.learn(tokenizer.tokenize(good1), False) proxyReady.set() Dibbler.run()
def test_untrain_spam(self): self.h.open('c') self.h.h.bayes.learn(tokenize(spam1), True) self.h.untrain_spam(spam1) self.assertEqual(self.h.h.bayes.nham, 0) self.assertEqual(self.h.h.bayes.nspam, 0) for token in tokenize(spam1): wi = self.h.h.bayes._wordinfoget(token) self.assertEqual(wi, None)
def test_untrain_spam(self): self.h.open('c') # Put a message in the classifier to be removed. self.h.h.bayes.learn(tokenize(spam1), True) # Verify that the classifier gets untrained with the message. self.h.untrain_spam(spam1) self.assertEqual(self.h.h.bayes.nham, 0) self.assertEqual(self.h.h.bayes.nspam, 0) for token in tokenize(spam1): wi = self.h.h.bayes._wordinfoget(token) self.assertEqual(wi, None)
def test_newdb(self): b = open_storage(TEMP_DBM_NAME, "dbm") b.learn(tokenize(spam1), True) b.learn(tokenize(good1), False) b.store() b.close() self.h.newdb() self.assertEqual(self.h.h, None) b = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(b.nham, 0) self.assertEqual(b.nspam, 0) b.close()
def test_merge_to_dbm(self): # Create a dbm classifier to merge with. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save data to check against. original_nham = bayes.nham original_nspam = bayes.nspam original_data = {} for key in bayes._wordinfokeys(): original_data[key] = bayes._wordinfoget(key) # Save & Close. bayes.store() bayes.close() # Create a CSV file to import. nham, nspam = 3, 4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = { "this": (2, 1), "is": (0, 1), "a": (3, 4), 'test': (1, 1), "of": (1, 0), "the": (1, 2), "import": (3, 1) } for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file), # and the data from the original dbm database. bayes2 = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes2.nham, nham + original_nham) self.assertEqual(bayes2.nspam, nspam + original_nspam) words = original_data.keys()[:] words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0, 0)) wi = original_data.get(word, None) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def test_filter(self): # Verify that the msg has the classification header added. self.h.open('c') self.h.h.bayes.learn(tokenize(good1), False) self.h.h.bayes.learn(tokenize(spam1), True) self.h.h.store() result = email.message_from_string(self.h.filter(spam1)) self.assert_(result[options["Headers", "classification_header_name"]].\ startswith(options["Headers", "header_spam_string"])) result = email.message_from_string(self.h.filter(good1)) self.assert_(result[options["Headers", "classification_header_name"]].\ startswith(options["Headers", "header_ham_string"]))
def score(unsure, h, cls, scores, msgids=None, skipspam=False): """See what effect on others each msg in unsure has""" ham_cutoff = options["Categorization", "ham_cutoff"] spam_cutoff = options["Categorization", "spam_cutoff"] n = 0 total = 0.0 okalready = set() add = okalready.add for msg in getmbox(unsure): prob = cls.spamprob(tokenize(msg)) n += 1 if prob >= spam_cutoff: add(msg['message-id']) else: total += prob first_mean = total/n print len(okalready), "out of", n, "messages already score as spam" print "initial mean spam prob: %.3f" % first_mean print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid") for msg in getmbox(unsure): msgid = msg['message-id'] if msgids is not None and msgid not in msgids: continue msgprob = cls.spamprob(tokenize(msg)) if skipspam and msgprob >= spam_cutoff: continue n = j = 0 h.train(msg, True) total = 0.0 probs = [] for trial in getmbox(unsure): if trial['message-id'] in okalready: continue n += 1 if n % 10 == 0: counter("", n) prob = cls.spamprob(tokenize(trial)) probs.append(prob) total += prob if prob >= spam_cutoff: j += 1 counter("", n) h.untrain(msg, True) mean = total/n meankey = round(mean, 3) scores.setdefault(meankey, []).append(msgid) sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n) print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)
def main(fp): charset = locale.getdefaultlocale()[1] if not charset: charset = 'us-ascii' db = pspam.database.open() r = db.open().root() p = r["profile"] msg = email.message_from_file(fp) prob, evidence = p.classifier.spamprob(tokenize(msg), True) print "Score:", prob print print "Clues" print "-----" for clue, prob in evidence: if isinstance(clue, UnicodeType): clue = clue.encode(charset, 'replace') print clue, prob
def main(args): try: opts, args = getopt.getopt(args, "hrto:", ["help", "re", "tokenize", "option="]) except getopt.GetoptError as msg: usage(msg) return 1 usere = False tokenizestdin = False for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-r", "--re"): usere = True elif opt in ("-t", "--tokenize"): tokenizestdin = True elif opt in ("-o", "--option"): options.set_from_cmdline(arg, sys.stderr) if usere and tokenizestdin: usage("-r and -t may not be used at the same time") return 1 dbname, usedb = database_type(opts) db = open_storage(dbname, usedb) if tokenizestdin: args = tokenize(sys.stdin) if args: print_spamcounts(args, db, usere) return 0 else: usage("need tokens on cmd line or -t w/ msg on stdin") return 1
def filter_message(hamdir, spamdir): signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1)) signal.alarm(24 * 60 * 60) tmpfile, pathname, filename = maketmp(hamdir) try: tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line bytes = 0 blocks = [] while 1: block = sys.stdin.read(BLOCK_SIZE) if not block: break bytes += len(block) if bytes < SIZE_LIMIT: blocks.append(block) tmpfile.write(block) tmpfile.close() if bytes < SIZE_LIMIT: msgdata = ''.join(blocks) del blocks msg = email.message_from_string(msgdata) del msgdata bayes = CdbClassifier(open(DB_FILE, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: prob = 0.0 if prob > SPAM_CUTOFF: os.rename(pathname, "%s/new/%s" % (spamdir, filename)) else: os.rename(pathname, "%s/new/%s" % (hamdir, filename)) except: os.unlink(pathname) raise
def print_message_score(msg_name, msg_fp): msg = email.message_from_file(msg_fp) bayes = CdbClassifier(open(DB_FILE, 'rb')) prob, evidence = bayes.spamprob(tokenize(msg), evidence=True) print msg_name, prob for word, prob in evidence: print ' ', repr(word), prob
def train(text, isSpam): """Trains the classifier on the given text.""" tokens = tokenizer.tokenize(text) bayes.learn(tokens, isSpam)
def classify(text): """Classifies the given text, returning the spamprob.""" tokens = tokenizer.tokenize(text) return bayes.spamprob(tokens)
def processAndTrain(v, vmoveto, bayes, is_spam, notesindex, log): if is_spam: header_str = options["Headers", "header_spam_string"] else: header_str = options["Headers", "header_ham_string"] print "Training %s" % (header_str,) docstomove = [] doc = v.GetFirstDocument() while doc: message = getMessage(doc) options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) nid = doc.NOTEID if notesindex.has_key(nid): trainedas = notesindex[nid] if trainedas == options["Headers", "header_spam_string"] and \ not is_spam: bayes.unlearn(tokens, True) elif trainedas == options["Headers", "header_ham_string"] and \ is_spam: bayes.unlearn(tokens, False) bayes.learn(tokens, is_spam) notesindex[nid] = header_str docstomove += [doc] doc = v.GetNextDocument(doc) for doc in docstomove: doc.RemoveFromFolder(v.Name) doc.PutInFolder(vmoveto.Name) print "%s documents trained" % (len(docstomove),) if log: log.LogAction("%s documents trained" % (len(docstomove),))
def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log): if len(notesindex.keys()) == 0: firsttime = 1 else: firsttime = 0 docstomove = [] numham = 0 numspam = 0 numuns = 0 numdocs = 0 doc = v.GetFirstDocument() while doc: nid = doc.NOTEID if firsttime: notesindex[nid] = 'never classified' else: if not notesindex.has_key(nid): numdocs += 1 message = getMessage(doc) options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) prob, clues = bayes.spamprob(tokens, evidence=True) if prob < options["Categorization", "ham_cutoff"]: disposition = options["Headers", "header_ham_string"] numham += 1 elif prob > options["Categorization", "spam_cutoff"]: disposition = options["Headers", "header_spam_string"] docstomove += [doc] numspam += 1 else: disposition = options["Headers", "header_unsure_string"] numuns += 1 notesindex[nid] = 'classified' try: print "%s spamprob is %s" % (subj[:30], prob) if log: log.LogAction("%s spamprob is %s" % (subj[:30], prob)) except UnicodeError: print "<subject not printed> spamprob is %s" % (prob) if log: log.LogAction("<subject not printed> spamprob " \ "is %s" % (prob,)) item = doc.ReplaceItemValue("Spam", prob) item.IsSummary = True doc.save(False, True, False) doc = v.GetNextDocument(doc) for doc in docstomove: doc.RemoveFromFolder(v.Name) doc.PutInFolder(vmoveto.Name) print "%s documents processed" % (numdocs,) print " %s classified as spam" % (numspam,) print " %s classified as ham" % (numham,) print " %s classified as unsure" % (numuns,) if log: log.LogAction("%s documents processed" % (numdocs,)) log.LogAction(" %s classified as spam" % (numspam,)) log.LogAction(" %s classified as ham" % (numham,)) log.LogAction(" %s classified as unsure" % (numuns,))
def untrain(self, msg, is_spam): """Untrain bayes with a message. msg can be a string, a file object, or a Message object. is_spam should be True if the message is spam, False if not. """ self.bayes.unlearn(tokenize(msg), is_spam)
def test_pickle_export(self): bayes = PickledClassifier(TEMP_PICKLE_NAME) bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) bayes.store() sb_dbexpimp.runExport(TEMP_PICKLE_NAME, "pickle", TEMP_CSV_NAME) fp = open(TEMP_CSV_NAME, "rb") reader = sb_dbexpimp.csv.reader(fp) (nham, nspam) = reader.next() self.assertEqual(int(nham), bayes.nham) self.assertEqual(int(nspam), bayes.nspam) for (word, hamcount, spamcount) in reader: word = sb_dbexpimp.uunquote(word) self.assert_(word in bayes._wordinfokeys()) wi = bayes._wordinfoget(word) self.assertEqual(int(hamcount), wi.hamcount) self.assertEqual(int(spamcount), wi.spamcount)
def train(bayes, msgs, is_spam): """Train bayes with all messages from a mailbox.""" mbox = mboxutils.getmbox(msgs) for msg in mbox: bayes.learn(tokenize(msg), is_spam)
def test_train_spam(self): self.h.open('c') self.h.train_spam(spam1) self.assertEqual(self.h.h.bayes.nham, 0) self.assertEqual(self.h.h.bayes.nspam, 1) for token in tokenize(spam1): wi = self.h.h.bayes._wordinfoget(token) self.assertEqual(wi.hamcount, 0) self.assertEqual(wi.spamcount, 1)
def test_merge_to_dbm(self): # Create a dbm classifier to merge with. bayes = DBDictClassifier(TEMP_DBM_NAME) # Stuff some messages in it so it's not empty. bayes.learn(tokenize(spam1), True) bayes.learn(tokenize(good1), False) # Save data to check against. original_nham = bayes.nham original_nspam = bayes.nspam original_data = {} for key in bayes._wordinfokeys(): original_data[key] = bayes._wordinfoget(key) # Save & Close. bayes.store() bayes.close() # Create a CSV file to import. nham, nspam = 3,4 temp = open(TEMP_CSV_NAME, "wb") temp.write("%d,%d\n" % (nham, nspam)) csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1), "of":(1,0), "the":(1,2), "import":(3,1)} for word, (ham, spam) in csv_data.items(): temp.write("%s,%s,%s\n" % (word, ham, spam)) temp.close() sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME) # Open the converted file and verify that it has all the data from # the CSV file (and by opening it, that it is a valid dbm file), # and the data from the original dbm database. bayes2 = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(bayes2.nham, nham + original_nham) self.assertEqual(bayes2.nspam, nspam + original_nspam) words = original_data.keys()[:] words.extend(csv_data.keys()) for word in words: word = sb_dbexpimp.uquote(word) self.assert_(word in bayes2._wordinfokeys()) h, s = csv_data.get(word, (0,0)) wi = original_data.get(word, None) if wi: h += wi.hamcount s += wi.spamcount wi2 = bayes2._wordinfoget(word) self.assertEqual(h, wi2.hamcount) self.assertEqual(s, wi2.spamcount)
def test_train_spam(self): # Verify that the classifier gets trained with the message. self.h.open('c') self.h.train_spam(spam1) self.assertEqual(self.h.h.bayes.nham, 0) self.assertEqual(self.h.h.bayes.nspam, 1) for token in tokenize(spam1): wi = self.h.h.bayes._wordinfoget(token) self.assertEqual(wi.hamcount, 0) self.assertEqual(wi.spamcount, 1)
def _scoremsg(self, msg, evidence=False): """Score a Message. msg can be a string, a file object, or a Message object. Returns the probability the message is spam. If evidence is true, returns a tuple: (probability, clues), where clues is a list of the words which contributed to the score. """ return self.bayes.spamprob(tokenize(msg), evidence)
def __init__(self, dir, name): path = dir + "/" + name self.tag = path f = open(path, 'rb') self.guts = f.read() f.close() self.prob = None self.probdiff = None self.allclues = [] self.clues = [] self.guts = "`~`".join(set(tokenize(self.guts)))
def test_newdb(self): # Create an existing classifier. b = open_storage(TEMP_DBM_NAME, "dbm") b.learn(tokenize(spam1), True) b.learn(tokenize(good1), False) b.store() b.close() # Create the fresh classifier. self.h.newdb() # Verify that the classifier isn't open. self.assertEqual(self.h.h, None) # Verify that any existing classifier with the same name # is overwritten. b = open_storage(TEMP_DBM_NAME, "dbm") self.assertEqual(b.nham, 0) self.assertEqual(b.nspam, 0) b.close()
def __init__(self, dir, name, index=None): path = dir + "/" + name self.tag = path f = open(path, 'rb') self.guts = f.read() f.close() self.prob = None self.probdiff = None self.allclues = [] self.clues = [] self.guts = "`~`".join(set(tokenize(self.guts))) # all words of email, separated by `~` self.index = index
def message_parse_error(self, buf): # We get an error parsing the message. We've already told the # client to expect more bytes that this buffer contains, but # there's not clean way to add the header. self.server.log.write("# error: %s\n" % repr(buf)) # XXX what to do? list's just add it after the first line score = self.server.classifier.spamprob(tokenize(buf)) L = buf.split("\n") L.insert(1, HEADER % score) return "\n".join(L)
def train(self, msg, is_spam, add_header=False): """Train bayes with a message. msg can be a string, a file object, or a Message object. is_spam should be 1 if the message is spam, 0 if not. If add_header is True, add a header with how it was trained (in case we need to untrain later) """ self.bayes.learn(tokenize(msg), is_spam) if add_header: if is_spam: trained = options["Headers", "header_spam_string"] else: trained = options["Headers", "header_ham_string"] del msg[options["Headers", "trained_header_name"]] msg.add_header(options["Headers", "trained_header_name"], trained)
def setUp(self): self.msg = email.message_from_string(spam1, _class=SBHeaderMessage) # Get a prob and some clues. c = Classifier() self.u_prob, clues = c.spamprob(tokenize(good1), True) c.learn(tokenize(good1), False) self.g_prob, clues = c.spamprob(tokenize(good1), True) c.unlearn(tokenize(good1), False) c.learn(tokenize(spam1), True) self.s_prob, self.clues = c.spamprob(tokenize(spam1), True) self.ham = options['Headers', 'header_ham_string'] self.spam = options['Headers', 'header_spam_string'] self.unsure = options['Headers', 'header_unsure_string'] self.to = "[email protected];[email protected]" self.msg["to"] = self.to
def processAndTrain(v, vmoveto, bayes, is_spam, notesindex, log): if is_spam: header_str = options["Headers", "header_spam_string"] else: header_str = options["Headers", "header_ham_string"] print "Training %s" % (header_str,) docstomove = [] doc = v.GetFirstDocument() while doc: message = getMessage(doc) options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) nid = doc.NOTEID if notesindex.has_key(nid): trainedas = notesindex[nid] if trainedas == options["Headers", "header_spam_string"] and \ not is_spam: # msg is trained as spam, is to be retrained as ham bayes.unlearn(tokens, True) elif trainedas == options["Headers", "header_ham_string"] and \ is_spam: # msg is trained as ham, is to be retrained as spam bayes.unlearn(tokens, False) bayes.learn(tokens, is_spam) notesindex[nid] = header_str docstomove += [doc] doc = v.GetNextDocument(doc) for doc in docstomove: doc.RemoveFromFolder(v.Name) doc.PutInFolder(vmoveto.Name) print "%s documents trained" % (len(docstomove),) if log: log.LogAction("%s documents trained" % (len(docstomove),))
def score_mime(self, msg_text, encoding): """Score a message representing a MIME document. The msg argument will be a string in the given encoding. """ # XXX Much of this probably belongs in the core server... if self.state.bayes is None: self.state.create_workers() # Get msg_text into canonical string representation. # Make sure we have a unicode object... if isinstance(msg_text, str): msg_text = unicode(msg_text, encoding) # ... then encode it as utf-8. if isinstance(msg_text, unicode): msg_text = msg_text.encode("utf-8") msg = message_from_string(msg_text, _class=spambayes.message.SBHeaderMessage) tokens = tokenize(msg) # XXX Maybe from here on down... prob, clues = self.state.bayes.spamprob(tokens, evidence=True) msg.addSBHeaders(prob, clues) self.state.record_classification(msg.GetClassification(), prob) # Cache the message. if not self.state.is_test and options["Storage", "cache_messages"]: msg.setId(self.state.getNewMessageName()) # Write the message into the Unknown cache. makeMessage = self.state.unknownCorpus.makeMessage message = makeMessage(msg.getId(), msg.as_string()) self.state.unknownCorpus.addMessage(message) return prob
def filter_message(hamdir, spamdir): signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1)) signal.alarm(24 * 60 * 60) # write message to temporary file (must be on same partition) tmpfile, pathname, filename = maketmp(hamdir) try: tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line bytes = 0 blocks = [] while 1: block = sys.stdin.read(BLOCK_SIZE) if not block: break bytes += len(block) if bytes < SIZE_LIMIT: blocks.append(block) tmpfile.write(block) tmpfile.close() if bytes < SIZE_LIMIT: msgdata = ''.join(blocks) del blocks msg = email.message_from_string(msgdata) del msgdata bayes = CdbClassifier(open(DB_FILE, 'rb')) prob = bayes.spamprob(tokenize(msg)) else: prob = 0.0 if prob > SPAM_CUTOFF: os.rename(pathname, "%s/new/%s" % (spamdir, filename)) else: os.rename(pathname, "%s/new/%s" % (hamdir, filename)) except: os.unlink(pathname) raise
if opt in ("-h", "--help"): usage() return 0 elif opt in ("-r", "--re"): usere = True elif opt in ("-t", "--tokenize"): tokenizestdin = True elif opt in ('-o', '--option'): options.set_from_cmdline(arg, sys.stderr) if usere and tokenizestdin: usage("-r and -t may not be used at the same time") return 1 dbname, usedb = database_type(opts) db = open_storage(dbname, usedb) if tokenizestdin: args = tokenize(sys.stdin) if args: print_spamcounts(args, db, usere) return 0 else: usage("need tokens on cmd line or -t w/ msg on stdin") return 1 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
def __iter__(self): return tokenize(self.guts)
def test_tokenize(self): toks = self.msg.tokenize() self.assertEqual(tuple(tokenize(spam1)), tuple(toks))
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose, ratio): round = 0 ham_cutoff = Options.options["Categorization", "ham_cutoff"] spam_cutoff = Options.options["Categorization", "spam_cutoff"] # list-ify ham and spam iterators immediately. We don't really want to # fetch the messages multiple times, and this is no worse than what happened # before when -R was passed. hambone_ = list(mboxutils.getmbox(hambox)) spamcan_ = list(mboxutils.getmbox(spambox)) if reverse: hambone_ = list(reversed(hambone_)) spamcan_ = list(reversed(spamcan_)) nspam, nham = len(spamcan_), len(hambone_) if ratio: rspam, rham = ratio # If the actual ratio of spam to ham in the database is better than # what was asked for, use that better ratio. if (rspam > rham) == (rspam * nham > rham * nspam): rspam, rham = nspam, nham # define some indexing constants ham = 0 spam = 1 name = ('ham', 'spam') misses = [0, 0] misclassified = lambda is_spam, score: (is_spam and score < spam_cutoff or not is_spam and score > ham_cutoff) while round < maxrounds and (misses[ham] or misses[spam] or round == 0): round += 1 if verbose: print >> sys.stderr, "*** round", round, "***" start = datetime.datetime.now() hambone = iter(hambone_) spamcan = iter(spamcan_) i = [0, 0] msgs_processed = 0 misses = [0, 0] training_sets = [hambone, spamcan] while not maxmsgs or msgs_processed < maxmsgs: # should the next message come from hambone or spamcan? train_spam = i[ham] * rspam > i[spam] * rham try: train_msg = training_sets[train_spam].next() except StopIteration: break i[train_spam] += 1 msgs_processed += 1 sys.stdout.write("\r%5d" % msgs_processed) sys.stdout.flush() tokens = list(tokenize(train_msg)) score = store.spamprob(tokens) selector = train_msg["message-id"] or train_msg["subject"] if misclassified(train_spam, score) and selector is not None: if verbose: print >> sys.stderr, "\tmiss %s: %.6f %s" % ( name[train_spam], score, selector) misses[train_spam] += 1 tdict[train_msg["message-id"]] = True store.learn(tokens, train_spam) delta = datetime.datetime.now() - start seconds = delta.seconds + delta.microseconds / 1000000 print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \ (round, msgs_processed, misses[0], misses[1], seconds) training_sets = [hambone, spamcan] # We count all untrained messages so the user knows what was skipped. # We also tag them for saving so we don't lose messages which might have # value in a future run for is_spam in ham, spam: nleft = 0 try: while True: msg = training_sets[is_spam].next() score = store.spamprob(tokenize(msg)) if misclassified(is_spam, score): tdict[msg["message-id"]] = True nleft += 1 except StopIteration: if nleft: print nleft, "untrained %ss" % name[is_spam]
def tokenize(self): return tokenize(self)
def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log): # the notesindex hash ensures that a message is looked at only once if len(notesindex.keys()) == 0: firsttime = 1 else: firsttime = 0 docstomove = [] numham = 0 numspam = 0 numuns = 0 numdocs = 0 doc = v.GetFirstDocument() while doc: nid = doc.NOTEID if firsttime: notesindex[nid] = 'never classified' else: if not notesindex.has_key(nid): numdocs += 1 # Notes returns strings in unicode, and the Python # decoder has trouble with these strings when # you try to print them. So don't... message = getMessage(doc) # generate_long_skips = True blows up on occasion, # probably due to this unicode problem. options["Tokenizer", "generate_long_skips"] = False tokens = tokenizer.tokenize(message) prob = bayes.spamprob(tokens) if prob < options["Categorization", "ham_cutoff"]: numham += 1 elif prob > options["Categorization", "spam_cutoff"]: docstomove += [doc] numspam += 1 else: numuns += 1 notesindex[nid] = 'classified' subj = message["subject"] try: print "%s spamprob is %s" % (subj[:30], prob) if log: log.LogAction("%s spamprob is %s" % (subj[:30], prob)) except UnicodeError: print "<subject not printed> spamprob is %s" % (prob) if log: log.LogAction("<subject not printed> spamprob " \ "is %s" % (prob,)) item = doc.ReplaceItemValue("Spam", prob) item.IsSummary = True doc.save(False, True, False) doc = v.GetNextDocument(doc) # docstomove list is built because moving documents in the middle of # the classification loop loses the iterator position for doc in docstomove: doc.RemoveFromFolder(v.Name) doc.PutInFolder(vmoveto.Name) print "%s documents processed" % (numdocs,) print " %s classified as spam" % (numspam,) print " %s classified as ham" % (numham,) print " %s classified as unsure" % (numuns,) if log: log.LogAction("%s documents processed" % (numdocs,)) log.LogAction(" %s classified as spam" % (numspam,)) log.LogAction(" %s classified as ham" % (numham,)) log.LogAction(" %s classified as unsure" % (numuns,))