def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test() self.tester.set_classifier(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist()
def __init__(self): self.falsepos = set() self.falseneg = set() self.unsure = set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc = CostCounter.default()
def pol_test(self, ham, spam, update=False, all_opt=False): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.train_predict(ham, 3, update=update, all_opt=all_opt) t.train_predict(spam, 2, update=update, all_opt=all_opt)
def dict_test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.train_predict(ham, 3) t.train_predict(spam, 2)
def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname pickle_write(fname, self.classifier, 1)
def main(): from spambayes.Histogram import Hist import sys class WrappedRandom: # There's no way W-H is equidistributed in 50 dimensions, so use # Marsaglia-wrapping to shuffle it more. def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): result = self.next i = int(result * self.n) self.next = self.tab[i] self.tab[i] = self.baserandom() return result random = WrappedRandom().random #from uni import uni as random #print random def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): H = S = 1.0 Hexp = Sexp = 0 for p in ps: S *= 1.0 - p H *= p if S < 1e-200: S, e = frexp(S) Sexp += e if H < 1e-200: H, e = frexp(H) Hexp += e S = ln(S) + Sexp * ln2 H = ln(H) + Hexp * ln2 n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) return S, H, (S-H + 1.0) / 2.0 warp = 0 bias = 0.99 if len(sys.argv) > 1: warp = int(sys.argv[1]) if len(sys.argv) > 2: bias = float(sys.argv[2]) h = Hist(20, lo=0.0, hi=1.0) s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) score.add(score1) print "Result for random vectors of 50 probs, +", warp, "forced to", bias # Should be uniformly distributed on all-random data. print print 'H', h.display() # Should be uniformly distributed on all-random data. print print 'S', s.display() # Distribution doesn't really matter. print print '(S-H+1)/2', score.display()
def main(): from spambayes.Histogram import Hist import sys class WrappedRandom: def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): result = self.next i = int(result * self.n) self.next = self.tab[i] self.tab[i] = self.baserandom() return result random = WrappedRandom().random def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): H = S = 1.0 Hexp = Sexp = 0 for p in ps: S *= 1.0 - p H *= p if S < 1e-200: S, e = frexp(S) Sexp += e if H < 1e-200: H, e = frexp(H) Hexp += e S = ln(S) + Sexp * ln2 H = ln(H) + Hexp * ln2 n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2*n) H = 1.0 - chi2Q(-2.0 * H, 2*n) return S, H, (S-H + 1.0) / 2.0 warp = 0 bias = 0.99 if len(sys.argv) > 1: warp = int(sys.argv[1]) if len(sys.argv) > 2: bias = float(sys.argv[2]) h = Hist(20, lo=0.0, hi=1.0) s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) score.add(score1) print "Result for random vectors of 50 probs, +", warp, "forced to", bias print print 'H', h.display() print print 'S', s.display() print print '(S-H+1)/2', score.display()
def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options["TestDriver", "show_ham_lo"], hi=options["TestDriver", "show_ham_hi"]): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options["TestDriver", "show_spam_lo"], hi=options["TestDriver", "show_spam_hi"]): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> <stat> false positive %:", t.false_positive_rate() print "-> <stat> false negative %:", t.false_negative_rate() print "-> <stat> unsure %:", t.unsure_rate() print "-> <stat> cost: $%.2f" % ( t.nham_wrong * options["TestDriver", "best_cutoff_fp_weight"] + t.nspam_wrong * options["TestDriver", "best_cutoff_fn_weight"] + (t.nham_unsure + t.nspam_unsure) * options["TestDriver", "best_cutoff_unsure_weight"]) newfpos = set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> <stat> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options["TestDriver", "show_false_positives"]: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> <stat> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options["TestDriver", "show_false_negatives"]: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> <stat> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options["TestDriver", "show_unsure"]: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options["TestDriver", "show_histograms"]: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist
class Driver: def __init__(self): self.falsepos = set() self.falseneg = set() self.unsure = set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc = CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" self.set_classifier(classifier.Bayes()) def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam - nspam, "spams" def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.untrain(ham, spam) print nham - c.nham, "hams &", nspam - c.nspam, "spams" def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname pickle_write(fname, self.classifier, 1) def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut, bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) # Set back the options for the delayed calculations in self.cc options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" % (f, fname) pickle_write(fname, h, 1) def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options["TestDriver", "show_ham_lo"], hi=options["TestDriver", "show_ham_hi"]): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options["TestDriver", "show_spam_lo"], hi=options["TestDriver", "show_spam_hi"]): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> <stat> false positive %:", t.false_positive_rate() print "-> <stat> false negative %:", t.false_negative_rate() print "-> <stat> unsure %:", t.unsure_rate() print "-> <stat> cost: $%.2f" % ( t.nham_wrong * options["TestDriver", "best_cutoff_fp_weight"] + t.nspam_wrong * options["TestDriver", "best_cutoff_fn_weight"] + (t.nham_unsure + t.nspam_unsure) * options["TestDriver", "best_cutoff_unsure_weight"]) newfpos = set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> <stat> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options["TestDriver", "show_false_positives"]: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> <stat> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options["TestDriver", "show_false_negatives"]: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> <stat> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options["TestDriver", "show_unsure"]: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options["TestDriver", "show_histograms"]: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist
class Driver: def __init__(self): self.falsepos = set() self.falseneg = set() self.unsure = set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc = CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" self.set_classifier(classifier.Bayes()) def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam- nspam, "spams" def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.untrain(ham, spam) print nham - c.nham, "hams &", nspam - c.nspam, "spams" def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname pickle_write(fname, self.classifier, 1) def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut, bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" % (f, fname) pickle_write(fname, h, 1) def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options["TestDriver", "show_ham_lo"], hi=options["TestDriver", "show_ham_hi"]): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options["TestDriver", "show_spam_lo"], hi=options["TestDriver", "show_spam_hi"]): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> <stat> false positive %:", t.false_positive_rate() print "-> <stat> false negative %:", t.false_negative_rate() print "-> <stat> unsure %:", t.unsure_rate() print "-> <stat> cost: $%.2f" % ( t.nham_wrong * options["TestDriver", "best_cutoff_fp_weight"] + t.nspam_wrong * options["TestDriver", "best_cutoff_fn_weight"] + (t.nham_unsure + t.nspam_unsure) * options["TestDriver", "best_cutoff_unsure_weight"]) newfpos = set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> <stat> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options["TestDriver", "show_false_positives"]: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> <stat> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options["TestDriver", "show_false_negatives"]: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> <stat> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options["TestDriver", "show_unsure"]: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options["TestDriver", "show_histograms"]: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist
def main(): from spambayes.Histogram import Hist import sys class WrappedRandom: # There's no way W-H is equidistributed in 50 dimensions, so use # Marsaglia-wrapping to shuffle it more. def __init__(self, baserandom=random.random, tabsize=513): self.baserandom = baserandom self.n = tabsize self.tab = [baserandom() for i in range(tabsize)] self.next = baserandom() def random(self): result = self.next i = int(result * self.n) self.next = self.tab[i] self.tab[i] = self.baserandom() return result random = WrappedRandom().random #from uni import uni as random #print random def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp): H = S = 1.0 Hexp = Sexp = 0 for p in ps: S *= 1.0 - p H *= p if S < 1e-200: S, e = frexp(S) Sexp += e if H < 1e-200: H, e = frexp(H) Hexp += e S = ln(S) + Sexp * ln2 H = ln(H) + Hexp * ln2 n = len(ps) S = 1.0 - chi2Q(-2.0 * S, 2 * n) H = 1.0 - chi2Q(-2.0 * H, 2 * n) return S, H, (S - H + 1.0) / 2.0 warp = 0 bias = 0.99 if len(sys.argv) > 1: warp = int(sys.argv[1]) if len(sys.argv) > 2: bias = float(sys.argv[2]) h = Hist(20, lo=0.0, hi=1.0) s = Hist(20, lo=0.0, hi=1.0) score = Hist(20, lo=0.0, hi=1.0) for i in range(5000): ps = [random() for j in range(50)] s1, h1, score1 = judge(ps + [bias] * warp) s.add(s1) h.add(h1) score.add(score1) print "Result for random vectors of 50 probs, +", warp, "forced to", bias # Should be uniformly distributed on all-random data. print print 'H', h.display() # Should be uniformly distributed on all-random data. print print 'S', s.display() # Distribution doesn't really matter. print print '(S-H+1)/2', score.display()
class Driver : def __init__(self): self.falsepos = Set() self.falseneg = Set() self.unsure = Set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc=CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" self.set_classifier(classifier.Bayes()) def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam- nspam, "spams" def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.untrain(ham, spam) print nham - c.nham, "hams &", nspam - c.nspam, "spams" def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname fp = file(fname, 'wb') pickle.dump(self.classifier, fp, 1) fp.close() def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut,bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" %(f, fname) fp = file(fname, 'wb') pickle.dump(h, fp, 1) fp.close() def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options["TestDriver", "show_ham_lo"], hi=options["TestDriver", "show_ham_hi"]): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options["TestDriver", "show_spam_lo"], hi=options["TestDriver", "show_spam_hi"]): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> <stat> false positive %:", t.false_positive_rate() print "-> <stat> false negative %:", t.false_negative_rate() print "-> <stat> unsure %:", t.unsure_rate() print "-> <stat> cost: $%.2f" % ( t.nham_wrong * options["TestDriver", "best_cutoff_fp_weight"] + t.nspam_wrong * options["TestDriver", "best_cutoff_fn_weight"] + (t.nham_unsure + t.nspam_unsure) * options["TestDriver", "best_cutoff_unsure_weight"]) newfpos = Set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> <stat> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options["TestDriver", "show_false_positives"]: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = Set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> <stat> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options["TestDriver", "show_false_negatives"]: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = Set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> <stat> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options["TestDriver", "show_unsure"]: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options["TestDriver", "show_histograms"]: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist try: Set = set except NameError: try: from sets import Set except ImportError: from spambayes.compatsets import Set try: from heapq import heapreplace except ImportError: from spambayes.compatheapq import heapreplace try: True, False except NameError: True, False = 1, 0 def printhist(tag, ham, spam, nbuckets=options["TestDriver", "nbuckets"]): print print "-> <stat> Ham scores for", tag, ham.display(nbuckets) print print "-> <stat> Spam scores for", tag, spam.display(nbuckets) if not options["TestDriver", "compute_best_cutoffs_from_histograms"]: return if ham.n == 0 or spam.n == 0: return assert ham.nbuckets == spam.nbuckets n = ham.nbuckets FPW = options["TestDriver", "best_cutoff_fp_weight"] FNW = options["TestDriver", "best_cutoff_fn_weight"] UNW = options["TestDriver", "best_cutoff_unsure_weight"] htotal = [0] * (n+1) stotal = [0] * (n+1) for i in range(1, n+1): htotal[i] = htotal[i-1] + ham.buckets[i-1] stotal[i] = stotal[i-1] + spam.buckets[i-1] assert htotal[-1] == ham.n assert stotal[-1] == spam.n best_cost = 1e200 bests = [] for h in range(n+1): num_fn = stotal[h] fn_cost = num_fn * FNW for s in xrange(h, n+1): num_fp = htotal[-1] - htotal[s] num_un = htotal[s] - htotal[h] + stotal[s] - stotal[h] cost = num_fp * FPW + fn_cost + num_un * UNW if cost <= best_cost: if cost < best_cost: best_cost = cost bests = [] bests.append((h, s)) print '-> best cost for %s $%.2f' % (tag, best_cost) print '-> per-fp cost $%.2f; per-fn cost $%.2f; per-unsure cost $%.2f' % ( FPW, FNW, UNW) if len(bests) > 1: print '-> achieved at', len(bests), 'cutoff pairs' info = [('smallest ham & spam cutoffs', bests[0]), ('largest ham & spam cutoffs', bests[-1])] else: info = [('achieved at ham & spam cutoffs', bests[0])] for tag, (h, s) in info: print '-> %s %g & %g' % (tag, float(h)/n, float(s)/n) num_fn = stotal[h] num_fp = htotal[-1] - htotal[s] num_unh = htotal[s] - htotal[h] num_uns = stotal[s] - stotal[h] print '-> fp %d; fn %d; unsure ham %d; unsure spam %d' % ( num_fp, num_fn, num_unh, num_uns) print '-> fp rate %.3g%%; fn rate %.3g%%; unsure rate %.3g%%' % ( num_fp*1e2 / ham.n, num_fn*1e2 / spam.n, (num_unh + num_uns)*1e2 / (ham.n + spam.n)) return float(bests[0][0])/n,float(bests[0][1])/n def printmsg(msg, prob, clues): print msg.tag print "prob =", prob for clue in clues: print "prob(%r) = %g" % clue print guts = str(msg) if options["TestDriver", "show_charlimit"] > 0: guts = guts[:options["TestDriver", "show_charlimit"]] print guts class Driver : def __init__(self): self.falsepos = Set() self.falseneg = Set() self.unsure = Set() self.global_ham_hist = Hist() self.global_spam_hist = Hist() self.ntimes_finishtest_called = 0 self.new_classifier() from spambayes import CostCounter self.cc=CostCounter.default() def new_classifier(self): """Create and use a new, virgin classifier.""" self.set_classifier(classifier.Bayes()) def set_classifier(self, classifier): """Specify a classifier to be used for further testing.""" self.classifier = classifier self.tester = Tester.Test(classifier) self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() def train(self, ham, spam): print "-> Training on", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.train(ham, spam) print c.nham - nham, "hams &", c.nspam- nspam, "spams" def untrain(self, ham, spam): print "-> Forgetting", ham, "&", spam, "...", c = self.classifier nham, nspam = c.nham, c.nspam self.tester.untrain(ham, spam) print nham - c.nham, "hams &", nspam - c.nspam, "spams" def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname fp = file(fname, 'wb') pickle.dump(self.classifier, fp, 1) fp.close() def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut,bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" %(f, fname) fp = file(fname, 'wb') pickle.dump(h, fp, 1) fp.close() def test(self, ham, spam): c = self.classifier t = self.tester local_ham_hist = Hist() local_spam_hist = Hist() def new_ham(msg, prob, lo=options["TestDriver", "show_ham_lo"], hi=options["TestDriver", "show_ham_hi"]): local_ham_hist.add(prob * 100.0) self.cc.ham(prob) if lo <= prob <= hi: print print "Ham with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) def new_spam(msg, prob, lo=options["TestDriver", "show_spam_lo"], hi=options["TestDriver", "show_spam_hi"]): local_spam_hist.add(prob * 100.0) self.cc.spam(prob) if lo <= prob <= hi: print print "Spam with prob =", prob prob, clues = c.spamprob(msg, True) printmsg(msg, prob, clues) t.reset_test_results() print "-> Predicting", ham, "&", spam, "..." t.predict(spam, True, new_spam) t.predict(ham, False, new_ham) print "-> <stat> tested", t.nham_tested, "hams &", t.nspam_tested, \ "spams against", c.nham, "hams &", c.nspam, "spams" print "-> <stat> false positive %:", t.false_positive_rate() print "-> <stat> false negative %:", t.false_negative_rate() print "-> <stat> unsure %:", t.unsure_rate() print "-> <stat> cost: $%.2f" % ( t.nham_wrong * options["TestDriver", "best_cutoff_fp_weight"] + t.nspam_wrong * options["TestDriver", "best_cutoff_fn_weight"] + (t.nham_unsure + t.nspam_unsure) * options["TestDriver", "best_cutoff_unsure_weight"]) newfpos = Set(t.false_positives()) - self.falsepos self.falsepos |= newfpos print "-> <stat> %d new false positives" % len(newfpos) if newfpos: print " new fp:", [e.tag for e in newfpos] if not options["TestDriver", "show_false_positives"]: newfpos = () for e in newfpos: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newfneg = Set(t.false_negatives()) - self.falseneg self.falseneg |= newfneg print "-> <stat> %d new false negatives" % len(newfneg) if newfneg: print " new fn:", [e.tag for e in newfneg] if not options["TestDriver", "show_false_negatives"]: newfneg = () for e in newfneg: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) newunsure = Set(t.unsures()) - self.unsure self.unsure |= newunsure print "-> <stat> %d new unsure" % len(newunsure) if newunsure: print " new unsure:", [e.tag for e in newunsure] if not options["TestDriver", "show_unsure"]: newunsure = () for e in newunsure: print '*' * 78 prob, clues = c.spamprob(e, True) printmsg(e, prob, clues) if options["TestDriver", "show_histograms"]: printhist("this pair:", local_ham_hist, local_spam_hist) self.trained_ham_hist += local_ham_hist self.trained_spam_hist += local_spam_hist