def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print(" saving pickle to", fname) pickle_write(fname, self.classifier, 1)
def _save_caches(self): # XXX Note that these caches are never refreshed, which might not # XXX be a good thing long-term (if a previously invalid URL # XXX becomes valid, for example). for name, data in [(self.bad_url_cache_name, self.bad_urls), (self.http_error_cache_name, self.http_error_urls),]: pickle_write(name, data)
def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut, bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" % (f, fname) pickle_write(fname, h, 1)
def main(args): try: opts, args = getopt.getopt(args, "hd:t:", ["type=", "help", "database="]) except getopt.GetoptError as msg: usage(msg) return 1 mapfile = None mboxtype = None for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-t", "--type"): mboxtype = arg if mapfile is None: usage("'-d mapfile' is required") return 1 if mboxtype is None: usage("'-t ham|spam' is required") return 1 if mboxtype not in ("ham", "spam"): usage("mboxtype must be 'ham' or 'spam'") return 1 try: mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) pickle_write(mapfile, mapd)
def _save_caches(self): # XXX Note that these caches are never refreshed, which might not # XXX be a good thing long-term (if a previously invalid URL # XXX becomes valid, for example). for name, data in [ (self.bad_url_cache_name, self.bad_urls), (self.http_error_cache_name, self.http_error_urls), ]: pickle_write(name, data)
def close(self): if options["globals", "verbose"]: print >> sys.stderr, "saving", len(self.cache), print >> sys.stderr, "items to", self.cachefile, if self.hits + self.misses: print >> sys.stderr, "%.2f%% hit rate" % \ (100 * self.hits / (self.hits + self.misses)), print >> sys.stderr pickle_write(self.cachefile, self.cache)
def close(self): if options["globals", "verbose"]: print("saving", len(self.cache), end=' ', file=sys.stderr) print("items to", self.cachefile, end=' ', file=sys.stderr) if self.hits + self.misses: print("%.2f%% hit rate" % \ (100 * self.hits / (self.hits + self.misses)), end=' ', file=sys.stderr) print(file=sys.stderr) pickle_write(self.cachefile, self.cache)
def close(self): if self.printStatsAtEnd: self.printStats() if self.cachefile: pickle_write(self.cachefile, self.caches)
def finishtest(self): if options["TestDriver", "show_histograms"]: printhist("all in this training set:", self.trained_ham_hist, self.trained_spam_hist) self.global_ham_hist += self.trained_ham_hist self.global_spam_hist += self.trained_spam_hist self.trained_ham_hist = Hist() self.trained_spam_hist = Hist() self.ntimes_finishtest_called += 1 if options["TestDriver", "save_trained_pickles"]: fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"], self.ntimes_finishtest_called) print " saving pickle to", fname pickle_write(fname, self.classifier, 1)
def alldone(self): if options["TestDriver", "show_histograms"]: besthamcut, bestspamcut = printhist("all runs:", self.global_ham_hist, self.global_spam_hist) else: besthamcut = options["Categorization", "ham_cutoff"] bestspamcut = options["Categorization", "spam_cutoff"] self.global_ham_hist.compute_stats() self.global_spam_hist.compute_stats() nham = self.global_ham_hist.n nspam = self.global_spam_hist.n nfp = len(self.falsepos) nfn = len(self.falseneg) nun = len(self.unsure) print "-> <stat> all runs false positives:", nfp print "-> <stat> all runs false negatives:", nfn print "-> <stat> all runs unsure:", nun print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham) print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam) print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam)) print "-> <stat> all runs cost: $%.2f" % ( nfp * options["TestDriver", "best_cutoff_fp_weight"] + nfn * options["TestDriver", "best_cutoff_fn_weight"] + nun * options["TestDriver", "best_cutoff_unsure_weight"]) # Set back the options for the delayed calculations in self.cc options["Categorization", "ham_cutoff"] = besthamcut options["Categorization", "spam_cutoff"] = bestspamcut print self.cc if options["TestDriver", "save_histogram_pickles"]: for f, h in (('ham', self.global_ham_hist), ('spam', self.global_spam_hist)): fname = "%s_%shist.pik" % (options["TestDriver", "pickle_basename"], f) print " saving %s histogram pickle to %s" % (f, fname) pickle_write(fname, h, 1)
return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-t", "--type"): mboxtype = arg if mapfile is None: usage("'-d mapfile' is required") return 1 if mboxtype is None: usage("'-t ham|spam' is required") return 1 if mboxtype not in ("ham", "spam"): usage("mboxtype must be 'ham' or 'spam'") return 1 try: mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) pickle_write(mapfile, mapd) if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
def main(args): try: opts, args = getopt.getopt(args, "b:sh") except getopt.error as msg: usage(msg) return 1 best = 0 skipspam = False for opt, arg in opts: if opt == "-h": usage() return 0 if opt == "-b": best = int(arg) elif opt == "-s": skipspam = True if len(args) != 3: usage("require ham, spam and unsure message piles") return 1 ham, spam, unsure = args choices = ["best.pck"] if "HOME" in os.environ: home = os.environ["HOME"] choices.append(os.path.join(home, "tmp", "best.pck")) choices.append(os.path.join(home, "best.pck")) choices.append(None) for bestfile in choices: if bestfile is None: break if os.path.exists(bestfile): break try: file(bestfile, "w") except IOError: pass else: os.unlink(bestfile) if bestfile is None: usage("can't find a place to write best.pck file") return 1 print("establish base training") learn(ham, h, False) learn(spam, h, True) print("scoring") if best: last_scores = pickle_read(bestfile) last_scores = list(last_scores.items()) last_scores.sort() msgids = set() for (k, v) in last_scores[-best:]: msgids.update(set(v)) else: msgids = None scores = {} try: score(unsure, h, cls, scores, msgids, skipspam) except KeyboardInterrupt: pass if not best: pickle_write(bestfile, scores) return 0
try: print "Replicating..." db.Replicate(rdbname) print "Done" except pywintypes.com_error: print "Could not replicate" if doClassify: classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log) print "The Spambayes database currently has %s Spam and %s Ham" \ % (bayes.nspam, bayes.nham) bayes.store() pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes") if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:') except getopt.error, msg: print >> sys.stderr, str(msg) + '\n\n' + __doc__ sys.exit() ldbname = None # local notes database name rdbname = None # remote notes database location sbfname = None # spambayes folder name
except IOError: pass else: os.unlink(bestfile) if bestfile is None: usage("can't find a place to write best.pck file") return 1 print "establish base training" learn(ham, h, False) learn(spam, h, True) print "scoring" if best: last_scores = pickle_read(bestfile) last_scores = last_scores.items() last_scores.sort() msgids = set() for (k, v) in last_scores[-best:]: msgids.update(set(v)) else: msgids = None scores = {} try: score(unsure, h, cls, scores, msgids, skipspam) except KeyboardInterrupt: pass if not best: pickle_write(bestfile, scores) return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
def store(self): """Store self as a pickle""" if options["globals", "verbose"]: print("Persisting", self.db_name, "as a pickle", file=sys.stderr) pickle_write(self.db_name, self, PICKLE_TYPE)
def store(self): pickle_write(self.db_name, self.db, self.mode)
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: notesindex = pickle_read(idxname) except IOError as e: if e.errno != errno.ENOENT: raise notesindex = {} print("%s file not found, this is a first time run" % (idxname,)) print("No classification will be performed") need_replicate = False sess = win32com.client.Dispatch("Lotus.NotesSession") try: if pwd: sess.initialize(pwd) else: sess.initialize() except pywintypes.com_error: print("Session aborted") sys.exit() try: db = sess.GetDatabase(rdbname, ldbname) except pywintypes.com_error: if rdbname: print("Could not open database remotely, trying locally") try: db = sess.GetDatabase("", ldbname) need_replicate = True except pywintypes.com_error: print("Could not open database") sys.exit() else: raise log = sess.CreateLog("SpambayesAgentLog") try: log.OpenNotesLog("", logname) except pywintypes.com_error: print("Could not open log") log = None if log: log.LogAction("Running spambayes") vinbox = db.getView('($Inbox)') vspam = db.getView("%s\Spam" % (foldname,)) vham = db.getView("%s\Ham" % (foldname,)) vtrainspam = db.getView("%s\Train as Spam" % (foldname,)) vtrainham = db.getView("%s\Train as Ham" % (foldname,)) if doTrain: processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log) processAndTrain(vtrainham, vham, bayes, False, notesindex, log) if need_replicate: try: print("Replicating...") db.Replicate(rdbname) print("Done") except pywintypes.com_error: print("Could not replicate") if doClassify: classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log) print("The Spambayes database currently has %s Spam and %s Ham" \ % (bayes.nspam, bayes.nham)) bayes.store() pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes")
elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-t", "--type"): mboxtype = arg if mapfile is None: usage("'-d mapfile' is required") return 1 if mboxtype is None: usage("'-t ham|spam' is required") return 1 if mboxtype not in ("ham", "spam"): usage("mboxtype must be 'ham' or 'spam'") return 1 try: mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) pickle_write(mapfile, mapd) if __name__ == "__main__": sys.exit(main(sys.argv[1:]))
def _save_caches(self): for name, data in [(self.bad_url_cache_name, self.bad_urls), (self.http_error_cache_name, self.http_error_urls),]: pickle_write(name, data)