def open(self, mode): assert not self.h, "Cannot reopen, close first." assert not self.mode, "Mode should be None on open, bad state." assert mode in ['r', 'c'], "Must give a valid mode: r, c." self.mode = mode self.h = hammie.open(self.dbname, self.usedb, self.mode)
def open(self, mode): if self.h is None or self.mode != mode: if self.h is not None: if self.mode != 'r': self.h.store() self.mode = mode self.h = hammie.open(self.dbname, self.usedb, self.mode)
def main(args): opts, args = getopt.getopt(args, "h") for opt, arg in opts: if opt == '-h': help() return 0 tagdb_list = [] msg = mboxutils.get_message(sys.stdin) try: del msg["X-Spambayes-Classification"] except KeyError: pass for pair in args: tag, db = pair.split('=', 1) h = hammie.open(db, True, 'r') score = h.score(msg) if score >= Options.options["Categorization", "spam_cutoff"]: msg["X-Spambayes-Classification"] = "%s; %.2f" % (tag, score) break else: msg["X-Spambayes-Classification"] = "unsure" sys.stdout.write(msg.as_string(unixfrom=(msg.get_unixfrom() is not None))) return 0
def main(args): opts, args = getopt.getopt(args, "h") for opt, arg in opts: if opt == '-h': usage() return 0 msg = mboxutils.get_message(sys.stdin) try: del msg["X-Spambayes-Classification"] except KeyError: pass for pair in args: tag, db = pair.split('=', 1) h = hammie.open(db, True, 'r') score = h.score(msg) if score >= Options.options["Categorization", "spam_cutoff"]: msg["X-Spambayes-Classification"] = "%s; %.2f" % (tag, score) break else: msg["X-Spambayes-Classification"] = "unsure" sys.stdout.write(msg.as_string(unixfrom=(msg.get_unixfrom() is not None))) return 0
def __init__(self): options = Options.options options.mergefiles(['/etc/hammierc', os.path.expanduser('~/.hammierc')]) self.dbname = options.hammiefilter_persistent_storage_file self.dbname = os.path.expanduser(self.dbname) self.usedb = options.hammiefilter_persistent_use_database self.ham = hammie.open(self.dbname, self.usedb, 'r')
def open(self, mode): if self.h is None or self.mode != mode: if self.h is not None: if self.mode != 'r': self.h.store() self.h.close() self.mode = mode self.h = hammie.open(self.dbname, self.usedb, self.mode)
def open(self): mtime = os.path.getmtime(self.dbname) if self.h is None or self.modtime < mtime: self.h = hammie.open(self.dbname, self.usedb, 'r') self.modtime = mtime
def __init__(self): options = Options.options options.mergefiles( ['/etc/hammierc', os.path.expanduser('~/.hammierc')]) self.dbname = options.hammiefilter_persistent_storage_file self.dbname = os.path.expanduser(self.dbname) self.usedb = options.hammiefilter_persistent_use_database self.ham = hammie.open(self.dbname, self.usedb, 'r')
def drive(nsets,decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % \ i for i in range(1, nsets+1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % \ i for i in range(1, nsets+1)] spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns+hamfns: allfns[fn] = None d = hammie.open('weaktest.db', False) hamtrain = 0 spamtrain = 0 n = 0 for dir,name, is_spam in allfns.iterkeys(): n += 1 m=msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS"%(hamtrain,spamtrain) scr=d.score(m) if debug > 1: print "score:%.3f"%scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f"%scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f"%scr cc.ham(scr) de = decision(scr,is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "="*70 print "%5d trained:%dH+%dS wrds:%d"%( n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def main(): """Main program; parse options and go.""" global loud try: opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:p:g:s:o:') except getopt.error as msg: usage(2, msg) if not opts: usage(2, "No options given") force = False trainnew = False removetrained = False good = [] spam = [] for opt, arg in opts: if opt == '-h': usage(0) elif opt == "-f": force = True elif opt == "-n": trainnew = True elif opt == "-q": loud = False elif opt == '-g': good.append(arg) elif opt == '-s': spam.append(arg) elif opt == "-r": removetrained = True elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: usedb = options["Storage", "persistent_use_database"] pck = get_pathname_option("Storage", "persistent_storage_file") h = hammie.open(pck, usedb, "c") for g in good: if loud: print("Training ham (%s):" % g) train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: if loud: print("Training spam (%s):" % s) train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True if save: h.store()
class SpambayesFilter(BufferAllFilter): checker = hammie.open(dbf, 1, 'r') def filter(self, s): if self.reply.split()[1] == '200': prob = self.checker.score("%s\r\n%s" % (self.serverheaders, s)) print "| prob: %.5f" % prob if prob >= Options.options["Categorization", "spam_cutoff"]: print self.serverheaders print "text:", s[0:40], "...", s[-40:] return "not authorized" return s
def drive(nsets, decision): print options.display() spamdirs = [get_pathname_option("TestDriver", "spam_directories") % i for i in range(1, nsets + 1)] hamdirs = [get_pathname_option("TestDriver", "ham_directories") % i for i in range(1, nsets + 1)] spamfns = [(x, y, 1) for x in spamdirs for y in os.listdir(x)] hamfns = [(x, y, 0) for x in hamdirs for y in os.listdir(x)] nham = len(hamfns) nspam = len(spamfns) cc = CostCounter.nodelay() allfns = {} for fn in spamfns + hamfns: allfns[fn] = None d = hammie.open("weaktest.db", False) hamtrain = 0 spamtrain = 0 n = 0 for dir, name, is_spam in allfns.iterkeys(): n += 1 m = msgs.Msg(dir, name).guts if debug > 1: print "trained:%dH+%dS" % (hamtrain, spamtrain) scr = d.score(m) if debug > 1: print "score:%.3f" % scr if not decision.tooearly(): if is_spam: if debug > 0: print "Spam with score %.2f" % scr cc.spam(scr) else: if debug > 0: print "Ham with score %.2f" % scr cc.ham(scr) de = decision(scr, is_spam) if de == TRAIN_AS_SPAM: d.train_spam(m) spamtrain += 1 elif de == TRAIN_AS_HAM: d.train_ham(m) hamtrain += 1 if n % 100 == 0: print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc print "=" * 70 print "%5d trained:%dH+%dS wrds:%d" % (n, hamtrain, spamtrain, len(d.bayes.wordinfo)) print cc
def check_spambayes(self, arguments, cache=[]): # All arguments are optional. First is REJECT threshold. if arguments: spam_cutoff = float(arguments[0]) arguments = arguments[1:] else: try: from spambayes.Options import options except ImportError: return spam_cutoff = options.spam_cutoff assert 0.0 < spam_cutoff <= 1.0, spam_cutoff # Second argument is KILL threshold. if arguments: kill_cutoff = float(arguments[0]) arguments = arguments[1:] else: kill_cutoff = 1.00 assert spam_cutoff <= kill_cutoff <= 1.0, (spam_cutoff, kill_cutoff) assert not arguments, arguments # Fetch data base, caching it on first call. if cache: data_base = cache[0] else: data_base_name = os.path.expanduser('~pinard/etc/nospam/hammie.db') try: from spambayes import hammie data_base = hammie.open(data_base_name, True, 'r') except (ImportError, IOError): data_base = None cache.append(data_base) if data_base is None: return # Evaluate spamicity and act accordingly. spamicity = data_base.score(self.message) if spamicity > kill_cutoff: self.checker.kill("Spambayes score is %4.2f." % spamicity) elif spamicity > spam_cutoff: self.checker.reject("Spambayes score is %4.2f." % spamicity) elif self.run.debug >= 2: self.checker.report("DEBUG: Spambayes score is %4.2f." % spamicity)
elif opt == '-u': unknown.append(arg) elif opt == '-U': untrain_mode = 1 elif opt == '-r': reverse = 1 pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") save = False h = hammie.open(pck, usedb, mode) if not untrain_mode: for g in good: print "Training ham (%s):" % g train(h, g, False) save = True for s in spam: print "Training spam (%s):" % s train(h, s, True) save = True else: for g in good: print "Untraining ham (%s):" % g untrain(h, g, False)
elif opt == '-s': spam.append(arg) elif opt == "-r": removetrained = True elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: # Use settings in configuration file. usedb = options["Storage", "persistent_use_database"] pck = get_pathname_option("Storage", "persistent_storage_file") h = hammie.open(pck, usedb, "c") for g in good: if loud: print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: if loud: print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True
def main(): """Main program; parse options and go.""" try: opts, args = getopt.getopt(sys.argv[1:], 'hd:Ufg:s:p:u:r') except getopt.error as msg: usage(2, msg) if not opts: usage(2, "No options given") pck = DEFAULTDB good = [] spam = [] unknown = [] reverse = 0 untrain_mode = 0 do_filter = False usedb = None mode = 'r' for opt, arg in opts: if opt == '-h': usage(0) elif opt == '-g': good.append(arg) mode = 'c' elif opt == '-s': spam.append(arg) mode = 'c' elif opt == "-f": do_filter = True elif opt == '-u': unknown.append(arg) elif opt == '-U': untrain_mode = 1 elif opt == '-r': reverse = 1 pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: usage(2, "Must specify one of -d or -D") save = False h = hammie.open(pck, usedb, mode) if not untrain_mode: for g in good: print("Training ham (%s):" % g) train(h, g, False) save = True for s in spam: print("Training spam (%s):" % s) train(h, s, True) save = True else: for g in good: print("Untraining ham (%s):" % g) untrain(h, g, False) save = True for s in spam: print("Untraining spam (%s):" % s) untrain(h, s, True) save = True if save: h.store() if do_filter: msg = sys.stdin.read() filtered = h.filter(msg) sys.stdout.write(filtered) if unknown: spams = hams = unsures = 0 for u in unknown: if len(unknown) > 1: print("Scoring", u) s, g, u = score(h, u, reverse) spams += s hams += g unsures += u print("Total %d spam, %d ham, %d unsure" % (spams, hams, unsures))
def classifier(scam, mode='c'): # Might have to first create the folder/file return hammie.open(('scams/%s/db' % scam), mode)
def main(): filters = Filters() duplicate = Duplicate() filters.add(duplicate, AppendFile("spam2.mbox")) filters.add(WhiteListFrom("good_emails.txt"), KEEP) filters.add(WhiteListSubstrings("subject", [ 'ABCD:', '[Python-announce]', '[Python]', '[Bioinfo]', '[EuroPython]', ]), KEEP) filters.add(WhiteListSubstrings("to", [ "*****@*****.**", "*****@*****.**", ]), KEEP) names = ["john", "", "jon", "johnathan"] valid_emails = ([name + "@lectroid.com" for name in names] + [name + "@bigboote.org" for name in names] + ["*****@*****.**"]) filters.add(IllegalDeliveredTo(valid_emails), DELETE) filters.add(SpamAssassin(), AppendFile("spam2.mbox")) filters.add(IsVirus, DELETE) h = hammie.open("cull.spambayes", "dbm", "r") filters.add(IsSpam(h, 0.90), AppendFile("spam.mbox")) server_configs = [("mail.example.com", "*****@*****.**", "password"), ("popserver.big.com", "ceo", "12345"), ] error_count = 0 cumulative_log = {SPAM: 0, VIRUS: 0} initial_log = None start_time = None # init'ed only after initial_log is created while 1: error_flag = False duplicate.unique.clear() # Hack! for server, user, pwd in server_configs: try: log = filter_server( (server, user, pwd), filters) except KeyboardInterrupt: raw_input("Press enter to continue. ") except StandardError: raise except: error_flag = True traceback.print_exc() continue if VERBOSE_LEVEL > 1 and log: print " ** Summary **" for x in (log.tests, log.actions): items = x.items() if items: items.sort() for k, v in items: print " %s: %s" % (k, v) print cumulative_log[SPAM] += log.tests.get(SPAM, 0) cumulative_log[VIRUS] += log.tests.get(VIRUS, 0) if initial_log is None: initial_log = cumulative_log.copy() start_time = time.time() if VERBOSE_LEVEL: print "Stats: %d spams, %d virus" % ( initial_log[SPAM], initial_log[VIRUS]) else: if VERBOSE_LEVEL: delta_t = time.time() - start_time delta_t = max(delta_t, 1) # print "Stats: %d spams (%.2f/hr), %d virus (%.2f/hr)" % ( cumulative_log[SPAM], (cumulative_log[SPAM] - initial_log[SPAM]) / delta_t * 3600, cumulative_log[VIRUS], (cumulative_log[VIRUS] - initial_log[VIRUS]) / delta_t * 3600) if error_flag: error_count += 1 if error_count > 0: restart_network() error_count = 0 delay = 10 * 60 while delay: try: wait(delay) break except KeyboardInterrupt: print while 1: cmd = raw_input("enter, delay, or quit? ") if cmd in ("q", "quit"): raise SystemExit(0) elif cmd == "": delay = 0 break elif cmd.isdigit(): delay = int(cmd) break else: print "Unknown command."
def __init__(self): self.h = hammie.open(self.hammieFile, mode = 'c') pass
def main(): filters = Filters() duplicate = Duplicate() filters.add(duplicate, AppendFile("spam2.mbox")) # A list of everyone who has emailed me this year. # Keep their messages on the server. filters.add(WhiteListFrom("good_emails.txt"), KEEP) # My mailing lists. filters.add( WhiteListSubstrings("subject", [ 'ABCD:', '[Python-announce]', '[Python]', '[Bioinfo]', '[EuroPython]', ]), KEEP) filters.add( WhiteListSubstrings("to", [ "*****@*****.**", "*****@*****.**", ]), KEEP) names = ["john", "", "jon", "johnathan"] valid_emails = ([name + "@lectroid.com" for name in names] + [name + "@bigboote.org" for name in names] + ["*****@*****.**"]) filters.add(IllegalDeliveredTo(valid_emails), DELETE) filters.add(SpamAssassin(), AppendFile("spam2.mbox")) # Get rid of anything which smells like an exectuable. filters.add(IsVirus, DELETE) # Use SpamBayes to identify spam. Make a local copy then # delete from the server. h = hammie.open("cull.spambayes", "dbm", "r") filters.add(IsSpam(h, 0.90), AppendFile("spam.mbox")) # These are my POP3 accounts. server_configs = [ ("mail.example.com", "*****@*****.**", "password"), ("popserver.big.com", "ceo", "12345"), ] # The main culling loop. error_count = 0 cumulative_log = {SPAM: 0, VIRUS: 0} initial_log = None start_time = None # init'ed only after initial_log is created while 1: error_flag = False duplicate.unique.clear() # Hack! for server, user, pwd in server_configs: try: log = filter_server((server, user, pwd), filters) except KeyboardInterrupt: raw_input("Press enter to continue. ") except StandardError: raise except: error_flag = True traceback.print_exc() continue if VERBOSE_LEVEL > 1 and log: print " ** Summary **" for x in (log.tests, log.actions): items = x.items() if items: items.sort() for k, v in items: print " %s: %s" % (k, v) print cumulative_log[SPAM] += log.tests.get(SPAM, 0) cumulative_log[VIRUS] += log.tests.get(VIRUS, 0) if initial_log is None: initial_log = cumulative_log.copy() start_time = time.time() if VERBOSE_LEVEL: print "Stats: %d spams, %d virus" % (initial_log[SPAM], initial_log[VIRUS]) else: if VERBOSE_LEVEL: delta_t = time.time() - start_time delta_t = max(delta_t, 1) # print "Stats: %d spams (%.2f/hr), %d virus (%.2f/hr)" % ( cumulative_log[SPAM], (cumulative_log[SPAM] - initial_log[SPAM]) / delta_t * 3600, cumulative_log[VIRUS], (cumulative_log[VIRUS] - initial_log[VIRUS]) / delta_t * 3600) if error_flag: error_count += 1 if error_count > 0: restart_network() error_count = 0 delay = 10 * 60 while delay: try: wait(delay) break except KeyboardInterrupt: print while 1: cmd = raw_input("enter, delay, or quit? ") if cmd in ("q", "quit"): raise SystemExit(0) elif cmd == "": delay = 0 break elif cmd.isdigit(): delay = int(cmd) break else: print "Unknown command."
spam.append(arg) elif opt == "-r": removetrained = True elif opt == '-o': options.set_from_cmdline(arg, sys.stderr) pck, usedb = storage.database_type(opts) if args: usage(2, "Positional arguments not allowed") if usedb == None: # Use settings in configuration file. usedb = options["Storage", "persistent_use_database"] pck = get_pathname_option("Storage", "persistent_storage_file") h = hammie.open(pck, usedb, "c") for g in good: if loud: print "Training ham (%s):" % g train(h, g, False, force, trainnew, removetrained) sys.stdout.flush() save = True for s in spam: if loud: print "Training spam (%s):" % s train(h, s, True, force, trainnew, removetrained) sys.stdout.flush() save = True
#!/bin/python """Wrapper script for testing the performance of SpamBayes. Run a canned mailbox through a SpamBayes ham/spam classifier. """ import os.path from spambayes import hammie, mboxutils __author__ = "[email protected] (Skip Montanaro)" __contact__ = "[email protected] (Collin Winter)" def bench_spambayes(ham_classifier, messages): for msg in messages: ham_classifier.score(msg) # data_dir = os.path.join(os.path.dirname(__file__), "data") data_dir = os.path.dirname(__file__) mailbox = os.path.join(data_dir, "spambayes_mailbox") #mailbox = os.path.join(data_dir, "small_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") messages = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") bench_spambayes(ham_classifier, messages)
Run a canned mailbox through a SpamBayes ham/spam classifier. """ import os.path import perf from spambayes import hammie, mboxutils __author__ = "[email protected] (Skip Montanaro)" __contact__ = "[email protected] (Collin Winter)" def bench_spambayes(ham_classifier, messages): for msg in messages: ham_classifier.score(msg) if __name__ == "__main__": runner = perf.Runner() runner.metadata['description'] = "Run the SpamBayes benchmark." data_dir = os.path.join(os.path.dirname(__file__), "data") mailbox = os.path.join(data_dir, "spambayes_mailbox") ham_data = os.path.join(data_dir, "spambayes_hammie.pkl") messages = list(mboxutils.getmbox(mailbox)) ham_classifier = hammie.open(ham_data, "pickle", "r") runner.bench_func('spambayes', bench_spambayes, ham_classifier, messages)