def msg_train(h, msg, is_spam, force): """Train bayes with a single message.""" # XXX: big hack -- why is email.Message unable to represent # multipart/alternative? try: mboxutils.as_string(msg) except TypeError: # We'll be unable to represent this as text :( return False if is_spam: spamtxt = options["Headers", "header_spam_string"] else: spamtxt = options["Headers", "header_ham_string"] oldtxt = msg.get(options["Headers", "trained_header_name"]) if force: # Train no matter what. if oldtxt != None: del msg[options["Headers", "trained_header_name"]] elif oldtxt == spamtxt: # Skip this one, we've already trained with it. return False elif oldtxt != None: # It's been trained, but as something else. Untrain. del msg[options["Headers", "trained_header_name"]] h.untrain(msg, not is_spam) h.train(msg, is_spam) msg.add_header(options["Headers", "trained_header_name"], spamtxt) return True
def _calc_response(self, switches, body): switches = switches.split() actions = [] opts, args = getopt.getopt(switches, 'fgstGS') h = self.server.hammie for opt, arg in opts: if opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) if actions == []: actions = [h.filter] from spambayes import mboxutils msg = mboxutils.get_message(body) for action in actions: action(msg) return mboxutils.as_string(msg, 1)
def main(profiling=False): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P', ['help', 'version', 'examples', 'option=']) create_newdb = False do_profile = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-v', '--version'): version() elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == '-P': do_profile = True if not profiling: try: import cProfile except ImportError: pass else: return cProfile.run("main(True)") elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb or not os.path.exists(h.dbname): h.newdb() print("Created new database in", h.dbname, file=sys.stderr) if create_newdb: sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" if loud: print " Reading as MH mailbox" import glob counter = 0 trained = 0 for fn in glob.glob(os.path.join(path, "[0-9]*")): counter += 1 cfn = fn tfn = os.path.join(path, "spambayes.tmp") if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() f = file(fn, "rb") msg = get_message(f) f.close() if not msg: print "Malformed message: %s. Skipping..." % cfn continue msg_train(h, msg, is_spam, force) trained += 1 if not options["Headers", "include_trained"]: continue f = file(tfn, "wb") f.write(mboxutils.as_string(msg)) f.close() shutil.copystat(cfn, tfn) os.rename(tfn, cfn) if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" if loud: print " Reading as Unix mbox" import mailbox import fcntl # Open and lock the mailbox. Some systems require it be opened for # writes in order to assert an exclusive lock. f = file(path, "r+b") fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, get_message) outf = os.tmpfile() counter = 0 trained = 0 for msg in mbox: if not msg: print "Malformed message number %d. I can't train on this mbox, sorry." % counter return counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() if msg_train(h, msg, is_spam, force): trained += 1 if options["Headers", "include_trained"]: # Write it out with the Unix "From " line outf.write(mboxutils.as_string(msg, True)) if options["Headers", "include_trained"]: outf.seek(0) try: os.ftruncate(f.fileno(), 0) f.seek(0) except: # If anything goes wrong, don't try to write print "Problem truncating mbox--nothing written" raise try: for line in outf.xreadlines(): f.write(line) except: print >> sys.stderr ("Problem writing mbox! Sorry, " "I tried my best, but your mail " "may be corrupted.") raise fcntl.flock(f, fcntl.LOCK_UN) f.close() if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" if loud: print " Reading as Unix mbox" import mailbox import fcntl # Open and lock the mailbox. Some systems require it be opened for # writes in order to assert an exclusive lock. f = file(path, "r+b") fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, get_message) outf = os.tmpfile() counter = 0 trained = 0 for msg in mbox: if not msg: print "Malformed message number %d. I can't train on this mbox, sorry." % counter return counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() if msg_train(h, msg, is_spam, force): trained += 1 if options["Headers", "include_trained"]: # Write it out with the Unix "From " line outf.write(mboxutils.as_string(msg, True)) if options["Headers", "include_trained"]: outf.seek(0) try: os.ftruncate(f.fileno(), 0) f.seek(0) except: # If anything goes wrong, don't try to write print "Problem truncating mbox--nothing written" raise try: for line in outf.xreadlines(): f.write(line) except: print >> sys.stderr("Problem writing mbox! Sorry, " "I tried my best, but your mail " "may be corrupted.") raise fcntl.flock(f, fcntl.LOCK_UN) f.close() if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def msg_train(h, msg, is_spam, force): """Train bayes with a single message.""" try: mboxutils.as_string(msg) except TypeError: return False if is_spam: spamtxt = options["Headers", "header_spam_string"] else: spamtxt = options["Headers", "header_ham_string"] oldtxt = msg.get(options["Headers", "trained_header_name"]) if force: if oldtxt != None: del msg[options["Headers", "trained_header_name"]] elif oldtxt == spamtxt: return False elif oldtxt != None: del msg[options["Headers", "trained_header_name"]] h.untrain(msg, not is_spam) h.train(msg, is_spam) msg.add_header(options["Headers", "trained_header_name"], spamtxt) return True
def maildir_train(h, path, is_spam, force, removetrained): """Train bayes with all messages from a maildir.""" if loud: print " Reading %s as Maildir" % (path, ) import time import socket pid = os.getpid() host = socket.gethostname() counter = 0 trained = 0 for fn in os.listdir(path): cfn = os.path.join(path, fn) tfn = os.path.normpath( os.path.join(path, "..", "tmp", "%d.%d_%d.%s" % (time.time(), pid, counter, host))) if (os.path.isdir(cfn)): continue counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() f = file(cfn, "rb") msg = get_message(f) f.close() if not msg: print "Malformed message: %s. Skipping..." % cfn continue if not msg_train(h, msg, is_spam, force): continue trained += 1 if not options["Headers", "include_trained"]: continue f = file(tfn, "wb") f.write(mboxutils.as_string(msg)) f.close() shutil.copystat(cfn, tfn) # XXX: This will raise an exception on Windows. Do any Windows # people actually use Maildirs? os.rename(tfn, cfn) if (removetrained): os.unlink(cfn) if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def maildir_train(h, path, is_spam, force, removetrained): """Train bayes with all messages from a maildir.""" if loud: print " Reading %s as Maildir" % (path,) import time import socket pid = os.getpid() host = socket.gethostname() counter = 0 trained = 0 for fn in os.listdir(path): cfn = os.path.join(path, fn) tfn = os.path.normpath(os.path.join(path, "..", "tmp", "%d.%d_%d.%s" % (time.time(), pid, counter, host))) if (os.path.isdir(cfn)): continue counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() f = file(cfn, "rb") msg = get_message(f) f.close() if not msg: print "Malformed message: %s. Skipping..." % cfn continue if not msg_train(h, msg, is_spam, force): continue trained += 1 if not options["Headers", "include_trained"]: continue f = file(tfn, "wb") f.write(mboxutils.as_string(msg)) f.close() shutil.copystat(cfn, tfn) # XXX: This will raise an exception on Windows. Do any Windows # people actually use Maildirs? os.rename(tfn, cfn) if (removetrained): os.unlink(cfn) if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def main(): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hxd:p:nfgstGSo:', ['help', 'examples', 'option=']) create_newdb = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb: h.newdb() sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
def mbox_train(h, path, is_spam, force): """Train bayes with a Unix mbox""" if loud: print(" Reading as Unix mbox") import mailbox import fcntl f = file(path, "r+b") fcntl.flock(f, fcntl.LOCK_EX) mbox = mailbox.PortableUnixMailbox(f, get_message) outf = os.tmpfile() counter = 0 trained = 0 for msg in mbox: if not msg: print("Malformed message number %d. I can't train on this mbox, sorry." % counter) return counter += 1 if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() if msg_train(h, msg, is_spam, force): trained += 1 if options["Headers", "include_trained"]: outf.write(mboxutils.as_string(msg, True)) if options["Headers", "include_trained"]: outf.seek(0) try: os.ftruncate(f.fileno(), 0) f.seek(0) except: print("Problem truncating mbox--nothing written") raise try: for line in outf: f.write(line) except: print(file=sys.stderr ("Problem writing mbox! Sorry, " "I tried my best, but your mail " "may be corrupted.")) raise fcntl.flock(f, fcntl.LOCK_UN) f.close() if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def mhdir_train(h, path, is_spam, force): """Train bayes with an mh directory""" if loud: print " Reading as MH mailbox" import glob counter = 0 trained = 0 for fn in glob.glob(os.path.join(path, "[0-9]*")): counter += 1 cfn = fn tfn = os.path.join(path, "spambayes.tmp") if loud and counter % 10 == 0: sys.stdout.write("\r%6d" % counter) sys.stdout.flush() f = file(fn, "rb") msg = get_message(f) f.close() if not msg: print "Malformed message: %s. Skipping..." % cfn continue msg_train(h, msg, is_spam, force) trained += 1 if not options["Headers", "include_trained"]: continue f = file(tfn, "wb") f.write(mboxutils.as_string(msg)) f.close() shutil.copystat(cfn, tfn) # XXX: This will raise an exception on Windows. Do any Windows # people actually use MH directories? os.rename(tfn, cfn) if loud: sys.stdout.write("\r%6d" % counter) sys.stdout.write("\r Trained %d out of %d messages\n" % (trained, counter))
def main(profiling=False): h = HammieFilter() actions = [] opts, args = getopt.getopt(sys.argv[1:], 'hvxd:p:nfgstGSo:P', ['help', 'version', 'examples', 'option=']) create_newdb = False do_profile = False for opt, arg in opts: if opt in ('-h', '--help'): usage(0) elif opt in ('-v', '--version'): version() elif opt in ('-x', '--examples'): examples() elif opt in ('-o', '--option'): Options.options.set_from_cmdline(arg, sys.stderr) elif opt == '-f': actions.append(h.filter) elif opt == '-g': actions.append(h.train_ham) elif opt == '-s': actions.append(h.train_spam) elif opt == '-t': actions.append(h.filter_train) elif opt == '-G': actions.append(h.untrain_ham) elif opt == '-S': actions.append(h.untrain_spam) elif opt == '-P': do_profile = True if not profiling: try: import cProfile except ImportError: pass else: return cProfile.run("main(True)") elif opt == "-n": create_newdb = True h.dbname, h.usedb = storage.database_type(opts) if create_newdb or not os.path.exists(h.dbname): h.newdb() print >> sys.stderr, "Created new database in", h.dbname if create_newdb: sys.exit(0) if actions == []: actions = [h.filter] if not args: args = ["-"] for fname in args: mbox = mboxutils.getmbox(fname) for msg in mbox: for action in actions: action(msg) if args == ["-"]: unixfrom = msg.get_unixfrom() is not None else: unixfrom = True result = mboxutils.as_string(msg, unixfrom=unixfrom) sys.stdout.write(result)
def score_and_filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, debug=None, train=None): """Score (judge) a message and add a disposition header. msg can be a string, a file object, or a Message object. Optionally, set header to the name of the header to add, and/or spam_cutoff/ham_cutoff to the probability values which must be met or exceeded for a message to get a 'Spam' or 'Ham' classification. An extra debugging header can be added if 'debug' is set to True. The name of the debugging header is given as 'debugheader'. If 'train' is True, also train on the result of scoring the message (ie. train as ham if it's ham, train as spam if it's spam). If the message already has a trained header, it will be untrained first. You'll want to be very dilligent about retraining mistakes if you use this option. All defaults for optional parameters come from the Options file. Returns the score and same message with a new disposition header. """ if header == None: header = options["Headers", "classification_header_name"] if spam_cutoff == None: spam_cutoff = options["Categorization", "spam_cutoff"] if ham_cutoff == None: ham_cutoff = options["Categorization", "ham_cutoff"] if debugheader == None: debugheader = options["Headers", "evidence_header_name"] if debug == None: debug = options["Headers", "include_evidence"] if train == None: train = options["Hammie", "train_on_filter"] msg = mboxutils.get_message(msg) try: del msg[header] except KeyError: pass if train: self.untrain_from_header(msg) prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: is_spam = False disp = options["Headers", "header_ham_string"] elif prob > spam_cutoff: is_spam = True disp = options["Headers", "header_spam_string"] else: is_spam = False disp = options["Headers", "header_unsure_string"] if train: self.train(msg, is_spam, True) basic_disp = disp disp += "; %.*f" % (options["Headers", "header_score_digits"], prob) if options["Headers", "header_score_logarithm"]: if prob <= 0.005 and prob > 0.0: import math x = -math.log10(prob) disp += " (%d)" % x if prob >= 0.995 and prob < 1.0: x = -math.log10(1.0 - prob) disp += " (%d)" % x del msg[header] msg.add_header(header, disp) # Obey notate_to and notate_subject. for header in ('to', 'subject'): if basic_disp in options["Headers", "notate_" + header]: orig = msg[header] del msg[header] msg[header] = "%s,%s" % (basic_disp, orig) if debug: disp = self.formatclues(clues) del msg[debugheader] msg.add_header(debugheader, disp) result = mboxutils.as_string(msg, unixfrom=(msg.get_unixfrom() is not None)) return prob, result
def score_and_filter(self, msg, header=None, spam_cutoff=None, ham_cutoff=None, debugheader=None, debug=None, train=None): """Score (judge) a message and add a disposition header. msg can be a string, a file object, or a Message object. Optionally, set header to the name of the header to add, and/or spam_cutoff/ham_cutoff to the probability values which must be met or exceeded for a message to get a 'Spam' or 'Ham' classification. An extra debugging header can be added if 'debug' is set to True. The name of the debugging header is given as 'debugheader'. If 'train' is True, also train on the result of scoring the message (ie. train as ham if it's ham, train as spam if it's spam). If the message already has a trained header, it will be untrained first. You'll want to be very dilligent about retraining mistakes if you use this option. All defaults for optional parameters come from the Options file. Returns the score and same message with a new disposition header. """ if header == None: header = options["Headers", "classification_header_name"] if spam_cutoff == None: spam_cutoff = options["Categorization", "spam_cutoff"] if ham_cutoff == None: ham_cutoff = options["Categorization", "ham_cutoff"] if debugheader == None: debugheader = options["Headers", "evidence_header_name"] if debug == None: debug = options["Headers", "include_evidence"] if train == None: train = options["Hammie", "train_on_filter"] msg = mboxutils.get_message(msg) try: del msg[header] except KeyError: pass if train: self.untrain_from_header(msg) prob, clues = self._scoremsg(msg, True) if prob < ham_cutoff: is_spam = False disp = options["Headers", "header_ham_string"] elif prob > spam_cutoff: is_spam = True disp = options["Headers", "header_spam_string"] else: is_spam = False disp = options["Headers", "header_unsure_string"] if train: self.train(msg, is_spam, True) basic_disp = disp disp += "; %.*f" % (options["Headers", "header_score_digits"], prob) if options["Headers", "header_score_logarithm"]: if prob<=0.005 and prob>0.0: import math x=-math.log10(prob) disp += " (%d)"%x if prob>=0.995 and prob<1.0: import math x=-math.log10(1.0-prob) disp += " (%d)"%x del msg[header] msg.add_header(header, disp) for header in ('to', 'subject'): if basic_disp in options["Headers", "notate_"+header]: orig = msg[header] del msg[header] msg[header] = "%s,%s" % (basic_disp, orig) if debug: disp = self.formatclues(clues) del msg[debugheader] msg.add_header(debugheader, disp) result = mboxutils.as_string(msg, unixfrom=(msg.get_unixfrom() is not None)) return prob, result