def setup(self): from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: proxy_support = urllib.request.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib.request.build_opener(proxy_support, urllib2.HTTPHandler) else: opener = urllib.request.build_opener(urllib2.HTTPHandler) urllib.request.install_opener(opener) age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): if options["globals", "verbose"]: print("Creating URL cache directory", file=sys.stderr) os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) self.urlCorpus.removeExpiredMessages() self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): if options["globals", "verbose"]: print("Bad URL pickle, using new.", file=sys.stderr) self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} else: if options["globals", "verbose"]: print("URL caches don't exist: creating") self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError as ValueError: if options["globals", "verbose"]: print("Bad HHTP error pickle, using new.", file=sys.stderr) self.http_error_urls = {} else: self.http_error_urls = {}
def __init__(self, dnsServer=None, cachefile=""): self.printStatsAtEnd = False self.returnSinglePTR = True self.cacheErrorSecs=5*60 self.dnsTimeout=10 self.cachefile = os.path.expanduser(cachefile) self.caches = None if self.cachefile and os.path.exists(self.cachefile): try: self.caches = pickle_read(self.cachefile) except: os.unlink(self.cachefile) if self.caches is None: self.caches = {"A": {}, "PTR": {}} if options["globals", "verbose"]: if self.caches["A"] or self.caches["PTR"]: print("opened existing cache with", end=' ', file=sys.stderr) print(len(self.caches["A"]), "A records", end=' ', file=sys.stderr) print("and", len(self.caches["PTR"]), end=' ', file=sys.stderr) print("PTR records", file=sys.stderr) else: print("opened new cache", file=sys.stderr) self.hits=0 # These two for statistics self.misses=0 self.pruneTicker=0 if dnsServer == None: DNS.DiscoverNameServers() self.queryObj = DNS.DnsRequest() else: self.queryObj = DNS.DnsRequest(server=dnsServer) return None
def main(args): try: opts, args = getopt.getopt(args, "hd:t:", ["type=", "help", "database="]) except getopt.GetoptError as msg: usage(msg) return 1 mapfile = None mboxtype = None for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-t", "--type"): mboxtype = arg if mapfile is None: usage("'-d mapfile' is required") return 1 if mboxtype is None: usage("'-t ham|spam' is required") return 1 if mboxtype not in ("ham", "spam"): usage("mboxtype must be 'ham' or 'spam'") return 1 try: mapd = pickle_read(mapfile) except IOError: mapd = {} for f in args: mapmessages(f, mboxtype, mapd) pickle_write(mapfile, mapd)
def main(args): try: opts, args = getopt.getopt(args, "hd:S:H:f:", ["help", "database=", "spamfile=", "hamfile=", "feature="]) except getopt.GetoptError as msg: usage(msg) return 1 charset = locale.getdefaultlocale()[1] if not charset: charset = 'us-ascii' mapfile = spamfile = hamfile = None features = set() for opt, arg in opts: if opt in ("-h", "--help"): usage() return 0 elif opt in ("-d", "--database"): mapfile = arg elif opt in ("-H", "--hamfile"): hamfile = arg elif opt in ("-S", "--spamfile"): spamfile = arg elif opt in ("-f", "--feature"): features.add(str(arg, charset)) if hamfile is None and spamfile is None: usage("At least one of -S or -H are required") return 1 if mapfile is None: usage("'-d mapfile' is required") return 1 try: mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 if not features and not args: usage("Require at least one feature (-f) arg or one message file") return 1 if not features: for f in args: for msg in getmbox(f): evidence = msg.get("X-Spambayes-Evidence", "") evidence = re.sub(r"\s+", " ", evidence) l = [e.rsplit(": ", 1)[0] for e in evidence.split("; ")[2:]] for s in l: try: s = make_header(decode_header(s)).__unicode__() except: s = str(s, 'us-ascii', 'replace') features.add(s) if not features: usage("No X-Spambayes-Evidence headers found") return 1 if spamfile is not None: spamfile = file(spamfile, "w") if hamfile is not None: hamfile = file(hamfile, "w") extractmessages(features, mapd, hamfile, spamfile)
def load(self): try: self.db = pickle_read(self.db_name) except IOError, e: if e.errno == errno.ENOENT: self.db = {} else: raise
def load(self): try: self.db = pickle_read(self.db_name) except IOError, e: if e.errno == errno.ENOENT: # New pickle self.db = {} else: raise
def __init__(self, dnsServer=None, cachefile=""): # These attributes intended for user setting self.printStatsAtEnd = False # As far as I can tell from the standards, # it's legal to have more than one PTR record # for an address. That is, it's legal to get # more than one name back when you do a # reverse lookup on an IP address. I don't # know of a use for that and I've never seen # it done. And I don't think that most # people would expect it. So forward ("A") # lookups always return a list. Reverse # ("PTR") lookups return a single name unless # this attribute is set to False. self.returnSinglePTR = True # How long to cache an error as no data self.cacheErrorSecs = 5 * 60 # How long to wait for the server self.dnsTimeout = 10 # end of user-settable attributes self.cachefile = os.path.expanduser(cachefile) self.caches = None if self.cachefile and os.path.exists(self.cachefile): try: self.caches = pickle_read(self.cachefile) except: os.unlink(self.cachefile) if self.caches is None: self.caches = {"A": {}, "PTR": {}} if options["globals", "verbose"]: if self.caches["A"] or self.caches["PTR"]: print >> sys.stderr, "opened existing cache with", print >> sys.stderr, len(self.caches["A"]), "A records", print >> sys.stderr, "and", len(self.caches["PTR"]), print >> sys.stderr, "PTR records" else: print >> sys.stderr, "opened new cache" self.hits = 0 # These two for statistics self.misses = 0 self.pruneTicker = 0 if dnsServer == None: DNS.DiscoverNameServers() self.queryObj = DNS.DnsRequest() else: self.queryObj = DNS.DnsRequest(server=dnsServer) return None
def __init__(self, dnsServer=None, cachefile=""): # These attributes intended for user setting self.printStatsAtEnd = False # As far as I can tell from the standards, # it's legal to have more than one PTR record # for an address. That is, it's legal to get # more than one name back when you do a # reverse lookup on an IP address. I don't # know of a use for that and I've never seen # it done. And I don't think that most # people would expect it. So forward ("A") # lookups always return a list. Reverse # ("PTR") lookups return a single name unless # this attribute is set to False. self.returnSinglePTR = True # How long to cache an error as no data self.cacheErrorSecs=5*60 # How long to wait for the server self.dnsTimeout=10 # end of user-settable attributes self.cachefile = os.path.expanduser(cachefile) self.caches = None if self.cachefile and os.path.exists(self.cachefile): try: self.caches = pickle_read(self.cachefile) except: os.unlink(self.cachefile) if self.caches is None: self.caches = {"A": {}, "PTR": {}} if options["globals", "verbose"]: if self.caches["A"] or self.caches["PTR"]: print >> sys.stderr, "opened existing cache with", print >> sys.stderr, len(self.caches["A"]), "A records", print >> sys.stderr, "and", len(self.caches["PTR"]), print >> sys.stderr, "PTR records" else: print >> sys.stderr, "opened new cache" self.hits=0 # These two for statistics self.misses=0 self.pruneTicker=0 if dnsServer == None: DNS.DiscoverNameServers() self.queryObj = DNS.DnsRequest() else: self.queryObj = DNS.DnsRequest(server=dnsServer) return None
def __init__(self, cachefile=""): self.cachefile = os.path.expanduser(cachefile) if os.path.exists(self.cachefile): self.cache = pickle_read(self.cachefile) else: self.cache = {} self.misses = self.hits = 0 if self.cachefile: atexit.register(self.close) self.engine = None
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: notesindex = pickle_read(idxname) except IOError, e: if e.errno != errno.ENOENT: raise notesindex = {} print "%s file not found, this is a first time run" % (idxname,) print "No classification will be performed"
def load(self): """Load this instance from the pickle.""" if options["globals", "verbose"]: print("Loading state from", self.db_name, "pickle", file=sys.stderr) try: tempbayes = pickle_read(self.db_name) except: tempbayes = None if tempbayes: classifier.Classifier.__setstate__(self, tempbayes.__getstate__()) if options["globals", "verbose"]: print( ("%s is an existing pickle," " with %d ham and %d spam") % (self.db_name, self.nham, self.nspam), file=sys.stderr, ) else: if options["globals", "verbose"]: print(self.db_name, "is a new pickle", file=sys.stderr) self.wordinfo = {} self.nham = 0 self.nspam = 0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify, pwd, idxname, logname): bayes = storage.open_storage(bdbname, useDBM) try: notesindex = pickle_read(idxname) except IOError as e: if e.errno != errno.ENOENT: raise notesindex = {} print("%s file not found, this is a first time run" % (idxname,)) print("No classification will be performed") need_replicate = False sess = win32com.client.Dispatch("Lotus.NotesSession") try: if pwd: sess.initialize(pwd) else: sess.initialize() except pywintypes.com_error: print("Session aborted") sys.exit() try: db = sess.GetDatabase(rdbname, ldbname) except pywintypes.com_error: if rdbname: print("Could not open database remotely, trying locally") try: db = sess.GetDatabase("", ldbname) need_replicate = True except pywintypes.com_error: print("Could not open database") sys.exit() else: raise log = sess.CreateLog("SpambayesAgentLog") try: log.OpenNotesLog("", logname) except pywintypes.com_error: print("Could not open log") log = None if log: log.LogAction("Running spambayes") vinbox = db.getView('($Inbox)') vspam = db.getView("%s\Spam" % (foldname,)) vham = db.getView("%s\Ham" % (foldname,)) vtrainspam = db.getView("%s\Train as Spam" % (foldname,)) vtrainham = db.getView("%s\Train as Ham" % (foldname,)) if doTrain: processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log) processAndTrain(vtrainham, vham, bayes, False, notesindex, log) if need_replicate: try: print("Replicating...") db.Replicate(rdbname) print("Done") except pywintypes.com_error: print("Could not replicate") if doClassify: classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log) print("The Spambayes database currently has %s Spam and %s Ham" \ % (bayes.nspam, bayes.nham)) bayes.store() pickle_write(idxname, notesindex) if log: log.LogAction("Finished running spambayes")
def loadHist(path): """Load the histogram pickle object""" return pickle_read(path)
def setup(self): # Can't import this at the top because it's circular. # XXX Someone smarter than me, please figure out the right # XXX way to do this. from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: # Build a new opener that uses a proxy requiring authorization proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: # Build a new opener without any proxy information. opener = urllib2.build_opener(urllib2.HTTPHandler) # Install it urllib2.install_opener(opener) # Setup the cache for retrieved urls age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) # Kill any old information in the cache self.urlCorpus.removeExpiredMessages() # Setup caches for unretrievable urls self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} else: if options["globals", "verbose"]: print "URL caches don't exist: creating" self.bad_urls = {"url:non_resolving": (), "url:non_html": (), "url:unknown_error": ()} if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {}
hamfile = arg elif opt in ("-S", "--spamfile"): spamfile = arg elif opt in ("-f", "--feature"): features.add(unicode(arg, charset)) if hamfile is None and spamfile is None: usage("At least one of -S or -H are required") return 1 if mapfile is None: usage("'-d mapfile' is required") return 1 try: mapd = pickle_read(mapfile) except IOError: usage("Mapfile %s does not exist" % mapfile) return 1 if not features and not args: usage("Require at least one feature (-f) arg or one message file") return 1 if not features: # extract significant tokens from each message and identify # where they came from for f in args: for msg in getmbox(f): evidence = msg.get("X-Spambayes-Evidence", "") evidence = re.sub(r"\s+", " ", evidence)
def main(args): try: opts, args = getopt.getopt(args, "b:sh") except getopt.error as msg: usage(msg) return 1 best = 0 skipspam = False for opt, arg in opts: if opt == "-h": usage() return 0 if opt == "-b": best = int(arg) elif opt == "-s": skipspam = True if len(args) != 3: usage("require ham, spam and unsure message piles") return 1 ham, spam, unsure = args choices = ["best.pck"] if "HOME" in os.environ: home = os.environ["HOME"] choices.append(os.path.join(home, "tmp", "best.pck")) choices.append(os.path.join(home, "best.pck")) choices.append(None) for bestfile in choices: if bestfile is None: break if os.path.exists(bestfile): break try: file(bestfile, "w") except IOError: pass else: os.unlink(bestfile) if bestfile is None: usage("can't find a place to write best.pck file") return 1 print("establish base training") learn(ham, h, False) learn(spam, h, True) print("scoring") if best: last_scores = pickle_read(bestfile) last_scores = list(last_scores.items()) last_scores.sort() msgids = set() for (k, v) in last_scores[-best:]: msgids.update(set(v)) else: msgids = None scores = {} try: score(unsure, h, cls, scores, msgids, skipspam) except KeyboardInterrupt: pass if not best: pickle_write(bestfile, scores) return 0
def setup(self): # Can't import this at the top because it's circular. # XXX Someone smarter than me, please figure out the right # XXX way to do this. from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory username = options["globals", "proxy_username"] password = options["globals", "proxy_password"] server = options["globals", "proxy_server"] if server.find(":") != -1: server, port = server.split(':', 1) else: port = 8080 if server: # Build a new opener that uses a proxy requiring authorization proxy_support = urllib2.ProxyHandler({"http" : \ "http://%s:%s@%s:%d" % \ (username, password, server, port)}) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) else: # Build a new opener without any proxy information. opener = urllib2.build_opener(urllib2.HTTPHandler) # Install it urllib2.install_opener(opener) # Setup the cache for retrieved urls age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60 dir = options["URLRetriever", "x-cache_directory"] if not os.path.exists(dir): # Create the directory. if options["globals", "verbose"]: print >> sys.stderr, "Creating URL cache directory" os.makedirs(dir) self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(), dir, cacheSize=20) # Kill any old information in the cache self.urlCorpus.removeExpiredMessages() # Setup caches for unretrievable urls self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck") self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck") if os.path.exists(self.bad_url_cache_name): try: self.bad_urls = pickle_read(self.bad_url_cache_name) except (IOError, ValueError): # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad URL pickle, using new." self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } else: if options["globals", "verbose"]: print "URL caches don't exist: creating" self.bad_urls = { "url:non_resolving": (), "url:non_html": (), "url:unknown_error": () } if os.path.exists(self.http_error_cache_name): try: self.http_error_urls = pickle_read(self.http_error_cache_name) except IOError, ValueError: # Something went wrong loading it (bad pickle, # probably). Start afresh. if options["globals", "verbose"]: print >> sys.stderr, "Bad HHTP error pickle, using new." self.http_error_urls = {}
break try: file(bestfile, "w") except IOError: pass else: os.unlink(bestfile) if bestfile is None: usage("can't find a place to write best.pck file") return 1 print "establish base training" learn(ham, h, False) learn(spam, h, True) print "scoring" if best: last_scores = pickle_read(bestfile) last_scores = last_scores.items() last_scores.sort() msgids = set() for (k, v) in last_scores[-best:]: msgids.update(set(v)) else: msgids = None scores = {} try: score(unsure, h, cls, scores, msgids, skipspam) except KeyboardInterrupt: pass if not best: pickle_write(bestfile, scores) return 0