Exemple #1
0
 def setup(self):
     from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
     username = options["globals", "proxy_username"]
     password = options["globals", "proxy_password"]
     server = options["globals", "proxy_server"]
     if server.find(":") != -1:
         server, port = server.split(':', 1)
     else:
         port = 8080
     if server:
         proxy_support = urllib.request.ProxyHandler({"http" : \
                                               "http://%s:%s@%s:%d" % \
                                               (username, password,
                                                server, port)})
         opener = urllib.request.build_opener(proxy_support,
                                       urllib2.HTTPHandler)
     else:
         opener = urllib.request.build_opener(urllib2.HTTPHandler)
     urllib.request.install_opener(opener)
     age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
     dir = options["URLRetriever", "x-cache_directory"]
     if not os.path.exists(dir):
         if options["globals", "verbose"]:
             print("Creating URL cache directory", file=sys.stderr)
         os.makedirs(dir)
     self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                       dir, cacheSize=20)
     self.urlCorpus.removeExpiredMessages()
     self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
     self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
     if os.path.exists(self.bad_url_cache_name):
         try:
             self.bad_urls = pickle_read(self.bad_url_cache_name)
         except (IOError, ValueError):
             if options["globals", "verbose"]:
                 print("Bad URL pickle, using new.", file=sys.stderr)
             self.bad_urls = {"url:non_resolving": (),
                              "url:non_html": (),
                              "url:unknown_error": ()}
     else:
         if options["globals", "verbose"]:
             print("URL caches don't exist: creating")
         self.bad_urls = {"url:non_resolving": (),
                     "url:non_html": (),
                     "url:unknown_error": ()}
     if os.path.exists(self.http_error_cache_name):
         try:
             self.http_error_urls = pickle_read(self.http_error_cache_name)
         except IOError as ValueError:
             if options["globals", "verbose"]:
                 print("Bad HHTP error pickle, using new.", file=sys.stderr)
             self.http_error_urls = {}
     else:
         self.http_error_urls = {}
Exemple #2
0
 def __init__(self, dnsServer=None, cachefile=""):
     self.printStatsAtEnd = False
     self.returnSinglePTR = True
     self.cacheErrorSecs=5*60
     self.dnsTimeout=10
     self.cachefile = os.path.expanduser(cachefile)
     self.caches = None
     if self.cachefile and os.path.exists(self.cachefile):
         try:
             self.caches = pickle_read(self.cachefile)
         except:
             os.unlink(self.cachefile)
     if self.caches is None:
         self.caches = {"A": {}, "PTR": {}}
     if options["globals", "verbose"]:
         if self.caches["A"] or self.caches["PTR"]:
             print("opened existing cache with", end=' ', file=sys.stderr)
             print(len(self.caches["A"]), "A records", end=' ', file=sys.stderr)
             print("and", len(self.caches["PTR"]), end=' ', file=sys.stderr)
             print("PTR records", file=sys.stderr)
         else:
             print("opened new cache", file=sys.stderr)
     self.hits=0 # These two for statistics
     self.misses=0
     self.pruneTicker=0
     if dnsServer == None:
         DNS.DiscoverNameServers()
         self.queryObj = DNS.DnsRequest()
     else:
         self.queryObj = DNS.DnsRequest(server=dnsServer)
     return None
Exemple #3
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:t:",
                                   ["type=", "help", "database="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    mapfile = None
    mboxtype = None
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg
        elif opt in ("-t", "--type"):
            mboxtype = arg
    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1
    if mboxtype is None:
        usage("'-t ham|spam' is required")
        return 1
    if mboxtype not in ("ham", "spam"):
        usage("mboxtype must be 'ham' or 'spam'")
        return 1
    try:
        mapd = pickle_read(mapfile)
    except IOError:
        mapd = {}
    for f in args:
        mapmessages(f, mboxtype, mapd)
    pickle_write(mapfile, mapd)
Exemple #4
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:S:H:f:",
                                   ["help", "database=", "spamfile=",
                                    "hamfile=", "feature="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    charset = locale.getdefaultlocale()[1]
    if not charset:
        charset = 'us-ascii'
    mapfile = spamfile = hamfile = None
    features = set()
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg
        elif opt in ("-H", "--hamfile"):
            hamfile = arg
        elif opt in ("-S", "--spamfile"):
            spamfile = arg
        elif opt in ("-f", "--feature"):
            features.add(str(arg, charset))
    if hamfile is None and spamfile is None:
        usage("At least one of -S or -H are required")
        return 1
    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1
    try:
        mapd = pickle_read(mapfile)
    except IOError:
        usage("Mapfile %s does not exist" % mapfile)
        return 1
    if not features and not args:
        usage("Require at least one feature (-f) arg or one message file")
        return 1
    if not features:
        for f in args:
            for msg in getmbox(f):
                evidence = msg.get("X-Spambayes-Evidence", "")
                evidence = re.sub(r"\s+", " ", evidence)
                l = [e.rsplit(": ", 1)[0]
                     for e in evidence.split("; ")[2:]]
                for s in l:
                    try:
                        s = make_header(decode_header(s)).__unicode__()
                    except:
                        s = str(s, 'us-ascii', 'replace')
                    features.add(s)
        if not features:
            usage("No X-Spambayes-Evidence headers found")
            return 1
    if spamfile is not None:
        spamfile = file(spamfile, "w")
    if hamfile is not None:
        hamfile = file(hamfile, "w")
    extractmessages(features, mapd, hamfile, spamfile)
Exemple #5
0
 def load(self):
     try:
         self.db = pickle_read(self.db_name)
     except IOError, e:
         if e.errno == errno.ENOENT:
             self.db = {}
         else:
             raise
Exemple #6
0
 def load(self):
     try:
         self.db = pickle_read(self.db_name)
     except IOError, e:
         if e.errno == errno.ENOENT:
             # New pickle
             self.db = {}
         else:
             raise
Exemple #7
0
    def __init__(self, dnsServer=None, cachefile=""):
        # These attributes intended for user setting
        self.printStatsAtEnd = False

        # As far as I can tell from the standards,
        # it's legal to have more than one PTR record
        # for an address. That is, it's legal to get
        # more than one name back when you do a
        # reverse lookup on an IP address. I don't
        # know of a use for that and I've never seen
        # it done. And I don't think that most
        # people would expect it. So forward ("A")
        # lookups always return a list. Reverse
        # ("PTR") lookups return a single name unless
        # this attribute is set to False.
        self.returnSinglePTR = True

        # How long to cache an error as no data
        self.cacheErrorSecs = 5 * 60

        # How long to wait for the server
        self.dnsTimeout = 10

        # end of user-settable attributes

        self.cachefile = os.path.expanduser(cachefile)
        self.caches = None

        if self.cachefile and os.path.exists(self.cachefile):
            try:
                self.caches = pickle_read(self.cachefile)
            except:
                os.unlink(self.cachefile)

        if self.caches is None:
            self.caches = {"A": {}, "PTR": {}}

        if options["globals", "verbose"]:
            if self.caches["A"] or self.caches["PTR"]:
                print >> sys.stderr, "opened existing cache with",
                print >> sys.stderr, len(self.caches["A"]), "A records",
                print >> sys.stderr, "and", len(self.caches["PTR"]),
                print >> sys.stderr, "PTR records"
            else:
                print >> sys.stderr, "opened new cache"

        self.hits = 0  # These two for statistics
        self.misses = 0
        self.pruneTicker = 0

        if dnsServer == None:
            DNS.DiscoverNameServers()
            self.queryObj = DNS.DnsRequest()
        else:
            self.queryObj = DNS.DnsRequest(server=dnsServer)
        return None
Exemple #8
0
    def __init__(self, dnsServer=None, cachefile=""):
    # These attributes intended for user setting
        self.printStatsAtEnd = False

        # As far as I can tell from the standards,
        # it's legal to have more than one PTR record
        # for an address. That is, it's legal to get
        # more than one name back when you do a
        # reverse lookup on an IP address. I don't
        # know of a use for that and I've never seen
        # it done. And I don't think that most
        # people would expect it. So forward ("A")
        # lookups always return a list. Reverse
        # ("PTR") lookups return a single name unless
        # this attribute is set to False.
        self.returnSinglePTR = True

        # How long to cache an error as no data
        self.cacheErrorSecs=5*60

        # How long to wait for the server
        self.dnsTimeout=10

        # end of user-settable attributes

        self.cachefile = os.path.expanduser(cachefile)
        self.caches = None

        if self.cachefile and os.path.exists(self.cachefile):
            try:
                self.caches = pickle_read(self.cachefile)
            except:
                os.unlink(self.cachefile)

        if self.caches is None:
            self.caches = {"A": {}, "PTR": {}}

        if options["globals", "verbose"]:
            if self.caches["A"] or self.caches["PTR"]:
                print >> sys.stderr, "opened existing cache with",
                print >> sys.stderr, len(self.caches["A"]), "A records",
                print >> sys.stderr, "and", len(self.caches["PTR"]),
                print >> sys.stderr, "PTR records"
            else:
                print >> sys.stderr, "opened new cache"

        self.hits=0 # These two for statistics
        self.misses=0
        self.pruneTicker=0

        if dnsServer == None:
            DNS.DiscoverNameServers()
            self.queryObj = DNS.DnsRequest()
        else:
            self.queryObj = DNS.DnsRequest(server=dnsServer)
        return None
 def __init__(self, cachefile=""):
     self.cachefile = os.path.expanduser(cachefile)
     if os.path.exists(self.cachefile):
         self.cache = pickle_read(self.cachefile)
     else:
         self.cache = {}
     self.misses = self.hits = 0
     if self.cachefile:
         atexit.register(self.close)
     self.engine = None
 def __init__(self, cachefile=""):
     self.cachefile = os.path.expanduser(cachefile)
     if os.path.exists(self.cachefile):
         self.cache = pickle_read(self.cachefile)
     else:
         self.cache = {}
     self.misses = self.hits = 0
     if self.cachefile:
         atexit.register(self.close)
     self.engine = None
Exemple #11
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)
    try:
        notesindex = pickle_read(idxname)
    except IOError, e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print "%s file not found, this is a first time run" % (idxname,)
        print "No classification will be performed"
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)

    try:
        notesindex = pickle_read(idxname)
    except IOError, e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print "%s file not found, this is a first time run" % (idxname,)
        print "No classification will be performed"
Exemple #13
0
 def load(self):
     """Load this instance from the pickle."""
     if options["globals", "verbose"]:
         print("Loading state from", self.db_name, "pickle", file=sys.stderr)
     try:
         tempbayes = pickle_read(self.db_name)
     except:
         tempbayes = None
     if tempbayes:
         classifier.Classifier.__setstate__(self, tempbayes.__getstate__())
         if options["globals", "verbose"]:
             print(
                 ("%s is an existing pickle," " with %d ham and %d spam") % (self.db_name, self.nham, self.nspam),
                 file=sys.stderr,
             )
     else:
         if options["globals", "verbose"]:
             print(self.db_name, "is a new pickle", file=sys.stderr)
         self.wordinfo = {}
         self.nham = 0
         self.nspam = 0
Exemple #14
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)
    try:
        notesindex = pickle_read(idxname)
    except IOError as e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print("%s file not found, this is a first time run" % (idxname,))
        print("No classification will be performed")
    need_replicate = False
    sess = win32com.client.Dispatch("Lotus.NotesSession")
    try:
        if pwd:
            sess.initialize(pwd)
        else:
            sess.initialize()
    except pywintypes.com_error:
        print("Session aborted")
        sys.exit()
    try:
        db = sess.GetDatabase(rdbname, ldbname)
    except pywintypes.com_error:
        if rdbname:
            print("Could not open database remotely, trying locally")
            try:
                db = sess.GetDatabase("", ldbname)
                need_replicate = True
            except pywintypes.com_error:
                print("Could not open database")
                sys.exit()
        else:
            raise
    log = sess.CreateLog("SpambayesAgentLog")
    try:
        log.OpenNotesLog("", logname)
    except pywintypes.com_error:
        print("Could not open log")
        log = None
    if log:
        log.LogAction("Running spambayes")
    vinbox = db.getView('($Inbox)')
    vspam = db.getView("%s\Spam" % (foldname,))
    vham = db.getView("%s\Ham" % (foldname,))
    vtrainspam = db.getView("%s\Train as Spam" % (foldname,))
    vtrainham = db.getView("%s\Train as Ham" % (foldname,))
    if doTrain:
        processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log)
        processAndTrain(vtrainham, vham, bayes, False, notesindex, log)
    if need_replicate:
        try:
            print("Replicating...")
            db.Replicate(rdbname)
            print("Done")
        except pywintypes.com_error:
            print("Could not replicate")
    if doClassify:
        classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log)
    print("The Spambayes database currently has %s Spam and %s Ham" \
          % (bayes.nspam, bayes.nham))
    bayes.store()
    pickle_write(idxname, notesindex)
    if log:
        log.LogAction("Finished running spambayes")
Exemple #15
0
def loadHist(path):
    """Load the histogram pickle object"""
    return pickle_read(path)
Exemple #16
0
    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%s@%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support,
                                          urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >> sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
                                          dir, cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            try:
                self.bad_urls = pickle_read(self.bad_url_cache_name)
            except (IOError, ValueError):
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {"url:non_resolving": (),
                                 "url:non_html": (),
                                 "url:unknown_error": ()}
        else:
            if options["globals", "verbose"]:
                print "URL caches don't exist: creating"
            self.bad_urls = {"url:non_resolving": (),
                        "url:non_html": (),
                        "url:unknown_error": ()}
        if os.path.exists(self.http_error_cache_name):
            try:
                self.http_error_urls = pickle_read(self.http_error_cache_name)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad HHTP error pickle, using new."
                self.http_error_urls = {}
            hamfile = arg
        elif opt in ("-S", "--spamfile"):
            spamfile = arg
        elif opt in ("-f", "--feature"):
            features.add(unicode(arg, charset))

    if hamfile is None and spamfile is None:
        usage("At least one of -S or -H are required")
        return 1

    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1

    try:
        mapd = pickle_read(mapfile)
    except IOError:
        usage("Mapfile %s does not exist" % mapfile)
        return 1

    if not features and not args:
        usage("Require at least one feature (-f) arg or one message file")
        return 1

    if not features:
        # extract significant tokens from each message and identify
        # where they came from
        for f in args:
            for msg in getmbox(f):
                evidence = msg.get("X-Spambayes-Evidence", "")
                evidence = re.sub(r"\s+", " ", evidence)
Exemple #18
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "b:sh")
    except getopt.error as msg:
        usage(msg)
        return 1
    best = 0
    skipspam = False
    for opt, arg in opts:
        if opt == "-h":
            usage()
            return 0
        if opt == "-b":
            best = int(arg)
        elif opt == "-s":
            skipspam = True
    if len(args) != 3:
        usage("require ham, spam and unsure message piles")
        return 1
    ham, spam, unsure = args
    choices = ["best.pck"]
    if "HOME" in os.environ:
        home = os.environ["HOME"]
        choices.append(os.path.join(home, "tmp", "best.pck"))
        choices.append(os.path.join(home, "best.pck"))
    choices.append(None)
    for bestfile in choices:
        if bestfile is None:
            break
        if os.path.exists(bestfile):
            break
        try:
            file(bestfile, "w")
        except IOError:
            pass
        else:
            os.unlink(bestfile)
    if bestfile is None:
        usage("can't find a place to write best.pck file")
        return 1
    print("establish base training")
    learn(ham, h, False)
    learn(spam, h, True)
    print("scoring")
    if best:
        last_scores = pickle_read(bestfile)
        last_scores = list(last_scores.items())
        last_scores.sort()
        msgids = set()
        for (k, v) in last_scores[-best:]:
            msgids.update(set(v))
    else:
        msgids = None
    scores = {}
    try:
        score(unsure, h, cls, scores, msgids, skipspam)
    except KeyboardInterrupt:
        pass
    if not best:
        pickle_write(bestfile, scores)
    return 0
    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%s@%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >> sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age,
                                          FileMessageFactory(),
                                          dir,
                                          cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            try:
                self.bad_urls = pickle_read(self.bad_url_cache_name)
            except (IOError, ValueError):
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {
                    "url:non_resolving": (),
                    "url:non_html": (),
                    "url:unknown_error": ()
                }
        else:
            if options["globals", "verbose"]:
                print "URL caches don't exist: creating"
            self.bad_urls = {
                "url:non_resolving": (),
                "url:non_html": (),
                "url:unknown_error": ()
            }
        if os.path.exists(self.http_error_cache_name):
            try:
                self.http_error_urls = pickle_read(self.http_error_cache_name)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad HHTP error pickle, using new."
                self.http_error_urls = {}
Exemple #20
0
         break
     try:
         file(bestfile, "w")
     except IOError:
         pass
     else:
         os.unlink(bestfile)
 if bestfile is None:
     usage("can't find a place to write best.pck file")
     return 1
 print "establish base training"
 learn(ham, h, False)
 learn(spam, h, True)
 print "scoring"
 if best:
     last_scores = pickle_read(bestfile)
     last_scores = last_scores.items()
     last_scores.sort()
     msgids = set()
     for (k, v) in last_scores[-best:]:
         msgids.update(set(v))
 else:
     msgids = None
 scores = {}
 try:
     score(unsure, h, cls, scores, msgids, skipspam)
 except KeyboardInterrupt:
     pass
 if not best:
     pickle_write(bestfile, scores)
 return 0
Exemple #21
0
def loadHist(path):
    """Load the histogram pickle object"""
    return pickle_read(path)