Exemple #1
0
    def createWorkers(self):
        """Using the options that were initialised in __init__ and then
        possibly overridden by the driver code, create the Bayes object,
        the Corpuses, the Trainers and so on."""
        print "Loading database...",
        if self.isTest:
            self.useDB = "pickle"
            self.DBName = '_pop3proxy_test.pickle'   # This is never saved.
        if not hasattr(self, "DBName"):
            self.DBName, self.useDB = storage.database_type([])
        self.bayes = storage.open_storage(self.DBName, self.useDB)
        self.mdb = spambayes.message.Message().message_info_db

        # Load stats manager.
        self.stats = Stats.Stats(options, self.mdb)

        self.buildStatusStrings()

        # Don't set up the caches and training objects when running the self-test,
        # so as not to clutter the filesystem.
        if not self.isTest:
            # Create/open the Corpuses.  Use small cache sizes to avoid hogging
            # lots of memory.
            sc = get_pathname_option("Storage", "spam_cache")
            hc = get_pathname_option("Storage", "ham_cache")
            uc = get_pathname_option("Storage", "unknown_cache")
            map(storage.ensureDir, [sc, hc, uc])
            if self.gzipCache:
                factory = GzipFileMessageFactory()
            else:
                factory = FileMessageFactory()
            age = options["Storage", "cache_expiry_days"]*24*60*60
            self.spamCorpus = ExpiryFileCorpus(age, factory, sc,
                                               '[0123456789\-]*',
                                               cacheSize=20)
            self.hamCorpus = ExpiryFileCorpus(age, factory, hc,
                                              '[0123456789\-]*',
                                              cacheSize=20)
            self.unknownCorpus = ExpiryFileCorpus(age, factory, uc,
                                                  '[0123456789\-]*',
                                                  cacheSize=20)

            # Given that (hopefully) users will get to the stage
            # where they do not need to do any more regular training to
            # be satisfied with spambayes' performance, we expire old
            # messages from not only the trained corpora, but the unknown
            # as well.
            self.spamCorpus.removeExpiredMessages()
            self.hamCorpus.removeExpiredMessages()
            self.unknownCorpus.removeExpiredMessages()

            # Create the Trainers.
            self.spamTrainer = storage.SpamTrainer(self.bayes)
            self.hamTrainer = storage.HamTrainer(self.bayes)
            self.spamCorpus.addObserver(self.spamTrainer)
            self.hamCorpus.addObserver(self.hamTrainer)
 def setUp(self):
     _FileCorpusBaseTest.setUp(self)
     self.cache_size = 100
     self.directory = 'fctesthamcorpus'
     self.factory = FileMessageFactory()
     self.stuff_corpus()
     self.corpus = ExpiryFileCorpus(1.0, self.factory, self.directory, '?',
                                    self.cache_size)
    def setup(self):
        # Can't import this at the top because it's circular.
        # XXX Someone smarter than me, please figure out the right
        # XXX way to do this.
        from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory

        username = options["globals", "proxy_username"]
        password = options["globals", "proxy_password"]
        server = options["globals", "proxy_server"]
        if server.find(":") != -1:
            server, port = server.split(':', 1)
        else:
            port = 8080
        if server:
            # Build a new opener that uses a proxy requiring authorization
            proxy_support = urllib2.ProxyHandler({"http" : \
                                                  "http://%s:%s@%s:%d" % \
                                                  (username, password,
                                                   server, port)})
            opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
        else:
            # Build a new opener without any proxy information.
            opener = urllib2.build_opener(urllib2.HTTPHandler)

        # Install it
        urllib2.install_opener(opener)

        # Setup the cache for retrieved urls
        age = options["URLRetriever", "x-cache_expiry_days"] * 24 * 60 * 60
        dir = options["URLRetriever", "x-cache_directory"]
        if not os.path.exists(dir):
            # Create the directory.
            if options["globals", "verbose"]:
                print >> sys.stderr, "Creating URL cache directory"
            os.makedirs(dir)

        self.urlCorpus = ExpiryFileCorpus(age,
                                          FileMessageFactory(),
                                          dir,
                                          cacheSize=20)
        # Kill any old information in the cache
        self.urlCorpus.removeExpiredMessages()

        # Setup caches for unretrievable urls
        self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
        self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
        if os.path.exists(self.bad_url_cache_name):
            try:
                self.bad_urls = pickle_read(self.bad_url_cache_name)
            except (IOError, ValueError):
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad URL pickle, using new."
                self.bad_urls = {
                    "url:non_resolving": (),
                    "url:non_html": (),
                    "url:unknown_error": ()
                }
        else:
            if options["globals", "verbose"]:
                print "URL caches don't exist: creating"
            self.bad_urls = {
                "url:non_resolving": (),
                "url:non_html": (),
                "url:unknown_error": ()
            }
        if os.path.exists(self.http_error_cache_name):
            try:
                self.http_error_urls = pickle_read(self.http_error_cache_name)
            except IOError, ValueError:
                # Something went wrong loading it (bad pickle,
                # probably).  Start afresh.
                if options["globals", "verbose"]:
                    print >> sys.stderr, "Bad HHTP error pickle, using new."
                self.http_error_urls = {}