Esempio n. 1
0
 def finishtest(self):

        if options["TestDriver", "show_histograms"]:

            printhist("all in this training set:",
                      self.trained_ham_hist, self.trained_spam_hist)

        self.global_ham_hist += self.trained_ham_hist

        self.global_spam_hist += self.trained_spam_hist

        self.trained_ham_hist = Hist()

        self.trained_spam_hist = Hist()

        self.ntimes_finishtest_called += 1

        if options["TestDriver", "save_trained_pickles"]:

            fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"],
                                  self.ntimes_finishtest_called)

            print("    saving pickle to", fname)

            pickle_write(fname, self.classifier, 1)
Esempio n. 2
0
 def _save_caches(self):
     # XXX Note that these caches are never refreshed, which might not
     # XXX be a good thing long-term (if a previously invalid URL
     # XXX becomes valid, for example).
     for name, data in [(self.bad_url_cache_name, self.bad_urls),
                        (self.http_error_cache_name, self.http_error_urls),]:
         pickle_write(name, data)
Esempio n. 3
0
 def alldone(self):
     if options["TestDriver", "show_histograms"]:
         besthamcut, bestspamcut = printhist("all runs:",
                                             self.global_ham_hist,
                                             self.global_spam_hist)
     else:
         besthamcut = options["Categorization", "ham_cutoff"]
         bestspamcut = options["Categorization", "spam_cutoff"]
         self.global_ham_hist.compute_stats()
         self.global_spam_hist.compute_stats()
     nham = self.global_ham_hist.n
     nspam = self.global_spam_hist.n
     nfp = len(self.falsepos)
     nfn = len(self.falseneg)
     nun = len(self.unsure)
     print "-> <stat> all runs false positives:", nfp
     print "-> <stat> all runs false negatives:", nfn
     print "-> <stat> all runs unsure:", nun
     print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham)
     print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam)
     print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam))
     print "-> <stat> all runs cost: $%.2f" % (
           nfp * options["TestDriver", "best_cutoff_fp_weight"] +
           nfn * options["TestDriver", "best_cutoff_fn_weight"] +
           nun * options["TestDriver", "best_cutoff_unsure_weight"])
     options["Categorization", "ham_cutoff"] = besthamcut
     options["Categorization", "spam_cutoff"] = bestspamcut
     print self.cc
     if options["TestDriver", "save_histogram_pickles"]:
         for f, h in (('ham', self.global_ham_hist),
                      ('spam', self.global_spam_hist)):
             fname = "%s_%shist.pik" % (options["TestDriver",
                                                "pickle_basename"], f)
             print "    saving %s histogram pickle to %s" % (f, fname)
             pickle_write(fname, h, 1)
Esempio n. 4
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:t:",
                                   ["type=", "help", "database="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    mapfile = None
    mboxtype = None
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg
        elif opt in ("-t", "--type"):
            mboxtype = arg
    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1
    if mboxtype is None:
        usage("'-t ham|spam' is required")
        return 1
    if mboxtype not in ("ham", "spam"):
        usage("mboxtype must be 'ham' or 'spam'")
        return 1
    try:
        mapd = pickle_read(mapfile)
    except IOError:
        mapd = {}
    for f in args:
        mapmessages(f, mboxtype, mapd)
    pickle_write(mapfile, mapd)
Esempio n. 5
0
 def _save_caches(self):
     # XXX Note that these caches are never refreshed, which might not
     # XXX be a good thing long-term (if a previously invalid URL
     # XXX becomes valid, for example).
     for name, data in [
         (self.bad_url_cache_name, self.bad_urls),
         (self.http_error_cache_name, self.http_error_urls),
     ]:
         pickle_write(name, data)
Esempio n. 6
0
 def close(self):
     if options["globals", "verbose"]:
         print >> sys.stderr, "saving", len(self.cache),
         print >> sys.stderr, "items to", self.cachefile,
         if self.hits + self.misses:
             print >> sys.stderr, "%.2f%% hit rate" % \
                   (100 * self.hits / (self.hits + self.misses)),
         print >> sys.stderr
     pickle_write(self.cachefile, self.cache)
Esempio n. 7
0
 def close(self):
     if options["globals", "verbose"]:
         print("saving", len(self.cache), end=' ', file=sys.stderr)
         print("items to", self.cachefile, end=' ', file=sys.stderr)
         if self.hits + self.misses:
             print("%.2f%% hit rate" % \
                   (100 * self.hits / (self.hits + self.misses)), end=' ', file=sys.stderr)
         print(file=sys.stderr)
     pickle_write(self.cachefile, self.cache)
Esempio n. 8
0
 def close(self):
     if options["globals", "verbose"]:
         print >> sys.stderr, "saving", len(self.cache),
         print >> sys.stderr, "items to", self.cachefile,
         if self.hits + self.misses:
             print >> sys.stderr, "%.2f%% hit rate" % \
                   (100 * self.hits / (self.hits + self.misses)),
         print >> sys.stderr
     pickle_write(self.cachefile, self.cache)
Esempio n. 9
0
 def close(self):

        if self.printStatsAtEnd:

            self.printStats()

        if self.cachefile:

            pickle_write(self.cachefile, self.caches)
Esempio n. 10
0
    def finishtest(self):
        if options["TestDriver", "show_histograms"]:
            printhist("all in this training set:", self.trained_ham_hist,
                      self.trained_spam_hist)
        self.global_ham_hist += self.trained_ham_hist
        self.global_spam_hist += self.trained_spam_hist
        self.trained_ham_hist = Hist()
        self.trained_spam_hist = Hist()

        self.ntimes_finishtest_called += 1
        if options["TestDriver", "save_trained_pickles"]:
            fname = "%s%d.pik" % (options["TestDriver", "pickle_basename"],
                                  self.ntimes_finishtest_called)
            print "    saving pickle to", fname
            pickle_write(fname, self.classifier, 1)
Esempio n. 11
0
    def alldone(self):
        if options["TestDriver", "show_histograms"]:
            besthamcut, bestspamcut = printhist("all runs:",
                                                self.global_ham_hist,
                                                self.global_spam_hist)
        else:
            besthamcut = options["Categorization", "ham_cutoff"]
            bestspamcut = options["Categorization", "spam_cutoff"]
            self.global_ham_hist.compute_stats()
            self.global_spam_hist.compute_stats()
        nham = self.global_ham_hist.n
        nspam = self.global_spam_hist.n
        nfp = len(self.falsepos)
        nfn = len(self.falseneg)
        nun = len(self.unsure)
        print "-> <stat> all runs false positives:", nfp
        print "-> <stat> all runs false negatives:", nfn
        print "-> <stat> all runs unsure:", nun
        print "-> <stat> all runs false positive %:", (nfp * 1e2 / nham)
        print "-> <stat> all runs false negative %:", (nfn * 1e2 / nspam)
        print "-> <stat> all runs unsure %:", (nun * 1e2 / (nham + nspam))
        print "-> <stat> all runs cost: $%.2f" % (
            nfp * options["TestDriver", "best_cutoff_fp_weight"] +
            nfn * options["TestDriver", "best_cutoff_fn_weight"] +
            nun * options["TestDriver", "best_cutoff_unsure_weight"])
        # Set back the options for the delayed calculations in self.cc
        options["Categorization", "ham_cutoff"] = besthamcut
        options["Categorization", "spam_cutoff"] = bestspamcut
        print self.cc

        if options["TestDriver", "save_histogram_pickles"]:
            for f, h in (('ham', self.global_ham_hist),
                         ('spam', self.global_spam_hist)):
                fname = "%s_%shist.pik" % (options["TestDriver",
                                                   "pickle_basename"], f)
                print "    saving %s histogram pickle to %s" % (f, fname)
                pickle_write(fname, h, 1)
Esempio n. 12
0
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg

        elif opt in ("-t", "--type"):
            mboxtype = arg

    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1

    if mboxtype is None:
        usage("'-t ham|spam' is required")
        return 1

    if mboxtype not in ("ham", "spam"):
        usage("mboxtype must be 'ham' or 'spam'")
        return 1

    try:
        mapd = pickle_read(mapfile)
    except IOError:
        mapd = {}

    for f in args:
        mapmessages(f, mboxtype, mapd)
    pickle_write(mapfile, mapd)

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
Esempio n. 13
0
 def close(self):
     if self.printStatsAtEnd:
         self.printStats()
     if self.cachefile:
         pickle_write(self.cachefile, self.caches)
Esempio n. 14
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "b:sh")
    except getopt.error as msg:
        usage(msg)
        return 1
    best = 0
    skipspam = False
    for opt, arg in opts:
        if opt == "-h":
            usage()
            return 0
        if opt == "-b":
            best = int(arg)
        elif opt == "-s":
            skipspam = True
    if len(args) != 3:
        usage("require ham, spam and unsure message piles")
        return 1
    ham, spam, unsure = args
    choices = ["best.pck"]
    if "HOME" in os.environ:
        home = os.environ["HOME"]
        choices.append(os.path.join(home, "tmp", "best.pck"))
        choices.append(os.path.join(home, "best.pck"))
    choices.append(None)
    for bestfile in choices:
        if bestfile is None:
            break
        if os.path.exists(bestfile):
            break
        try:
            file(bestfile, "w")
        except IOError:
            pass
        else:
            os.unlink(bestfile)
    if bestfile is None:
        usage("can't find a place to write best.pck file")
        return 1
    print("establish base training")
    learn(ham, h, False)
    learn(spam, h, True)
    print("scoring")
    if best:
        last_scores = pickle_read(bestfile)
        last_scores = list(last_scores.items())
        last_scores.sort()
        msgids = set()
        for (k, v) in last_scores[-best:]:
            msgids.update(set(v))
    else:
        msgids = None
    scores = {}
    try:
        score(unsure, h, cls, scores, msgids, skipspam)
    except KeyboardInterrupt:
        pass
    if not best:
        pickle_write(bestfile, scores)
    return 0
Esempio n. 15
0
        try:
            print "Replicating..."
            db.Replicate(rdbname)
            print "Done"
        except pywintypes.com_error:
            print "Could not replicate"

    if doClassify:
        classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log)

    print "The Spambayes database currently has %s Spam and %s Ham" \
          % (bayes.nspam, bayes.nham)

    bayes.store()

    pickle_write(idxname, notesindex)

    if log:
        log.LogAction("Finished running spambayes")


if __name__ == '__main__':
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:')
    except getopt.error, msg:
        print >> sys.stderr, str(msg) + '\n\n' + __doc__
        sys.exit()

    ldbname = None  # local notes database name
    rdbname = None  # remote notes database location
    sbfname = None  # spambayes folder name
Esempio n. 16
0
        except IOError:
            pass
        else:
            os.unlink(bestfile)
    if bestfile is None:
        usage("can't find a place to write best.pck file")
        return 1
    print "establish base training"
    learn(ham, h, False)
    learn(spam, h, True)
    print "scoring"
    if best:
        last_scores = pickle_read(bestfile)
        last_scores = last_scores.items()
        last_scores.sort()
        msgids = set()
        for (k, v) in last_scores[-best:]:
            msgids.update(set(v))
    else:
        msgids = None
    scores = {}
    try:
        score(unsure, h, cls, scores, msgids, skipspam)
    except KeyboardInterrupt:
        pass
    if not best:
        pickle_write(bestfile, scores)
    return 0
if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
Esempio n. 17
0
        try:
            print "Replicating..."
            db.Replicate(rdbname)
            print "Done"
        except pywintypes.com_error:
            print "Could not replicate"

    if doClassify:
        classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log)

    print "The Spambayes database currently has %s Spam and %s Ham" \
          % (bayes.nspam, bayes.nham)

    bayes.store()

    pickle_write(idxname, notesindex)

    if log:
        log.LogAction("Finished running spambayes")


if __name__ == '__main__':
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'htcPd:p:l:r:f:o:i:W:L:')
    except getopt.error, msg:
        print >> sys.stderr, str(msg) + '\n\n' + __doc__
        sys.exit()

    ldbname = None  # local notes database name
    rdbname = None  # remote notes database location
    sbfname = None  # spambayes folder name
Esempio n. 18
0
 def store(self):
     """Store self as a pickle"""
     if options["globals", "verbose"]:
         print("Persisting", self.db_name, "as a pickle", file=sys.stderr)
     pickle_write(self.db_name, self, PICKLE_TYPE)
Esempio n. 19
0
 def store(self):
     pickle_write(self.db_name, self.db, self.mode)
Esempio n. 20
0
def run(bdbname, useDBM, ldbname, rdbname, foldname, doTrain, doClassify,
        pwd, idxname, logname):
    bayes = storage.open_storage(bdbname, useDBM)
    try:
        notesindex = pickle_read(idxname)
    except IOError as e:
        if e.errno != errno.ENOENT:
            raise
        notesindex = {}
        print("%s file not found, this is a first time run" % (idxname,))
        print("No classification will be performed")
    need_replicate = False
    sess = win32com.client.Dispatch("Lotus.NotesSession")
    try:
        if pwd:
            sess.initialize(pwd)
        else:
            sess.initialize()
    except pywintypes.com_error:
        print("Session aborted")
        sys.exit()
    try:
        db = sess.GetDatabase(rdbname, ldbname)
    except pywintypes.com_error:
        if rdbname:
            print("Could not open database remotely, trying locally")
            try:
                db = sess.GetDatabase("", ldbname)
                need_replicate = True
            except pywintypes.com_error:
                print("Could not open database")
                sys.exit()
        else:
            raise
    log = sess.CreateLog("SpambayesAgentLog")
    try:
        log.OpenNotesLog("", logname)
    except pywintypes.com_error:
        print("Could not open log")
        log = None
    if log:
        log.LogAction("Running spambayes")
    vinbox = db.getView('($Inbox)')
    vspam = db.getView("%s\Spam" % (foldname,))
    vham = db.getView("%s\Ham" % (foldname,))
    vtrainspam = db.getView("%s\Train as Spam" % (foldname,))
    vtrainham = db.getView("%s\Train as Ham" % (foldname,))
    if doTrain:
        processAndTrain(vtrainspam, vspam, bayes, True, notesindex, log)
        processAndTrain(vtrainham, vham, bayes, False, notesindex, log)
    if need_replicate:
        try:
            print("Replicating...")
            db.Replicate(rdbname)
            print("Done")
        except pywintypes.com_error:
            print("Could not replicate")
    if doClassify:
        classifyInbox(vinbox, vtrainspam, bayes, ldbname, notesindex, log)
    print("The Spambayes database currently has %s Spam and %s Ham" \
          % (bayes.nspam, bayes.nham))
    bayes.store()
    pickle_write(idxname, notesindex)
    if log:
        log.LogAction("Finished running spambayes")
Esempio n. 21
0
        elif opt in ("-d", "--database"):
            mapfile = arg

        elif opt in ("-t", "--type"):
            mboxtype = arg

    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1

    if mboxtype is None:
        usage("'-t ham|spam' is required")
        return 1

    if mboxtype not in ("ham", "spam"):
        usage("mboxtype must be 'ham' or 'spam'")
        return 1

    try:
        mapd = pickle_read(mapfile)
    except IOError:
        mapd = {}

    for f in args:
        mapmessages(f, mboxtype, mapd)
    pickle_write(mapfile, mapd)


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
Esempio n. 22
0
 def store(self):

        pickle_write(self.db_name, self.db, self.mode)
Esempio n. 23
0
 def _save_caches(self):
     for name, data in [(self.bad_url_cache_name, self.bad_urls),
                        (self.http_error_cache_name, self.http_error_urls),]:
         pickle_write(name, data)