Example #1
0
    def _update(self, folders, is_spam):
        changed = False
        for f in folders:
            log("update from %s" % f.path)
            added, removed = f.read()
            if added:
                log("added %d" % len(added))
            if removed:
                log("removed %d" % len(removed))
            get_transaction().commit()
            if not (added or removed):
                continue
            changed = True

            # It's important not to commit a transaction until
            # after update_probabilities is called in update().
            # Otherwise some new entries will cause scoring to fail.
            for msg in added.keys():
                self.classifier.learn(tokenize(msg), is_spam)
            del added
            get_transaction().commit(1)
            log("learned")
            for msg in removed.keys():
                self.classifier.unlearn(tokenize(msg), is_spam)
            if removed:
                log("unlearned")
            del removed
            get_transaction().commit(1)
        return changed
Example #2
0
 def test_dbm_export(self):
     # Create a dbm classifier to export.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Export.
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     # Reopen the original.
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     # Verify that the CSV holds all the original data (and, by using
     # the CSV module to open it, that it is valid CSV data).
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
Example #3
0
def train_message(msg, is_spam, cdata):
    # Train an individual message.
    # Returns True if newly added (message will be correctly
    # untrained if it was in the wrong category), False if already
    # in the correct category.  Catch your own damn exceptions.
    # If re-classified AND rescore = True, then a new score will
    # be written to the message (so the user can see some effects)
    from spambayes.tokenizer import tokenize

    cdata.message_db.load_msg(msg)
    was_spam = msg.t
    if was_spam == is_spam:
        return False  # already correctly classified

    # Brand new (was_spam is None), or incorrectly classified.
    stream = msg.GetEmailPackageObject()
    if was_spam is not None:
        # The classification has changed; unlearn the old classification.
        cdata.bayes.unlearn(tokenize(stream), was_spam)

    # Learn the correct classification.
    cdata.bayes.learn(tokenize(stream), is_spam)
    msg.t = is_spam
    cdata.message_db.store_msg(msg)
    cdata.dirty = True
    return True
 def test_dbm_export(self):
     # Create a dbm classifier to export.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Export.
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     # Reopen the original.
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     # Verify that the CSV holds all the original data (and, by using
     # the CSV module to open it, that it is valid CSV data).
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
Example #5
0
 def _update(self, folders, is_spam):
     changed = False
     for f in folders:
         log("update from %s" % f.path)
         added, removed = f.read()
         if added:
             log("added %d" % len(added))
         if removed:
             log("removed %d" % len(removed))
         get_transaction().commit()
         if not (added or removed):
             continue
         changed = True
         for msg in added.keys():
             self.classifier.learn(tokenize(msg), is_spam)
         del added
         get_transaction().commit(1)
         log("learned")
         for msg in removed.keys():
             self.classifier.unlearn(tokenize(msg), is_spam)
         if removed:
             log("unlearned")
         del removed
         get_transaction().commit(1)
     return changed
Example #6
0
def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")
Example #7
0
def train_message(msg, is_spam, cdata):
    # Train an individual message.
    # Returns True if newly added (message will be correctly
    # untrained if it was in the wrong category), False if already
    # in the correct category.  Catch your own damn exceptions.
    # If re-classified AND rescore = True, then a new score will
    # be written to the message (so the user can see some effects)
    from spambayes.tokenizer import tokenize

    if not cdata.message_db.has_key(msg.searchkey):
        was_spam = None
    else:
        was_spam = cdata.message_db[msg.searchkey]=='1'
    if was_spam == is_spam:
        return False    # already correctly classified

    # Brand new (was_spam is None), or incorrectly classified.
    stream = msg.GetEmailPackageObject()
    if was_spam is not None:
        # The classification has changed; unlearn the old classification.
        cdata.bayes.unlearn(tokenize(stream), was_spam)

    # Learn the correct classification.
    cdata.bayes.learn(tokenize(stream), is_spam)
    cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
    cdata.dirty = True
    return True
Example #8
0
 def untrain_message(msg, cdata):

    from spambayes.tokenizer import tokenize

    stream = msg.GetEmailPackageObject()

    cdata.message_db.load_msg(msg)

    if been_trained_as_spam(msg):

        assert not been_trained_as_ham(msg), "Can't have been both!"

        cdata.bayes.unlearn(tokenize(stream), True)

        cdata.message_db.remove_msg(msg)

        cdata.dirty = True

        return True

    if been_trained_as_ham(msg):

        assert not been_trained_as_spam(msg), "Can't have been both!"

        cdata.bayes.unlearn(tokenize(stream), False)

        cdata.message_db.remove_msg(msg)

        cdata.dirty = True

        return False

    return None
Example #9
0
 def test_merge_to_pickle(self):
     bayes = PickledClassifier(TEMP_PICKLE_NAME)
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     bayes.store()
     nham, nspam = 3,4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_PICKLE_NAME, "pickle", False,
                           TEMP_CSV_NAME)
     bayes2 = open_storage(TEMP_PICKLE_NAME, "pickle")
     self.assertEqual(bayes2.nham, nham + bayes.nham)
     self.assertEqual(bayes2.nspam, nspam + bayes.nspam)
     words = bayes._wordinfokeys()
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0,0))
         wi = bayes._wordinfoget(word)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
Example #10
0
 def train_message(msg, is_spam, cdata):

    from spambayes.tokenizer import tokenize

    cdata.message_db.load_msg(msg)

    was_spam = msg.t

    if was_spam == is_spam:

        return False    

    stream = msg.GetEmailPackageObject()

    if was_spam is not None:

        cdata.bayes.unlearn(tokenize(stream), was_spam)

    cdata.bayes.learn(tokenize(stream), is_spam)

    msg.t = is_spam

    cdata.message_db.store_msg(msg)

    cdata.dirty = True

    return True
Example #11
0
def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")
Example #12
0
    def _update(self, folders, is_spam):
        changed = False
        for f in folders:
            log("update from %s" % f.path)
            added, removed = f.read()
            if added:
                log("added %d" % len(added))
            if removed:
                log("removed %d" % len(removed))
            get_transaction().commit()
            if not (added or removed):
                continue
            changed = True

            # It's important not to commit a transaction until
            # after update_probabilities is called in update().
            # Otherwise some new entries will cause scoring to fail.
            for msg in added.keys():
                self.classifier.learn(tokenize(msg), is_spam)
            del added
            get_transaction().commit(1)
            log("learned")
            for msg in removed.keys():
                self.classifier.unlearn(tokenize(msg), is_spam)
            if removed:
                log("unlearned")
            del removed
            get_transaction().commit(1)
        return changed
Example #13
0
     def test_filter_train(self):

        self.h.open('c')

        self.h.h.bayes.learn(tokenize(good1), False)

        self.h.h.bayes.learn(tokenize(spam1), True)

        self.h.h.store()

        result = email.message_from_string(self.h.filter_train(spam1))

        self.assert_(result[options["Headers",
                                    "classification_header_name"]].\
                     startswith(options["Headers", "header_spam_string"]))

        self.assertEqual(self.h.h.bayes.nspam, 2)

        result = email.message_from_string(self.h.filter_train(good1))

        self.assert_(result[options["Headers",
                                    "classification_header_name"]].\
                     startswith(options["Headers", "header_ham_string"]))

        self.assertEqual(self.h.h.bayes.nham, 2)
 def runUIAndProxy():
     httpServer = UserInterfaceServer(8881)
     proxyUI = ProxyUserInterface(state, _recreateState)
     httpServer.register(proxyUI)
     BayesProxyListener('localhost', 8110, ('', 8111))
     state.bayes.learn(tokenizer.tokenize(spam1), True)
     state.bayes.learn(tokenizer.tokenize(good1), False)
     proxyReady.set()
     Dibbler.run()
Example #15
0
 def runUIAndProxy():
     httpServer = UserInterfaceServer(8881)
     proxyUI = ProxyUserInterface(state, _recreateState)
     httpServer.register(proxyUI)
     BayesProxyListener('localhost', 8110, ('', 8111))
     state.bayes.learn(tokenizer.tokenize(spam1), True)
     state.bayes.learn(tokenizer.tokenize(good1), False)
     proxyReady.set()
     Dibbler.run()
Example #16
0
 def test_untrain_spam(self):
     self.h.open('c')
     self.h.h.bayes.learn(tokenize(spam1), True)
     self.h.untrain_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 0)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi, None)
Example #17
0
 def test_untrain_spam(self):
     self.h.open('c')
     # Put a message in the classifier to be removed.
     self.h.h.bayes.learn(tokenize(spam1), True)
     # Verify that the classifier gets untrained with the message.
     self.h.untrain_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 0)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi, None)
Example #18
0
 def test_untrain_spam(self):
     self.h.open('c')
     # Put a message in the classifier to be removed.
     self.h.h.bayes.learn(tokenize(spam1), True)
     # Verify that the classifier gets untrained with the message.
     self.h.untrain_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 0)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi, None)
Example #19
0
 def test_newdb(self):
     b = open_storage(TEMP_DBM_NAME, "dbm")
     b.learn(tokenize(spam1), True)
     b.learn(tokenize(good1), False)
     b.store()
     b.close()
     self.h.newdb()
     self.assertEqual(self.h.h, None)
     b = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(b.nham, 0)
     self.assertEqual(b.nspam, 0)
     b.close()
Example #20
0
 def test_merge_to_dbm(self):
     # Create a dbm classifier to merge with.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save data to check against.
     original_nham = bayes.nham
     original_nspam = bayes.nspam
     original_data = {}
     for key in bayes._wordinfokeys():
         original_data[key] = bayes._wordinfoget(key)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Create a CSV file to import.
     nham, nspam = 3, 4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {
         "this": (2, 1),
         "is": (0, 1),
         "a": (3, 4),
         'test': (1, 1),
         "of": (1, 0),
         "the": (1, 2),
         "import": (3, 1)
     }
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file),
     # and the data from the original dbm database.
     bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes2.nham, nham + original_nham)
     self.assertEqual(bayes2.nspam, nspam + original_nspam)
     words = original_data.keys()[:]
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0, 0))
         wi = original_data.get(word, None)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
Example #21
0
 def test_filter(self):
     # Verify that the msg has the classification header added.
     self.h.open('c')
     self.h.h.bayes.learn(tokenize(good1), False)
     self.h.h.bayes.learn(tokenize(spam1), True)
     self.h.h.store()
     result = email.message_from_string(self.h.filter(spam1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_spam_string"]))
     result = email.message_from_string(self.h.filter(good1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_ham_string"]))
Example #22
0
 def test_filter(self):
     # Verify that the msg has the classification header added.
     self.h.open('c')
     self.h.h.bayes.learn(tokenize(good1), False)
     self.h.h.bayes.learn(tokenize(spam1), True)
     self.h.h.store()
     result = email.message_from_string(self.h.filter(spam1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_spam_string"]))
     result = email.message_from_string(self.h.filter(good1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_ham_string"]))
Example #23
0
def score(unsure, h, cls, scores, msgids=None, skipspam=False):
    """See what effect on others each msg in unsure has"""
    ham_cutoff = options["Categorization", "ham_cutoff"]
    spam_cutoff = options["Categorization", "spam_cutoff"]
    n = 0
    total = 0.0
    okalready = set()
    add = okalready.add
    for msg in getmbox(unsure):
        prob = cls.spamprob(tokenize(msg))
        n += 1
        if prob >= spam_cutoff:
            add(msg['message-id'])
        else:
            total += prob
    first_mean = total/n
    print len(okalready), "out of", n, "messages already score as spam"
    print "initial mean spam prob: %.3f" % first_mean
    print "%5s %3s %5s %5s %s" % ("prob", "new", "mean", "sdev", "msgid")
    for msg in getmbox(unsure):
        msgid = msg['message-id']
        if msgids is not None and msgid not in msgids:
            continue
        msgprob = cls.spamprob(tokenize(msg))
        if skipspam and msgprob >= spam_cutoff:
            continue
        n = j = 0
        h.train(msg, True)
        total = 0.0
        probs = []
        for trial in getmbox(unsure):
            if trial['message-id'] in okalready:
                continue
            n += 1
            if n % 10 == 0:
                counter("", n)
            prob = cls.spamprob(tokenize(trial))
            probs.append(prob)
            total += prob
            if prob >= spam_cutoff:
                j += 1
        counter("", n)
        h.untrain(msg, True)
        mean = total/n
        meankey = round(mean, 3)
        scores.setdefault(meankey, []).append(msgid)
        sdev = math.sqrt(sum([(mean-prob)**2 for prob in probs])/n)
        print "\r%.3f %3d %.3f %.3f %s" % (msgprob, j, mean, sdev, msgid)
Example #24
0
 def main(fp):

    charset = locale.getdefaultlocale()[1]

    if not charset:

        charset = 'us-ascii'

    db = pspam.database.open()

    r = db.open().root()

    p = r["profile"]

    msg = email.message_from_file(fp)

    prob, evidence = p.classifier.spamprob(tokenize(msg), True)

    print "Score:", prob

    print

    print "Clues"

    print "-----"

    for clue, prob in evidence:

        if isinstance(clue, UnicodeType):

            clue = clue.encode(charset, 'replace')

        print clue, prob
Example #25
0
def main(args):
    try:
        opts, args = getopt.getopt(args, "hrto:", ["help", "re", "tokenize", "option="])
    except getopt.GetoptError as msg:
        usage(msg)
        return 1
    usere = False
    tokenizestdin = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True
        elif opt in ("-o", "--option"):
            options.set_from_cmdline(arg, sys.stderr)
    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1
    dbname, usedb = database_type(opts)
    db = open_storage(dbname, usedb)
    if tokenizestdin:
        args = tokenize(sys.stdin)
    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1
Example #26
0
def filter_message(hamdir, spamdir):
    signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1))
    signal.alarm(24 * 60 * 60)
    tmpfile, pathname, filename = maketmp(hamdir)
    try:
        tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line
        bytes = 0
        blocks = []
        while 1:
            block = sys.stdin.read(BLOCK_SIZE)
            if not block:
                break
            bytes += len(block)
            if bytes < SIZE_LIMIT:
                blocks.append(block)
            tmpfile.write(block)
        tmpfile.close()
        if bytes < SIZE_LIMIT:
            msgdata = ''.join(blocks)
            del blocks
            msg = email.message_from_string(msgdata)
            del msgdata
            bayes = CdbClassifier(open(DB_FILE, 'rb'))
            prob = bayes.spamprob(tokenize(msg))
        else:
            prob = 0.0
        if prob > SPAM_CUTOFF:
            os.rename(pathname, "%s/new/%s" % (spamdir, filename))
        else:
            os.rename(pathname, "%s/new/%s" % (hamdir, filename))
    except:
        os.unlink(pathname)
        raise
Example #27
0
def print_message_score(msg_name, msg_fp):
    msg = email.message_from_file(msg_fp)
    bayes = CdbClassifier(open(DB_FILE, 'rb'))
    prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
    print msg_name, prob
    for word, prob in evidence:
        print '  ', repr(word), prob
Example #28
0
def train(text, isSpam):

    """Trains the classifier on the given text."""

    tokens = tokenizer.tokenize(text)

    bayes.learn(tokens, isSpam)
Example #29
0
 def classify(text):

    """Classifies the given text, returning the spamprob."""

    tokens = tokenizer.tokenize(text)

    return bayes.spamprob(tokens)
Example #30
0
def processAndTrain(v, vmoveto, bayes, is_spam, notesindex, log):
    if is_spam:
        header_str = options["Headers", "header_spam_string"]
    else:
        header_str = options["Headers", "header_ham_string"]
    print "Training %s" % (header_str,)
    docstomove = []
    doc = v.GetFirstDocument()
    while doc:
        message = getMessage(doc)
        options["Tokenizer", "generate_long_skips"] = False
        tokens = tokenizer.tokenize(message)
        nid = doc.NOTEID
        if notesindex.has_key(nid):
            trainedas = notesindex[nid]
            if trainedas == options["Headers", "header_spam_string"] and \
               not is_spam:
                bayes.unlearn(tokens, True)
            elif trainedas == options["Headers", "header_ham_string"] and \
                 is_spam:
                bayes.unlearn(tokens, False)
        bayes.learn(tokens, is_spam)
        notesindex[nid] = header_str
        docstomove += [doc]
        doc = v.GetNextDocument(doc)
    for doc in docstomove:
        doc.RemoveFromFolder(v.Name)
        doc.PutInFolder(vmoveto.Name)
    print "%s documents trained" % (len(docstomove),)
    if log:
        log.LogAction("%s documents trained" % (len(docstomove),))
Example #31
0
def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log):
    if len(notesindex.keys()) == 0:
        firsttime = 1
    else:
        firsttime = 0
    docstomove = []
    numham = 0
    numspam = 0
    numuns = 0
    numdocs = 0
    doc = v.GetFirstDocument()
    while doc:
        nid = doc.NOTEID
        if firsttime:
            notesindex[nid] = 'never classified'
        else:
            if not notesindex.has_key(nid):
                numdocs += 1
                message = getMessage(doc)
                options["Tokenizer", "generate_long_skips"] = False
                tokens = tokenizer.tokenize(message)
                prob, clues = bayes.spamprob(tokens, evidence=True)
                if prob < options["Categorization", "ham_cutoff"]:
                    disposition = options["Headers", "header_ham_string"]
                    numham += 1
                elif prob > options["Categorization", "spam_cutoff"]:
                    disposition = options["Headers", "header_spam_string"]
                    docstomove += [doc]
                    numspam += 1
                else:
                    disposition = options["Headers", "header_unsure_string"]
                    numuns += 1
                notesindex[nid] = 'classified'
                try:
                    print "%s spamprob is %s" % (subj[:30], prob)
                    if log:
                        log.LogAction("%s spamprob is %s" % (subj[:30],
                                                             prob))
                except UnicodeError:
                    print "<subject not printed> spamprob is %s" % (prob)
                    if log:
                        log.LogAction("<subject not printed> spamprob " \
                                      "is %s" % (prob,))
                item = doc.ReplaceItemValue("Spam", prob)
                item.IsSummary = True
                doc.save(False, True, False)
        doc = v.GetNextDocument(doc)
    for doc in docstomove:
        doc.RemoveFromFolder(v.Name)
        doc.PutInFolder(vmoveto.Name)
    print "%s documents processed" % (numdocs,)
    print "   %s classified as spam" % (numspam,)
    print "   %s classified as ham" % (numham,)
    print "   %s classified as unsure" % (numuns,)
    if log:
        log.LogAction("%s documents processed" % (numdocs,))
        log.LogAction("   %s classified as spam" % (numspam,))
        log.LogAction("   %s classified as ham" % (numham,))
        log.LogAction("   %s classified as unsure" % (numuns,))
Example #32
0
     def untrain(self, msg, is_spam):

        """Untrain bayes with a message.
        msg can be a string, a file object, or a Message object.
        is_spam should be True if the message is spam, False if not.
        """

        self.bayes.unlearn(tokenize(msg), is_spam)
Example #33
0
def untrain_message(msg, cdata):
    from spambayes.tokenizer import tokenize
    stream = msg.GetEmailPackageObject()
    cdata.message_db.load_msg(msg)
    if been_trained_as_spam(msg):
        assert not been_trained_as_ham(msg), "Can't have been both!"
        cdata.bayes.unlearn(tokenize(stream), True)
        cdata.message_db.remove_msg(msg)
        cdata.dirty = True
        return True
    if been_trained_as_ham(msg):
        assert not been_trained_as_spam(msg), "Can't have been both!"
        cdata.bayes.unlearn(tokenize(stream), False)
        cdata.message_db.remove_msg(msg)
        cdata.dirty = True
        return False
    return None
Example #34
0
 def test_pickle_export(self):
     bayes = PickledClassifier(TEMP_PICKLE_NAME)
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     bayes.store()
     sb_dbexpimp.runExport(TEMP_PICKLE_NAME, "pickle", TEMP_CSV_NAME)
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
Example #35
0
 def train(bayes, msgs, is_spam):

    """Train bayes with all messages from a mailbox."""

    mbox = mboxutils.getmbox(msgs)

    for msg in mbox:

        bayes.learn(tokenize(msg), is_spam)
Example #36
0
 def test_train_spam(self):
     self.h.open('c')
     self.h.train_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 1)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi.hamcount, 0)
         self.assertEqual(wi.spamcount, 1)
 def test_merge_to_dbm(self):
     # Create a dbm classifier to merge with.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save data to check against.
     original_nham = bayes.nham
     original_nspam = bayes.nspam
     original_data = {}
     for key in bayes._wordinfokeys():
         original_data[key] = bayes._wordinfoget(key)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Create a CSV file to import.
     nham, nspam = 3,4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file),
     # and the data from the original dbm database.
     bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes2.nham, nham + original_nham)
     self.assertEqual(bayes2.nspam, nspam + original_nspam)
     words = original_data.keys()[:]
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0,0))
         wi = original_data.get(word, None)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
Example #38
0
 def test_train_spam(self):
     # Verify that the classifier gets trained with the message.
     self.h.open('c')
     self.h.train_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 1)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi.hamcount, 0)
         self.assertEqual(wi.spamcount, 1)
Example #39
0
    def untrain(self, msg, is_spam):
        """Untrain bayes with a message.

        msg can be a string, a file object, or a Message object.

        is_spam should be True if the message is spam, False if not.

        """

        self.bayes.unlearn(tokenize(msg), is_spam)
Example #40
0
     def _scoremsg(self, msg, evidence=False):

        """Score a Message.
        msg can be a string, a file object, or a Message object.
        Returns the probability the message is spam.  If evidence is
        true, returns a tuple: (probability, clues), where clues is a
        list of the words which contributed to the score.
        """

        return self.bayes.spamprob(tokenize(msg), evidence)
Example #41
0
 def test_train_spam(self):
     # Verify that the classifier gets trained with the message.
     self.h.open('c')
     self.h.train_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 1)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi.hamcount, 0)
         self.assertEqual(wi.spamcount, 1)
Example #42
0
 def __init__(self, dir, name):
     path = dir + "/" + name
     self.tag = path
     f = open(path, 'rb')
     self.guts = f.read()
     f.close()
     self.prob = None
     self.probdiff = None
     self.allclues = []
     self.clues = []
     self.guts = "`~`".join(set(tokenize(self.guts)))
Example #43
0
    def test_newdb(self):
        # Create an existing classifier.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        b.learn(tokenize(spam1), True)
        b.learn(tokenize(good1), False)
        b.store()
        b.close()

        # Create the fresh classifier.
        self.h.newdb()

        # Verify that the classifier isn't open.
        self.assertEqual(self.h.h, None)

        # Verify that any existing classifier with the same name
        # is overwritten.
        b = open_storage(TEMP_DBM_NAME, "dbm")
        self.assertEqual(b.nham, 0)
        self.assertEqual(b.nspam, 0)
        b.close()
Example #44
0
    def _scoremsg(self, msg, evidence=False):
        """Score a Message.

        msg can be a string, a file object, or a Message object.

        Returns the probability the message is spam.  If evidence is
        true, returns a tuple: (probability, clues), where clues is a
        list of the words which contributed to the score.

        """

        return self.bayes.spamprob(tokenize(msg), evidence)
Example #45
0
File: msgs.py Project: lsabc/KARMA
 def __init__(self, dir, name, index=None):
     path = dir + "/" + name
     self.tag = path
     f = open(path, 'rb')
     self.guts = f.read()
     f.close()
     self.prob = None
     self.probdiff = None
     self.allclues = []
     self.clues = []
     self.guts = "`~`".join(set(tokenize(self.guts))) # all words of email, separated by `~`
     self.index = index
Example #46
0
    def message_parse_error(self, buf):
        # We get an error parsing the message.  We've already told the
        # client to expect more bytes that this buffer contains, but
        # there's not clean way to add the header.

        self.server.log.write("# error: %s\n" % repr(buf))

        # XXX what to do?  list's just add it after the first line
        score = self.server.classifier.spamprob(tokenize(buf))

        L = buf.split("\n")
        L.insert(1, HEADER % score)
        return "\n".join(L)
Example #47
0
def main(fp):
    charset = locale.getdefaultlocale()[1]
    if not charset:
        charset = 'us-ascii'

    db = pspam.database.open()
    r = db.open().root()

    p = r["profile"]

    msg = email.message_from_file(fp)
    prob, evidence = p.classifier.spamprob(tokenize(msg), True)
    print "Score:", prob
    print
    print "Clues"
    print "-----"
    for clue, prob in evidence:
        if isinstance(clue, UnicodeType):
            clue = clue.encode(charset, 'replace')
        print clue, prob
Example #48
0
    def train(self, msg, is_spam, add_header=False):
        """Train bayes with a message.

        msg can be a string, a file object, or a Message object.

        is_spam should be 1 if the message is spam, 0 if not.

        If add_header is True, add a header with how it was trained (in
        case we need to untrain later)

        """

        self.bayes.learn(tokenize(msg), is_spam)
        if add_header:
            if is_spam:
                trained = options["Headers", "header_spam_string"]
            else:
                trained = options["Headers", "header_ham_string"]
            del msg[options["Headers", "trained_header_name"]]
            msg.add_header(options["Headers", "trained_header_name"], trained)
Example #49
0
 def setUp(self):
     self.msg = email.message_from_string(spam1, _class=SBHeaderMessage)
     # Get a prob and some clues.
     c = Classifier()
     self.u_prob, clues = c.spamprob(tokenize(good1), True)
     c.learn(tokenize(good1), False)
     self.g_prob, clues = c.spamprob(tokenize(good1), True)
     c.unlearn(tokenize(good1), False)
     c.learn(tokenize(spam1), True)
     self.s_prob, self.clues = c.spamprob(tokenize(spam1), True)
     self.ham = options['Headers', 'header_ham_string']
     self.spam = options['Headers', 'header_spam_string']
     self.unsure = options['Headers', 'header_unsure_string']
     self.to = "[email protected];[email protected]"
     self.msg["to"] = self.to
def processAndTrain(v, vmoveto, bayes, is_spam, notesindex, log):
    if is_spam:
        header_str = options["Headers", "header_spam_string"]
    else:
        header_str = options["Headers", "header_ham_string"]

    print "Training %s" % (header_str,)

    docstomove = []
    doc = v.GetFirstDocument()
    while doc:
        message = getMessage(doc)

        options["Tokenizer", "generate_long_skips"] = False
        tokens = tokenizer.tokenize(message)

        nid = doc.NOTEID
        if notesindex.has_key(nid):
            trainedas = notesindex[nid]
            if trainedas == options["Headers", "header_spam_string"] and \
               not is_spam:
                # msg is trained as spam, is to be retrained as ham
                bayes.unlearn(tokens, True)
            elif trainedas == options["Headers", "header_ham_string"] and \
                 is_spam:
                # msg is trained as ham, is to be retrained as spam
                bayes.unlearn(tokens, False)

        bayes.learn(tokens, is_spam)

        notesindex[nid] = header_str
        docstomove += [doc]
        doc = v.GetNextDocument(doc)

    for doc in docstomove:
        doc.RemoveFromFolder(v.Name)
        doc.PutInFolder(vmoveto.Name)

    print "%s documents trained" % (len(docstomove),)
    if log:
        log.LogAction("%s documents trained" % (len(docstomove),))
Example #51
0
    def score_mime(self, msg_text, encoding):
        """Score a message representing a MIME document.

        The msg argument will be a string in the given encoding.
        """

        # XXX Much of this probably belongs in the core server...

        if self.state.bayes is None:
            self.state.create_workers()
        # Get msg_text into canonical string representation.
        # Make sure we have a unicode object...
        if isinstance(msg_text, str):
            msg_text = unicode(msg_text, encoding)
        # ... then encode it as utf-8.
        if isinstance(msg_text, unicode):
            msg_text = msg_text.encode("utf-8")
        msg = message_from_string(msg_text,
                                  _class=spambayes.message.SBHeaderMessage)

        tokens = tokenize(msg)

        # XXX Maybe from here on down...

        prob, clues = self.state.bayes.spamprob(tokens, evidence=True)
        msg.addSBHeaders(prob, clues)

        self.state.record_classification(msg.GetClassification(), prob)

        # Cache the message.
        if not self.state.is_test and options["Storage", "cache_messages"]:
            msg.setId(self.state.getNewMessageName())
            # Write the message into the Unknown cache.
            makeMessage = self.state.unknownCorpus.makeMessage
            message = makeMessage(msg.getId(), msg.as_string())
            self.state.unknownCorpus.addMessage(message)
        return prob
Example #52
0
def filter_message(hamdir, spamdir):
    signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1))
    signal.alarm(24 * 60 * 60)

    # write message to temporary file (must be on same partition)
    tmpfile, pathname, filename = maketmp(hamdir)
    try:
        tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line
        bytes = 0
        blocks = []
        while 1:
            block = sys.stdin.read(BLOCK_SIZE)
            if not block:
                break
            bytes += len(block)
            if bytes < SIZE_LIMIT:
                blocks.append(block)
            tmpfile.write(block)
        tmpfile.close()

        if bytes < SIZE_LIMIT:
            msgdata = ''.join(blocks)
            del blocks
            msg = email.message_from_string(msgdata)
            del msgdata
            bayes = CdbClassifier(open(DB_FILE, 'rb'))
            prob = bayes.spamprob(tokenize(msg))
        else:
            prob = 0.0

        if prob > SPAM_CUTOFF:
            os.rename(pathname, "%s/new/%s" % (spamdir, filename))
        else:
            os.rename(pathname, "%s/new/%s" % (hamdir, filename))
    except:
        os.unlink(pathname)
        raise
Example #53
0
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True
        elif opt in ('-o', '--option'):
            options.set_from_cmdline(arg, sys.stderr)

    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1

    dbname, usedb = database_type(opts)
    db = open_storage(dbname, usedb)

    if tokenizestdin:
        args = tokenize(sys.stdin)

    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
Example #54
0
 def __iter__(self):
     return tokenize(self.guts)
Example #55
0
 def test_tokenize(self):
     toks = self.msg.tokenize()
     self.assertEqual(tuple(tokenize(spam1)), tuple(toks))
Example #56
0
def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
          ratio):
    round = 0
    ham_cutoff = Options.options["Categorization", "ham_cutoff"]
    spam_cutoff = Options.options["Categorization", "spam_cutoff"]

    # list-ify ham and spam iterators immediately.  We don't really want to
    # fetch the messages multiple times, and this is no worse than what happened
    # before when -R was passed.
    hambone_ = list(mboxutils.getmbox(hambox))
    spamcan_ = list(mboxutils.getmbox(spambox))

    if reverse:
        hambone_ = list(reversed(hambone_))
        spamcan_ = list(reversed(spamcan_))

    nspam, nham = len(spamcan_), len(hambone_)
    if ratio:
        rspam, rham = ratio
        # If the actual ratio of spam to ham in the database is better than
        # what was asked for, use that better ratio.
        if (rspam > rham) == (rspam * nham > rham * nspam):
            rspam, rham = nspam, nham

    # define some indexing constants
    ham = 0
    spam = 1
    name = ('ham', 'spam')
    misses = [0, 0]

    misclassified = lambda is_spam, score: (is_spam and score < spam_cutoff or
                                            not is_spam and score > ham_cutoff)

    while round < maxrounds and (misses[ham] or misses[spam] or round == 0):
        round += 1
        if verbose:
            print >> sys.stderr, "*** round", round, "***"

        start = datetime.datetime.now()
        hambone = iter(hambone_)
        spamcan = iter(spamcan_)

        i = [0, 0]
        msgs_processed = 0
        misses = [0, 0]
        training_sets = [hambone, spamcan]

        while not maxmsgs or msgs_processed < maxmsgs:

            # should the next message come from hambone or spamcan?
            train_spam = i[ham] * rspam > i[spam] * rham

            try:
                train_msg = training_sets[train_spam].next()
            except StopIteration:
                break

            i[train_spam] += 1
            msgs_processed += 1
            sys.stdout.write("\r%5d" % msgs_processed)
            sys.stdout.flush()

            tokens = list(tokenize(train_msg))
            score = store.spamprob(tokens)
            selector = train_msg["message-id"] or train_msg["subject"]

            if misclassified(train_spam, score) and selector is not None:
                if verbose:
                    print >> sys.stderr, "\tmiss %s: %.6f %s" % (
                        name[train_spam], score, selector)

                misses[train_spam] += 1
                tdict[train_msg["message-id"]] = True
                store.learn(tokens, train_spam)

        delta = datetime.datetime.now() - start
        seconds = delta.seconds + delta.microseconds / 1000000

        print "\rround: %2d, msgs: %4d, ham misses: %3d, spam misses: %3d, %.1fs" % \
              (round, msgs_processed, misses[0], misses[1], seconds)

    training_sets = [hambone, spamcan]

    # We count all untrained messages so the user knows what was skipped.
    # We also tag them for saving so we don't lose messages which might have
    # value in a future run
    for is_spam in ham, spam:
        nleft = 0
        try:
            while True:
                msg = training_sets[is_spam].next()
                score = store.spamprob(tokenize(msg))

                if misclassified(is_spam, score):
                    tdict[msg["message-id"]] = True
                    nleft += 1

        except StopIteration:
            if nleft:
                print nleft, "untrained %ss" % name[is_spam]
Example #57
0
 def tokenize(self):
     return tokenize(self)
def classifyInbox(v, vmoveto, bayes, ldbname, notesindex, log):

    # the notesindex hash ensures that a message is looked at only once

    if len(notesindex.keys()) == 0:
        firsttime = 1
    else:
        firsttime = 0

    docstomove = []
    numham = 0
    numspam = 0
    numuns = 0
    numdocs = 0

    doc = v.GetFirstDocument()
    while doc:
        nid = doc.NOTEID
        if firsttime:
            notesindex[nid] = 'never classified'
        else:
            if not notesindex.has_key(nid):
                numdocs += 1

                # Notes returns strings in unicode, and the Python
                # decoder has trouble with these strings when
                # you try to print them.  So don't...

                message = getMessage(doc)

                # generate_long_skips = True blows up on occasion,
                # probably due to this unicode problem.
                options["Tokenizer", "generate_long_skips"] = False
                tokens = tokenizer.tokenize(message)
                prob = bayes.spamprob(tokens)

                if prob < options["Categorization", "ham_cutoff"]:
                    numham += 1
                elif prob > options["Categorization", "spam_cutoff"]:
                    docstomove += [doc]
                    numspam += 1
                else:
                    numuns += 1

                notesindex[nid] = 'classified'
                subj = message["subject"]
                try:
                    print "%s spamprob is %s" % (subj[:30], prob)
                    if log:
                        log.LogAction("%s spamprob is %s" % (subj[:30],
                                                             prob))
                except UnicodeError:
                    print "<subject not printed> spamprob is %s" % (prob)
                    if log:
                        log.LogAction("<subject not printed> spamprob " \
                                      "is %s" % (prob,))

                item = doc.ReplaceItemValue("Spam", prob)
                item.IsSummary = True
                doc.save(False, True, False)

        doc = v.GetNextDocument(doc)

    # docstomove list is built because moving documents in the middle of
    # the classification loop loses the iterator position
    for doc in docstomove:
        doc.RemoveFromFolder(v.Name)
        doc.PutInFolder(vmoveto.Name)

    print "%s documents processed" % (numdocs,)
    print "   %s classified as spam" % (numspam,)
    print "   %s classified as ham" % (numham,)
    print "   %s classified as unsure" % (numuns,)
    if log:
        log.LogAction("%s documents processed" % (numdocs,))
        log.LogAction("   %s classified as spam" % (numspam,))
        log.LogAction("   %s classified as ham" % (numham,))
        log.LogAction("   %s classified as unsure" % (numuns,))
Example #59
0
def train(bayes, msgs, is_spam):
    """Train bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    for msg in mbox:
        bayes.learn(tokenize(msg), is_spam)