Ejemplo n.º 1
0
def best_features():
    ham_train = bayes.HAM + bayes.TRAIN
    spam_train = bayes.SPAM + bayes.TRAIN
    ham_words = map(lambda s: '/' + s + '/', 
            set.union(*map(toolkit.word_bag, toolkit.get_files(ham_train))))
    spam_words = map(lambda s: '/' + s + '/', 
            set.union(*map(toolkit.word_bag, toolkit.get_files(spam_train))))
    all_words = set(ham_words + spam_words)

    # p(w_i | c)
    p_w_ham = defaultdict(lambda: toolkit.ZERO, zip(ham_words, 
            toolkit.countre(ham_train, ham_words, smoothing=toolkit.ONE)[0]))
    p_w_spam = defaultdict(lambda: toolkit.ZERO, zip(spam_words,
            toolkit.countre(spam_train, spam_words, smoothing=toolkit.ONE)[0]))
    #for word in all_words:
    #    if type(p_w_ham[word]) != toolkit.NUM or type(p_w_spam[word]) != toolkit.NUM:
    #        print p_w_ham[word], p_w_spam[word], word
    #return range(0, 100)
    mut_inf = dict()
    maxlog = math.log(toolkit.MAX)
    def no_error(x, y):
        try:
            return math.log(x / y)
        except Exception:
            return maxlog
    for word in all_words:
        p_w_h = p_w_ham[word]
        #if type(p_w_h)() == list() or type(p_w_s)() == list:
        #    print "'" + word + "'"
        #else:
        #    print "*",
        p_nw_h = toolkit.ONE - p_w_h
        p_w_s = p_w_spam[word]
        p_nw_s = toolkit.ONE - p_w_s
        p_w = p_w_h * toolkit.PRIOR_HAM + p_w_s * toolkit.PRIOR_SPAM
        p_nw = toolkit.ONE - p_w
        log_w_h = no_error(p_w, p_w_h)
        log_nw_h = no_error(p_nw, p_nw_h)
        log_w_s = no_error(p_w, p_w_s)
        log_nw_s = no_error(p_nw, p_nw_s)
        mut_inf[word] = p_w_h * toolkit.PRIOR_HAM * log_w_h
        mut_inf[word] += p_nw_h * toolkit.PRIOR_HAM * log_nw_h
        mut_inf[word] += p_w_s * toolkit.PRIOR_SPAM * log_w_s
        mut_inf[word] += p_nw_s * toolkit.PRIOR_SPAM * log_nw_s
    return sorted(mut_inf.iteritems(), key=operator.itemgetter(1), reverse=True)
Ejemplo n.º 2
0
def test_classify():
    feats = features.get_features()
    print feats
    good = 0
    wrong = 0
    spam_files = toolkit.get_files('spam/train')

    for sf in spam_files:
        if classify_wrap(sf, feats, 0):
            good += 1
        else:
            wrong += 1
    
    print "After SPAM: good: %d, wrong: %d" % (good, wrong)
    ham_files = toolkit.get_files('ham/train')
    for hf in ham_files:
        if not(classify_wrap(hf, feats, 0)):
            good += 1
        else:
            wrong += 1
    print "good: %d, wrong: %d" % (good, wrong)
Ejemplo n.º 3
0
def validate_classification(roc_step=toolkit.NUM('0.001')):
    """Compute the ROC."""
    true = dict()
    false = dict()
    true[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0))
    true[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0))
    false[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0))
    false[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0))
    ham_roc = list()
    spam_roc = list()

    # Compute best features.
    print "Computing the 300 most characteristic features"
    print "Therefore we compute the mutual information of each word with
    respect to the classification"
    print ""
    best_features = features.best_features()[-300:]
    best_features = map(operator.itemgetter(0), best_features)

    ham_files = toolkit.get_files(bayes.HAM + bayes.TEST)
    spam_files = toolkit.get_files(bayes.SPAM + bayes.TEST)
    ham_count = len(ham_files)
    spam_count = len(spam_files)
    test_samples = zip(ham_files + spam_files, ham_count * [bayes.HAM] +
            spam_count * [bayes.SPAM])
    ham_count = toolkit.NUM(ham_count)
    spam_count = toolkit.NUM(spam_count)

    from math import log, ceil
    total_count = ham_count + spam_count
    total_digits = ceil(log(total_count, 10))
    print_msg = "[%%s] Processing file %%0%dd of %d: %%s " % (total_digits, total_count)
    count = 0
    for filename, clss in test_samples:
        count += 1
        print print_msg % (strftime("%H:%M:%S", gmtime()), count, filename)
        threshold = toolkit.ZERO
        while threshold <= toolkit.ONE:
            classification =  bayes.classify(filename, best_features, threshold)
            if (classification == clss):
                true[classification][threshold] += toolkit.ONE
            else:
                false[classification][threshold] += toolkit.ONE
            threshold += roc_step
    threshold = toolkit.ZERO
    while threshold <= toolkit.ONE:
        total_false = toolkit.NUM(false[bayes.HAM][threshold] +
                false[bayes.SPAM][threshold])
        total_true = toolkit.NUM(true[bayes.HAM][threshold] + true[bayes.SPAM][threshold])
        ham_roc.append((false[bayes.HAM][threshold] / total_false,
            true[bayes.HAM][threshold] / total_true))
        spam_roc.append((false[bayes.SPAM][threshold] / total_false,
            true[bayes.SPAM][threshold] / total_true))
        threshold += roc_step
    #roc.reverse()
    print len(ham_roc)
    hamfile = open('ham_roc.dat', 'w')
    spamfile = open('spam_roc.dat', 'w')
    for h_e, s_e in zip(ham_roc, spam_roc):
        hamfile.write(str(h_e[0]) + ' ' + str(h_e[1]) + '\n')
        hamfile.flush()
        spamfile.write(str(s_e[0]) + ' ' + str(s_e[1]) + '\n')
        spamfile.flush()
    hamfile.close()
    spamfile.close()
    return true, false, ham_roc, spam_roc