def best_features(): ham_train = bayes.HAM + bayes.TRAIN spam_train = bayes.SPAM + bayes.TRAIN ham_words = map(lambda s: '/' + s + '/', set.union(*map(toolkit.word_bag, toolkit.get_files(ham_train)))) spam_words = map(lambda s: '/' + s + '/', set.union(*map(toolkit.word_bag, toolkit.get_files(spam_train)))) all_words = set(ham_words + spam_words) # p(w_i | c) p_w_ham = defaultdict(lambda: toolkit.ZERO, zip(ham_words, toolkit.countre(ham_train, ham_words, smoothing=toolkit.ONE)[0])) p_w_spam = defaultdict(lambda: toolkit.ZERO, zip(spam_words, toolkit.countre(spam_train, spam_words, smoothing=toolkit.ONE)[0])) #for word in all_words: # if type(p_w_ham[word]) != toolkit.NUM or type(p_w_spam[word]) != toolkit.NUM: # print p_w_ham[word], p_w_spam[word], word #return range(0, 100) mut_inf = dict() maxlog = math.log(toolkit.MAX) def no_error(x, y): try: return math.log(x / y) except Exception: return maxlog for word in all_words: p_w_h = p_w_ham[word] #if type(p_w_h)() == list() or type(p_w_s)() == list: # print "'" + word + "'" #else: # print "*", p_nw_h = toolkit.ONE - p_w_h p_w_s = p_w_spam[word] p_nw_s = toolkit.ONE - p_w_s p_w = p_w_h * toolkit.PRIOR_HAM + p_w_s * toolkit.PRIOR_SPAM p_nw = toolkit.ONE - p_w log_w_h = no_error(p_w, p_w_h) log_nw_h = no_error(p_nw, p_nw_h) log_w_s = no_error(p_w, p_w_s) log_nw_s = no_error(p_nw, p_nw_s) mut_inf[word] = p_w_h * toolkit.PRIOR_HAM * log_w_h mut_inf[word] += p_nw_h * toolkit.PRIOR_HAM * log_nw_h mut_inf[word] += p_w_s * toolkit.PRIOR_SPAM * log_w_s mut_inf[word] += p_nw_s * toolkit.PRIOR_SPAM * log_nw_s return sorted(mut_inf.iteritems(), key=operator.itemgetter(1), reverse=True)
def test_classify(): feats = features.get_features() print feats good = 0 wrong = 0 spam_files = toolkit.get_files('spam/train') for sf in spam_files: if classify_wrap(sf, feats, 0): good += 1 else: wrong += 1 print "After SPAM: good: %d, wrong: %d" % (good, wrong) ham_files = toolkit.get_files('ham/train') for hf in ham_files: if not(classify_wrap(hf, feats, 0)): good += 1 else: wrong += 1 print "good: %d, wrong: %d" % (good, wrong)
def validate_classification(roc_step=toolkit.NUM('0.001')): """Compute the ROC.""" true = dict() false = dict() true[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0)) true[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0)) false[bayes.HAM] = defaultdict(lambda: toolkit.NUM(0)) false[bayes.SPAM] = defaultdict(lambda: toolkit.NUM(0)) ham_roc = list() spam_roc = list() # Compute best features. print "Computing the 300 most characteristic features" print "Therefore we compute the mutual information of each word with respect to the classification" print "" best_features = features.best_features()[-300:] best_features = map(operator.itemgetter(0), best_features) ham_files = toolkit.get_files(bayes.HAM + bayes.TEST) spam_files = toolkit.get_files(bayes.SPAM + bayes.TEST) ham_count = len(ham_files) spam_count = len(spam_files) test_samples = zip(ham_files + spam_files, ham_count * [bayes.HAM] + spam_count * [bayes.SPAM]) ham_count = toolkit.NUM(ham_count) spam_count = toolkit.NUM(spam_count) from math import log, ceil total_count = ham_count + spam_count total_digits = ceil(log(total_count, 10)) print_msg = "[%%s] Processing file %%0%dd of %d: %%s " % (total_digits, total_count) count = 0 for filename, clss in test_samples: count += 1 print print_msg % (strftime("%H:%M:%S", gmtime()), count, filename) threshold = toolkit.ZERO while threshold <= toolkit.ONE: classification = bayes.classify(filename, best_features, threshold) if (classification == clss): true[classification][threshold] += toolkit.ONE else: false[classification][threshold] += toolkit.ONE threshold += roc_step threshold = toolkit.ZERO while threshold <= toolkit.ONE: total_false = toolkit.NUM(false[bayes.HAM][threshold] + false[bayes.SPAM][threshold]) total_true = toolkit.NUM(true[bayes.HAM][threshold] + true[bayes.SPAM][threshold]) ham_roc.append((false[bayes.HAM][threshold] / total_false, true[bayes.HAM][threshold] / total_true)) spam_roc.append((false[bayes.SPAM][threshold] / total_false, true[bayes.SPAM][threshold] / total_true)) threshold += roc_step #roc.reverse() print len(ham_roc) hamfile = open('ham_roc.dat', 'w') spamfile = open('spam_roc.dat', 'w') for h_e, s_e in zip(ham_roc, spam_roc): hamfile.write(str(h_e[0]) + ' ' + str(h_e[1]) + '\n') hamfile.flush() spamfile.write(str(s_e[0]) + ' ' + str(s_e[1]) + '\n') spamfile.flush() hamfile.close() spamfile.close() return true, false, ham_roc, spam_roc