def main(): # train_data = { # 'test/spam1' : 'spam', # 'test/spam2' : 'spam', # 'test/spam3' : 'spam', # 'test/spam4' : 'spam', # 'test/ham1' : 'ham', # 'test/ham2' : 'ham', # 'test/ham3' : 'ham', # 'test/ham4' : 'ham', # } # # Setup train_data, test_data = parse_labels() stats(test_data) test_data = { 'test/test0' : 'spam', } print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts" word_dict, spam_count, ham_count = create_word_counts(train_data) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done." # Iterate through possible values of lambda for lambda smoothing. # la_set = [0.005, 0.1, 0.5, 1.0, 2.0] la_set = [0] for la in la_set: print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la) word_probs = compute_probs(word_dict, spam_count, ham_count, la) spam_prior_prob = (float)(spam_count) / (spam_count + ham_count) ham_prior_prob = (float)(ham_count) / (spam_count + ham_count) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done" TP, FP, TN, FN, problems = classify(test_data, word_probs, spam_prior_prob, ham_prior_prob) summarize_findings(la, TP, FP, TN, FN, problems)
print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts" word_dict, total_doc_count, spam_doc_count, ham_doc_count = create_word_counts(train_data) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done." la = 2.000 print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la) word_probs = compute_probs(word_dict, spam_doc_count, ham_doc_count, la) spam_prior_prob = (float)(spam_doc_count) / (spam_doc_count + ham_doc_count) ham_prior_prob = (float)(ham_doc_count) / (spam_doc_count + ham_doc_count) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Spam Prior Prob : {0}".format(spam_prior_prob) print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Ham Prior Prob : {0}".format(ham_prior_prob) best_mi = compute_mi(word_probs, spam_prior_prob, ham_prior_prob) top_200 = best_mi[0:200] print [word[0] for word in top_200] mi_word_probs = {} for (word, mi) in top_200: mi_word_probs[word] = word_probs[word] mi_word_probs["*"] = word_probs["*"] TP, FP, TN, FN = classify(test_data, mi_word_probs, spam_prior_prob, ham_prior_prob) summarize_findings(TP, FP, TN, FN) print "Done." if __name__ == "__main__": main()