Beispiel #1
0
def main():
    # Setup: parse trec06p-cs280/labels into two dictionaries train_data and test_data
    # Each dictionary is a {file : spam_or_ham} where file is a string of the file e.g 'data/001/001' and spam_or_ham is
    #the label e.g. 'spam' or 'ham'
    train_data, test_data = parse_labels()
    print "There are %d documents for training, and %d documents for testing" % (len(train_data.keys()), len(test_data.keys()))

    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts"
    # Creat a word_dict ( i.e { word: (spam count, ham, count)}, and get the total number of spams documents
    # and total number of ham documents
    word_dict, total_doc_count, spam__docs_count, ham_docs_count = create_word_counts(train_data)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done."
    print "There are a total of %d documents of which %d are spam and %d are ham." %(total_doc_count, spam__docs_count, ham_docs_count)
    print "Prior probability for spam is: %f" % (spam__docs_count/float(total_doc_count))
    print "Prior probability for ham is: %f" % (ham_docs_count/float(total_doc_count))
    print "The vocabulary extracted from training totals %d words" % (len(word_dict.keys()))
    data_set = test_data

    # Iterate through possible values of lambda for lambda smoothing.
    #la_set = [0.00]

    la_set = [0.00, 0.005, 0.1, 0.5, 1.0, 2.0]
    for la in la_set:
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la)
        word_probs = compute_probs(word_dict, spam__docs_count, ham_docs_count, la)
        spam_prior_prob = (float)(spam__docs_count) / (total_doc_count)
        ham_prior_prob = (float)(ham_docs_count) / (total_doc_count)
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done"

        TP, FP, TN, FN = classify(data_set, word_probs, spam_prior_prob, ham_prior_prob)
        summarize_findings(TP, FP, TN, FN)
Beispiel #2
0
def main():
    train_data, test_data = parse_labels()

    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts"
    word_dict, total_doc_count, spam_doc_count, ham_doc_count = create_word_counts(train_data)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done."

    la = 2.000

    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la)
    word_probs = compute_probs(word_dict, spam_doc_count, ham_doc_count, la)
    spam_prior_prob = (float)(spam_doc_count) / (spam_doc_count + ham_doc_count)
    ham_prior_prob = (float)(ham_doc_count) / (spam_doc_count + ham_doc_count)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Spam Prior Prob : {0}".format(spam_prior_prob)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Ham Prior Prob : {0}".format(ham_prior_prob)

    best_mi = compute_mi(word_probs, spam_prior_prob, ham_prior_prob)
    top_200 = best_mi[0:200]
    print [word[0] for word in top_200]
Beispiel #3
0
def main():
    # train_data = {
    #     'test/spam1' : 'spam',
    #     'test/spam2' : 'spam',
    #     'test/spam3' : 'spam',
    #     'test/spam4' : 'spam',
    #     'test/ham1' : 'ham',
    #     'test/ham2' : 'ham',
    #     'test/ham3' : 'ham',
    #     'test/ham4' : 'ham',
    # }
    #
    # Setup

    train_data, test_data = parse_labels()
    stats(test_data)


    test_data = {
        'test/test0' : 'spam',
    }


    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts"
    word_dict, spam_count, ham_count = create_word_counts(train_data)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done."

    # Iterate through possible values of lambda for lambda smoothing.
    # la_set = [0.005, 0.1, 0.5, 1.0, 2.0]
    la_set = [0]

    for la in la_set:
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la)
        word_probs = compute_probs(word_dict, spam_count, ham_count, la)
        spam_prior_prob = (float)(spam_count) / (spam_count + ham_count)
        ham_prior_prob = (float)(ham_count) / (spam_count + ham_count)
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done"

        TP, FP, TN, FN, problems = classify(test_data, word_probs, spam_prior_prob, ham_prior_prob)
        summarize_findings(la, TP, FP, TN, FN, problems)