Ejemplo n.º 1
0
def part2():
	training_emails = readTrainingEmails()
	spam_emails = []
	reg_emails = []

	for email in training_emails:
		if(email.label == 0):
			reg_emails.append(email)
		else:
			spam_emails.append(email)

	emails_classes = []
	emails_classes.append(emailClass(reg_emails))
	emails_classes.append(emailClass(spam_emails))

	multinomial(emails_classes[0])
	multinomial(emails_classes[1])
	bernouilli(emails_classes[0])
	bernouilli(emails_classes[1])

	file = open("multinomial_regular.txt", "w")
	print>>file, emails_classes[0].m_likelihood
	file.close()

	file = open("multinomial_spam.txt", "w")
	print>>file, emails_classes[1].m_likelihood
	file.close()

	file = open("bernouilli_regular.txt", "w")
	print>>file, emails_classes[0].b_likelihood
	file.close()

	file = open("bernouilli_spam.txt", "w")
	print>>file, emails_classes[1].b_likelihood
	file.close()

	actual_labels, testing_emails = readTestingEmails()
Ejemplo n.º 2
0
def main():
    training_emails = readTrainingReviews()
    neg_emails = []
    pos_emails = []

    for email in training_emails:
        if (email.label == 1):
            pos_emails.append(email)
        else:
            neg_emails.append(email)

    emails_classes = []
    emails_classes.append(emailClass(neg_emails))
    emails_classes.append(emailClass(pos_emails))

    multinomial(emails_classes[0])
    multinomial(emails_classes[1])
    bernouilli(emails_classes[0])
    bernouilli(emails_classes[1])

    file = open("multinomial_negative.txt", "w")
    print >> file, emails_classes[0].m_likelihood
    file.close()

    file = open("multinomial_positive.txt", "w")
    print >> file, emails_classes[1].m_likelihood
    file.close()

    file = open("bernouilli_negative.txt", "w")
    print >> file, emails_classes[0].b_likelihood
    file.close()

    file = open("bernouilli_positive.txt", "w")
    print >> file, emails_classes[1].b_likelihood
    file.close()

    actual_labels, testing_emails = readTestingReviews()
    confusionMatrixM = zeros((2, 2))
    confusionMatrixB = zeros((2, 2))

    hypothetical_m_classifier = []
    hypothetical_b_classifier = []
    map_m_estimate = []
    map_b_estimate = []

    for x in xrange(0, len(testing_emails)):
        map_estimate, label = multinomial_classifier(testing_emails[x],
                                                     emails_classes[0],
                                                     emails_classes[1])
        map_estimate2, label2 = bernouilli_classifier(testing_emails[x],
                                                      emails_classes[0],
                                                      emails_classes[1])
        hypothetical_m_classifier.append(label)
        hypothetical_b_classifier.append(label2)
        map_m_estimate.append(map_estimate)
        map_b_estimate.append(map_estimate2)

    count_m_list = []
    count_m_0 = len(hypothetical_m_classifier) - count_nonzero(
        hypothetical_m_classifier)
    count_m_1 = count_nonzero(hypothetical_m_classifier)
    count_m_list.append(count_m_0)
    count_m_list.append(count_m_1)

    count_b_list = []
    count_b_0 = len(hypothetical_b_classifier) - count_nonzero(
        hypothetical_b_classifier)
    count_b_1 = count_nonzero(hypothetical_b_classifier)
    count_b_list.append(count_b_0)
    count_b_list.append(count_b_1)

    temp_actual = actual_labels[:]
    for x in xrange(0, len(actual_labels)):
        if (actual_labels[x] == 1):
            temp_actual[x] = 0
        elif (actual_labels[x] == -1):
            temp_actual[x] = 1

    for x in xrange(0, len(temp_actual)):
        confusionMatrixM[temp_actual[x]][hypothetical_m_classifier[x]] += 1
        confusionMatrixB[temp_actual[x]][hypothetical_b_classifier[x]] += 1

    for x in xrange(0, 2):
        for y in xrange(0, 2):
            confusionMatrixM[x][y] = float(
                confusionMatrixM[x][y] * 100) / count_m_list[x]
            print count_m_list[x]
            confusionMatrixB[x][y] = float(
                confusionMatrixB[x][y] * 100) / count_b_list[x]

    print confusionMatrixM
    print confusionMatrixB

    for x in xrange(0, len(hypothetical_m_classifier)):
        if (hypothetical_m_classifier[x] == 1):
            hypothetical_m_classifier[x] = -1
        elif (hypothetical_m_classifier[x] == 0):
            hypothetical_m_classifier[x] = 1

        if (hypothetical_b_classifier[x] == 1):
            hypothetical_b_classifier[x] = -1
        elif (hypothetical_b_classifier[x] == 0):
            hypothetical_b_classifier[x] = 1

    error = list(array(actual_labels) - array(hypothetical_m_classifier))
    error_b = list(array(actual_labels) - array(hypothetical_b_classifier))

    error_value = float(count_nonzero(error)) / len(testing_emails)
    error_b_value = float(count_nonzero(error_b)) / len(testing_emails)

    sorted_m_positive = sorted(emails_classes[0].m_likelihood.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
    sorted_m_negative = sorted(emails_classes[1].m_likelihood.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
    sorted_b_positive = sorted(emails_classes[0].b_likelihood.items(),
                               key=operator.itemgetter(1),
                               reverse=True)
    sorted_b_negative = sorted(emails_classes[1].b_likelihood.items(),
                               key=operator.itemgetter(1),
                               reverse=True)

    for x in xrange(0, 20):
        print "Word: ", sorted_m_negative[x][0], " Value: ", sorted_m_negative[
            x][1]

    print "Booyah"
    for x in xrange(0, 20):
        print "Word: ", sorted_b_positive[x][0], " Value: ", sorted_b_positive[
            x][1]
    print "Huzzah!"
    for x in xrange(0, 20):
        print "Word: ", sorted_b_negative[x][0], " Value: ", sorted_b_negative[
            x][1]

    print "The multinomial error is ", error_value

    print "The bernouilli error is ", error_b_value

    file = open("map_m_estimate.txt", "w")
    print >> file, map_m_estimate
    file.close()

    file = open("map_b_estimate.txt", "w")
    print >> file, map_b_estimate
    file.close()

    file = open("hypothetical_m_classifier.txt", "w")
    print >> file, hypothetical_m_classifier
    file.close()

    file = open("hypothetical_b_classifier.txt", "w")
    print >> file, hypothetical_b_classifier
    file.close()

    file = open("actual_labels.txt", "w")
    print >> file, actual_labels
    file.close()
Ejemplo n.º 3
0
def part2():
	training_emails = readTrainingEmails()
	spam_emails = []
	reg_emails = []

	for email in training_emails:
		if(email.label == 0):
			reg_emails.append(email)
		else:
			spam_emails.append(email)

	emails_classes = []
	emails_classes.append(emailClass(reg_emails))
	emails_classes.append(emailClass(spam_emails))
	
	multinomial(emails_classes[0])
	multinomial(emails_classes[1])
	bernouilli(emails_classes[0])
	bernouilli(emails_classes[1])

	file = open("multinomial_regular.txt", "w")
	print>>file, emails_classes[0].m_likelihood
	file.close()

	file = open("multinomial_spam.txt", "w")
	print>>file, emails_classes[1].m_likelihood
	file.close()

	file = open("bernouilli_regular.txt", "w")
	print>>file, emails_classes[0].b_likelihood
	file.close()

	file = open("bernouilli_spam.txt", "w")
	print>>file, emails_classes[1].b_likelihood
	file.close()

	actual_labels, testing_emails = readTestingEmails()
	confusionMatrixM = zeros((2,2))
	confusionMatrixB = zeros((2,2))

	hypothetical_m_classifier = []
	hypothetical_b_classifier = []
	map_m_estimate = []
	map_b_estimate = []

	for x in xrange(0,len(testing_emails)):
		map_estimate, label = multinomial_classifier(testing_emails[x], emails_classes[1], emails_classes[0])
		map_estimate2, label2 = bernouilli_classifier(testing_emails[x], emails_classes[1], emails_classes[0])
		hypothetical_m_classifier.append(label)
		hypothetical_b_classifier.append(label2)
		map_m_estimate.append(map_estimate)
		map_b_estimate.append(map_estimate2)

	count_m_list = []
	count_m_0 = len(hypothetical_m_classifier) - count_nonzero(hypothetical_m_classifier)
	count_m_1 = count_nonzero(hypothetical_m_classifier)
	count_m_list.append(count_m_0)
	count_m_list.append(count_m_1)

	count_b_list = []
	count_b_0 = len(hypothetical_b_classifier) - count_nonzero(hypothetical_b_classifier)
	count_b_1 = count_nonzero(hypothetical_b_classifier)
	count_b_list.append(count_b_0)
	count_b_list.append(count_b_1)

	for x in xrange(0,len(actual_labels)):
		confusionMatrixM[actual_labels[x]][hypothetical_m_classifier[x]] += 1
		confusionMatrixB[actual_labels[x]][hypothetical_b_classifier[x]] += 1

	for x in xrange(0,2):
		for y in xrange(0,2):
			confusionMatrixM[x][y] = float(confusionMatrixM[x][y] * 100) / count_m_list[x]
			print count_m_list[x]
			confusionMatrixB[x][y] = float(confusionMatrixB[x][y] * 100) / count_b_list[x]

	print confusionMatrixM
	print confusionMatrixB

	error = list(array(actual_labels) - array(hypothetical_m_classifier))
	error_b = list(array(actual_labels) - array(hypothetical_b_classifier))

	error_value = float(count_nonzero(error))/len(testing_emails)
	error_b_value = float(count_nonzero(error_b))/len(testing_emails)

	sorted_m_reg = sorted(emails_classes[0].m_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_m_spam = sorted(emails_classes[1].m_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_b_reg = sorted(emails_classes[0].b_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_b_spam = sorted(emails_classes[1].b_likelihood.items(), key=operator.itemgetter(1), reverse = True)

	for x in xrange(1,20):
		print "Word: ", sorted_m_reg[x][0], " Value: ", sorted_m_reg[x][1]

	print "WAAAHOOO"
	for x in xrange(0,20):
		print "Word: ", sorted_m_spam[x][0], " Value: ", sorted_m_spam[x][1]

	print "Booyah"
	for x in xrange(0,20):
		print "Word: ", sorted_b_reg[x][0], " Value: ", sorted_b_reg[x][1]
	print "Huzzah!"
	for x in xrange(0,20):
		print "Word: ", sorted_b_spam[x][0], " Value: ", sorted_b_spam[x][1]

	print "The multinomial error is ", error_value
	
	print "The bernouilli error is ", error_b_value
	
	file = open("map_m_estimate.txt", "w")
	print>>file, map_m_estimate
	file.close()

	file = open("map_b_estimate.txt", "w")
	print>>file, map_b_estimate
	file.close()

	file = open("hypothetical_m_classifier.txt", "w")
	print>>file, hypothetical_m_classifier
	file.close()

	file = open("hypothetical_b_classifier.txt", "w")
	print>>file, hypothetical_b_classifier
	file.close()

	file = open("actual_labels.txt", "w")
	print>>file, actual_labels
	file.close()
Ejemplo n.º 4
0
def main():
	training_emails = readTrainingReviews()
	neg_emails = []
	pos_emails = []

	for email in training_emails:
		if(email.label == 1):
			pos_emails.append(email)
		else:
			neg_emails.append(email)

	emails_classes = []
	emails_classes.append(emailClass(neg_emails))
	emails_classes.append(emailClass(pos_emails))
	
	multinomial(emails_classes[0])
	multinomial(emails_classes[1])
	bernouilli(emails_classes[0])
	bernouilli(emails_classes[1])

	file = open("multinomial_negative.txt", "w")
	print>>file, emails_classes[0].m_likelihood
	file.close()

	file = open("multinomial_positive.txt", "w")
	print>>file, emails_classes[1].m_likelihood
	file.close()

	file = open("bernouilli_negative.txt", "w")
	print>>file, emails_classes[0].b_likelihood
	file.close()

	file = open("bernouilli_positive.txt", "w")
	print>>file, emails_classes[1].b_likelihood
	file.close()

	actual_labels, testing_emails = readTestingReviews()
	confusionMatrixM = zeros((2,2))
	confusionMatrixB = zeros((2,2))

	hypothetical_m_classifier = []
	hypothetical_b_classifier = []
	map_m_estimate = []
	map_b_estimate = []

	for x in xrange(0,len(testing_emails)):
		map_estimate, label = multinomial_classifier(testing_emails[x], emails_classes[0], emails_classes[1])
		map_estimate2, label2 = bernouilli_classifier(testing_emails[x], emails_classes[0], emails_classes[1])
		hypothetical_m_classifier.append(label)
		hypothetical_b_classifier.append(label2)
		map_m_estimate.append(map_estimate)
		map_b_estimate.append(map_estimate2)

	count_m_list = []
	count_m_0 = len(hypothetical_m_classifier) - count_nonzero(hypothetical_m_classifier)
	count_m_1 = count_nonzero(hypothetical_m_classifier)
	count_m_list.append(count_m_0)
	count_m_list.append(count_m_1)

	count_b_list = []
	count_b_0 = len(hypothetical_b_classifier) - count_nonzero(hypothetical_b_classifier)
	count_b_1 = count_nonzero(hypothetical_b_classifier)
	count_b_list.append(count_b_0)
	count_b_list.append(count_b_1)

	temp_actual = actual_labels[:]
	for x in xrange(0,len(actual_labels)):
		if(actual_labels[x] == 1):
			temp_actual[x] = 0
		elif(actual_labels[x] == -1):
			temp_actual[x] = 1

	for x in xrange(0,len(temp_actual)):
		confusionMatrixM[temp_actual[x]][hypothetical_m_classifier[x]] += 1
		confusionMatrixB[temp_actual[x]][hypothetical_b_classifier[x]] += 1

	for x in xrange(0,2):
		for y in xrange(0,2):
			confusionMatrixM[x][y] = float(confusionMatrixM[x][y] * 100) / count_m_list[x]
			print count_m_list[x]
			confusionMatrixB[x][y] = float(confusionMatrixB[x][y] * 100) / count_b_list[x]

	print confusionMatrixM
	print confusionMatrixB

	for x in xrange(0,len(hypothetical_m_classifier)):
		if(hypothetical_m_classifier[x] == 1):
			hypothetical_m_classifier[x] = -1
		elif(hypothetical_m_classifier[x] == 0):
			hypothetical_m_classifier[x] = 1

		if(hypothetical_b_classifier[x] == 1):
			hypothetical_b_classifier[x] = -1
		elif(hypothetical_b_classifier[x] == 0):
			hypothetical_b_classifier[x] = 1

	error = list(array(actual_labels) - array(hypothetical_m_classifier))
	error_b = list(array(actual_labels) - array(hypothetical_b_classifier))

	error_value = float(count_nonzero(error))/len(testing_emails)
	error_b_value = float(count_nonzero(error_b))/len(testing_emails)

	sorted_m_positive = sorted(emails_classes[0].m_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_m_negative = sorted(emails_classes[1].m_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_b_positive = sorted(emails_classes[0].b_likelihood.items(), key=operator.itemgetter(1), reverse = True)
	sorted_b_negative = sorted(emails_classes[1].b_likelihood.items(), key=operator.itemgetter(1), reverse = True)

	for x in xrange(0,20):
		print "Word: ", sorted_m_negative[x][0], " Value: ", sorted_m_negative[x][1]

	print "Booyah"
	for x in xrange(0,20):
		print "Word: ", sorted_b_positive[x][0], " Value: ", sorted_b_positive[x][1]
	print "Huzzah!"
	for x in xrange(0,20):
		print "Word: ", sorted_b_negative[x][0], " Value: ", sorted_b_negative[x][1]
	

	print "The multinomial error is ", error_value
	
	print "The bernouilli error is ", error_b_value
	
	file = open("map_m_estimate.txt", "w")
	print>>file, map_m_estimate
	file.close()

	file = open("map_b_estimate.txt", "w")
	print>>file, map_b_estimate
	file.close()

	file = open("hypothetical_m_classifier.txt", "w")
	print>>file, hypothetical_m_classifier
	file.close()

	file = open("hypothetical_b_classifier.txt", "w")
	print>>file, hypothetical_b_classifier
	file.close()

	file = open("actual_labels.txt", "w")
	print>>file, actual_labels
	file.close()
Ejemplo n.º 5
0
def part2():
    training_emails = readTrainingEmails()
    spam_emails = []
    reg_emails = []

    for email in training_emails:
        if email.label == 0:
            reg_emails.append(email)
        else:
            spam_emails.append(email)

    emails_classes = []
    emails_classes.append(emailClass(reg_emails))
    emails_classes.append(emailClass(spam_emails))

    multinomial(emails_classes[0])
    multinomial(emails_classes[1])
    bernouilli(emails_classes[0])
    bernouilli(emails_classes[1])

    file = open("multinomial_regular.txt", "w")
    print >> file, emails_classes[0].m_likelihood
    file.close()

    file = open("multinomial_spam.txt", "w")
    print >> file, emails_classes[1].m_likelihood
    file.close()

    file = open("bernouilli_regular.txt", "w")
    print >> file, emails_classes[0].b_likelihood
    file.close()

    file = open("bernouilli_spam.txt", "w")
    print >> file, emails_classes[1].b_likelihood
    file.close()

    actual_labels, testing_emails = readTestingEmails()
    confusionMatrixM = zeros((2, 2))
    confusionMatrixB = zeros((2, 2))

    hypothetical_m_classifier = []
    hypothetical_b_classifier = []
    map_m_estimate = []
    map_b_estimate = []

    for x in xrange(0, len(testing_emails)):
        map_estimate, label = multinomial_classifier(testing_emails[x], emails_classes[1], emails_classes[0])
        map_estimate2, label2 = bernouilli_classifier(testing_emails[x], emails_classes[1], emails_classes[0])
        hypothetical_m_classifier.append(label)
        hypothetical_b_classifier.append(label2)
        map_m_estimate.append(map_estimate)
        map_b_estimate.append(map_estimate2)

    count_m_list = []
    count_m_0 = len(hypothetical_m_classifier) - count_nonzero(hypothetical_m_classifier)
    count_m_1 = count_nonzero(hypothetical_m_classifier)
    count_m_list.append(count_m_0)
    count_m_list.append(count_m_1)

    count_b_list = []
    count_b_0 = len(hypothetical_b_classifier) - count_nonzero(hypothetical_b_classifier)
    count_b_1 = count_nonzero(hypothetical_b_classifier)
    count_b_list.append(count_b_0)
    count_b_list.append(count_b_1)

    for x in xrange(0, len(actual_labels)):
        confusionMatrixM[actual_labels[x]][hypothetical_m_classifier[x]] += 1
        confusionMatrixB[actual_labels[x]][hypothetical_b_classifier[x]] += 1

    for x in xrange(0, 2):
        for y in xrange(0, 2):
            confusionMatrixM[x][y] = float(confusionMatrixM[x][y] * 100) / count_m_list[x]
            print count_m_list[x]
            confusionMatrixB[x][y] = float(confusionMatrixB[x][y] * 100) / count_b_list[x]

    print confusionMatrixM
    print confusionMatrixB

    error = list(array(actual_labels) - array(hypothetical_m_classifier))
    error_b = list(array(actual_labels) - array(hypothetical_b_classifier))

    error_value = float(count_nonzero(error)) / len(testing_emails)
    error_b_value = float(count_nonzero(error_b)) / len(testing_emails)

    sorted_m_reg = sorted(emails_classes[0].m_likelihood.items(), key=operator.itemgetter(1), reverse=True)
    sorted_m_spam = sorted(emails_classes[1].m_likelihood.items(), key=operator.itemgetter(1), reverse=True)
    sorted_b_reg = sorted(emails_classes[0].b_likelihood.items(), key=operator.itemgetter(1), reverse=True)
    sorted_b_spam = sorted(emails_classes[1].b_likelihood.items(), key=operator.itemgetter(1), reverse=True)

    for x in xrange(1, 20):
        print "Word: ", sorted_m_reg[x][0], " Value: ", sorted_m_reg[x][1]

    print "WAAAHOOO"
    for x in xrange(0, 20):
        print "Word: ", sorted_m_spam[x][0], " Value: ", sorted_m_spam[x][1]

    print "Booyah"
    for x in xrange(0, 20):
        print "Word: ", sorted_b_reg[x][0], " Value: ", sorted_b_reg[x][1]
    print "Huzzah!"
    for x in xrange(0, 20):
        print "Word: ", sorted_b_spam[x][0], " Value: ", sorted_b_spam[x][1]

    print "The multinomial error is ", error_value

    print "The bernouilli error is ", error_b_value

    file = open("map_m_estimate.txt", "w")
    print >> file, map_m_estimate
    file.close()

    file = open("map_b_estimate.txt", "w")
    print >> file, map_b_estimate
    file.close()

    file = open("hypothetical_m_classifier.txt", "w")
    print >> file, hypothetical_m_classifier
    file.close()

    file = open("hypothetical_b_classifier.txt", "w")
    print >> file, hypothetical_b_classifier
    file.close()

    file = open("actual_labels.txt", "w")
    print >> file, actual_labels
    file.close()