Example #1
0
 def __init__(self, train_data, beta=1):
     self.counts = Counter()
     spam = 0
     total = 0
     spam_prior = Counter()
     ham_prior = Counter()
     for row in util.get_rows(train_data):
         if row["spam"]:
             spam += 1
             del row["spam"]
             spam_prior.update(row)
         else:
             del row["spam"]
             ham_prior.update(row)
         total += 1
     ham = total - spam
     prior = ((ham + beta - 1) / (total + 2 * beta - 2), (spam + beta - 1) / (total + 2 * beta - 2))
     for k, v in spam_prior.items():
         spam_prior[k] = (v + beta - 1) / (spam + 2 * beta - 2)
     for k, v in ham_prior.items():
         ham_prior[k] = (v + beta - 1) / (ham + 2 * beta - 2)
     self.weights = {}
     self.weights["base"] = math.log(prior[1] / prior[0])
     for k in set(spam_prior) | set(ham_prior):
         self.weights["base"] += math.log((1 - spam_prior[k]) / (1 - ham_prior[k]))
         self.weights[k] = math.log(spam_prior[k] / ham_prior[k]) - math.log(
             (1 - spam_prior[k]) / (1 - ham_prior[k])
         )
Example #2
0
def main(train, test, beta, model):
    classifier = nb.Classifier(train, beta)
    util.dump_model(classifier, model)
    count = correct = 0
    for row in util.get_rows(test):
        prob, is_spam = classifier.classify(row)
        print(round(prob, 4))
        if round(prob) == is_spam:
            correct += 1
        count += 1
    print('\nAccuracy: {:.2%}'.format(correct/count))