Example #1
0
 def __init__(self, train_data, beta=1):
     self.counts = Counter()
     beta = int(beta)
     spam = 0
     total = 0
     spam_prior = Counter()
     ham_prior = Counter()
     for row in util.get_rows(train_data):
         if row['spam']:
             spam += 1
             del row['spam']
             spam_prior.update(row)
         else:
             del row['spam']
             ham_prior.update(row)
         total += 1
     ham = total - spam
     prior = ((ham + beta - 1) / (total + 2*beta - 2),
              (spam + beta - 1) / (total + 2*beta - 2))
     for k, v in spam_prior.items():
         spam_prior[k] = (v + beta -1) / (spam + 2*beta - 2)
     for k, v in ham_prior.items():
         ham_prior[k] = (v + beta -1) / (ham + 2*beta - 2)
     self.weights = {}
     self.weights['base'] = math.log(prior[1]/prior[0])
     for k in set(spam_prior) | set(ham_prior):
         self.weights['base'] += math.log((1 - spam_prior[k]) /
                                          (1 - ham_prior[k]))
         self.weights[k] = (math.log(spam_prior[k] / ham_prior[k]) -
                            math.log((1 - spam_prior[k]) /
                                     (1 - ham_prior[k])))
Example #2
0
def main(classifier, test_file, model_file):
    classifier.dump(model_file)
    count = correct = 0
    for row in util.get_rows(test_file):
        is_spam = row['spam']
        prob = classifier.get_prob(row)
        print(round(prob, 4))
        if round(prob) == is_spam:
            correct += 1
        count += 1
    print('\nAccuracy: {:.2%}'.format(correct/count))
Example #3
0
 def __init__(self, train_data, eta, sigma):
     eta = float(eta)
     sigma = float(sigma)
     headers = ['base'] + util.get_headers(train_data)
     self.weights = dict.fromkeys(headers, 0)
     for i in range(100):
         gradient = defaultdict(float)
         for row in util.get_rows(train_data, shuffle=True):
             target = row['spam']
             del row['spam']
             row['base'] = 1
             w = target - cond_log_likelihood(self.weights, row)
             for f, x in row.items():
                 gradient[f] = x*w - (self.weights[f] / (sigma**2))
             if magnitude(gradient.values()) < 0.01:
                 break
             for f in self.weights:
                 self.weights[f] += eta * gradient[f]
Example #4
0
 def __init__(self, train_data, eta, false=0):
     eta = float(eta)
     self.false = int(false)
     headers = ['base'] + util.get_headers(train_data)
     self.weights = dict.fromkeys(headers, 0)
     for i in range(1000):
         print(i+1)
         error = False
         for row in util.get_rows(train_data, false=self.false):
             target = row['spam']
             del row['spam']
             output = self.classify(row)
             if output != target:
                 error = True
                 delta = eta * (target - output)
                 row['base'] = 1
                 for x in row:
                     self.weights[x] += delta * row[x]
         if not error:
             break