def __init__(self, train_data, beta=1): self.counts = Counter() beta = int(beta) spam = 0 total = 0 spam_prior = Counter() ham_prior = Counter() for row in util.get_rows(train_data): if row['spam']: spam += 1 del row['spam'] spam_prior.update(row) else: del row['spam'] ham_prior.update(row) total += 1 ham = total - spam prior = ((ham + beta - 1) / (total + 2*beta - 2), (spam + beta - 1) / (total + 2*beta - 2)) for k, v in spam_prior.items(): spam_prior[k] = (v + beta -1) / (spam + 2*beta - 2) for k, v in ham_prior.items(): ham_prior[k] = (v + beta -1) / (ham + 2*beta - 2) self.weights = {} self.weights['base'] = math.log(prior[1]/prior[0]) for k in set(spam_prior) | set(ham_prior): self.weights['base'] += math.log((1 - spam_prior[k]) / (1 - ham_prior[k])) self.weights[k] = (math.log(spam_prior[k] / ham_prior[k]) - math.log((1 - spam_prior[k]) / (1 - ham_prior[k])))
def main(classifier, test_file, model_file): classifier.dump(model_file) count = correct = 0 for row in util.get_rows(test_file): is_spam = row['spam'] prob = classifier.get_prob(row) print(round(prob, 4)) if round(prob) == is_spam: correct += 1 count += 1 print('\nAccuracy: {:.2%}'.format(correct/count))
def __init__(self, train_data, eta, sigma): eta = float(eta) sigma = float(sigma) headers = ['base'] + util.get_headers(train_data) self.weights = dict.fromkeys(headers, 0) for i in range(100): gradient = defaultdict(float) for row in util.get_rows(train_data, shuffle=True): target = row['spam'] del row['spam'] row['base'] = 1 w = target - cond_log_likelihood(self.weights, row) for f, x in row.items(): gradient[f] = x*w - (self.weights[f] / (sigma**2)) if magnitude(gradient.values()) < 0.01: break for f in self.weights: self.weights[f] += eta * gradient[f]
def __init__(self, train_data, eta, false=0): eta = float(eta) self.false = int(false) headers = ['base'] + util.get_headers(train_data) self.weights = dict.fromkeys(headers, 0) for i in range(1000): print(i+1) error = False for row in util.get_rows(train_data, false=self.false): target = row['spam'] del row['spam'] output = self.classify(row) if output != target: error = True delta = eta * (target - output) row['base'] = 1 for x in row: self.weights[x] += delta * row[x] if not error: break