/
main.py
60 lines (39 loc) · 1.61 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from helpers import *
import perceptron
import pegasos
# Limit perceptron training iterations
PERCEPTRON_MAX_ITER = 30
print "Reading training file..."
lines = read_instances("spam_train.txt")
print "Counting words..."
vocabulary = build_vocabulary(lines)
print "Building labels..."
desired_outputs = build_labels(lines)
print "Filtering instances..."
vocabulary = Counter({w: c for w, c in vocabulary.iteritems() if c >= 30})
print "Buiding feature vectors..."
feature_vectors = build_features(lines, vocabulary)
print "Calling perceptron train"
(w, k, it) = perceptron.train(feature_vectors, desired_outputs, PERCEPTRON_MAX_ITER)
print "Number of iterations (perceptron):", it
print "Number of mistakes (perceptron):", k
print "15 most positive words (perceptron):", w.most_common(15)
print "15 most negative words: (perceptron)", w.most_common()[:-15-1:-1]
print "Calling pegasos train"
# Test with array of lambdas or hardcoded one
l = range(-9,9)
ws = [pegasos.train(feature_vectors, desired_outputs, 2**x) for x in l]
#ws = [pegasos.train(feature_vectors, desired_outputs, 2 ** -7)]
print "Reading validation file..."
lines = read_instances("spam_val.txt")
print "Building feature vectors (validation)..."
feature_vectors = build_features(lines, vocabulary)
print "Building labels (validation)..."
desired_outputs = build_labels(lines)
print "Calling perceptron test..."
error = test(w, feature_vectors, desired_outputs)
print "Perceptron error on validation:", error, "%"
for w in ws:
print "Calling pegasos test..."
error = test(w, feature_vectors, desired_outputs)
print "Pegasos Error on validation:", error, "%"