forked from turian/parser-model
/
train.py
executable file
·156 lines (134 loc) · 5.43 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
import common.hyperparameters, common.options
HYPERPARAMETERS = common.hyperparameters.read("attardi07_english_ptb")
common.options.reparse(HYPERPARAMETERS)
import common.dump
rundir = common.dump.create_canonical_directory(HYPERPARAMETERS)
import examples
from vocabulary import *
from common.stats import stats
from common.file import myopen
import sys
import numpy as N
import math
import random
#from common import movingaverage
import common.movingaverage
random.seed(HYPERPARAMETERS["random seed"])
N.random.seed(HYPERPARAMETERS["random seed"])
IDIM = featuremap.len
ODIM = labelmap.len
HID = HYPERPARAMETERS["hidden dimensions"]
LR = HYPERPARAMETERS["learning rate"]
HLAYERS = HYPERPARAMETERS["hidden layers"]
from pylearn.algorithms.weights import random_weights
w1 = random_weights(IDIM, HID)
b1 = N.zeros(HID)
if HLAYERS == 2:
wh = random_weights(HID, HID)
bh = N.zeros(HID)
w2 = random_weights(HID, ODIM)
b2 = N.zeros(ODIM)
import graph
import state
def abs_prehidden(prehidden, str="Prehidden"):
abs_prehidden = N.abs(prehidden)
med = N.median(abs_prehidden)
abs_prehidden = abs_prehidden.tolist()
assert len(abs_prehidden) == 1
abs_prehidden = abs_prehidden[0]
abs_prehidden.sort()
abs_prehidden.reverse()
print >> sys.stderr, cnt, "Abs%s median =" % str, med, "max =", abs_prehidden[:5]
best_validation_accuracy = 0.
best_validation_at = 0
def validate():
acc = []
for (i, (x, y)) in enumerate(examples.get_validation_example()):
if HYPERPARAMETERS["locally normalize"]:
targety = N.array([y])
else:
targety = N.zeros(ODIM)
targety[y] = 1.
if HLAYERS == 2:
o = graph.validatefn([x.data], targety, w1[x.indices], b1, wh, bh, w2, b2)
(kl, softmax, argmax, prehidden1, prehidden2) = o
else:
o = graph.validatefn([x.data], targety, w1[x.indices], b1, w2, b2)
(kl, softmax, argmax, prehidden) = o
if argmax == y: acc.append(1.)
else: acc.append(0.)
if i < 5:
if HLAYERS == 2:
abs_prehidden(prehidden1, "Prehidden1")
abs_prehidden(prehidden2, "Prehidden2")
else:
abs_prehidden(prehidden)
return N.mean(acc), N.std(acc)
mvgavg_accuracy = common.movingaverage.MovingAverage()
mvgavg_loss = common.movingaverage.MovingAverage()
cnt = 0
#if HLAYERS == 2:
# state.save((w1, b1, wh, bh, w2, b2), rundir, best_validation_accuracy, best_validation_at)
#else:
# state.save((w1, b1, w2, b2), rundir, best_validation_accuracy, best_validation_at)
for (x, y) in examples.get_training_example():
cnt += 1
# print x, y
# print "Target y =", y
if HYPERPARAMETERS["locally normalize"]:
targety = N.array([y])
else:
targety = N.zeros(ODIM)
targety[y] = 1.
if HLAYERS == 2:
o = graph.trainfn([x.data], targety, w1[x.indices], b1, wh, bh, w2, b2)
(kl, softmax, argmax, prehidden1, prehidden2, gw1, gb1, gwh, gbh, gw2, gb2) = o
else:
o = graph.trainfn([x.data], targety, w1[x.indices], b1, w2, b2)
(kl, softmax, argmax, prehidden, gw1, gb1, gw2, gb2) = o
# print "old KL=%.3f, softmax=%s, argmax=%d" % (kl, softmax, argmax)
# print "old KL=%.3f, argmax=%d" % (kl, argmax)
if argmax == y: this_accuracy = 100.
else: this_accuracy = 0.
mvgavg_accuracy.add(this_accuracy)
if N.isnan(kl).any():
print >> sys.stderr, "Got a NaN loss"
else:
mvgavg_loss.add(kl)
# Only sum the gradient along the non-zeroes.
# How do we implement this as C code?
# for idx in x.indices:
# w1[idx,:] -= gw1[idx,:] * LR
w1[x.indices] -= gw1 * LR
# w1 -= gw1 * LR
b1 -= gb1 * LR
if HLAYERS == 2:
wh -= gwh * LR
bh -= gbh * LR
w2 -= gw2 * LR
b2 -= gb2 * LR
# o = graph.validatefn(x, N.array([y]), w1, b1, w2, b2)
# (kl, softmax, argmax, presquashh) = o
## print "new KL=%.3f, softmax=%s, argmax=%d" % (kl, softmax, argmax)
# print "new KL=%.3f, argmax=%d" % (kl, argmax)
if cnt % HYPERPARAMETERS["examples per validation"] == 0:
valacc, valstd = validate()
sys.stderr.write("After %d training examples, validation accuracy: %.2f%%, stddev: %.2f%% (former best=%.2f%% at %d)\n" % (cnt, valacc*100, valstd*100, best_validation_accuracy*100, best_validation_at))
if best_validation_accuracy < valacc:
best_validation_accuracy = valacc
best_validation_at = cnt
sys.stderr.write("NEW BEST VALIDATION ACCURACY. Saving state.\n")
if HLAYERS == 2:
state.save((w1, b1, wh, bh, w2, b2), rundir, best_validation_accuracy, best_validation_at)
else:
state.save((w1, b1, w2, b2), rundir, best_validation_accuracy, best_validation_at)
elif cnt > 2*best_validation_at and cnt >= HYPERPARAMETERS["minimum training updates"]:
sys.stderr.write("Have not beaten best validation accuracy for a while. Terminating training...\n")
sys.stderr.write(stats() + "\n")
break
if cnt % 10000 == 0:
sys.stderr.write("After %d training examples, training accuracy %s\n" % (cnt, mvgavg_accuracy))
sys.stderr.write("After %d training examples, training loss %s\n" % (cnt, mvgavg_loss))
sys.stderr.write(stats() + "\n")
#graph.COMPILE_MODE.print_summary()