/
lm.py
201 lines (178 loc) · 9.2 KB
/
lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
Copyright 2017 Neural Networks and Deep Learning lab, MIPT
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import tensorflow as tf
from tf_layers import cudnn_lstm
class LM:
def __init__(self,
vocab_size,
tok_emb_mat,
emb_dim=256,
n_hidden=512,
n_layers=1,
n_unroll=70,
model_name='test_model',
gpu=1,
bidirectional=False,
dropout_keep_prob=0.7):
tf.reset_default_graph()
self.learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='lr')
self._dropout_ph = tf.placeholder_with_default(1.0, shape=[], name='drop')
self.tok_ph = tf.placeholder(dtype=tf.int32, shape=[None, None], name='tok_idxs')
self.mask_ph = tf.placeholder_with_default(tf.ones_like(self.tok_ph, dtype=tf.float32), shape=[None, None])
self.model_name = model_name
self.vocab_size = vocab_size
self.n_unroll = n_unroll
self.dropout_keep_prob = dropout_keep_prob
# Embeddings
emb_mat = tf.Variable(tok_emb_mat, name='Embeddings_Mat', trainable=True)
embs = tf.nn.embedding_lookup(emb_mat, self.tok_ph)
# Forward LSTM
with tf.variable_scope('Forward'):
units = embs[:, :-1, :]
units = self._variational_dropout(units, self._dropout_ph)
for n in range(n_layers):
with tf.variable_scope('LSTM_' + str(n)):
units, _ = cudnn_lstm(units, n_hidden)
if n != n_layers - 1:
units = self._variational_dropout(units, self._dropout_ph)
if n_hidden != emb_dim:
units = tf.layers.dense(units, emb_dim, name='Output_Projection')
units = self._variational_dropout(units, self._dropout_ph)
logits_fw = tf.tensordot(units, emb_mat, (2, 1))
targets = tf.one_hot(self.tok_ph, self.vocab_size)
fw_loss = tf.losses.softmax_cross_entropy(targets[:, 1:, :], logits_fw, reduction=tf.losses.Reduction.NONE)
fw_loss = self.mask_ph[:, 1:] * fw_loss
self.loss = fw_loss
if bidirectional:
# Backward LSTM
# Lengths assumed to be equal to n_unroll + n_hist
lengths = tf.cast(tf.reduce_sum(self.mask_ph, 1), tf.int32)
embs_bw = tf.reverse_sequence(embs, lengths, seq_axis=1, batch_axis=0)
with tf.variable_scope('Backward'):
units = embs_bw[:, :-1, :]
for n in range(n_layers):
with tf.variable_scope('LSTM_' + str(n)):
units, _ = cudnn_lstm(units, n_hidden)
if n != n_layers - 1:
units = self._variational_dropout(units, self._dropout_ph)
if n_hidden != emb_dim:
units = tf.layers.dense(units, emb_dim, name='Output_Projection')
units = self._variational_dropout(units, self._dropout_ph)
logits_bw = tf.tensordot(units, emb_mat, (2, 1))
targets_bw = tf.one_hot(tf.reverse_sequence(self.tok_ph, lengths, seq_axis=1, batch_axis=0), self.vocab_size)
bw_loss = tf.losses.softmax_cross_entropy(targets_bw[:, 1:, :], logits_bw, reduction=tf.losses.Reduction.NONE)
bw_loss = self.mask_ph[:, 1:] * bw_loss
self.loss = (self.loss + bw_loss) / 2
self.loss = tf.reduce_sum(self.loss) / tf.reduce_sum(self.mask_ph)
# Summary
tf.summary.scalar('log_loss', self.loss)
self.summary = tf.summary.merge_all()
# Predictions
self.pred = tf.argmax(logits_fw, axis=-1)
if bidirectional:
self.pred_bw = tf.argmax(tf.reverse_sequence(logits_bw, lengths, seq_axis=1, batch_axis=0), axis=-1)
# Train ops
self.train_op = self.get_train_op(self.loss, self.learning_rate_ph, clip_norm=5.0, optimizer_scope_name='Optimizer')
# the session
config = tf.ConfigProto()
config.gpu_options.visible_device_list = str(gpu)
self.sess = tf.Session(config=config)
# Init variables
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
# self.saver.restore(self.sess, 'model/reddit_lm.ckpt')
self.summary_writer = tf.summary.FileWriter('model/' + self.model_name, self.sess.graph)
def get_train_op(self,
loss,
learning_rate,
optimizer=None,
clip_norm=None,
learnable_scopes=None,
optimizer_scope_name=None):
""" Get train operation for given loss
Args:
loss: loss, tf tensor or scalar
learning_rate: scalar or placeholder
clip_norm: clip gradients norm by clip_norm
learnable_scopes: which scopes are trainable (None for all)
optimizer: instance of tf.train.Optimizer, default Adam
Returns:
train_op
"""
if optimizer_scope_name is None:
opt_scope = tf.variable_scope('Optimizer')
else:
opt_scope = tf.variable_scope(optimizer_scope_name)
with opt_scope:
if learnable_scopes is None:
variables_to_train = tf.trainable_variables()
else:
variables_to_train = []
for scope_name in learnable_scopes:
for var in tf.trainable_variables():
if scope_name in var.name:
variables_to_train.append(var)
if optimizer is None:
optimizer = tf.train.AdamOptimizer
# For batch norm it is necessary to update running averages
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
opt = optimizer(learning_rate)
grads_and_vars = opt.compute_gradients(loss, var_list=variables_to_train)
if clip_norm is not None:
grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var)
for grad, var in grads_and_vars if grad is not None]
train_op = opt.apply_gradients(grads_and_vars)
return train_op
@staticmethod
def _variational_dropout(units, keep_prob):
noise_shape = [tf.shape(units)[0], 1, tf.shape(units)[2]]
return tf.nn.dropout(units, keep_prob, noise_shape)
def train(self, corp, batch_size=32, lr=3e-3, every_n=10000, n_epochs=10):
total_loss = 0
best_loss = 1e10
best_val_loss = 1e10
count = 0
for epoch in range(n_epochs):
print('Epoch {}'.format(epoch))
for n, (x, mask) in enumerate(corp.batch_generator(batch_size, self.n_unroll)):
loss, summary, _ = self.sess.run([self.loss, self.summary, self.train_op], {self.tok_ph: x,
self.learning_rate_ph: lr,
self._dropout_ph: self.dropout_keep_prob,
self.mask_ph: mask})
self.summary_writer.add_summary(summary, count + n)
total_loss += loss
if n % every_n == every_n - 1:
print(total_loss / every_n)
if total_loss / every_n < best_loss:
best_loss = total_loss / every_n
print('New best loss: {}, model saved'.format(best_loss))
self.saver.save(self.sess, 'model/'+ self.model_name + '.ckpt')
total_loss = 0
val_loss = 0
n_val = 0
for n, (x, mask) in enumerate(corp.batch_generator(batch_size, self.n_unroll)):
loss = self.sess.run(self.loss, {self.tok_ph: x,
self.learning_rate_ph: lr,
self._dropout_ph: 1.0,
self.mask_ph: mask})
val_loss += loss
n_val += 1
print('Validation loss: {}'.format(val_loss / n_val))
if val_loss / n_val < best_loss:
best_val_loss = val_loss / n_val
print('New best val loss: {}, model saved'.format(best_loss))
self.saver.save(self.sess, 'model/'+ self.model_name + '_val.ckpt')
val_loss = 0
break
count += n