-
Notifications
You must be signed in to change notification settings - Fork 0
/
attention_function.py
319 lines (246 loc) · 13.9 KB
/
attention_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 14 10:45:16 2019
@author: lizhenping
"""
import tensorflow as tf
from tensorflow.python.ops import variable_scope
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import math_ops
import numpy as np
tf.enable_eager_execution()
def load_embed_txt(embed_file, vocab):
emb_dict = dict()
emb_size = None
with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
for line in f:
tokens = line.strip().split(" ")
word = tokens[0]
if word not in vocab._word_to_id: continue
vec = list(map(float, tokens[1:]))
emb_dict[word] = vec
if emb_size:
assert emb_size == len(vec), "All embedding size should be same."
else:
emb_size = len(vec)
return emb_dict, emb_size
def _create_pretrained_emb_from_vocab(vocab, embed_file, dtype=tf.float32, name=None):
trainable_tokens = vocab._word_to_id
emb_dict, emb_size = load_embed_txt(embed_file, vocab)
for token in trainable_tokens:
if token not in emb_dict:
emb_dict[token] = np.random.normal(size=(200))
emb_mat = np.array(
[emb_dict[token] for token in vocab._word_to_id], dtype=dtype.as_numpy_dtype())
num_trainable_tokens = emb_mat.shape[0]
emb_size = emb_mat.shape[1]
emb_mat = tf.constant(emb_mat)
emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
emb_mat_var = tf.get_variable(name, [num_trainable_tokens, emb_size])
return tf.concat([emb_mat_var, emb_mat_const], 0)
def linear(args, output_size, bias, bias_start=0.0, scope=None):
if args is None or (isinstance(args, (list, tuple)) and not args):
raise ValueError("`args` must be specified")
if not isinstance(args, (list, tuple)):
args = [args]
# Calculate the total size of arguments on dimension 1.
total_arg_size = 0
if isinstance(args, tuple):
args = args[1]
shapes = [a.get_shape().as_list() for a in args]
#shapes [[32,200] [32,400]]
for shape in shapes:
if len(shape) != 2:
raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes))
if not shape[1]:
raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes))
else:
total_arg_size += shape[1]
# Now the computation.
with tf.variable_scope(scope or "Linear"):
matrix = tf.get_variable("Matrix", [total_arg_size, output_size])
if len(args) == 1:
res = tf.matmul(args[0], matrix)
else:
#matrix shape=(600, 200)
#values=(32,600)
res = tf.matmul(tf.concat(axis=1, values=args), matrix)
if not bias:
return res
bias_term = tf.get_variable(
"Bias", [output_size], initializer=tf.constant_initializer(bias_start))
return res + bias_term
def attention_decoder(decoder_inputs, initial_state, encoder_states, enc_padding_mask, cell,
initial_state_attention=False):
with variable_scope.variable_scope("attention_decoder") as scope:
# if this line fails, it's because the batch size isn't defined
#对encoder_state 进行split分成对应每个batch_size 下的每一行的
batch_size = encoder_states.get_shape()[0].value
# if this line fails, it's because the attention length isn't defined
attn_size = encoder_states.get_shape()[2].value
print(attn_size)
#这里不清数为什么做expend_dim
# shape (batch_size, attn_len, 1, attn_size)
encoder_states = tf.expand_dims(encoder_states, axis=2)
attention_vec_size = attn_size
#此处是需要测试出来W_h的具体形状和数值
#W_h(1,1,400,400)
W_h = variable_scope.get_variable("W_h", [1, 1, attn_size, attention_vec_size])
# shape (batch_size,attn_length,1,attention_vec_size)
#此处自己做了实验百分之百不是仅仅的做维度变化,做了数值变化
#代码见:
#不明白为什么要做卷积变换
#此处对encoder_state进行变形,此处变形用的是卷积,用encoder_state对一个W_h [shape,shape],做卷积变形。此处数值应该出现变化,而不是单单的矩阵变形。
encoder_features = nn_ops.conv2d(encoder_states, W_h, [1, 1, 1, 1], "SAME")
#此处初始化赋值的疑惑:tf.layers.conv2d中的权重初始化参数默认为None,但是就算不给参数也可以正常训练,
#经过查看源码发现tf.layers.conv2d是从tf.keras.layer.Conv继承来的,在父类中对初始化进行了定义,
#kernel_initializer='glorot_uniform'对卷积核参数进行均匀初始化
#f(x)*g(t-x)在此处做卷积变换,疑惑是声明变量的数值是多少
# Get the weight vectors v and w_c (w_c is for coverage)
#此处的V是直接定义的一个矩阵。但是没有进行权重初始化
#标准的写法不是这种,是利用生成一个神经网络层进行初始化,但是此处应该雇佣纠结初始化数值问题,用的nn_ops库中的矩阵在initializer的时候能够自动初始化。相当于同tf例子中的作用一样,不一样的函数表达式
v = variable_scope.get_variable("v", [attention_vec_size])
#此处是使用一个BahdanauAttention算法来分配注意力
def attention(decoder_state):
with variable_scope.variable_scope("attention"):
# Pass the decoder state through a linear layer
# shape (batch_size, attention_vec_size)
#linear的作用做a*decode_state+bias 这里面感觉自己的理解是错误的?
#decoder_features=[32,200]
#decoder_state=decoder_state
#linear(args, output_size, bias, bias_start=0.0, scope=None):
#此处没搞明白参数的对应关系。
#
decoder_state
attention_vec_size
print(decoder_state)
#此处linear没做审什么变动知识做了list tuple检查,如果decoder_state形式正确,decoder没任何变化
decoder_features = linear(decoder_state, attention_vec_size, True)
# reshape to (batch_size, 1, 1, attention_vec_size)
#此处一直没明白为什么要reshape,此处运行是要大量时间,是为了?
decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1)
def masked_attention(e):
"""Take softmax of e then apply enc_padding_mask and re-normalize"""
attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length)
attn_dist *= enc_padding_mask # apply mask
masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size)
return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize
# Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)求出来score数值
#score = FC(tanh(FC(EO) + FC(H)))
#FC = Fully connected (dense) layer
#EO = Encoder output
#H = hidden state
#X = input to the decoder
#e=score
#V=400
e = math_ops.reduce_sum(
v * math_ops.tanh(encoder_features + decoder_features), [2, 3])
# Calculate attention distribution
attn_dist = masked_attention(e)
# Calculate the context vector from attn_dist and encoder_states
#对attn_dist进行变形 由 变成 并乘以输入的encoder*state得到一个注意力分配模型
#此处疑惑的是矩阵的形状 返回的atten_dist应该没用
context_vector = math_ops.reduce_sum(
array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states,
[1, 2]) # shape (batch_size, attn_size).
context_vector = array_ops.reshape(context_vector, [-1, attn_size])
return context_vector, attn_dist
outputs = []
attn_dists = []
state = initial_state
context_vector = array_ops.zeros([batch_size, attn_size])
# Ensure the second shape of attention vectors is set.
context_vector.set_shape([None, attn_size])
if initial_state_attention: # true in decode mode
# Re-calculate the context vector from the previous step
# so that we can pass it through a linear layer with
# this step's input to get a modified version of the input
context_vector, _ = attention(initial_state)
for i, inp in enumerate(decoder_inputs):
if i > 0:
variable_scope.get_variable_scope().reuse_variables()
# Merge input and previous attentions into one vector x of the same size as inp
input_size = inp.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from input: %s" % inp.name)
x = linear([inp] + [context_vector], input_size, True)
cell_output, state = cell(x, state)
# Run the attention mechanism.
if i == 0 and initial_state_attention: # always true in decode mode
with variable_scope.variable_scope(variable_scope.get_variable_scope(),
reuse=True):
context_vector, attn_dist = attention(state)
else:
context_vector, attn_dist = attention(state)
attn_dists.append(attn_dist)
# Concatenate the cell_output (= decoder state) and the context vector,
# and pass them through a linear layer
with variable_scope.variable_scope("AttnOutputProjection"):
output = linear([cell_output] + [context_vector], cell.output_size, True)
outputs.append(output)
return outputs, state, attn_dists
class baseModel(object):
"""Base class for seq2seq models."""
def __init__(self):
#hps传入的是parser args的数值,用于加载输入参数的数值
#在此处声明类中需要用的变量
self.arg_dec_outputs = None
self._dec_out_state = None
self.attn_dists = None
def _add_embedding(self):
#此处是加载glove/glove.6B.200d.txt数据集
if os.path.exists(self.hps.embed_path):
embedding_encoder = _create_pretrained_emb_from_vocab(
self._src_vocab, self.hps.embed_path, name="embedding_src")
embedding_decoder = _create_pretrained_emb_from_vocab(
self._tgt_vocab, self.hps.embed_path, name="embedding_tgt")
else:
#如果未加载自动生成一个同样大小的矩阵,随机生成
embedding_encoder = tf.get_variable('embedding_src',
[self._src_vocab.size(), self.hps.emb_dim],
dtype=tf.float32,
initializer=self.trunc_norm_init)
#利用look_up查找对应的单词在表中的位置做单词和ebededing的映射
self.emb_enc_inputs = tf.nn.embedding_lookup(embedding_encoder, self._enc_batch)
self.emb_arg_dec_inputs = [tf.nn.embedding_lookup(embedding_decoder, x)
for x in tf.unstack(self._arg_dec_batch, axis=1)]
if self.hps.model in ["sep_dec", "shd_dec"]:
self.emb_kp_dec_inputs = [tf.nn.embedding_lookup(embedding_decoder, x)
for x in tf.unstack(self._kp_dec_batch, axis=1)]
#print(" Time to a
def _attention_decoder(self):
self.rand_unif_init = \
tf.random_uniform_initializer(-0.02,
0.02,
seed=123)
# define a 2-layer LSTM network for decoder
cell1 = tf.contrib.rnn.LSTMCell(
200, state_is_tuple=True, initializer=self.rand_unif_init)
cell2 = tf.contrib.rnn.LSTMCell(
200, state_is_tuple=True, initializer=self.rand_unif_init)
cell = tf.contrib.rnn.MultiRNNCell([cell1, cell2])
embedding_decoder = tf.get_variable('embedding_tgt',
[self._tgt_vocab.size(), self.hps.emb_dim],
dtype=tf.float32,
initializer=self.trunc_norm_init)
emb_arg_dec_inputs = tf.Variable(tf.fill([100,32,200], 0.4))
emb_arg_dec_inputs = tf.convert_to_tensor(emb_arg_dec_inputs)
_dec_in_state = tf.Variable(tf.fill([4,100], 0.6))
_dec_in_state = tf.convert_to_tensor(_dec_in_state)
encoder_outputs = tf.Variable(tf.fill([32,100,400], 0.7))
encoder_outputs = tf.convert_to_tensor(encoder_outputs)
_enc_padding_mask = tf.Variable(tf.fill([32,100], 0.8))
enc_padding_mask = np.ones((32, 250), dtype=np.float32)
enc_padding_mask = tf.convert_to_tensor(_enc_padding_mask)
print("this is a test")
self.arg_dec_outputs, arg_dec_out_state, arg_attn_dists = attention_decoder(
self.emb_arg_dec_inputs, _dec_in_state, encoder_outputs,
enc_padding_mask, cell,False)
def main():
l1 = baseModel()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(l1._attention_decoder())
if __name__ == '__main__':
main()