def _fuse(self): with tf.variable_scope("Context_to_Query_Attention_Layer"): C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2), [1, 1, self.max_q_len, 1]) Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1), [1, self.max_p_len, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, self.q_embed_encoding) self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding) self.attention_outputs = [ self.c_embed_encoding, self.c2q, self.c_embed_encoding * self.c2q, self.c_embed_encoding * self.q2c ] # self.config.batch_size if not self.demo else 1, # self.max_p_len, # self.max_q_len, # self.config.max_ch_len, # self.config.hidden_size, # self.config.char_embed_size, # self.config.head_size N, PL, QL, CL, d, dc, nh = self._params() if self.config.fix_pretrained_vector: dc = self.char_mat.get_shape()[-1] with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(self.attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=1, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=True, reuse=True if i > 0 else None, dropout=self.dropout)) for i, item in enumerate(self.enc): self.enc[i] = tf.reshape(self.enc[i], [N, -1, self.enc[i].get_shape()[-1]])
def _fuse(self): with tf.variable_scope("Context_to_Query_Attention_Layer"): C = tf.tile(tf.expand_dims(self.c_embed_encoding, 2), [1, 1, self.max_q_len, 1]) Q = tf.tile(tf.expand_dims(self.q_embed_encoding, 1), [1, self.max_p_len, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, self.q_embed_encoding) self.q2c = tf.matmul(tf.matmul(S_, S_T), self.c_embed_encoding) self.attention_outputs = [ self.c_embed_encoding, self.c2q, self.c_embed_encoding * self.c2q, self.c_embed_encoding * self.q2c ] PL, QL, CL, d, dc, nh = self._params() with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(self.attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=3, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout))
def forward(self): config = self.config N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads with tf.variable_scope("Input_Embedding_Layer"): ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # Bidaf style conv-highway encoder ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) ch_emb = tf.reduce_max(ch_emb, axis=1) qh_emb = tf.reduce_max(qh_emb, axis=1) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) c_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout) q_emb = tf.nn.dropout( tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout) c_emb = tf.concat([c_emb, ch_emb], axis=2) q_emb = tf.concat([q_emb, qh_emb], axis=2) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) with tf.variable_scope("Embedding_Encoder_Layer"): c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1]) Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q] if config.q2c: attention_outputs.append(c * self.q2c) with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def forward(self): self.c_words = tf.placeholder(tf.int32, [None, self.config.context_len], 'context-words') self.c_chars = tf.placeholder( tf.int32, [None, self.config.context_len, self.config.max_char_len], 'context-chars') self.c_mask = tf.sign(self.c_words) self.q_words = tf.placeholder(tf.int32, [None, self.config.question_len], 'query-words') self.q_chars = tf.placeholder( tf.int32, [None, self.config.question_len, self.config.max_char_len], 'query-chars') self.q_mask = tf.sign(self.q_words) self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32) self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32) self.start = tf.placeholder(tf.int32, [None], 'start-index') self.end = tf.placeholder(tf.int32, [None], 'end-index') with tf.variable_scope('input-embedding'): c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words) q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words) c_ch = layers.char_embed(self.c_chars, self.char_embed, dropout=self.dropout) q_ch = layers.char_embed(self.q_chars, self.char_embed, dropout=self.dropout, reuse=True) c = tf.concat([c_w, c_ch], -1) q = tf.concat([q_w, q_ch], -1) with tf.variable_scope('rnn'): c_rnn = layers.birnn(c, self.c_len, self.config.cell_size, self.config.cell_type, self.dropout) q_rnn = layers.birnn(q, self.q_len, self.config.cell_size, self.config.cell_type, self.dropout, reuse=True) with tf.variable_scope('attention'): attention = layers.bi_attention(c_rnn, q_rnn, layers.trilinear(c_rnn, q_rnn), self.c_mask, self.q_mask) attention = tf.layers.conv1d(attention, self.config.cell_size * 2, 1, padding='same') with tf.variable_scope('memory1'): memory1 = layers.birnn(attention, self.c_len, self.config.cell_size, self.config.cell_type, self.dropout) with tf.variable_scope('self-attention') as scope: x = memory1 self_attention = layers.bi_attention(x, x, layers.trilinear(x, x), self.c_mask, self.c_mask, only_c2q=True) res = tf.layers.dense(self_attention, self.config.cell_size * 2, activation=tf.nn.relu) res = tf.layers.dropout(res, rate=self.config.dropout, training=self.config.training) res += attention with tf.variable_scope('memory2'): memory2 = layers.birnn(res, self.c_len, self.config.cell_size, self.config.cell_type, self.dropout) with tf.variable_scope('start-index') as scope: self.start_linear = tf.squeeze(tf.layers.dense(memory2, 1), -1) self.pred_start = tf.nn.softmax(self.start_linear) with tf.variable_scope('end-index') as scope: end_input = tf.concat( [tf.expand_dims(self.start_linear, -1), memory2], -1) memory3 = layers.birnn(end_input, self.c_len, self.config.cell_size, self.config.cell_type, self.dropout) self.end_linear = tf.squeeze(tf.layers.dense(memory3, 1), -1) self.pred_end = tf.nn.softmax(self.end_linear) with tf.variable_scope('loss') as scope: loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.start_linear, labels=self.start) loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.end_linear, labels=self.end) loss = tf.reduce_mean(loss1 + loss2) lossL2 = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) * self.config.l2 self.loss = loss + lossL2 with tf.variable_scope('optimizer') as scope: optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) grads = tf.gradients(self.loss, tf.trainable_variables()) grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip) grads_and_vars = zip(grads, tf.trainable_variables()) self.optimize = optimizer.apply_gradients( grads_and_vars, global_step=self.global_step) if self.config.ema_decay > 0: with tf.variable_scope('ema') as scope: ema = tf.train.ExponentialMovingAverage( decay=self.config.ema_decay) ema_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) assign_vars = [] for var in tf.global_variables(): v = ema.average(var) if v: assign_vars.append(tf.assign(var, v)) self.assign_vars = assign_vars
def forward(self): config = self.config N = config.batch_size if not self.demo else 1 PL = self.c_maxlen QL = self.q_maxlen CL = config.char_limit # 16 d = config.hidden # 96 dc = config.char_dim # 64 nh = config.num_heads # 1 with tf.variable_scope("Input_Embedding_Layer"): ''' self.ch : (N, c_maxlen, 16) self.qh : (N, q_maxlen, 16) ''' ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch), [N * PL, CL, dc]) # (N*c_maxlen, 16, 64) qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh), [N * QL, CL, dc]) # (N*q_maxlen, 16, 64) ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout) qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout) # BiDAF style conv-highway encoder: conv over chars in each word in a batch of passages ch_emb = conv(ch_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=None) # (N*c_maxlen, 16-5+1, 96) qh_emb = conv(qh_emb, d, bias=True, activation=tf.nn.relu, kernel_size=5, name="char_conv", reuse=True) # (N*q_maxlen, 16-5+1, 96) ch_emb = tf.reduce_max(ch_emb, axis=1) # (N*c_maxlen, 96) qh_emb = tf.reduce_max(qh_emb, axis=1) # (N*q_maxlen, 96) ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]]) # (N, c_maxlen, 96) qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]]) # (N, q_maxlen, 96) ''' self.c : (N, c_maxlen) self.q : (N, q_maxlen) ''' c_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.c), 1.0 - self.dropout) # (N, c_maxlen, 300) q_emb = tf.nn.dropout(tf.nn.embedding_lookup( self.word_mat, self.q), 1.0 - self.dropout) # (N, q_maxlen, 300) c_emb = tf.concat([c_emb, ch_emb], axis=2) # (N, c_maxlen, 396) q_emb = tf.concat([q_emb, qh_emb], axis=2) # (N, q_maxlen, 396) c_emb = highway(c_emb, size=d, scope="highway", dropout=self.dropout, reuse=None) # (N, c_maxlen, 96) q_emb = highway(q_emb, size=d, scope="highway", dropout=self.dropout, reuse=True) # (N, q_maxlen, 96) with tf.variable_scope("Embedding_Encoder_Layer"): ''' -> positional encoding -> layer_normalization -> depth-wise separable convolution -> self attention -> feed forward network In the paper: The total number of encoder blocks is 1 ''' # (N, c_maxlen, 96) c = residual_block(c_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Encoder_Residual_Block", bias=False, dropout=self.dropout) # (N, q_maxlen, 96) q = residual_block( q_emb, num_blocks=1, num_conv_layers=4, kernel_size=7, mask=self.q_mask, num_filters=d, num_heads=nh, seq_len=self.q_len, scope="Encoder_Residual_Block", reuse=True, # Share the weights between passage and question bias=False, dropout=self.dropout) with tf.variable_scope("Context_to_Query_Attention_Layer"): ''' tf.tile(input, multiples, name=None): creates a new tensor by replicating input multiples times. The output tensor's i'th dimension has input.dims(i) * multiples[i] elements, and the values of input are replicated multiples[i] times along the 'i'th dimension. Paper: The layer parameters are the same as the Embedding Encoder Layer except that convolution layer number is 2 within a block and the total number of blocks is 7 ''' ''' c: (N, c_maxlen, d) q: (N, q_maxlen, d) ch_emb: (N, c_maxlen, d) qh_emb: (N, q_maxlen, d) C: (N, c_maxlen, q_maxlen, d) Q: (N, c_maxlen, q_maxlen, d) S: (N, c_maxlen, q_maxlen) mask_q: (N, 1, q_maxlen) mask_c: (N, c_maxlen, 1) S_: (N, c_maxlen, q_maxlen) S_T: (N, q_maxlen, c_maxlen) self.c2q: (N, c_maxlen, d) = tf.matmul(S_, q) self.q2c: (N, c_maxlen, d) = tf.matmul(tf.matmul(S_, S_T), c) ''' C = tf.tile(tf.expand_dims(c, 2), [1, 1, self.q_maxlen, 1]) Q = tf.tile(tf.expand_dims(q, 1), [1, self.c_maxlen, 1, 1]) S = trilinear([C, Q, C * Q], input_keep_prob=1.0 - self.dropout) mask_q = tf.expand_dims(self.q_mask, 1) S_ = tf.nn.softmax(mask_logits(S, mask=mask_q)) mask_c = tf.expand_dims(self.c_mask, 2) S_T = tf.transpose( tf.nn.softmax(mask_logits(S, mask=mask_c), axis=1), (0, 2, 1)) self.c2q = tf.matmul(S_, q) self.q2c = tf.matmul(tf.matmul(S_, S_T), c) attention_outputs = [c, self.c2q, c * self.c2q] if config.q2c: attention_outputs.append(c * self.q2c) with tf.variable_scope("Model_Encoder_Layer"): inputs = tf.concat(attention_outputs, axis=-1) self.enc = [conv(inputs, d, name="input_projection")] # d=hidden=96 for i in range(3): if i % 2 == 0: # dropout every 2 blocks self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout) self.enc.append( residual_block(self.enc[i], num_blocks=7, num_conv_layers=2, kernel_size=5, mask=self.c_mask, num_filters=d, num_heads=nh, seq_len=self.c_len, scope="Model_Encoder", bias=False, reuse=True if i > 0 else None, dropout=self.dropout)) with tf.variable_scope("Output_Layer"): ''' tf.matrix_band_part: Copy a tensor setting everything outside a central band in each innermost matrix to zero. self.enc[i]: (N, c_maxlen, d) start_logits: (N, c_maxlen) end_logits: (N, c_maxlen) logits1: (N, c_maxlen) logits2: (N, c_maxlen) outer: (N, c_maxlen, c_maxlen) yp1, yp2, losses, losses2: (N,) ''' start_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[2]], axis=-1), 1, bias=False, name="start_pointer"), -1) end_logits = tf.squeeze( conv(tf.concat([self.enc[1], self.enc[3]], axis=-1), 1, bias=False, name="end_pointer"), -1) self.logits = [ mask_logits(start_logits, mask=self.c_mask), mask_logits(end_logits, mask=self.c_mask) ] logits1, logits2 = [l for l in self.logits] losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1, labels=self.y1) losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2, labels=self.y2) self.loss = tf.reduce_mean(losses + losses2) # find max-score span outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2), tf.expand_dims(tf.nn.softmax(logits2), axis=1)) outer = tf.matrix_band_part(outer, 0, 15) self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) #DEBUG self.debug_ops.extend([ self.enc[1], start_logits, end_logits, logits1, logits2, outer, self.yp1, self.yp2, losses, losses2, self.loss ]) if config.l2_norm is not None: variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) l2_loss = tf.contrib.layers.apply_regularization( regularizer, variables) self.loss += l2_loss if config.decay is not None: self.var_ema = tf.train.ExponentialMovingAverage(config.decay) ema_op = self.var_ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) self.shadow_vars = [] self.global_vars = [] for var in tf.global_variables(): v = self.var_ema.average(var) if v: self.shadow_vars.append(v) self.global_vars.append(var) self.assign_vars = [] for g, v in zip(self.global_vars, self.shadow_vars): self.assign_vars.append(tf.assign(g, v))
def forward(self): self.c_words = tf.placeholder(tf.int32, [None, self.config.context_len], 'context-words') self.c_chars = tf.placeholder( tf.int32, [None, self.config.context_len, self.config.max_char_len], 'context-chars') self.c_mask = tf.sign(self.c_words) self.q_words = tf.placeholder(tf.int32, [None, self.config.question_len], 'query-words') self.q_chars = tf.placeholder( tf.int32, [None, self.config.question_len, self.config.max_char_len], 'query-chars') self.q_mask = tf.sign(self.q_words) self.c_len = tf.cast(tf.reduce_sum(self.c_mask, -1), tf.int32) self.q_len = tf.cast(tf.reduce_sum(self.q_mask, -1), tf.int32) self.start = tf.placeholder(tf.int32, [None], 'start-index') self.end = tf.placeholder(tf.int32, [None], 'end-index') with tf.variable_scope('input-embedding'): c_w = tf.nn.embedding_lookup(self.word_embed, self.c_words) q_w = tf.nn.embedding_lookup(self.word_embed, self.q_words) c_ch = layers.char_embed(self.c_chars, self.char_embed, dropout=self.dropout) q_ch = layers.char_embed(self.q_chars, self.char_embed, dropout=self.dropout, reuse=True) c = tf.concat([c_w, c_ch], -1) q = tf.concat([q_w, q_ch], -1) with tf.variable_scope('highway-1'): c = layers.highway(c, self.config.embed_size, dropout=self.dropout) q = layers.highway(q, self.config.embed_size, dropout=self.dropout, reuse=True) with tf.variable_scope('highway-2'): c = layers.highway(c, self.config.embed_size, dropout=self.dropout) q = layers.highway(q, self.config.embed_size, dropout=self.dropout, reuse=True) with tf.variable_scope('projection'): c = tf.layers.conv1d(c, self.config.filters, 1, padding='same') q = tf.layers.conv1d(q, self.config.filters, 1, padding='same', reuse=True) with tf.variable_scope('input-encoder'): c = layers.encoder_block(c, num_blocks=1, num_convolutions=4, kernel=7, mask=self.c_mask, dropout=self.dropout) q = layers.encoder_block(q, num_blocks=1, num_convolutions=4, kernel=7, mask=self.q_mask, dropout=self.dropout, reuse=True) with tf.variable_scope('attention'): attention = layers.bi_attention(c, q, layers.trilinear(c, q), self.c_mask, self.q_mask) attention = tf.layers.conv1d(attention, self.config.filters, 1, padding='same') modeling = [attention] for i in range(3): reuse = i > 0 m = layers.encoder_block(modeling[i], num_blocks=7, num_convolutions=2, kernel=5, mask=self.c_mask, dropout=self.dropout, reuse=reuse) if i % 2 == 0: m = tf.nn.dropout(m, 1.0 - self.dropout) modeling.append(m) with tf.variable_scope('start-index') as scope: self.start_linear = tf.concat([modeling[-3], modeling[-2]], -1) self.start_linear = tf.squeeze( tf.layers.dense(self.start_linear, 1, use_bias=False), -1) self.pred_start = tf.nn.softmax(self.start_linear, name='pred-start') with tf.variable_scope('end-index') as scope: self.end_linear = tf.concat([modeling[-3], modeling[-1]], -1) self.end_linear = tf.squeeze( tf.layers.dense(self.end_linear, 1, use_bias=False), -1) self.pred_end = tf.nn.softmax(self.end_linear, name='pred-end') with tf.variable_scope('loss') as scope: loss1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.start_linear, labels=self.start) loss2 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.end_linear, labels=self.end) loss = tf.reduce_mean(loss1 + loss2) lossL2 = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) * self.config.l2 self.loss = loss + lossL2 with tf.variable_scope('optimizer') as scope: optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) grads = tf.gradients(self.loss, tf.trainable_variables()) grads, _ = tf.clip_by_global_norm(grads, self.config.grad_clip) grads_and_vars = zip(grads, tf.trainable_variables()) self.optimize = optimizer.apply_gradients( grads_and_vars, global_step=self.global_step) if self.config.ema_decay > 0: with tf.variable_scope('ema') as scope: ema = tf.train.ExponentialMovingAverage( decay=self.config.ema_decay) ema_op = ema.apply(tf.trainable_variables()) with tf.control_dependencies([ema_op]): self.loss = tf.identity(self.loss) assign_vars = [] for var in tf.global_variables(): v = ema.average(var) if v: assign_vars.append(tf.assign(var, v)) self.assign_vars = assign_vars