def sample(self): """sampling function used during inference, reuse the parameters defined in decode()""" with tf.variable_scope("Decoder_Layer", reuse=True): memory = self.enc h = tf.nn.tanh( _linear(self.init_h, output_size=self.d, bias=False, scope="h_initial")) c = tf.nn.tanh( _linear(self.init_c, output_size=self.d, bias=False, scope="c_initial")) hh = tf.zeros((self.N, self.d)) state = (c, h) if self.layer == 1 else [(c, h) for _ in range(self.layer)] prev, attn_w = None, None symbols = [] prev_probs = tf.zeros(self.N) # the ground-truth question, only the start token will be used oups = tf.split(self.que, [1] * (self.QL + 2), 1) for i, inp in enumerate(oups): einp = tf.reshape( tf.nn.embedding_lookup(self.plus_word_mat, inp), [self.N, self.dw]) if prev is not None: with tf.variable_scope("loop_function", reuse=True): einp, prev_symbol, prev_probs = self._loop_function_sample( prev, attn_w, prev_probs, i) symbols.append(prev_symbol) cinp = tf.concat([einp, hh], 1) h, state = self.decoder_cell(cinp, state) # compute context vector attn, _, attn_w = bilinear_attention( tf.expand_dims(h, 1), units=self.d, num_heads=1, # attns=tf.expand_dims(tf.reduce_sum(coverage, 0), 1), memory=memory, scope="temporal_attention", mask=self.para_mask, bias=False, return_weights=True) attn_dim = attn.get_shape().as_list()[-1] attn = tf.reshape(attn, [-1, attn_dim]) attn_w = tf.reshape(attn_w, [-1, self.PL]) # attention vector hh = tf.nn.tanh( _linear(tf.concat([attn, h], 1), output_size=self.d, bias=False, scope="hh")) with tf.variable_scope("AttnOutputProjection"): # maxout output = _linear(tf.concat([attn, h], 1), output_size=2 * self.dw, bias=False, scope="maxout") output = tf.reshape(output, [-1, self.dw, 2]) output = tf.reduce_max(output, 2) prev = output einp, prev_symbol, prev_probs = self._loop_function_sample( prev, attn_w, prev_probs, i) symbols.append(prev_symbol) return symbols, tf.expand_dims(prev_probs, 1)
def decode(self, que, reuse=None): """decoding function used during training, decoder is a 2-layer uni-lstm""" with tf.variable_scope("Decoder_Layer", reuse=reuse): memory = self.enc # init the decoder's state h = tf.nn.tanh( _linear(self.init_h, output_size=self.d, bias=False, scope="h_initial")) c = tf.nn.tanh( _linear(self.init_c, output_size=self.d, bias=False, scope="c_initial")) hh = tf.zeros( (self.N, self.d)) # the attention vector from previous step state = (c, h) if self.layer == 1 else [(c, h) for _ in range(self.layer)] attn_ws = [] # save every step's attention logits outputs = [] # save every step's output vectors # the ground-truth question oups = tf.split(que, [1] * (self.QL + 2), 1) for i, inp in enumerate(oups): if i > 0: tf.get_variable_scope().reuse_variables() # word embedding + previous attention vector einp = tf.reshape( tf.nn.embedding_lookup(self.plus_word_mat, inp), [self.N, self.dw]) cinp = tf.concat([einp, hh], 1) # update cell h, state = self.decoder_cell(cinp, state) # attention, obtain the context vector and attention logits attn, _, attn_w = bilinear_attention( tf.expand_dims(h, 1), units=self.d, num_heads=1, memory=memory, scope="temporal_attention", mask=self.para_mask, bias=False, return_weights=True) attn_dim = attn.get_shape().as_list()[-1] attn = tf.reshape(attn, [-1, attn_dim]) attn_w = tf.reshape(attn_w, [-1, self.PL]) attn_ws.append(attn_w) # attention vector hh = tf.nn.tanh( _linear(tf.concat([attn, h], 1), output_size=self.d, bias=False, scope="hh")) with tf.variable_scope("AttnOutputProjection"): # maxout output = _linear(tf.concat([attn, h], 1), output_size=2 * self.dw, bias=False, scope="maxout") output = tf.reshape(output, [-1, self.dw, 2]) output = tf.reduce_max(output, 2) outputs.append(output) return outputs, oups, attn_ws
def search(self, beam_size, prev_probs=None): """beam search function used during inference, reuse the parameters defined in decode()""" with tf.variable_scope("Decoder_Layer", reuse=True): memory = self.enc # specify the loop function, either for standard beam search or diverse beam search loop_function = self._loop_function_diverse_search if self.diverse_beam else self._loop_function_search # init the decoder's state h = tf.nn.tanh( _linear(self.init_h, output_size=self.d, bias=False, scope="h_initial")) c = tf.nn.tanh( _linear(self.init_c, output_size=self.d, bias=False, scope="c_initial")) hh = tf.zeros( (self.N, 1, self.d)) # the attention vector from previous step state = (c, h) if self.layer == 1 else [(c, h) for _ in range(self.layer)] prev, attn_w = None, None # the output vector and attention logits from previous step # the accumulated log probabilities of the beam prev_probs = prev_probs if prev_probs is not None else tf.zeros( (self.N, 1)) finished = tf.cast(tf.zeros((self.N, 1)), tf.bool) # whether </S> is encountered symbols = [] # the output words at each step in the beam attn_ws = [] # the attention logits at each step in the beam # the decoder states at each step in the beam hs = [tf.reshape(h, [self.N, 1, self.d])] # the ground-truth question, only the start token will be used oups = tf.split(self.que, [1] * (self.QL + 2), 1) for i, inp in enumerate(oups): einp = tf.nn.embedding_lookup(self.plus_word_mat, inp) if prev is not None: # from the second step with tf.variable_scope("loop_function", reuse=True): einp, prev_probs, index, prev_symbol, finished = loop_function( beam_size, prev, attn_w, prev_probs, finished, i) hh = tf.gather_nd( hh, index) # update prev attention vector state = tuple(tf.gather_nd(s, index) for s in state) if self.layer == 1 else \ [tuple(tf.gather_nd(s, index) for s in sta) for sta in state] # update prev state for j, symbol in enumerate(symbols): symbols[j] = tf.gather_nd( symbol, index) # update prev symbols symbols.append(prev_symbol) for j, hsi in enumerate(hs): hs[j] = tf.gather_nd(hsi, index) # update cell state = tuple(tf.reshape(s, [-1, self.d]) for s in state) if self.layer == 1 else \ [tuple(tf.reshape(s, [-1, self.d]) for s in sta) for sta in state] cinp = tf.concat([einp, hh], -1) cinp_dim = cinp.get_shape().as_list()[-1] h, state = self.decoder_cell(tf.reshape(cinp, [-1, cinp_dim]), state) # compute context vector attn, _, attn_w = bilinear_attention( tf.reshape(h, [self.N, -1, self.d]), units=self.d, num_heads=1, memory=memory, mask=self.para_mask, scope="temporal_attention", bias=False, return_weights=True) attn_dim = attn.get_shape().as_list()[-1] attn = tf.reshape(attn, [-1, attn_dim]) attn_w = tf.reshape(attn_w, [self.N, -1, self.PL]) attn_ws.append(attn_w) # attention vector hh = tf.nn.tanh( _linear(tf.concat([attn, h], -1), output_size=self.d, bias=False, scope="hh")) hh = tf.reshape(hh, [self.N, -1, self.d]) # reshape for next step's indexing convenience state = tuple(tf.reshape(s, [self.N, -1, self.d]) for s in state) if self.layer == 1 else \ [tuple(tf.reshape(s, [self.N, -1, self.d]) for s in sta) for sta in state] hs.append(tf.reshape(h, [self.N, -1, self.d])) with tf.variable_scope("AttnOutputProjection"): # maxout output = _linear(tf.concat([attn, h], -1), output_size=2 * self.dw, bias=False, scope="maxout") output = tf.reshape(output, [self.N, -1, self.dw, 2]) output = tf.reduce_max(output, -1) prev = output # process the last symbol einp, prev_probs, index, prev_symbol, finished = loop_function( beam_size, prev, attn_w, prev_probs, finished, i) for j, symbol in enumerate(symbols): symbols[j] = tf.gather_nd(symbol, index) # update prev symbols symbols.append(prev_symbol) return symbols, prev_probs
def output(self, input): with tf.variable_scope("Output_Layer"): hidden = tf.nn.relu( _linear(input, output_size=self.d, bias=True, scope="hidden")) pred = _linear(hidden, output_size=2, bias=True, scope="output") return pred