def _compute_logits(self, rnn_out): if self._num_layers == 1 and self._weights is not None: assert tensor_utils.shape(rnn_out, -1) == self._hidden_dim if self._num_layers == 1: with tf.variable_scope("mlp1", reuse=self._reuse): if self._weights is None: scale = (3.0 / self._hidden_dim) ** 0.5 weight_initializer = tf.random_uniform_initializer( minval=-scale, maxval=scale) self._linear1 = Linear( rnn_out, self._output_size, True, weights=None, weight_initializer=weight_initializer) else: self._linear1 = Linear( rnn_out, self._output_size, True, weights=self._weights) logits = self._linear1(rnn_out) else: assert False assert self._num_layers == 2 with tf.variable_scope("mlp1", reuse=self._reuse): if self._linear1 is None: self._linear1 = Linear( rnn_out, self._hidden_dim, True, weights=None, weight_initializer=tf.contrib.layers.xavier_initializer()) hidden = self._linear1(rnn_out) if self._activation: hidden = self._activation(hidden) if self._mode == tf.estimator.ModeKeys.TRAIN and self._dropout > 0.: hidden = tf.nn.dropout(hidden, keep_prob=1.-self._dropout) with tf.variable_scope("mlp2", reuse=self._reuse): if self._linear2 is None: if self._weights is None: scale = (3.0 / self._hidden_dim) ** 0.5 weight_initializer = tf.random_uniform_initializer( minval=-scale, maxval=scale) self._linear2 = Linear( hidden, self._output_size, True, weights=None, weight_initializer=weight_initializer) else: self._linear2 = Linear( hidden, self._output_size, True, weights=self._weights) logits = self._linear2(hidden) return logits
def beam_decoder(features, mode, vocab, encoder_outputs, hps): """Beam search decoder. Args: features: Dictionary of input Tensors. mode: train or eval. Keys from tf.estimator.ModeKeys. vocab: A list of strings of words in the vocabulary. encoder_outputs: output tensors from the encoder hps: Hyperparams. Returns: Decoder outputs """ assert mode is not tf.estimator.ModeKeys.TRAIN, "Not using beam in training." embeddings = encoder_outputs.embeddings mem_input = encoder_outputs.mem_input batch_size = tensor_utils.shape(features["src_len"], 0) src_len, src_inputs = features["src_len"], features["src_inputs"] src_mask = tf.sequence_mask(src_len, tf.shape(src_inputs)[1]) if hps.att_neighbor: neighbor_len, neighbor_inputs \ = features["neighbor_len"], features["neighbor_inputs"] neighbor_mask = tf.sequence_mask(neighbor_len, tf.shape(neighbor_inputs)[1]) inputs = tf.concat([src_inputs, neighbor_inputs], 1) # lens = src_len + neighbor_len mask = tf.concat([src_mask, neighbor_mask], axis=1) lens = tf.shape(mask)[1] * tf.ones([batch_size], tf.int32) else: inputs = features["src_inputs"] lens = features["src_len"] mask = src_mask sparse_inputs = None float_mask = tf.cast(mask, dtype=tf.float32) if hps.use_copy: sparse_inputs = sparse_map(tf.expand_dims(inputs, axis=2), tf.expand_dims(float_mask, axis=2), vocab.size()) sparse_inputs = sparse_tile_batch(sparse_inputs, multiplier=hps.beam_width) tiled_mask = tf.contrib.seq2seq.tile_batch(mask, multiplier=hps.beam_width) inputs = tf.contrib.seq2seq.tile_batch(inputs, multiplier=hps.beam_width) lens = tf.contrib.seq2seq.tile_batch(lens, multiplier=hps.beam_width) def _beam_decode(cell): """Beam decode.""" with tf.variable_scope("beam_decoder"): initial_state = cell.zero_state(batch_size=batch_size * hps.beam_width, dtype=tf.float32) if hps.use_bridge: h_state, c_state = encoder_outputs.states h_state = tf.contrib.seq2seq.tile_batch( h_state, multiplier=hps.beam_width) c_state = tf.contrib.seq2seq.tile_batch( c_state, multiplier=hps.beam_width) initial_cell_state = tf.contrib.rnn.LSTMStateTuple( h_state, c_state) initial_state = initial_state.clone( cell_state=(initial_cell_state, )) decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=embeddings, start_tokens=tf.fill([batch_size], vocab.word2id(data.START_DECODING)), end_token=vocab.word2id(data.STOP_DECODING), initial_state=initial_state, beam_width=hps.beam_width, length_penalty_weight=hps.length_norm, coverage_penalty_weight=hps.cp) with tf.variable_scope("dynamic_decode", reuse=tf.AUTO_REUSE): decoder_outputs, _, decoder_len = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=hps.max_dec_steps) return decoder_outputs, decoder_len # [batch_size*beam_width, src_len, encoder_dim] att_context = tf.contrib.seq2seq.tile_batch(encoder_outputs.att_context, multiplier=hps.beam_width) copy_context = None if hps.use_copy: copy_context = tf.contrib.seq2seq.tile_batch( encoder_outputs.copy_context, multiplier=hps.beam_width) with tf.variable_scope("attention", reuse=tf.AUTO_REUSE): if hps.att_type == "luong": attention = tf.contrib.seq2seq.LuongAttention( num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens) elif hps.att_type == "bahdanau": attention = tf.contrib.seq2seq.BahdanauAttention( num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens) elif hps.att_type == "hyper": attention = HyperAttention(num_units=hps.decoder_dim, mem_input=mem_input, hps=hps, memory=att_context, use_beam=True, memory_sequence_length=lens) elif hps.att_type == "my": attention = MyAttention(num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens, mask=tiled_mask) with tf.variable_scope("rnn_decoder", reuse=tf.AUTO_REUSE): decoder_cell = get_rnn_cell(mode=mode, hps=hps, input_dim=hps.decoder_dim + hps.emb_dim, num_units=hps.decoder_dim, num_layers=hps.num_decoder_layers, dropout=hps.decoder_drop, mem_input=mem_input, use_beam=True, cell_type=hps.rnn_cell) with tf.variable_scope("attention_wrapper", reuse=tf.AUTO_REUSE): decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention, attention_layer_size=hps.decoder_dim, alignment_history=hps.use_copy) with tf.variable_scope("output_projection", reuse=tf.AUTO_REUSE): weights = tf.transpose(embeddings) if hps.tie_embedding else None hidden_dim = hps.emb_dim if hps.tie_embedding else hps.decoder_dim decoder_cell = OutputWrapper( decoder_cell, num_layers=hps.num_mlp_layers, hidden_dim=hidden_dim, output_size=vocab.size() if hps.tie_embedding else hps.output_size, weights=weights, dropout=hps.out_drop, use_copy=hps.use_copy, encoder_emb=copy_context, sparse_inputs=sparse_inputs, mask=tf.cast(tiled_mask, dtype=tf.float32), hps=hps, mode=mode, reuse=tf.AUTO_REUSE) decoder_outputs, decoder_len = _beam_decode(decoder_cell) return DecoderOutputs(decoder_outputs=decoder_outputs, decoder_len=decoder_len)
def basic_decoder(features, mode, vocab, encoder_outputs, hps): """Decoder. Args: features: Dictionary of input Tensors. mode: train or eval. Keys from tf.estimator.ModeKeys. vocab: A list of strings of words in the vocabulary. encoder_outputs: output tensors from the encoder hps: Hyperparams. Returns: Decoder outputs """ embeddings = encoder_outputs.embeddings mem_input = encoder_outputs.mem_input batch_size = tensor_utils.shape(mem_input, 0) src_len, src_inputs = features["src_len"], features["src_inputs"] src_mask = tf.sequence_mask(src_len, tf.shape(src_inputs)[1]) if hps.att_neighbor: neighbor_len, neighbor_inputs \ = features["neighbor_len"], features["neighbor_inputs"] neighbor_mask = tf.sequence_mask(neighbor_len, tf.shape(neighbor_inputs)[1]) inputs = tf.concat([src_inputs, neighbor_inputs], 1) lens = src_len + neighbor_len mask = tf.concat([src_mask, neighbor_mask], axis=1) else: inputs = features["src_inputs"] lens = features["src_len"] mask = src_mask sparse_inputs = None float_mask = tf.cast(mask, dtype=tf.float32) if hps.use_copy: sparse_inputs = sparse_map(tf.expand_dims(inputs, axis=2), tf.expand_dims(float_mask, axis=2), vocab.size()) # [batch_size, dec_len] decoder_inputs = features["decoder_inputs"] # [batch_size, dec_len, emb_dim] decoder_input_emb = tf.nn.embedding_lookup(embeddings, decoder_inputs) if mode == tf.estimator.ModeKeys.TRAIN and hps.emb_drop > 0.: decoder_input_emb = tf.nn.dropout(decoder_input_emb, keep_prob=1.0 - hps.emb_drop) def _decode(cell, helper): """Decode function. Args: cell: rnn cell helper: a helper instance from tf.contrib.seq2seq. Returns: decoded outputs and lengths. """ with tf.variable_scope("decoder"): initial_state = cell.zero_state(batch_size, tf.float32) if hps.use_bridge: h_state, c_state = encoder_outputs.states initial_cell_state = tf.contrib.rnn.LSTMStateTuple( h_state, c_state) initial_state = initial_state.clone( cell_state=(initial_cell_state, )) decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell, helper=helper, initial_state=initial_state) with tf.variable_scope("dynamic_decode", reuse=tf.AUTO_REUSE): decoder_outputs, _, decoder_len = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, maximum_iterations=hps.max_dec_steps) return decoder_outputs, decoder_len att_context = encoder_outputs.att_context with tf.variable_scope("attention"): if hps.att_type == "luong": attention = tf.contrib.seq2seq.LuongAttention( num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens) elif hps.att_type == "bahdanau": attention = tf.contrib.seq2seq.BahdanauAttention( num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens) elif hps.att_type == "hyper": attention = HyperAttention(num_units=hps.decoder_dim, mem_input=mem_input, hps=hps, memory=att_context, use_beam=False, memory_sequence_length=lens) elif hps.att_type == "my": attention = MyAttention(num_units=hps.decoder_dim, memory=att_context, memory_sequence_length=lens, mask=mask) with tf.variable_scope("rnn_decoder"): decoder_cell = get_rnn_cell(mode=mode, hps=hps, input_dim=hps.decoder_dim + hps.emb_dim, num_units=hps.decoder_dim, num_layers=hps.num_decoder_layers, dropout=hps.decoder_drop, mem_input=mem_input, use_beam=False, cell_type=hps.rnn_cell) with tf.variable_scope("attention_wrapper"): decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention, attention_layer_size=hps.decoder_dim, alignment_history=hps.use_copy) with tf.variable_scope("output_projection"): weights = tf.transpose(embeddings) if hps.tie_embedding else None hidden_dim = hps.emb_dim if hps.tie_embedding else hps.decoder_dim decoder_cell = OutputWrapper( decoder_cell, num_layers=hps.num_mlp_layers, hidden_dim=hidden_dim, output_size=hps.output_size, # output_size=vocab.size(), weights=weights, dropout=hps.out_drop, use_copy=hps.use_copy, encoder_emb=encoder_outputs.copy_context, sparse_inputs=sparse_inputs, mask=float_mask, mode=mode, hps=hps) if mode == tf.estimator.ModeKeys.TRAIN: if hps.sampling_probability > 0.: helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( inputs=decoder_input_emb, sequence_length=features["decoder_len"], embedding=embeddings, sampling_probability=hps.sampling_probability) else: helper = tf.contrib.seq2seq.TrainingHelper(decoder_input_emb, features["decoder_len"]) decoder_outputs, _ = _decode(decoder_cell, helper=helper) return DecoderOutputs(decoder_outputs=decoder_outputs, decoder_len=features["decoder_len"]), None # used to compute loss teacher_helper = tf.contrib.seq2seq.TrainingHelper(decoder_input_emb, features["decoder_len"]) teacher_decoder_outputs, _ = _decode(decoder_cell, helper=teacher_helper) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=embeddings, start_tokens=tf.fill([batch_size], vocab.word2id(data.START_DECODING)), end_token=vocab.word2id(data.STOP_DECODING)) decoder_outputs, decoder_len = _decode(decoder_cell, helper=helper) return DecoderOutputs( decoder_outputs=decoder_outputs, decoder_len=decoder_len), \ DecoderOutputs( decoder_outputs=teacher_decoder_outputs, decoder_len=features["decoder_len"] )
def __init__(self, cell, num_layers, hidden_dim, output_size, weights=None, activation=tf.tanh, dropout=0., use_copy=False, encoder_emb=None, sparse_inputs=None, mask=None, hps=None, mode=tf_estimator.ModeKeys.EVAL, reuse=None): """Create a cell with output projection. Args: cell: an RNNCell, a projection to output_size is added to it. num_layers: number of MLP layers. hidden_dim: hidden size of the MLP. output_size: integer, the size of the output after projection. weights: (optional) a specified tensor. activation: (optional) an optional activation function. dropout: dropout rate for dropout at the output layer. use_copy: Use copy mechanism or not. encoder_emb: Outputs of the encoder. sparse_inputs: Sparse inputs. mask: mask. hps: Hyperparameters. mode: train/eval. reuse: (optional) Python boolean describing whether to reuse variables in an existing scope. If not `True`, and the existing scope already has the given variables, an error is raised. Raises: TypeError: if cell is not an RNNCell. ValueError: if output_size is not positive. """ super(OutputWrapper, self).__init__(_reuse=reuse) if output_size < 1: raise ValueError("Parameter output_size must be > 0: %d." % output_size) self._cell = cell self._num_layers = num_layers self._activation = activation self._weights = weights if self._weights is None: self._output_size = output_size else: self._output_size = tensor_utils.shape(self._weights, 1) self._hidden_dim = hidden_dim self._dropout = dropout self._reuse = reuse self._mode = None self._sigmoid = tf.sigmoid self._linear1, self._linear2, self._linear_copy = None, None, None assert self._num_layers <= 2 self._use_copy = use_copy self._encoder_emb = encoder_emb self._sparse_inputs, self._mask = sparse_inputs, mask self._mode = mode self._reuse_attention = hps.reuse_attention if self._use_copy: assert self._sparse_inputs is not None assert self._mask is not None if not self._reuse_attention: assert self._encoder_emb is not None encoder_dim = tf.shape(self._encoder_emb)[-1] if encoder_dim != self._hidden_dim: assert False self._eps = 1e-8 self._vocab_offset = hps.output_size