def forward( self, # type: ignore input: torch.Tensor, sequence_length: Union[torch.LongTensor, List[int]] = None, dtype: Optional[torch.dtype] = None) -> torch.Tensor: r"""Feeds forward inputs through the network layers and returns outputs. Args: input: The inputs to the network, which is a 3D tensor. sequence_length (optional): An :tensor:`LongTensor` of shape ``[batch_size]`` or a python array containing the length of each element in :attr:`inputs`. If given, time steps beyond the length will first be masked out before feeding to the layers. dtype (optional): Type of the inputs. If not provided, infers from inputs automatically. Returns: The output of the final layer. """ if sequence_length is not None: input = mask_sequences(input, sequence_length, dtype=dtype, time_major=False) return super().forward(input)
def _discount_reward_tensor_1d(reward, sequence_length, discount=1., dtype=None): if sequence_length is None: raise ValueError('sequence_length must not be `None` for 1D reward.') batch_size = tf.shape(reward)[0] max_seq_length = tf.reduce_max(sequence_length) dtype = dtype or reward.dtype if discount == 1.: dmat = tf.ones(tf.concat([[batch_size], [max_seq_length]], 0), dtype=dtype) else: mask = tf.sequence_mask(sequence_length, dtype=dtype) mask = tf.concat([mask[:, 1:], tf.zeros_like(mask[:, -1:])], axis=1) # Make each row = [discount, ..., discount, 1, ..., 1] dmat = mask * discount + (1 - mask) dmat = tf.cumprod(dmat, axis=1, reverse=True) disc_reward = dmat * tf.expand_dims(reward, -1) disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype, tensor_rank=2) return disc_reward
def _build( self, # pylint: disable=arguments-differ inputs, sequence_length=None, dtype=None, mode=None): """Feeds forward inputs through the network layers and returns outputs. Args: inputs: The inputs to the network, which is a 3D tensor. sequence_length (optional): An int tensor of shape `[batch_size]` containing the length of each element in :attr:`inputs`. If given, time steps beyond the length will first be masked out before feeding to the layers. dtype (optional): Type of the inputs. If not provided, infers from inputs automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. If `None`, :func:`texar.global_mode` is used. Returns: The output of the final layer. """ if sequence_length is not None: inputs = mask_sequences(inputs, sequence_length, dtype=dtype, time_major=False, tensor_rank=3) return super(Conv1DNetwork, self)._build(inputs, mode=mode)
def _discount_reward_py_1d(reward, sequence_length, discount=1., dtype=None): if sequence_length is None: raise ValueError('sequence_length must not be `None` for 1D reward.') reward = np.array(reward) sequence_length = np.array(sequence_length) batch_size = reward.shape[0] max_seq_length = np.max(sequence_length) dtype = dtype or reward.dtype if discount == 1.: dmat = np.ones([batch_size, max_seq_length], dtype=dtype) else: steps = np.tile(np.arange(max_seq_length), [batch_size, 1]) mask = np.asarray(steps < (sequence_length - 1)[:, None], dtype=dtype) # Make each row = [discount, ..., discount, 1, ..., 1] dmat = mask * discount + (1 - mask) dmat = np.cumprod(dmat[:, ::-1], axis=1)[:, ::-1] disc_reward = dmat * reward[:, None] disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype) #mask = np.asarray(steps < sequence_length[:, None], dtype=dtype) #disc_reward = mask * disc_reward return disc_reward
def _forward_output_layers(inputs, input_size, output_layer, time_major, hparams, mode, sequence_length=None): """Forwards inputs through the output layers. Args: inputs: A Tensor of shape `[batch_size, max_time] + input_size` if :attr:`time_major=False`, or shape `[max_time, batch_size] + input_size` if :attr:`time_major=True`. Returns: A pair :attr:`(outputs, outputs_size), where - :attr:`outputs`: A Tensor of shape \ `[batch_size, max_time] + outputs_size`. - :attr:`outputs_size`: An `int` or 1D `int` array representing the \ output size. """ if output_layer is None: return inputs, input_size if hparams is None: # output_layer was passed in from the constructor if isinstance(output_layer, (list, tuple)): raise ValueError('output_layer must not be a list or tuple.') output, output_size = _forward_single_output_layer( inputs, input_size, output_layer) else: # output_layer was built based on hparams output_layer = _to_list(output_layer) dropout_layer_ids = _to_list(hparams.dropout_layer_ids) if len(dropout_layer_ids) > 0: training = is_train_mode(mode) output = inputs output_size = input_size for i, layer in enumerate(output_layer): if i in dropout_layer_ids: output = _apply_dropout(output, time_major, hparams, training) output, output_size = _forward_single_output_layer( output, output_size, layer) if len(output_layer) in dropout_layer_ids: output = _apply_dropout(output, time_major, hparams, training) if sequence_length is not None: output = mask_sequences(output, sequence_length, time_major=time_major, tensor_rank=3) return output, output_size
def test_mask_sequences(self): """Tests :func:`texar.utils.shapes.mask_sequences`. """ seq = np.ones([3, 4, 3], dtype=np.int32) seq_length = np.array([3, 2, 1], dtype=np.int32) masked_seq = shapes.mask_sequences(seq, seq_length) self.assertEqual(masked_seq.shape, seq.shape) seq_sum = np.sum(masked_seq, axis=(1, 2)) np.testing.assert_array_equal(seq_sum, seq_length * 3)
def test_mask_sequences(self): r"""Tests :func:`texar.utils.shapes.mask_sequences`. """ seq = torch.ones(3, 4, 3, dtype=torch.int32) seq_length = torch.tensor([3, 2, 1], dtype=torch.int32) masked_seq = shapes.mask_sequences(seq, seq_length) np.testing.assert_array_equal(masked_seq.shape, seq.shape) seq_sum = torch.sum(masked_seq, dim=(1, 2)) np.testing.assert_array_equal(seq_sum, seq_length * 3)
def _build(self, # pylint: disable=arguments-differ inputs, sequence_length=None, dtype=None, time_major=False, mode=None): if sequence_length is not None: inputs = mask_sequences( inputs, sequence_length, dtype=dtype, time_major=time_major, tensor_rank=3) return super(Conv1DNetwork, self)._build(inputs, mode=mode)
def _discount_reward_py_2d(reward, sequence_length=None, discount=1., dtype=None): if sequence_length is not None: reward = mask_sequences(reward, sequence_length, dtype=dtype) dtype = dtype or reward.dtype if discount == 1.: disc_reward = np.cumsum( reward[:, ::-1], axis=1, dtype=dtype)[:, ::-1] else: disc_reward = np.copy(reward) for i in range(reward.shape[1]-2, -1, -1): disc_reward[:, i] += disc_reward[:, i+1] * discount return disc_reward
def forward(self, # type: ignore positions: Optional[torch.LongTensor] = None, sequence_length: Optional[torch.LongTensor] = None, **kwargs) \ -> torch.Tensor: r"""Embeds. Either :attr:`positions` or :attr:`sequence_length` is required: - If both are given, :attr:`sequence_length` is used to mask out embeddings of those time steps beyond the respective sequence lengths. - If only :attr:`sequence_length` is given, then positions from `0` to `sequence_length - 1` are embedded. Args: positions (optional): An :tensor:`LongTensor` containing the position IDs to embed. sequence_length (optional): An :tensor:`LongTensor` of shape ``[batch_size]``. Time steps beyond the respective sequence lengths will have zero-valued embeddings. Returns: A Tensor of shape ``[batch_size, position_size, dim]``. """ if positions is None: if sequence_length is None: raise ValueError( 'Either `positions` or `sequence_length` is required.') max_length = sequence_length.max() batch_size = sequence_length.size(0) inputs = torch.arange(max_length).to(device=sequence_length.device) inputs = inputs.expand(batch_size, max_length) else: inputs = positions if self._cache_embeddings: outputs = F.embedding(inputs, self.signal, **kwargs) else: outputs = self._compute_embeddings(inputs, self.inv_timescales) if sequence_length is not None: outputs = mask_sequences(outputs, sequence_length) return outputs
def _forward_output_layers( inputs: torch.Tensor, output_layer: Optional[nn.Module], time_major: bool, sequence_length: Optional[Union[torch.LongTensor, List[int]]] = None) \ -> Tuple[torch.Tensor, int]: r"""Forwards inputs through the output layers. Args: inputs: A Tensor of shape ``[batch_size, max_time] + input_size`` if :attr:`time_major` is `False`, or shape ``[max_time, batch_size] + input_size`` if :attr:`time_major` is `True`. output_layer (optional): :torch_nn:`Sequential` or :torch_nn:`Module` of output layers. time_major (bool): The shape format of the :attr:`inputs` and :attr:`outputs` Tensors. If `True`, these tensors are of shape `[max_time, batch_size, input_size]`. If `False` (default), these tensors are of shape `[batch_size, max_time, input_size]`. sequence_length (optional): A 1D :tensor:`LongTensor` of shape ``[batch_size]``. Sequence lengths of the batch inputs. Used to copy-through state and zero-out outputs when past a batch element's sequence length. Returns: A pair :attr:`(outputs, outputs_size), where - :attr:`outputs`: A Tensor of shape `[batch_size, max_time] + outputs_size`. - :attr:`outputs_size`: An `int` representing the output size. """ if output_layer is None: return inputs, inputs.shape[-1] output = output_layer(inputs) if sequence_length is not None: output = mask_sequences(output, sequence_length, time_major=time_major) output_size = output.shape[-1] return output, output_size
def _discount_reward_tensor_1d(reward: torch.Tensor, sequence_length: torch.LongTensor, discount: float = 1.) -> torch.Tensor: r"""Computes discounted reward. Args: reward: 1D Tensor with shape `[batch_size]`. sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be masked. discount (float): A scalar. The discount factor. Returns: A 2D Tensor of the discounted reward. """ if sequence_length is None: raise ValueError('sequence_length must not be `None` for 1D reward.') if not isinstance(sequence_length, torch.Tensor): sequence_length = torch.tensor(sequence_length, dtype=torch.int64, device=reward.device) batch_size = reward.shape[0] max_seq_length = torch.max(sequence_length) dtype: torch.dtype = reward.dtype if discount == 1.: disc_reward = torch.unsqueeze(reward, -1).expand(batch_size, max_seq_length) else: mask = sequence_mask(sequence_length, dtype=dtype) mask = torch.cat((mask[:, 1:], torch.zeros_like(mask[:, -1:])), dim=1) # Make each row = [discount, ..., discount, 1, ..., 1] dmat = mask * discount + (1 - mask) dmat = torch.flip(dmat, (1, )) dmat = torch.cumprod(dmat, dim=1) dmat = torch.flip(dmat, (1, )) disc_reward = dmat * torch.unsqueeze(reward, -1) disc_reward = mask_sequences(disc_reward, sequence_length, dtype=dtype) return disc_reward
def _discount_reward_tensor_2d(reward, sequence_length=None, discount=1., dtype=None): if sequence_length is not None: reward = mask_sequences( reward, sequence_length, dtype=dtype, tensor_rank=2) if discount == 1.: disc_reward = tf.cumsum(reward, axis=1, reverse=True) else: # [max_time, batch_size] rev_reward_T = tf.transpose(tf.reverse(reward, [1]), [1, 0]) rev_reward_T_cum = tf.scan( fn=lambda acc, cur: cur + discount * acc, elems=rev_reward_T, initializer=tf.zeros_like(reward[:, 1]), back_prop=False) disc_reward = tf.reverse( tf.transpose(rev_reward_T_cum, [1, 0]), [1]) return disc_reward
def _discount_reward_tensor_2d(reward: torch.Tensor, sequence_length: Optional[ torch.LongTensor] = None, discount: float = 1.) -> torch.Tensor: r"""Computes discounted reward. Args: reward: 2D Tensor with shape `[batch_size, max_time]`. sequence_length (optional): A Tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be masked. discount (float): A scalar. The discount factor. Returns: A 2D Tensor of the discounted reward. """ dtype: torch.dtype = reward.dtype if sequence_length is not None: reward = mask_sequences(reward, sequence_length, dtype=dtype) if discount == 1.: reward = torch.flip(reward, (1, )) disc_reward = torch.cumsum(reward, dim=1) disc_reward = torch.flip(disc_reward, (1, )) else: # [max_time, batch_size] rev_reward_T = torch.flip(reward, (1, )).permute(1, 0) res = [] acc = torch.zeros_like(reward[:, 1]) for i in range(rev_reward_T.shape[0]): cur = rev_reward_T[i] acc = cur + discount * acc res.append(acc) rev_reward_T_cum = torch.stack(res, dim=0) disc_reward = torch.flip(rev_reward_T_cum.permute(1, 0), (1, )) return disc_reward
def _build(self, # pylint: disable=arguments-differ memory, memory_sequence_length=None, memory_attention_bias=None, inputs=None, sequence_length=None, decoding_strategy='train_greedy', beam_width=1, alpha=0, start_tokens=None, end_token=None, max_decoding_length=None, mode=None): """Performs decoding. The decoder supports 4 decoding strategies. For the first 3 strategies, set :attr:`decoding_strategy` to the respective string. - **"train_greedy"**: decoding in teacher-forcing fashion \ (i.e., feeding \ ground truth to decode the next step), and for each step sample \ is obtained by taking the `argmax` of logits. \ Argument :attr:`inputs` is required for this strategy. \ :attr:`sequence_length` is optional. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding \ `generated` sample to decode the next step), and for each step sample is obtained by taking the `argmax` of logits.\ Arguments :attr:`(start_tokens, end_token)` are \ required for this strategy, and argument \ :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and for each step\ sample is obtained by `random sampling` from the logits. Arguments :attr:`(start_tokens, end_token)` are \ required for this strategy, and argument \ :attr:`max_decoding_length` is optional. - **Beam Search**: set :attr:`beam_width` to > 1 to use beam search \ decoding.\ Arguments :attr:`(start_tokens, end_token)` are \ required, and argument \ :attr:`max_decoding_length` is optional. Args: memory: The memory to attend, e.g., the output of an RNN encoder. A Tensor of shape `[batch_size, memory_max_time, dim]`. memory_sequence_length (optional): A Tensor of shape `[batch_size]` containing the sequence lengths for the batch entries in memory. Used to create attention bias of :attr:`memory_attention_bias` is not given. Ignored if `memory_attention_bias` is provided. memory_attention_bias (optional): A Tensor of shape `[batch_size, num_heads, memory_max_time, dim]`. An attention bias typically sets the value of a padding position to a large negative value for masking. If not given, :attr:`memory_sequence_length` is used to automatically create an attention bias. inputs (optional): Input tensor for teacher forcing decoding, of shape `[batch_size, target_max_time, emb_dim]` containing the target sequence word embeddings. Used when :attr:`decoding_strategy` is set to "train_greedy". sequence_length (optional): A Tensor of shape `[batch_size]`, containing the sequence length of :attr:`inputs`. Tokens beyond the respective sequence length are masked out. Used when :attr:`decoding_strategy` is set to "train_greedy". decoding_strategy (str): A string specifying the decoding strategy, including "train_greedy", "infer_greedy", "infer_sample". Different arguments are required based on the strategy. See above for details. Ignored if :attr:`beam_width` > 1. beam_width (int): Set to > 1 to use beam search. alpha (float): Length penalty coefficient. Refer to https://arxiv.org/abs/1609.08144 for more details. tart_tokens (optional): An int Tensor of shape `[batch_size]`, containing the start tokens. Used when `decoding_strategy` = "infer_greedy" or "infer_sample", or `beam_width` > 1. end_token (optional): An int 0D Tensor, the token that marks end of decoding. Used when `decoding_strategy` = "infer_greedy" or "infer_sample", or `beam_width` > 1. max_decoding_length (optional): An int scalar Tensor indicating the maximum allowed number of decoding steps. If `None` (default), use "max_decoding_length" defined in :attr:`hparams`. Ignored in "train_greedy" decoding. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Controls dropout mode. If `None` (default), :func:`texar.global_mode` is used. Returns: - For **"train_greedy"** decoding, returns an instance of \ :class:`~texar.modules.TransformerDecoderOutput` which contains\ `sample_id` and `logits`. - For **"infer_greedy"** and **"infer_sample"** decoding, returns\ a tuple `(outputs, sequence_lengths)`, where `outputs` is an \ instance of :class:`~texar.modules.TransformerDecoderOutput` as\ in "train_greedy", and `sequence_lengths` is a Tensor of shape\ `[batch_size]` containing the length of each sample. - For **beam_search** decoding, returns a `dict` containing keys\ "sample_id" and "log_prob". - **"sample_id"** is an int Tensor of shape \ `[batch_size, max_time, beam_width]` containing generated\ token indexes. `sample_id[:,:,0]` is the highest-probable \ sample. - **"log_porb"** is a float Tensor of shape \ `[batch_size, beam_width]` containing the log probability \ of each sequence sample. """ if memory_attention_bias is None: if memory_sequence_length is None: raise ValueError( "`memory_sequence_length` is required if " "`memory_attention_bias` is not given.") #enc_padding = 1 - mask_sequences(tf.ones_like(memory), # memory_sequence_length, # tensor_rank=3)[:, :, 0] enc_padding = 1 - tf.sequence_mask( memory_sequence_length, tf.shape(memory)[1], dtype=tf.float32) memory_attention_bias = attn.attention_bias_ignore_padding( enc_padding) if beam_width <= 1 and decoding_strategy == 'train_greedy': if sequence_length is not None: inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) decoder_self_attention_bias = ( attn.attention_bias_lower_triangle( shape_list(inputs)[1])) target_inputs = inputs * self._hparams.dim**0.5 _, lengths, channels = shape_list(target_inputs) pos_embeds = self.position_embedder(lengths, channels) inputs = target_inputs + pos_embeds decoder_output = self._self_attention_stack( inputs, memory, decoder_self_attention_bias=decoder_self_attention_bias, memory_attention_bias=memory_attention_bias, cache=None, mode=mode) logits = self.output_layer(decoder_output) preds = tf.to_int32(tf.argmax(logits, axis=-1)) output = TransformerDecoderOutput( logits=logits, sample_id=preds ) rets = output else: # Inference decoding if max_decoding_length is None: max_decoding_length = self._hparams.max_decoding_length if beam_width <= 1: logits, preds, sequence_length = self._infer_decoding( self._prepare_tokens_to_embeds, start_tokens, end_token, decode_length=max_decoding_length, memory=memory, memory_attention_bias=memory_attention_bias, decoding_strategy=decoding_strategy, ) output = TransformerDecoderOutput( logits=logits, sample_id=preds) rets = output, sequence_length else: # The output format is different when running beam search sample_id, log_prob = self._beam_decode( self._prepare_tokens_to_embeds, start_tokens, end_token, beam_width=beam_width, alpha=alpha, decode_length=max_decoding_length, memory=memory, memory_attention_bias=memory_attention_bias, ) predictions = { 'sample_id':sample_id, 'log_prob': log_prob } rets = predictions if not self._built: self._add_internal_trainable_variables() self._built = True return rets
def _build(self, positions=None, sequence_length=None, mode=None, **kwargs): """Embeds the positions. Either :attr:`position` or :attr:`sequence_length` is required: - If both are given, :attr:`sequence_length` is used to mask out \ embeddings of those time steps beyond the respective sequence \ lengths. - If only :attr:`sequence_length` is given, then positions \ from `0` to `sequence_length-1` are embedded. Args: positions (optional): An integer tensor containing the position ids to embed. sequence_length (optional): An integer tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will have zero-valued embeddings. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. If `None`, dropout will be controlled by :func:`texar.global_mode`. kwargs: Additional keyword arguments for :tf_main:`tf.nn.embedding_lookup <nn/embedding_lookup>` besides :attr:`params` and :attr:`ids`. Returns: A `Tensor` of shape `shape(inputs) + embedding dimension`. """ # Gets embedder inputs inputs = positions if positions is None: if sequence_length is None: raise ValueError( 'Either `positions` or `sequence_length` is required.') max_length = tf.reduce_max(sequence_length) single_inputs = tf.range(start=0, limit=max_length, dtype=tf.int32) # Expands `single_inputs` to have shape [batch_size, max_length] expander = tf.expand_dims(tf.ones_like(sequence_length), -1) inputs = expander * tf.expand_dims(single_inputs, 0) ids_rank = len(inputs.shape.dims) embedding = self._embedding is_training = is_train_mode(mode) # Gets dropout strategy st = self._hparams.dropout_strategy if positions is None and st == 'item': # If `inputs` is based on `sequence_length`, then dropout # strategies 'item' and 'item_type' have the same effect, we # use 'item_type' to avoid unknown noise_shape in the 'item' # strategy st = 'item_type' # Dropouts as 'item_type' before embedding if st == 'item_type': dropout_layer = self._get_dropout_layer(self._hparams, dropout_strategy=st) if dropout_layer: embedding = dropout_layer.apply(inputs=embedding, training=is_training) # Embeds outputs = tf.nn.embedding_lookup(embedding, inputs, **kwargs) # Dropouts as 'item' or 'elements' after embedding if st != 'item_type': dropout_layer = self._get_dropout_layer(self._hparams, ids_rank=ids_rank, dropout_input=outputs, dropout_strategy=st) if dropout_layer: outputs = dropout_layer.apply(inputs=outputs, training=is_training) # Optionally masks if sequence_length is not None: outputs = mask_sequences(outputs, sequence_length, tensor_rank=len(inputs.shape.dims) + self._dim_rank) return outputs
def _build(self, inputs, sequence_length, mode=None): """Encodes the inputs. Args: inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`, containing the word embeddings of input sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.global_mode` is used. Returns: A Tensor of shape `[batch_size, max_time, dim]` containing the encoded vectors. """ # Multiply input embedding with the sqrt of its dimension for # normalization if not self._hparams.use_bert_config: inputs = inputs * self._hparams.dim**0.5 inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) _, lengths, _ = shape_list(inputs) inputs_padding = 1 - tf.sequence_mask( sequence_length, tf.shape(inputs)[1], dtype=tf.float32) if self._hparams.use_bert_config: ignore_padding = attn.attention_bias_ignore_padding( inputs_padding, bias_value=-1e4) else: ignore_padding = attn.attention_bias_ignore_padding( inputs_padding) encoder_self_attention_bias = ignore_padding positions = tf.expand_dims(tf.range(lengths, dtype=tf.int32), 0) pos_embeds = self.position_embedder(positions) input_embedding = inputs + pos_embeds if self._hparams.use_bert_config: x = layers.layer_normalize(input_embedding) x = tf.layers.dropout(x, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) else: x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) # Just to keep consistent with BERT, actually makes no difference if self._hparams.use_bert_config: pad_remover = None else: pad_remover = utils.transformer_utils.PadRemover(inputs_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): multihead_attention = self.multihead_attention_list[i] # trivial difference between BERT and original Transformer if self._hparams.use_bert_config: _queries_input = x else: _queries_input = layers.layer_normalize(x) attention_output = multihead_attention( queries=_queries_input, memory=_queries_input, memory_attention_bias=encoder_self_attention_bias, mode=mode, ) attention_output = tf.layers.dropout( attention_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) x = x + attention_output with tf.variable_scope('output'): if self._hparams.use_bert_config: x = layers.layer_normalize(x) y = x else: y = layers.layer_normalize(x) poswise_network = self.poswise_networks[i] with tf.variable_scope(poswise_network.variable_scope): original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.dim]) if pad_remover: y = tf.expand_dims(pad_remover.remove(y), axis=0) # [1, batch_size*seq_length, hidden_dim] layer_output = poswise_network(y, mode=mode) sub_output = tf.layers.dropout( layer_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode) ) if pad_remover: sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) else: sub_output = tf.reshape(sub_output, original_shape) x = x + sub_output if self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True return x
def forward(self, # type: ignore inputs: Optional[torch.Tensor] = None, sequence_length: Optional[torch.LongTensor] = None, memory: Optional[torch.Tensor] = None, memory_sequence_length: Optional[torch.LongTensor] = None, memory_attention_bias: Optional[torch.Tensor] = None, context: Optional[torch.Tensor] = None, context_sequence_length: Optional[torch.LongTensor] = None, helper: Optional[Helper] = None, decoding_strategy: str = 'train_greedy', max_decoding_length: Optional[int] = None, impute_finished: bool = False, infer_mode: Optional[bool] = None, beam_width: Optional[int] = None, length_penalty: float = 0., **kwargs) \ -> Union[ TransformerDecoderOutput, Tuple[TransformerDecoderOutput, torch.LongTensor], Dict[str, torch.Tensor]]: r"""Performs decoding. The interface is very similar to that of RNN decoders (:class:`texar.modules.RNNDecoderBase`). In particular, the function provides **3 ways** to specify the decoding method, with varying flexibility: 1. The :attr:`decoding_strategy` argument. - **"train_greedy"**: decoding in teacher-forcing fashion (i.e., feeding ground truth to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Argument :attr:`inputs` is required for this strategy. :attr:`sequence_length` is optional. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding `generated` sample to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and for each step sample is obtained by `random sampling` from the logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. This argument is used only when arguments :attr:`helper` and :attr:`beam_width` are both `None`. 2. The :attr:`helper` argument: An instance of subclass of :class:`texar.modules.decoders.Helper`. This provides a superset of decoding strategies than above. The interface is the same as in RNN decoders. Please refer to :meth:`texar.modules.RNNDecoderBase.forward` for detailed usage and examples. Note that, here, though using a :class:`~texar.decoder.TrainingHelper` corresponding to the ``"train_greedy"`` strategy above, the implementation is *slower* than directly setting ``decoding_strategy="train_greedy"`` (though output results are the same). Argument :attr:`max_decoding_length` is optional. 3. **Beam search**: set :attr:`beam_width` to use beam search decoding. Arguments :attr:`(start_tokens, end_token)` are required, and argument :attr:`max_decoding_length` is optional. .. warning:: Beam search is not yet implemented. Setting :attr:`beam_width` to any value greater than 1 would raise a :exc:`NotImplementedError` Args: memory (optional): The memory to attend, e.g., the output of an RNN encoder. A :tensor:`Tensor` of shape ``[batch_size, memory_max_time, dim]``. memory_sequence_length (optional): A :tensor:`Tensor` of shape ``[batch_size]`` containing the sequence lengths for the batch entries in memory. Used to create attention bias of :attr:`memory_attention_bias` is not given. Ignored if :attr:`memory_attention_bias` is provided. memory_attention_bias (optional): A :tensor:`Tensor` of shape ``[batch_size, num_heads, memory_max_time, dim]``. An attention bias typically sets the value of a padding position to a large negative value for masking. If not given, :attr:`memory_sequence_length` is used to automatically create an attention bias. inputs (optional): Input tensor for teacher forcing decoding, of shape ``[batch_size, target_max_time, emb_dim]`` containing the target sequence word embeddings. Used when :attr:`decoding_strategy` is set to ``"train_greedy"``. sequence_length (optional): A :tensor:`LongTensor` of shape ``[batch_size]``, containing the sequence length of :attr:`inputs`. Tokens beyond the respective sequence length are masked out. Used when :attr:`decoding_strategy` is set to ``"train_greedy"``. decoding_strategy (str): A string specifying the decoding strategy, including ``"train_greedy"``, ``"infer_greedy"``, ``"infer_sample"``. Different arguments are required based on the strategy. See above for details. Ignored if :attr:`beam_width` or :attr:`helper` is set. beam_width (int): Set to use beam search. If given, :attr:`decoding_strategy` is ignored. length_penalty (float): Length penalty coefficient used in beam search decoding. Refer to https://arxiv.org/abs/1609.08144 for more details. It should be larger if longer sentences are desired. context (optional): An :tensor:`LongTensor` of shape ``[batch_size, length]``, containing the starting tokens for decoding. If context is set, ``start_tokens`` of the :class:`~texar.modules.Helper` will be ignored. context_sequence_length (optional): Specify the length of context. max_decoding_length (int, optional): The maximum allowed number of decoding steps. If `None` (default), use ``"max_decoding_length"`` defined in :attr:`hparams`. Ignored in ``"train_greedy"`` decoding. impute_finished (bool): If `True`, then states for batch entries which are marked as finished get copied through and the corresponding outputs get zeroed out. This causes some slowdown at each time step, but ensures that the final state and outputs have the correct values and that backprop ignores time steps that were marked as finished. Ignored in ``"train_greedy"`` decoding. helper (optional): An instance of :class:`texar.modules.decoders.Helper` that defines the decoding strategy. If given, ``decoding_strategy`` and helper configurations in :attr:`hparams` are ignored. infer_mode (optional): If not `None`, overrides mode given by :attr:`self.training`. Returns: - For **"train_greedy"** decoding, returns an instance of :class:`~texar.modules.TransformerDecoderOutput` which contains `sample_id` and `logits`. - For **"infer_greedy"** and **"infer_sample"** decoding or decoding with :attr:`helper`, returns a tuple ``(outputs, sequence_lengths)``, where ``outputs`` is an instance of :class:`~texar.modules.TransformerDecoderOutput` as in `"train_greedy"`, and ``sequence_lengths`` is a :tensor:`LongTensor` of shape ``[batch_size]`` containing the length of each sample. - For **beam search** decoding, returns a ``dict`` containing keys ``"sample_id"`` and ``"log_prob"``. - ``"sample_id"`` is a :tensor:`LongTensor` of shape ``[batch_size, max_time, beam_width]`` containing generated token indexes. ``sample_id[:,:,0]`` is the highest-probable sample. - ``"log_prob"`` is a :tensor:`Tensor` of shape ``[batch_size, beam_width]`` containing the log probability of each sequence sample. """ if memory is not None: if memory_attention_bias is None: if memory_sequence_length is None: raise ValueError("`memory_sequence_length` is required if " "`memory_attention_bias` is not given.") enc_padding = 1 - sequence_mask(memory_sequence_length, memory.size(1), dtype=torch.float32) memory_attention_bias = attn.attention_bias_ignore_padding( enc_padding) # record the context, which will be used in step function # for dynamic_decode if context is not None: if context_sequence_length is None: raise ValueError("'context_sequence_length' must not be None" "when 'context' is specified.") self._state_context = context[:, 1:] self._state_context_sequence_length = context_sequence_length - 1 else: self._state_context = None self._state_context_sequence_length = None # Faster code path for teacher-forcing training if (helper is None and beam_width is None and decoding_strategy == 'train_greedy'): if inputs is None: raise ValueError( "'input' must not be none " "when using 'train_greedy' decoding strategy.") if sequence_length is not None: inputs = mask_sequences(inputs, sequence_length) decoder_self_attention_bias = (attn.attention_bias_lower_triangle( inputs.size(1))) decoder_output = self._self_attention_stack( inputs, memory, decoder_self_attention_bias, memory_attention_bias, cache=None) logits = self._output_layer(decoder_output) sample_id = torch.argmax(logits, dim=-1) return TransformerDecoderOutput(logits, sample_id) # Inference code path. if max_decoding_length is None: max_decoding_length = self._hparams.max_decoding_length self._state_max_decoding_length = max_decoding_length if beam_width is None or beam_width == 1: # Inference-like decoding # Prepare helper if helper is None: kwargs.update(decoding_strategy=decoding_strategy) if context is not None: kwargs.update(start_tokens=context[:, 0]) helper = self._create_or_get_helper(infer_mode, **kwargs) assert isinstance(helper, EmbeddingHelper) self._state_cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=False, batch_size=helper.batch_size) if context is not None: assert self._state_context is not None pad_length = max_decoding_length - self._state_context.size(1) if pad_length > 0: self._state_context = torch.cat( (self._state_context, self._state_context.new_zeros( self._state_context.size(0), pad_length)), dim=1) outputs, cache, sequence_lengths = self.dynamic_decode( helper, inputs=None, sequence_length=None, initial_state=None, max_decoding_length=max_decoding_length, impute_finished=impute_finished) del cache # not used if context is not None: # Here the length of sample_id will be larger than that # of logit by 1, because there will be a additional # start_token in the returned sample_id. # the start_id should be the first token of the # given context start_tokens = context[:, 0] outputs = TransformerDecoderOutput( logits=outputs.logits, sample_id=torch.cat( [start_tokens.unsqueeze(1), outputs.sample_id], dim=1)) sequence_lengths = sequence_lengths + 1 return outputs, sequence_lengths else: # Beam-search decoding # Ignore `decoding_strategy` and # assume `helper` is not set. if helper is not None: raise ValueError("Must not set 'beam_width' and 'helper' " "simultaneously.") if context is not None: start_tokens = context[:, 0] else: if 'start_tokens' not in kwargs: raise ValueError( "'start_tokens' must be specified when using" "beam search decoding.") start_tokens = kwargs['start_tokens'] _batch_size = start_tokens.size(0) self._state_cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=True, batch_size=_batch_size) end_token: int = kwargs.get('end_token') # type: ignore # The output format is different when running beam search. sample_id, log_prob = self._beam_decode( start_tokens, end_token, embedding_fn=kwargs['embedding'], beam_width=beam_width, length_penalty=length_penalty, decode_length=max_decoding_length) return {'sample_id': sample_id, 'log_prob': log_prob}
def mask_and_reduce(sequence, sequence_length, rank=2, average_across_batch=True, average_across_timesteps=False, average_across_remaining=False, sum_over_batch=False, sum_over_timesteps=True, sum_over_remaining=True, dtype=None, time_major=False): """Masks out sequence entries that are beyond the respective sequence lengths, and reduces (average or sum) away dimensions. This is a combined function of :func:`~texar.utils.shapes.mask_sequences` and :func:`~texar.losses.losses_utils.reduce_batch_time`. Args: sequence: A Tensor of sequence values. If `time_major=False` (default), this must be a Tensor of shape: `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of the Tensor is specified with :attr:`rank`. If `time_major=True`, this must be a Tensor of shape: `[max_time, batch_size, d_2, ..., d_rank].` sequence_length: A Tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be made zero. If `None`, not masking is performed. rank (int): The rank of :attr:`sequence`. Must be >= 2. Default is 2, i.e., :attr:`sequence` is a 2D Tensor consisting of batch and time dimensions. average_across_timesteps (bool): If set, average the sequence across the time dimension. Must not set :attr:`average_across_timesteps` and :attr:`sum_over_timesteps` at the same time. average_across_batch (bool): If set, average the sequence across the batch dimension. Must not set :attr:`average_across_batch`' and :attr:`sum_over_batch` at the same time. average_across_remaining (bool): If set, average the sequence across the remaining dimensions. Must not set :attr:`average_across_remaining`' and :attr:`sum_over_remaining` at the same time. sum_over_timesteps (bool): If set, sum the loss across the time dimension. Must not set :attr:`average_across_timesteps` and :attr:`sum_over_timesteps` at the same time. sum_over_batch (bool): If set, sum the loss across the batch dimension. Must not set :attr:`average_across_batch` and :attr:`sum_over_batch` at the same time. sum_over_remaining (bool): If set, sum the loss across the remaining dimension. Must not set :attr:`average_across_remaining` and :attr:`sum_over_remaining` at the same time. time_major (bool): The shape format of the inputs. If `True`, :attr:`sequence` must have shape `[max_time, batch_size, ...]`. If `False` (default), :attr:`sequence` must have shape `[batch_size, max_time, ...]`. dtype (dtype): Type of :attr:`sequence`. If `None`, infer from :attr:`sequence` automatically. """ if rank < 2: raise ValueError('`rank` must be >= 2.') if time_major: sequence = rnn._transpose_batch_time(sequence) if sequence_length is None: sequence = mask_sequences(sequence, sequence_length, dtype=dtype, time_major=False, tensor_rank=rank) if rank > 2: if average_across_remaining and sum_over_remaining: raise ValueError("Only one of `average_across_remaining` and " "`sum_over_remaining` can be set.") if average_across_remaining: sequence = tf.reduce_mean(sequence, axis=range(2, rank)) elif sum_over_remaining: sequence = tf.reduce_sum(sequence, axis=range(2, rank)) sequence = reduce_batch_time(sequence, sequence_length, average_across_batch, average_across_timesteps, sum_over_batch, sum_over_timesteps) reduce_time = average_across_timesteps or sum_over_timesteps reduce_batch = average_across_batch or sum_over_batch if not reduce_time and not reduce_batch and time_major: sequence = rnn._transpose_batch_time(sequence) return sequence
def forward( self, # type: ignore positions: Optional[torch.LongTensor] = None, sequence_length: Optional[torch.LongTensor] = None, **kwargs): r"""Embeds the positions. Either :attr:`positions` or :attr:`sequence_length` is required: - If both are given, :attr:`sequence_length` is used to mask out embeddings of those time steps beyond the respective sequence lengths. - If only :attr:`sequence_length` is given, then positions from 0 to ``sequence_length - 1`` are embedded. Args: positions (optional): A :tensor:`LongTensor` containing the position IDs to embed. sequence_length (optional): An :tensor:`LongTensor` of shape ``[batch_size]``. Time steps beyond the respective sequence lengths will have zero-valued embeddings. kwargs: Additional keyword arguments for :torch_nn:`functional.embedding` besides :attr:`params` and :attr:`ids`. Returns: A `Tensor` of shape `shape(inputs) + embedding dimension`. """ # Gets embedder inputs if positions is None: if sequence_length is None: raise ValueError( 'Either `positions` or `sequence_length` is required.') max_length = torch.max(sequence_length) single_inputs = torch.arange(start=0, end=max_length) # Expands `single_inputs` to have shape [batch_size, max_length] inputs = single_inputs.unsqueeze(0) inputs = inputs.expand(len(sequence_length), -1).contiguous() else: inputs = positions ids_rank = inputs.dim() embedding = self._embedding inputs = inputs.to(device=embedding.device) # Gets dropout strategy st = self._hparams.dropout_strategy # Dropouts as 'item_type' before embedding if st == 'item_type': noise_shape = self._get_noise_shape(dropout_strategy=st, dropout_input=embedding) embedding = self._dropout_layer(embedding, noise_shape) # Embeds outputs = torch.nn.functional.embedding(inputs.type(torch.long), embedding, **kwargs) # Dropouts as 'item' or 'elements' after embedding if st != 'item_type': noise_shape = self._get_noise_shape(dropout_strategy=st, dropout_input=outputs, ids_rank=ids_rank) outputs = self._dropout_layer(outputs, noise_shape) # Optionally masks if sequence_length is not None: outputs = mask_sequences(outputs, sequence_length) return outputs
def _build(self, inputs, sequence_length, mode=None): """Encodes the inputs. Args: inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`, containing the word embeddings of input sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.global_mode` is used. Returns: A Tensor of shape `[batch_size, max_time, dim]` containing the encoded vectors. """ # Multiply input embedding with the sqrt of its dimension for # normalization inputs = inputs * self._hparams.dim**0.5 inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) _, lengths, _ = shape_list(inputs) inputs_padding = 1 - tf.sequence_mask( sequence_length, tf.shape(inputs)[1], dtype=tf.float32) ignore_padding = attn.attention_bias_ignore_padding(inputs_padding) encoder_self_attention_bias = ignore_padding pos_embeds = self.position_embedder(lengths, self._hparams.dim) input_embedding = inputs + pos_embeds x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) pad_remover = utils.transformer_utils.PadRemover(inputs_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('self_attention'): selfatt_output = attn.multihead_attention( queries=layers.layer_normalize(x), memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=self._hparams.num_heads, dropout_rate=self._hparams.attention_dropout, num_units=self._hparams.dim, scope='multihead_attention') x = x + tf.layers.dropout( selfatt_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) with tf.variable_scope(poswise_network.variable_scope): y = layers.layer_normalize(x) original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.dim]) y = tf.expand_dims(pad_remover.remove(y), axis=0) # [1, batch_size*seq_length, hidden_dim] sub_output = tf.layers.dropout( poswise_network(y), rate=self._hparams.residual_dropout, training=is_train_mode(mode)) sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) x = x + sub_output encoder_output = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True return encoder_output
def _dynamic_rnn_loop(cell: RNNCellBase[State], inputs: torch.Tensor, initial_state: State, sequence_length: torch.LongTensor) \ -> Tuple[torch.Tensor, State]: r"""Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A ``Tensor`` of shape ``[time, batch_size, input_size]``, or a nested tuple of such elements. initial_state: A ``Tensor`` of shape ``[batch_size, state_size]``, or if ``cell.state_size`` is a tuple, then this should be a tuple of tensors having shapes ``[batch_size, s]`` for ``s`` in ``cell.state_size``. sequence_length: (optional) An ``int32`` ``Tensor`` of shape ``[batch_size]``. Returns: Tuple ``(final_outputs, final_state)``. final_outputs: A ``Tensor`` of shape ``[time, batch_size, cell.output_size]``. If ``cell.output_size`` is a (possibly nested) tuple of ints or ``TensorShape`` objects, then this returns a (possibly nested) tuple of Tensors matching the corresponding shapes. final_state: A ``Tensor``, or possibly nested tuple of Tensors, matching in length and shapes to ``initial_state``. """ state = initial_state time_steps = inputs.shape[0] all_outputs = [] all_state: MaybeTuple[List[torch.Tensor]] if isinstance(state, tuple): all_state = ([], []) else: all_state = [] for i in range(time_steps): output, state = cell(inputs[i], state) all_outputs.append(output) if isinstance(state, tuple): all_state[0].append(state[0]) all_state[1].append(state[1]) else: all_state.append(state) # type: ignore # pylint: disable=fixme # TODO: Do not compute everything regardless of sequence_length final_outputs = torch.stack(all_outputs, dim=0) final_outputs = mask_sequences(final_outputs, sequence_length=sequence_length, time_major=True) final_state: MaybeTuple[List[torch.Tensor]] if isinstance(state, tuple): final_state = ([], []) else: final_state = [] for batch_idx, time_idx in enumerate(sequence_length.tolist()): if time_idx > 0: if isinstance(state, tuple): final_state[0].append(all_state[0][time_idx - 1][batch_idx]) final_state[1].append(all_state[1][time_idx - 1][batch_idx]) else: final_state.append( # type: ignore all_state[time_idx - 1][batch_idx]) else: if isinstance(initial_state, tuple): final_state[0].append(initial_state[0][batch_idx]) final_state[1].append(initial_state[1][batch_idx]) else: final_state.append(initial_state[batch_idx]) # type: ignore if isinstance(state, tuple): final_state = (torch.stack(final_state[0], dim=0), torch.stack(final_state[1], dim=0)) else: final_state = torch.stack(final_state, dim=0) # type: ignore return final_outputs, final_state
def mask_and_reduce(sequence: torch.Tensor, sequence_length: Optional[torch.LongTensor], rank: int = 2, average_across_batch: bool = True, average_across_timesteps: bool = False, average_across_remaining: bool = False, sum_over_batch: bool = False, sum_over_timesteps: bool = True, sum_over_remaining: bool = True, dtype: Optional[torch.dtype] = None, time_major: bool = False) -> torch.Tensor: r"""Masks out sequence entries that are beyond the respective sequence lengths, and reduces (average or sum) away dimensions. This is a combination of :func:`~texar.utils.shapes.mask_sequences` and :func:`~texar.losses.losses_utils.reduce_batch_time`. Args: sequence: A tensor of sequence values. If `time_major=False` (default), this must be a tensor of shape `[batch_size, max_time, d_2, ..., d_rank]`, where the rank of the tensor is specified with :attr:`rank`. The batch and time dimensions are exchanged if `time_major` is True. sequence_length: A tensor of shape `[batch_size]`. Time steps beyond the respective sequence lengths will be made zero. If `None`, no masking is performed. rank (int): The rank of :attr:`sequence`. Must be >= 2. Default is 2, i.e., `sequence` is a 2D Tensor consisting of batch and time dimensions. average_across_timesteps (bool): If set, average the sequence across the time dimension. Must not set `average_across_timesteps` and `sum_over_timesteps` at the same time. average_across_batch (bool): If set, average the sequence across the batch dimension. Must not set `average_across_batch`' and `sum_over_batch` at the same time. average_across_remaining (bool): If set, average the sequence across the remaining dimensions. Must not set `average_across_remaining`' and `sum_over_remaining` at the same time. sum_over_timesteps (bool): If set, sum the sequence across the time dimension. Must not set `average_across_timesteps` and `sum_over_timesteps` at the same time. sum_over_batch (bool): If set, sum the sequence across the batch dimension. Must not set `average_across_batch` and `sum_over_batch` at the same time. sum_over_remaining (bool): If set, sum the sequence across the remaining dimension. Must not set `average_across_remaining` and `sum_over_remaining` at the same time. dtype (torch.dtype): The dtype of the returned mask. time_major (bool): The shape format of the inputs. If `True`, :attr:`sequence` must have shape `[max_time, batch_size, ...]`. If `False` (default), `sequence` must have shape `[batch_size, max_time, ...]`. Returns: A tensor containing the masked and reduced sequence. """ if rank < 2: raise ValueError('`rank` must be >= 2.') if time_major: sequence = transpose_batch_time(sequence) if sequence_length is not None: sequence = mask_sequences(sequence, sequence_length, dtype=dtype, time_major=False) if rank > 2: if average_across_remaining and sum_over_remaining: raise ValueError("Only one of `average_across_remaining` and " "`sum_over_remaining` can be set.") if average_across_remaining: for axis in sorted(list(range(2, rank)), reverse=True): sequence = torch.mean(sequence, dim=axis) elif sum_over_remaining: for axis in sorted(list(range(2, rank)), reverse=True): sequence = torch.sum(sequence, dim=axis) sequence = reduce_batch_time(sequence, sequence_length, average_across_batch, average_across_timesteps, sum_over_batch, sum_over_timesteps) reduce_time = average_across_timesteps or sum_over_timesteps reduce_batch = average_across_batch or sum_over_batch if not reduce_time and not reduce_batch and time_major: sequence = transpose_batch_time(sequence) return sequence
def _dynamic_rnn_loop(cell: RNNCellBase[State], inputs: torch.Tensor, initial_state: State, sequence_length: torch.LongTensor) \ -> Tuple[torch.Tensor, State]: r"""Internal implementation of Dynamic RNN. Args: cell: An instance of RNNCell. inputs: A ``Tensor`` of shape ``[time, batch_size, input_size]``, or a nested tuple of such elements. initial_state: A ``Tensor`` of shape ``[batch_size, state_size]``, or if ``cell.state_size`` is a tuple, then this should be a tuple of tensors having shapes ``[batch_size, s]`` for ``s`` in ``cell.state_size``. sequence_length: (optional) An ``int32`` ``Tensor`` of shape ``[batch_size]``. Returns: Tuple ``(final_outputs, final_state)``. final_outputs: A ``Tensor`` of shape ``[time, batch_size, cell.output_size]``. If ``cell.output_size`` is a (possibly nested) tuple of ints or ``torch.Size`` objects, then this returns a (possibly nested) tuple of Tensors matching the corresponding shapes. final_state: A ``Tensor``, or possibly nested tuple of Tensors, matching in length and shapes to ``initial_state``. """ state = initial_state time_steps = inputs.shape[0] all_outputs = [] all_state = map_structure(lambda _: no_map(list), state) for i in range(time_steps): output, state = cell(inputs[i], state) all_outputs.append(output) map_structure_zip(lambda xs, x: xs.append(x), (all_state, state)) # TODO: Do not compute everything regardless of sequence_length final_outputs = torch.stack(all_outputs, dim=0) final_outputs = mask_sequences(final_outputs, sequence_length=sequence_length, time_major=True) final_state = map_structure(lambda _: no_map(list), state) # pylint: disable=cell-var-from-loop # Our use case is fine because the function is called immediately and # exclusively in the current iteration of the loop. for batch_idx, time_idx in enumerate(sequence_length.tolist()): if time_idx > 0: map_structure_zip( lambda xs, x: xs.append(x[time_idx - 1][batch_idx]), (final_state, all_state)) else: map_structure_zip(lambda xs, x: xs.append(x[batch_idx]), (final_state, initial_state)) # pylint: enable=cell-var-from-loop final_state = map_structure(lambda x: torch.stack(x, dim=0), final_state) return final_outputs, final_state
def _build( self, # pylint: disable=arguments-differ memory=None, memory_sequence_length=None, memory_attention_bias=None, inputs=None, sequence_length=None, decoding_strategy='train_greedy', beam_width=None, length_penalty=0., start_tokens=None, end_token=None, context=None, context_sequence_length=None, softmax_temperature=None, max_decoding_length=None, impute_finished=False, helper=None, mode=None): """Performs decoding. The interface is very similar to that of RNN decoders (:meth:`texar.modules.RNNDecoderBase._build`). In particular, the function provides **3 ways** to specify the decoding method, with varying flexibility: 1. The :attr:`decoding_strategy` argument. - **"train_greedy"**: decoding in teacher-forcing fashion (i.e., feeding ground truth to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Argument :attr:`inputs` is required for this strategy. :attr:`sequence_length` is optional. - **"infer_greedy"**: decoding in inference fashion (i.e., feeding `generated` sample to decode the next step), and for each step sample is obtained by taking the `argmax` of logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. - **"infer_sample"**: decoding in inference fashion, and for each step sample is obtained by `random sampling` from the logits. Arguments :attr:`(start_tokens, end_token)` are required for this strategy, and argument :attr:`max_decoding_length` is optional. This argument is used only when arguments :attr:`helper` and :attr:`beam_width` are both `None`. 2. The :attr:`helper` argument: An instance of subclass of :tf_main:`tf.contrib.seq2seq.Helper <contrib/seq2seq/Helper>`. This provides a superset of decoding strategies than above. The interface is the same as in RNN decoders. Please refer to :meth:`texar.modules.RNNDecoderBase._build` for detailed usage and examples. Note that, here, though using a :tf_main:`TrainingHelper <contrib/seq2seq/TrainingHelper>` corresponding to the "train_greedy" strategy above, the implementation is *slower* than directly setting `decoding_strategy="train_greedy"` (though the output results are the same). Argument :attr:`max_decoding_length` is optional. 3. **Beam search**: set :attr:`beam_width` to use beam search decoding. Arguments :attr:`(start_tokens, end_token)` are required, and argument :attr:`max_decoding_length` is optional. Args: memory (optional): The memory to attend, e.g., the output of an RNN encoder. A Tensor of shape `[batch_size, memory_max_time, dim]`. memory_sequence_length (optional): A Tensor of shape `[batch_size]` containing the sequence lengths for the batch entries in memory. Used to create attention bias of :attr:`memory_attention_bias` is not given. Ignored if `memory_attention_bias` is provided. memory_attention_bias (optional): A Tensor of shape `[batch_size, num_heads, memory_max_time, dim]`. An attention bias typically sets the value of a padding position to a large negative value for masking. If not given, :attr:`memory_sequence_length` is used to automatically create an attention bias. inputs (optional): Input tensor for teacher forcing decoding, of shape `[batch_size, target_max_time, emb_dim]` containing the target sequence word embeddings. Used when :attr:`decoding_strategy` is set to "train_greedy". sequence_length (optional): A Tensor of shape `[batch_size]`, containing the sequence length of :attr:`inputs`. Tokens beyond the respective sequence length are masked out. Used when :attr:`decoding_strategy` is set to "train_greedy". decoding_strategy (str): A string specifying the decoding strategy, including "train_greedy", "infer_greedy", "infer_sample". Different arguments are required based on the strategy. See above for details. Ignored if :attr:`beam_width` or :attr:`helper` is set. beam_width (int): Set to use beam search. If given, :attr:`decoding_strategy` is ignored. length_penalty (float): Length penalty coefficient used in beam search decoding. Refer to https://arxiv.org/abs/1609.08144 for more details. It Should be larger if longer sentences are wanted. start_tokens (optional): An int Tensor of shape `[batch_size]`, containing the start tokens. Used when :attr:`decoding_strategy` = "infer_greedy" or "infer_sample", or :attr:`beam_width` is set. Ignored when context is set. end_token (optional): An int 0D Tensor, the token that marks end of decoding. Used when :attr:`decoding_strategy` = "infer_greedy" or "infer_sample", or :attr:`beam_width` is set. context (optional): An int Tensor of shape `[batch_size, length]`, containing the starting tokens for decoding. If context is set, the start_tokens will be ignored. context_sequence_length (optional): specify the length of context. softmax_temperature (optional): A float 0D Tensor, value to divide the logits by before computing the softmax. Larger values (above 1.0) result in more random samples. Must > 0. If `None`, 1.0 is used. Used when :attr:`decoding_strategy` = "infer_sample"`. max_decoding_length (optional): An int scalar Tensor indicating the maximum allowed number of decoding steps. If `None` (default), use "max_decoding_length" defined in :attr:`hparams`. Ignored in "train_greedy" decoding. impute_finished (bool): If `True`, then states for batch entries which are marked as finished get copied through and the corresponding outputs get zeroed out. This causes some slowdown at each time step, but ensures that the final state and outputs have the correct values and that backprop ignores time steps that were marked as finished. Ignored in "train_greedy" decoding. helper (optional): An instance of :tf_main:`Helper <contrib/seq2seq/Helper>` that defines the decoding strategy. If given, :attr:`decoding_strategy` is ignored. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Controls dropout mode. If `None` (default), :func:`texar.global_mode` is used. Returns: - For **"train_greedy"** decoding, returns an instance of \ :class:`~texar.modules.TransformerDecoderOutput` which contains\ `sample_id` and `logits`. - For **"infer_greedy"** and **"infer_sample"** decoding or\ decoding with :attr:`helper`, returns\ a tuple `(outputs, sequence_lengths)`, where `outputs` is an \ instance of :class:`~texar.modules.TransformerDecoderOutput` as\ in "train_greedy", and `sequence_lengths` is a Tensor of shape\ `[batch_size]` containing the length of each sample. - For **beam search** decoding, returns a `dict` containing keys\ "sample_id" and "log_prob". - **"sample_id"** is an int Tensor of shape \ `[batch_size, max_time, beam_width]` containing generated\ token indexes. `sample_id[:,:,0]` is the highest-probable \ sample. - **"log_prob"** is a float Tensor of shape \ `[batch_size, beam_width]` containing the log probability \ of each sequence sample. """ if memory is not None: if memory_attention_bias is None: if memory_sequence_length is None: raise ValueError("`memory_sequence_length` is required if " "`memory_attention_bias` is not given.") enc_padding = 1 - tf.sequence_mask(memory_sequence_length, tf.shape(memory)[1], dtype=tf.float32) memory_attention_bias = attn.attention_bias_ignore_padding( enc_padding) # record the context, which will be used in step function # for dynamic_decode if context is not None: start_tokens = context[:, 0] self.context = context[:, 1:] self.context_sequence_length = context_sequence_length - 1 else: self.context = None if helper is None and beam_width is None and \ decoding_strategy == 'train_greedy': # Teacher-forcing if sequence_length is not None: inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) decoder_self_attention_bias = (attn.attention_bias_lower_triangle( shape_list(inputs)[1])) if self._hparams.scale_embeds: target_inputs = inputs * self._hparams.dim**0.5 else: target_inputs = inputs _, lengths, _ = shape_list(target_inputs) positions = tf.expand_dims(tf.range(lengths, dtype=tf.int32), 0) pos_embeds = self.position_embedder(positions) inputs = target_inputs + pos_embeds decoder_output = self._self_attention_stack( inputs, memory, decoder_self_attention_bias=decoder_self_attention_bias, memory_attention_bias=memory_attention_bias, cache=None, mode=mode) logits = self.output_layer(decoder_output) preds = tf.to_int32(tf.argmax(logits, axis=-1)) rets = TransformerDecoderOutput(logits=logits, sample_id=preds) else: if max_decoding_length is None: max_decoding_length = self._hparams.max_decoding_length self._inputs_to_outputs = self._inputs_to_outputs_fn( max_decoding_length + 1) if beam_width is None: #Inference-like decoding # Prepare helper if helper is not None: # ignore `decoding_strategy` pass else: if decoding_strategy == "infer_greedy": helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self._embedding, start_tokens, end_token) elif decoding_strategy == "infer_sample": helper = tf.contrib.seq2seq.SampleEmbeddingHelper( self._embedding, start_tokens, end_token, softmax_temperature) else: raise ValueError( "Unknown decoding strategy: {}".format( decoding_strategy)) self._helper = helper self._cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=False) if context is not None: self.context = tf.pad( self.context, [[0, 0], [0, max_decoding_length - tf.shape(self.context)[1]]]) outputs, cache, sequence_lengths = dynamic_decode( decoder=self, impute_finished=impute_finished, maximum_iterations=max_decoding_length, output_time_major=False, scope=self.variable_scope) if context is not None: # Here the length of sample_id will be larger than that # of logit by 1, because there will be a additional # start_token in the returned sample_id. # the start_id should be the first token of the # given context outputs = TransformerDecoderOutput( logits=outputs.logits, sample_id=tf.concat([ tf.expand_dims(start_tokens, 1), outputs.sample_id ], axis=1)) sequence_lengths = sequence_lengths + 1 rets = outputs, sequence_lengths else: #Beam-search decoding # ignore `decoding_strategy` # assume `helper` is not set if helper is not None: raise ValueError("Must not set 'beam_width' and 'helper' " "simultaneously.") _batch_size = tf.shape(start_tokens)[0] self._cache = self._init_cache(memory, memory_attention_bias, beam_search_decoding=True, batch_size=_batch_size) # The output format is different when running beam search sample_id, log_prob = self._beam_decode( start_tokens, end_token, beam_width=beam_width, length_penalty=length_penalty, decode_length=max_decoding_length, ) rets = {'sample_id': sample_id, 'log_prob': log_prob} if not self._built: self._add_internal_trainable_variables() self._built = True return rets