def _self_attention_stack(self, inputs, memory, decoder_self_attention_bias=None, memory_attention_bias=None, cache=None, mode=None): """Stacked multihead attention module. """ inputs = tf.layers.dropout(inputs, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) if cache is not None: memory_attention_bias = \ cache['memory_attention_bias'] else: assert decoder_self_attention_bias is not None x = inputs for i in range(self._hparams.num_blocks): layer_name = 'layer_{}'.format(i) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): selfatt_output = attn.multihead_attention( queries=layers.layer_normalize(x), memory=None, memory_attention_bias=decoder_self_attention_bias, num_units=self._hparams.dim, num_heads=self._hparams.num_heads, dropout_rate=self._hparams.attention_dropout, cache=layer_cache, scope="multihead_attention", ) x = x + tf.layers.dropout( selfatt_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) if memory is not None: with tf.variable_scope('encdec_attention'): encdec_output = attn.multihead_attention( queries=layers.layer_normalize(x), memory=memory, memory_attention_bias=memory_attention_bias, num_units=self._hparams.dim, num_heads=self._hparams.num_heads, dropout_rate=self._hparams.attention_dropout, scope="multihead_attention" ) x = x + tf.layers.dropout(encdec_output, \ rate=self._hparams.residual_dropout, \ training=is_train_mode(mode)) poswise_network = FeedForwardNetwork( \ hparams=self._hparams['poswise_feedforward']) with tf.variable_scope(poswise_network.variable_scope): sub_output = tf.layers.dropout( poswise_network(layers.layer_normalize(x)), rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) x = x + sub_output return layers.layer_normalize(x)
def _self_attention_stack(self, inputs, memory, decoder_self_attention_bias=None, memory_attention_bias=None, cache=None, mode=None): """Stacked multihead attention module. """ inputs = tf.layers.dropout(inputs, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) if cache is not None: memory_attention_bias = \ cache['memory_attention_bias'] else: assert decoder_self_attention_bias is not None x = inputs for i in range(self._hparams.num_blocks): layer_name = 'layer_{}'.format(i) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("self_attention"): multihead_attention = \ self.multihead_attentions['self_att'][i] selfatt_output = multihead_attention( queries=layers.layer_normalize(x), memory=None, memory_attention_bias=decoder_self_attention_bias, cache=layer_cache, mode=mode, ) x = x + tf.layers.dropout( selfatt_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) if memory is not None: with tf.variable_scope('encdec_attention'): multihead_attention = \ self.multihead_attentions['encdec_att'][i] encdec_output = multihead_attention( queries=layers.layer_normalize(x), memory=memory, memory_attention_bias=memory_attention_bias, mode=mode, ) x = x + tf.layers.dropout(encdec_output, \ rate=self._hparams.residual_dropout, \ training=is_train_mode(mode)) poswise_network = self.poswise_networks[i] with tf.variable_scope('past_poswise_ln'): sub_output = tf.layers.dropout( poswise_network(layers.layer_normalize(x)), rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) x = x + sub_output return layers.layer_normalize(x)
def _build(self, inputs, sequence_length, mode=None): """Encodes the inputs. Args: inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`, containing the word embeddings of input sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.global_mode` is used. Returns: A Tensor of shape `[batch_size, max_time, dim]` containing the encoded vectors. """ # Multiply input embedding with the sqrt of its dimension for # normalization if not self._hparams.use_bert_config: inputs = inputs * self._hparams.dim**0.5 inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) _, lengths, _ = shape_list(inputs) inputs_padding = 1 - tf.sequence_mask( sequence_length, tf.shape(inputs)[1], dtype=tf.float32) if self._hparams.use_bert_config: ignore_padding = attn.attention_bias_ignore_padding( inputs_padding, bias_value=-1e4) else: ignore_padding = attn.attention_bias_ignore_padding( inputs_padding) encoder_self_attention_bias = ignore_padding positions = tf.expand_dims(tf.range(lengths, dtype=tf.int32), 0) pos_embeds = self.position_embedder(positions) input_embedding = inputs + pos_embeds if self._hparams.use_bert_config: x = layers.layer_normalize(input_embedding) x = tf.layers.dropout(x, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) else: x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) # Just to keep consistent with BERT, actually makes no difference if self._hparams.use_bert_config: pad_remover = None else: pad_remover = utils.transformer_utils.PadRemover(inputs_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): multihead_attention = self.multihead_attention_list[i] # trivial difference between BERT and original Transformer if self._hparams.use_bert_config: _queries_input = x else: _queries_input = layers.layer_normalize(x) attention_output = multihead_attention( queries=_queries_input, memory=_queries_input, memory_attention_bias=encoder_self_attention_bias, mode=mode, ) attention_output = tf.layers.dropout( attention_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) x = x + attention_output with tf.variable_scope('output'): if self._hparams.use_bert_config: x = layers.layer_normalize(x) y = x else: y = layers.layer_normalize(x) poswise_network = self.poswise_networks[i] with tf.variable_scope(poswise_network.variable_scope): original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.dim]) if pad_remover: y = tf.expand_dims(pad_remover.remove(y), axis=0) # [1, batch_size*seq_length, hidden_dim] layer_output = poswise_network(y, mode=mode) sub_output = tf.layers.dropout( layer_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode) ) if pad_remover: sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) else: sub_output = tf.reshape(sub_output, original_shape) x = x + sub_output if self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True return x
def _layer_norm(x, scope): return layers.layer_normalize(x, reuse=tf.AUTO_REUSE, scope=scope)
def _build(self, inputs, sequence_length, mode=None): """Encodes the inputs. Args: inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`, containing the word embeddings of input sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.global_mode` is used. Returns: A Tensor of shape `[batch_size, max_time, dim]` containing the encoded vectors. """ # Multiply input embedding with the sqrt of its dimension for # normalization inputs = inputs * self._hparams.dim**0.5 inputs = mask_sequences(inputs, sequence_length, tensor_rank=3) _, lengths, _ = shape_list(inputs) inputs_padding = 1 - tf.sequence_mask( sequence_length, tf.shape(inputs)[1], dtype=tf.float32) ignore_padding = attn.attention_bias_ignore_padding(inputs_padding) encoder_self_attention_bias = ignore_padding pos_embeds = self.position_embedder(lengths, self._hparams.dim) input_embedding = inputs + pos_embeds x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) pad_remover = utils.transformer_utils.PadRemover(inputs_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('self_attention'): selfatt_output = attn.multihead_attention( queries=layers.layer_normalize(x), memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=self._hparams.num_heads, dropout_rate=self._hparams.attention_dropout, num_units=self._hparams.dim, scope='multihead_attention') x = x + tf.layers.dropout( selfatt_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) with tf.variable_scope(poswise_network.variable_scope): y = layers.layer_normalize(x) original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.dim]) y = tf.expand_dims(pad_remover.remove(y), axis=0) # [1, batch_size*seq_length, hidden_dim] sub_output = tf.layers.dropout( poswise_network(y), rate=self._hparams.residual_dropout, training=is_train_mode(mode)) sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) x = x + sub_output encoder_output = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True return encoder_output
def _build(self, inputs, memory, sequence_length, memory_sequence_length, adjs, encoder_output, mode=None): """Encodes the inputs. Args: inputs: A 3D Tensor of shape `[batch_size, max_time, dim]`, containing the embedding of input sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. The input embedding is typically an aggregation of word embedding and position embedding. memory: A 3D Tensor of shape `[batch_size, memory_max_time, dim]`, containing the embedding of memory sequences. Note that the embedding dimension `dim` must equal "dim" in :attr:`hparams`. The input embedding is typically an aggregation of word embedding and position embedding. sequence_length: A 1D Tensor of shape `[batch_size]`. Input tokens beyond respective sequence lengths are masked out automatically. sequence_length: A 1D Tensor of shape `[batch_size]`. Memory tokens beyond respective sequence lengths are masked out automatically. adjs: A 3D Tensor of shape `[batch_size, max_time, max_time]`, containing the adjacency matrices of input sequences encoder_output: bool. True: return encoder-like embeddings. False: return CrossGraphTransformerDecoderOutput. mode (optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>`, including `TRAIN`, `EVAL`, and `PREDICT`. Used to toggle dropout. If `None` (default), :func:`texar.global_mode` is used. Returns: A Tensor of shape `[batch_size, max_time, dim]` containing the encoded vectors. """ # Get adjacency masks from adjs adj_masks = 1 - tf.cast(tf.equal(adjs, 0), dtype=tf.float32) # Multiply input embedding with the sqrt of its dimension for # normalization inputs_padding = 1 - tf.sequence_mask( sequence_length, tf.shape(inputs)[1], dtype=tf.float32) if self._hparams.use_bert_config: ignore_padding = attn.attention_bias_ignore_padding( inputs_padding, bias_value=-1e4) else: ignore_padding = attn.attention_bias_ignore_padding(inputs_padding) encoder_self_attention_bias = ignore_padding input_embedding = inputs # shape (batch_size, max_time, dim) if self._hparams.use_bert_config: x = layers.layer_normalize(input_embedding) x = tf.layers.dropout(x, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) else: x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=is_train_mode(mode)) # Just to keep consistent with BERT, actually makes no difference if self._hparams.use_bert_config: pad_remover = None else: pad_remover = utils.transformer_utils.PadRemover(inputs_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): graph_multihead_attention = self.graph_multihead_attention_list[ i] # trivial difference between BERT and original Transformer if self._hparams.use_bert_config: _queries_input = x else: _queries_input = layers.layer_normalize(x) attention_output = graph_multihead_attention( queries=_queries_input, memory=memory, adj_masks=adj_masks, memory_attention_bias=encoder_self_attention_bias, mode=mode, ) attention_output = tf.layers.dropout( attention_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode), ) # attention_output: weighted sum of V of memory with weights determined by querying keys of memory x = x + attention_output with tf.variable_scope('output'): if self._hparams.use_bert_config: x = layers.layer_normalize(x) y = x else: y = layers.layer_normalize(x) poswise_network = self.poswise_networks[i] with tf.variable_scope(poswise_network.variable_scope): original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.dim]) if pad_remover: y = tf.expand_dims(pad_remover.remove(y), axis=0) # [1, batch_size*seq_length, hidden_dim] layer_output = poswise_network(y, mode=mode) sub_output = tf.layers.dropout( layer_output, rate=self._hparams.residual_dropout, training=is_train_mode(mode)) if pad_remover: sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) else: sub_output = tf.reshape(sub_output, original_shape) x = x + sub_output if self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._hparams.use_bert_config: x = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True if encoder_output: return x logits = self._output_layer(x) sample_ids = tf.to_int32(tf.argmax(logits, axis=-1)) probs = '' # probs = GumbelSoftmax(self._tau, logits=logits).sample() # probs = tf.nn.softmax(logits / self._tau) # vanilla softmax rets = CrossGraphTransformerFixedLengthDecoderOutput( logits=logits, sample_id=sample_ids, probs=probs) return rets
def _build(self, inputs, mode=None): """Encodes the inputs with transformer encoder. Args: inputs: A 2D Tensor of shape `[batch_size, max_time]` mode(optional): A tensor taking value in :tf_main:`tf.estimator.ModeKeys <estimator/ModeKeys>` """ encoder_padding = tf.to_float(tf.equal(inputs, 0)) #pylint:disable=too-many-locals self.enc = tf.nn.embedding_lookup(self._embedding, inputs) _, _, channels = shape_list(self.enc) if self._hparams.multiply_embedding_mode == 'sqrt_depth': self.enc = self.enc * channels**0.5 ignore_padding = attentions.attention_bias_ignore_padding( encoder_padding) encoder_self_attention_bias = ignore_padding encoder_decoder_attention_bias = ignore_padding if self.target_symbol_embedding: emb_target_space = tf.reshape(self.target_symbol_embedding, [1, 1, -1]) self.enc = self.enc + emb_target_space lengths = shape_list(self.enc)[1] channels = shape_list(self.enc)[2] pos_embeds = self.position_embedder(lengths, channels) input_embedding = self.enc + pos_embeds x = tf.layers.dropout(input_embedding, rate=self._hparams.embedding_dropout, training=context.global_mode_train()) pad_remover = utils.transformer_utils.PadRemover(encoder_padding) for i in range(self._hparams.num_blocks): with tf.variable_scope("layer_{}".format(i)): with tf.variable_scope('self_attention'): selfatt_output = attentions.multihead_attention( queries=layers.layer_normalize(x), memory=None, memory_attention_bias=encoder_self_attention_bias, num_heads=self._hparams.num_heads, dropout_rate=self._hparams.attention_dropout, num_units=self._hparams.num_units, scope='multihead_attention') x = x + tf.layers.dropout( selfatt_output, rate=self._hparams.residual_dropout, training=context.global_mode_train()) poswise_network = FeedForwardNetwork( hparams=self._hparams['poswise_feedforward']) with tf.variable_scope(poswise_network.variable_scope): y = layers.layer_normalize(x) original_shape = shape_list(y) y = tf.reshape(y, [-1, self._hparams.num_units]) y = tf.expand_dims(pad_remover.remove(y), axis=0) #[1, batch_size*seq_length, hidden_dim] sub_output = tf.layers.dropout( poswise_network(y), rate=self._hparams.residual_dropout, training=context.global_mode_train()) sub_output = tf.reshape(pad_remover.restore(tf.squeeze(\ sub_output, axis=0)), original_shape \ ) x = x + sub_output self.stack_output = x encoder_output = layers.layer_normalize(x) if not self._built: self._add_internal_trainable_variables() self._built = True return encoder_output, encoder_decoder_attention_bias