def _get_attn_bias_with_dropout(self, seq_len, uniform_low=0):
        default_padding = model_utils.get_padding_by_seq_len(seq_len)

        if not graph_utils.is_training():
            attention_bias = model_utils.get_padding_bias(
                None, default_padding)
            return attention_bias, default_padding

        batch_size = tf.shape(default_padding)[0]
        max_seq_len = tf.shape(default_padding)[1]

        u = tfp.distributions.Uniform(low=uniform_low,
                                      high=tf.cast(seq_len, dtype=tf.float32))
        remove_indices = tf.cast(u.sample(), tf.int32)
        remove_padding = tf.one_hot(remove_indices,
                                    depth=max_seq_len,
                                    dtype=tf.float32)

        identical_prob = self.params.get('noiser_ident_prob', .99)
        b = tfp.distributions.Binomial(total_count=1, probs=identical_prob)
        non_identical_mask = b.sample(
            sample_shape=(batch_size, ))  # [batch_size]
        non_identical_mask = tf.tile(tf.reshape(non_identical_mask, [-1, 1]),
                                     [1, max_seq_len])
        non_identical_mask = 1. - non_identical_mask

        masked_remove_padding = non_identical_mask * remove_padding

        # [batch, length]
        padding = default_padding + masked_remove_padding

        attention_bias = model_utils.get_padding_bias(None, padding=padding)

        return attention_bias, padding
    def call(self, seq, seq_len, **kwargs):
        inputs = self._prepare_inputs(seq)

        padding = model_utils.get_padding_by_seq_len(seq_len)
        attention_bias = model_utils.get_padding_bias(None, padding=padding)

        encoded = self.encoder(inputs, attention_bias, padding)

        return encoded, attention_bias
Example #3
0
    def call(self, src_word_ids, tgt_word_ids, insert_word_ids,
             common_word_ids, src_len, tgt_len, iw_len, cw_len, **kwargs):
        with tf.variable_scope('edit_encoder'):
            outputs = self.mev_extractor(src_word_ids, tgt_word_ids, src_len,
                                         tgt_len)
            cnx_tgt, tgt_attn_bias, pooled_src, micro_evs_st = outputs

            src_padding = model_utils.get_padding_by_seq_len(src_len)
            src_attn_bias = model_utils.get_padding_bias(None, src_padding)

            return tf.constant([[0.]]), (micro_evs_st, micro_evs_st,
                                         src_attn_bias), (tf.constant([[0.]]),
                                                          tf.constant([[0.]]),
                                                          tf.constant([[0.]]))
Example #4
0
    def __call__(self, inputs, targets=None):
        """Calculate target logits or inferred target sequences.

        Args:
          inputs: int tensor with shape [batch_size, input_length].
          targets: None or int tensor with shape [batch_size, target_length].

        Returns:
          If targets is defined, then return logits for each word in the target
          sequence. float tensor with shape [batch_size, target_length, vocab_size]
          If target is none, then generate output sequence one token at a time.
            returns a dictionary {
              output: [batch_size, decoded length]
              score: [batch_size, float]}
        """
        # Variance scaling is used here because it seems to work in many problems.
        # Other reasonable initializers may also work just as well.
        initializer = tf.variance_scaling_initializer(
            self.params["initializer_gain"],
            mode="fan_avg",
            distribution="uniform")
        with tf.variable_scope("Transformer", initializer=initializer):
            # Calculate attention bias for encoder self-attention and decoder
            # multi-headed attention layers.
            attention_bias = model_utils.get_padding_bias(inputs)

            # Run the inputs through the encoder layer to map the symbol
            # representations to continuous representations.
            encoder_outputs = self.encode(inputs, attention_bias)

            # Generate output sequence if targets is None, or return logits if target
            # sequence is known.
            if targets is None:
                return self.predict(encoder_outputs, attention_bias)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
    def call(self, src, tgt, src_len, tgt_len, **kwargs):
        # Add [REMOVE] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_tgt = self.embedding_layer(tgt)

        # [batch, length+1, hidden_size]
        extended_embedded_tgt = self._add_token_to_beginning(embedded_tgt, self.rm_tok_embedding)
        extended_embedded_tgt += model_utils.get_position_encoding(
            tf.shape(extended_embedded_tgt)[1],
            self.params.hidden_size
        )
        extended_tgt_len = tgt_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_tgt_attention_bias, extended_tgt_padding = self._get_attn_bias_with_dropout(
                extended_tgt_len, uniform_low=1)
        else:
            extended_tgt_padding = model_utils.get_padding_by_seq_len(extended_tgt_len)
            extended_tgt_attention_bias = model_utils.get_padding_bias(None, extended_tgt_padding)

        # Add [CLS] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_src = self.embedding_layer(src)

        # [batch, length+1, hidden_size]
        extended_embedded_src = self._add_cls_token(embedded_src)
        extended_embedded_src += model_utils.get_position_encoding(
            tf.shape(extended_embedded_src)[1],
            self.params.hidden_size
        )
        extended_src_len = src_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_src_attention_bias, extended_src_padding = self._get_attn_bias_with_dropout(
                extended_src_len, uniform_low=1)
        else:
            extended_src_padding = model_utils.get_padding_by_seq_len(extended_src_len)
            extended_src_attention_bias = model_utils.get_padding_bias(None, extended_src_padding)

        # Encode Target
        # [batch, length+1, hidden_size]
        encoded_tgt = self._encode_tgt(extended_embedded_tgt, extended_tgt_padding, extended_tgt_attention_bias)

        # Decode source using the encoded target
        # [batch, length+1, hidden_size]
        decoder_output = self._decode_micro_edit_vectors(extended_embedded_src, extended_src_padding,
                                                         extended_src_attention_bias,
                                                         encoded_tgt, extended_tgt_attention_bias)

        if not graph_utils.is_training() and self.params.save_attentions:
            tf.add_to_collection('TransformerMicroEditExtractor_Attentions', [
                self.target_encoder.self_attn_alignment_history,
                self.mev_decoder.self_attn_alignment_history,
                self.mev_decoder.enc_dec_attn_alignment_history,
            ])

        with tf.name_scope("pooler"):
            # We "pool" the model by simply taking the hidden state corresponding
            # to the first token.
            first_token_tensor = tf.squeeze(decoder_output[:, 0:1, :], axis=1)
            pooled = self.pooling_layer(first_token_tensor)

        # with tf.control_dependencies([prints]):
        # [batch, length, hidden_size]
        micro_ev = self.mev_projection(decoder_output[:, 1:, :])

        return encoded_tgt[:, 1:, :], extended_tgt_attention_bias[:, :, :, 1:], pooled, micro_ev