Ejemplo n.º 1
0
    def __init__(self, config, **kwargs):
        super().__init__(**kwargs)
        self.config = Config.merge_to_new(
            [config.editor.transformer, config.editor.decoder])

        # change hidden_size of transformer to match with the augmented input
        self.config.put('orig_hidden_size', self.config.hidden_size)
        self.config.put(
            'hidden_size',
            self.config.hidden_size + config.editor.edit_encoder.edit_dim)

        # Project embedding to transformer's hidden_size if needed
        embedding_layer = EmbeddingSharedWeights.get_from_graph()
        self.vocab_size = embedding_layer.vocab_size
        if config.editor.word_dim != self.config.orig_hidden_size:
            self.embedding_layer = embedding_layer.get_projected(
                self.config.orig_hidden_size)
        else:
            self.embedding_layer = embedding_layer

        # As far as EmbeddingSharedWeights class supports linear projection on embeddings,
        # we will use it to compute the model's logits
        self.project_back = tf.layers.Dense(config.editor.word_dim,
                                            activation=None,
                                            name='project_back')
        self.vocab_projection = embedding_layer

        # Transformer stack
        self.decoder_stack = MultiSourceDecoderStack(self.config.to_json(),
                                                     graph_utils.is_training())
Ejemplo n.º 2
0
    def _get_attn_bias_with_dropout(self, seq_len, uniform_low=0):
        default_padding = model_utils.get_padding_by_seq_len(seq_len)

        if not graph_utils.is_training():
            attention_bias = model_utils.get_padding_bias(
                None, default_padding)
            return attention_bias, default_padding

        batch_size = tf.shape(default_padding)[0]
        max_seq_len = tf.shape(default_padding)[1]

        u = tfp.distributions.Uniform(low=uniform_low,
                                      high=tf.cast(seq_len, dtype=tf.float32))
        remove_indices = tf.cast(u.sample(), tf.int32)
        remove_padding = tf.one_hot(remove_indices,
                                    depth=max_seq_len,
                                    dtype=tf.float32)

        identical_prob = self.params.get('noiser_ident_prob', .99)
        b = tfp.distributions.Binomial(total_count=1, probs=identical_prob)
        non_identical_mask = b.sample(
            sample_shape=(batch_size, ))  # [batch_size]
        non_identical_mask = tf.tile(tf.reshape(non_identical_mask, [-1, 1]),
                                     [1, max_seq_len])
        non_identical_mask = 1. - non_identical_mask

        masked_remove_padding = non_identical_mask * remove_padding

        # [batch, length]
        padding = default_padding + masked_remove_padding

        attention_bias = model_utils.get_padding_bias(None, padding=padding)

        return attention_bias, padding
Ejemplo n.º 3
0
    def __init__(self, config, **kwargs):
        config.put('editor.edit_encoder.edit_dim', 0)
        super().__init__(config, **kwargs)

        # Transformer stack
        del self.decoder_stack
        self.decoder_stack = StraightAttentionDecoderStack(
            self.config.to_json(), graph_utils.is_training())
Ejemplo n.º 4
0
    def __init__(self, config, embedding_layer=None, **kwargs):
        super().__init__(**kwargs)
        self.config = config

        if embedding_layer is None:
            embedding_layer = EmbeddingSharedWeights.get_from_graph()

        if embedding_layer.word_dim != config.hidden_size:
            self.embedding_layer = embedding_layer.get_projected(config.hidden_size)
        else:
            self.embedding_layer = embedding_layer

        self.encoder = EncoderStack(config.to_json(), graph_utils.is_training())
Ejemplo n.º 5
0
    def call(self, src, tgt=None, src_len=None, tgt_len=None, **kwargs):
        assert src is not None \
               and tgt is not None \
               and src_len is not None \
               and tgt_len is not None

        initializer = tf.variance_scaling_initializer(
            self.params.initializer_gain,
            mode="fan_avg",
            distribution="uniform")

        with tf.variable_scope("TMEV", initializer=initializer):
            embedded_tgt = self.embedding_layer(tgt, tgt_len)
            tgt_padding = model_utils.get_padding(tgt)
            tgt_attention_bias = model_utils.get_padding_bias(
                None, tgt_padding)

            embedded_src = self.embedding_layer(src, src_len)
            src_padding = model_utils.get_padding(src)
            src_attention_bias = model_utils.get_padding_bias(
                None, src_padding)

            encoded_tgt = self._encode_tgt(embedded_tgt, tgt_padding,
                                           tgt_attention_bias)

            micro_ev = self._decode_micro_edit_vectors(embedded_src,
                                                       src_padding,
                                                       src_attention_bias,
                                                       encoded_tgt,
                                                       tgt_attention_bias)

            if not graph_utils.is_training():
                tf.add_to_collection(
                    'TransformerMicroEditExtractor_Attentions', [
                        self.target_encoder.self_attn_alignment_history,
                        self.mev_decoder.self_attn_alignment_history,
                        self.mev_decoder.enc_dec_attn_alignment_history,
                    ])

            return encoded_tgt, micro_ev
Ejemplo n.º 6
0
    def __init__(self, embedding_layer, mev_projection, params, **kwargs):
        super().__init__(**kwargs)
        self.params = params
        is_training = graph_utils.is_training()

        encoder_config = Config.merge_to_new([params, params.encoder])
        decoder_config = Config.merge_to_new([params, params.decoder])

        self.target_encoder = EncoderStack(encoder_config.to_json(),
                                           is_training, params.save_attentions)
        self.mev_decoder = DecoderStack(decoder_config.to_json(), is_training,
                                        params.save_attentions)

        self.embedding_layer = embedding_layer
        self.mev_projection = mev_projection

        self.cls_tok_embedding = self.add_weight('cls_tok_embedding',
                                                 (self.params.hidden_size, ),
                                                 dtype=tf.float32,
                                                 trainable=True)

        self.pooling_layer = tf.layers.Dense(self.params.hidden_size,
                                             activation='tanh',
                                             name='pooling_layer')
Ejemplo n.º 7
0
    def call(self, src, tgt, src_len, tgt_len, **kwargs):
        # Add [REMOVE] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_tgt = self.embedding_layer(tgt)

        # [batch, length+1, hidden_size]
        extended_embedded_tgt = self._add_token_to_beginning(embedded_tgt, self.rm_tok_embedding)
        extended_embedded_tgt += model_utils.get_position_encoding(
            tf.shape(extended_embedded_tgt)[1],
            self.params.hidden_size
        )
        extended_tgt_len = tgt_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_tgt_attention_bias, extended_tgt_padding = self._get_attn_bias_with_dropout(
                extended_tgt_len, uniform_low=1)
        else:
            extended_tgt_padding = model_utils.get_padding_by_seq_len(extended_tgt_len)
            extended_tgt_attention_bias = model_utils.get_padding_bias(None, extended_tgt_padding)

        # Add [CLS] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_src = self.embedding_layer(src)

        # [batch, length+1, hidden_size]
        extended_embedded_src = self._add_cls_token(embedded_src)
        extended_embedded_src += model_utils.get_position_encoding(
            tf.shape(extended_embedded_src)[1],
            self.params.hidden_size
        )
        extended_src_len = src_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_src_attention_bias, extended_src_padding = self._get_attn_bias_with_dropout(
                extended_src_len, uniform_low=1)
        else:
            extended_src_padding = model_utils.get_padding_by_seq_len(extended_src_len)
            extended_src_attention_bias = model_utils.get_padding_bias(None, extended_src_padding)

        # Encode Target
        # [batch, length+1, hidden_size]
        encoded_tgt = self._encode_tgt(extended_embedded_tgt, extended_tgt_padding, extended_tgt_attention_bias)

        # Decode source using the encoded target
        # [batch, length+1, hidden_size]
        decoder_output = self._decode_micro_edit_vectors(extended_embedded_src, extended_src_padding,
                                                         extended_src_attention_bias,
                                                         encoded_tgt, extended_tgt_attention_bias)

        if not graph_utils.is_training() and self.params.save_attentions:
            tf.add_to_collection('TransformerMicroEditExtractor_Attentions', [
                self.target_encoder.self_attn_alignment_history,
                self.mev_decoder.self_attn_alignment_history,
                self.mev_decoder.enc_dec_attn_alignment_history,
            ])

        with tf.name_scope("pooler"):
            # We "pool" the model by simply taking the hidden state corresponding
            # to the first token.
            first_token_tensor = tf.squeeze(decoder_output[:, 0:1, :], axis=1)
            pooled = self.pooling_layer(first_token_tensor)

        # with tf.control_dependencies([prints]):
        # [batch, length, hidden_size]
        micro_ev = self.mev_projection(decoder_output[:, 1:, :])

        return encoded_tgt[:, 1:, :], extended_tgt_attention_bias[:, :, :, 1:], pooled, micro_ev