def __init__(self,
                 name: str,
                 n_heads: int,
                 keys_encoder: Attendable,
                 values_encoder: Attendable = None,
                 dropout_keep_prob: float = 1.0,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        BaseAttention.__init__(self, name, reuse, save_checkpoint,
                               load_checkpoint, initializers)

        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob

        if self.n_heads <= 0:
            raise ValueError("Number of heads must be greater than zero.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        if values_encoder is None:
            values_encoder = keys_encoder

        self.attention_keys = get_attention_states(keys_encoder)
        self.attention_mask = get_attention_mask(keys_encoder)
        self.attention_values = get_attention_states(values_encoder)

        self._variable_scope.set_initializer(
            tf.variance_scaling_initializer(mode="fan_avg",
                                            distribution="uniform"))
Exemple #2
0
    def __init__(self,
                 name: str,
                 n_heads: int,
                 keys_encoder: Attendable,
                 values_encoder: Attendable = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        BaseAttention.__init__(self, name, save_checkpoint, load_checkpoint,
                               initializers)

        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob

        if self.n_heads <= 0:
            raise ValueError("Number of heads must be greater than zero.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        if values_encoder is None:
            values_encoder = keys_encoder

        self.attention_keys = get_attention_states(keys_encoder)
        self.attention_mask = get_attention_mask(keys_encoder)
        self.attention_values = get_attention_states(values_encoder)
Exemple #3
0
    def cross_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor:
        assert self.cross_attention_sublayer is not None
        assert self.n_cross_att_heads is not None
        assert self.input_for_cross_attention is not None

        encoder_att_states = get_attention_states(
            self.input_for_cross_attention)
        encoder_att_mask = get_attention_mask(self.input_for_cross_attention)

        # Layer normalization
        normalized_queries = layer_norm(queries)

        encoder_context, _ = attention(
            queries=normalized_queries,
            keys=encoder_att_states,
            values=encoder_att_states,
            keys_mask=encoder_att_mask,
            num_heads=self.n_cross_att_heads,
            dropout_callback=lambda x: dropout(
                x, self.attention_dropout_keep_prob, self.train_mode),
            use_bias=self.use_att_transform_bias)

        # Apply dropout
        encoder_context = dropout(
            encoder_context, self.dropout_keep_prob, self.train_mode)

        # Add residual connections
        return encoder_context + queries
Exemple #4
0
    def cross_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor:
        assert self.cross_attention_sublayer is not None
        encoder_att_states = get_attention_states(
            self.input_for_cross_attention)
        encoder_att_mask = get_attention_mask(self.input_for_cross_attention)

        # Layer normalization
        normalized_queries = layer_norm(queries)

        encoder_context, _ = attention(
            queries=normalized_queries,
            keys=encoder_att_states,
            values=encoder_att_states,
            keys_mask=encoder_att_mask,
            num_heads=self.n_cross_att_heads,
            dropout_callback=lambda x: dropout(
                x, self.attention_dropout_keep_prob, self.train_mode),
            use_bias=self.use_att_transform_bias)

        # Apply dropout
        encoder_context = dropout(encoder_context, self.dropout_keep_prob,
                                  self.train_mode)

        # Add residual connections
        return encoder_context + queries
    def __init__(self,
                 name: str,
                 input_sequence: Attendable,
                 hidden_size: int,
                 num_heads: int,
                 output_size: int = None,
                 state_proj_size: int = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Initialize an instance of the encoder."""
        check_argument_types()
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint,
                           initializers)

        self.input_sequence = input_sequence
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.state_proj_size = state_proj_size
        self.dropout_keep_prob = dropout_keep_prob

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        with self.use_scope():
            self._attention_states_dropped = dropout(
                get_attention_states(self.input_sequence),
                self.dropout_keep_prob,
                self.train_mode)
Exemple #6
0
    def encoder_attention_sublayer(self, queries: tf.Tensor) -> tf.Tensor:
        """Create the encoder-decoder attention sublayer."""

        encoder_att_states = get_attention_states(self.encoder)
        encoder_att_mask = get_attention_mask(self.encoder)

        # Layer normalization
        normalized_queries = layer_norm(queries)

        # Attend to the encoder
        # TODO handle histories
        encoder_context, _ = attention(
            queries=normalized_queries,
            keys=encoder_att_states,
            values=encoder_att_states,
            keys_mask=encoder_att_mask,
            num_heads=self.n_heads_enc,
            dropout_callback=lambda x: dropout(
                x, self.attention_dropout_keep_prob, self.train_mode),
            use_bias=self.use_att_transform_bias)

        # Apply dropout
        encoder_context = dropout(encoder_context, self.dropout_keep_prob,
                                  self.train_mode)

        # Add residual connections
        return encoder_context + queries
Exemple #7
0
    def __init__(self,
                 name: str,
                 encoders: List[Attendable],
                 attention_state_size: int,
                 share_attn_projections: bool = False,
                 use_sentinels: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        check_argument_types()
        MultiAttention.__init__(self,
                                name=name,
                                attention_state_size=attention_state_size,
                                share_attn_projections=share_attn_projections,
                                use_sentinels=use_sentinels,
                                reuse=reuse,
                                save_checkpoint=save_checkpoint,
                                load_checkpoint=load_checkpoint,
                                initializers=initializers)
        self._encoders = encoders

        # pylint: disable=protected-access
        self._encoders_tensors = [
            get_attention_states(e) for e in self._encoders
        ]
        self._encoders_masks = [get_attention_mask(e) for e in self._encoders]
        # pylint: enable=protected-access

        for e_m in self._encoders_masks:
            assert_shape(e_m, [-1, -1])

        for e_t in self._encoders_tensors:
            assert_shape(e_t, [-1, -1, -1])

        with self.use_scope():
            self.encoder_projections_for_logits = \
                self.get_encoder_projections("logits_projections")

            self.encoder_attn_biases = [
                get_variable(name="attn_bias_{}".format(i),
                             shape=[],
                             initializer=tf.zeros_initializer())
                for i in range(len(self._encoders_tensors))
            ]

            if self._share_projections:
                self.encoder_projections_for_ctx = \
                    self.encoder_projections_for_logits
            else:
                self.encoder_projections_for_ctx = \
                    self.get_encoder_projections("context_projections")

            if self._use_sentinels:
                self._encoders_masks.append(
                    tf.ones([tf.shape(self._encoders_masks[0])[0], 1]))

            self.masks_concat = tf.concat(self._encoders_masks, 1)
Exemple #8
0
    def model_dimension(self) -> int:
        dim = self.input_sequence.dimension

        if self.input_for_cross_attention is not None:
            cross_att_dim = get_attention_states(
                self.input_for_cross_attention).get_shape()[-1].value
            if cross_att_dim != dim:
                raise ValueError(
                    "The input for cross-attention must be of the same "
                    "dimension as the model, was {}.".format(cross_att_dim))

        return dim
Exemple #9
0
    def model_dimension(self) -> int:
        dim = self.input_sequence.dimension

        if self.input_for_cross_attention is not None:
            cross_att_dim = get_attention_states(
                self.input_for_cross_attention).get_shape()[-1].value
            if cross_att_dim != dim:
                raise ValueError(
                    "The input for cross-attention must be of the same "
                    "dimension as the model, was {}.".format(cross_att_dim))

        return dim
Exemple #10
0
    def __init__(self,
                 name: str,
                 n_heads: int,
                 keys_encoder: Attendable,
                 values_encoder: Attendable = None,
                 dropout_keep_prob: float = 1.0,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        check_argument_types()
        BaseAttention.__init__(self, name, save_checkpoint, load_checkpoint)

        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob

        if self.n_heads <= 0:
            raise ValueError("Number of heads must be greater than zero.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        if values_encoder is None:
            values_encoder = keys_encoder

        self.attention_keys = get_attention_states(keys_encoder)
        self.attention_values = get_attention_states(values_encoder)
        self.attention_mask = get_attention_mask(keys_encoder)

        self._dimension = self.attention_keys.get_shape()[-1].value

        if self._dimension % self.n_heads != 0:
            raise ValueError("Model dimension ({}) must be divisible by the "
                             "number of attention heads ({})".format(
                                 self._dimension, self.n_heads))

        self._head_dim = int(self._dimension / self.n_heads)
        self._scaling_factor = 1 / math.sqrt(self._head_dim)
Exemple #11
0
    def encoder_attention(self, level: int, queries: tf.Tensor) -> tf.Tensor:

        with tf.variable_scope("dec_inter_att_level_{}".format(level),
                               reuse=tf.AUTO_REUSE):
            encoder_att_states = get_attention_states(self.encoder)
            encoder_att_mask = get_attention_mask(self.encoder)

            # TODO handle histories
            encoder_context, _ = attention(
                queries=queries,
                keys=encoder_att_states,
                values=encoder_att_states,
                keys_mask=encoder_att_mask,
                num_heads=self.n_heads_enc,
                dropout_callback=lambda x: dropout(
                    x, self.attention_dropout_keep_prob, self.train_mode))

            return dropout(
                encoder_context, self.dropout_keep_prob, self.train_mode)
 def attention_states(self) -> tf.Tensor:
     return dropout(get_attention_states(self.encoder),
                    self.dropout_keep_prob, self.train_mode)
Exemple #13
0
 def _encoders_tensors(self) -> List[tf.Tensor]:
     tensors = [get_attention_states(e) for e in self._encoders]
     for e_t in tensors:
         assert_shape(e_t, [-1, -1, -1])
     return tensors
Exemple #14
0
    def __init__(self,
                 name: str,
                 input_sequence: TemporalStateful,
                 ff_hidden_size: int,
                 depth: int,
                 n_heads: int,
                 dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: float = 1.0,
                 target_space_id: int = None,
                 use_att_transform_bias: bool = False,
                 use_positional_encoding: bool = True,
                 input_for_cross_attention: Attendable = None,
                 n_cross_att_heads: int = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create an encoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            input_sequence: Embedded input sequence.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            dropout_keep_prob: Probability of keeping a value during dropout.
            target_space_id: Specifies the modality of the target space.
            use_att_transform_bias: Add bias when transforming qkv vectors
                for attention.
            use_positional_encoding: If True, position encoding signal is added
                to the input.

        Keyword arguments:
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads: Number of the self-attention heads.
            depth: Number of sublayers.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            input_for_cross_attention: An attendable model part that is
                attended using cross-attention on every layer of the decoder,
                analogically to how encoder is attended in the decoder.
            n_cross_att_heads: Number of heads used in the cross-attention.

        """
        check_argument_types()
        ModelPart.__init__(self, name, save_checkpoint, load_checkpoint)

        self.input_sequence = input_sequence
        self.model_dimension = self.input_sequence.dimension
        self.ff_hidden_size = ff_hidden_size
        self.depth = depth
        self.n_heads = n_heads
        self.dropout_keep_prob = dropout_keep_prob
        self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.target_space_id = target_space_id
        self.use_att_transform_bias = use_att_transform_bias
        self.use_positional_encoding = use_positional_encoding
        self.input_for_cross_attention = input_for_cross_attention
        self.n_cross_att_heads = n_cross_att_heads

        if self.depth <= 0:
            raise ValueError("Depth must be a positive integer.")

        if self.ff_hidden_size <= 0:
            raise ValueError("Feed forward hidden size must be a "
                             "positive integer.")

        if self.dropout_keep_prob <= 0.0 or self.dropout_keep_prob > 1.0:
            raise ValueError("Dropout keep prob must be inside (0,1].")

        if (self.attention_dropout_keep_prob <= 0.0
                or self.attention_dropout_keep_prob > 1.0):
            raise ValueError("Dropout keep prob for attn must be in (0,1].")

        if self.target_space_id is not None and (self.target_space_id >= 32
                                                 or self.target_space_id < 0):
            raise ValueError(
                "If provided, the target space ID should be between 0 and 31. "
                "Was: {}".format(self.target_space_id))

        if (input_for_cross_attention is None) ^ (n_cross_att_heads is None):
            raise ValueError(
                "Either both input_for_cross_attention and n_cross_att_heads "
                "must be provided or none of them.")

        if input_for_cross_attention is not None:
            cross_att_dim = get_attention_states(
                input_for_cross_attention).get_shape()[-1].value
            if cross_att_dim != self.model_dimension:
                raise ValueError(
                    "The input for cross-attention must be of the same "
                    "dimension as the model, was {}.".format(cross_att_dim))

        self.train_mode = tf.placeholder(tf.bool, [], "train_mode")
        log("Output op: {}".format(self.output))
Exemple #15
0
 def attention_states(self) -> tf.Tensor:
     return dropout(get_attention_states(self.encoder),
                    self.dropout_keep_prob,
                    self.train_mode)
Exemple #16
0
    def __init__(
            self,
            name: str,
            encoder: Attendable,
            vocabulary: Vocabulary,
            data_id: str,
            # TODO infer the default for these three from the encoder
            ff_hidden_size: int,
            n_heads_self: int,
            n_heads_enc: int,
            depth: int,
            max_output_len: int,
            dropout_keep_prob: float = 1.0,
            embedding_size: int = None,
            embeddings_source: EmbeddedSequence = None,
            tie_embeddings: bool = True,
            label_smoothing: float = None,
            attention_dropout_keep_prob: float = 1.0,
            use_att_transform_bias: bool = False,
            supress_unk: bool = False,
            save_checkpoint: str = None,
            load_checkpoint: str = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoder: Input encoder of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.

        Keyword arguments:
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over the encoder.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       embedding_size=embedding_size,
                                       embeddings_source=embeddings_source,
                                       tie_embeddings=tie_embeddings,
                                       label_smoothing=label_smoothing,
                                       supress_unk=supress_unk,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint)

        self.encoder = encoder
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self
        self.n_heads_enc = n_heads_enc
        self.depth = depth
        self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias

        self.encoder_states = get_attention_states(self.encoder)
        self.encoder_mask = get_attention_mask(self.encoder)
        self.dimension = (self.encoder_states.get_shape()[2].value
                          )  # type: ignore

        if self.embedding_size != self.dimension:
            raise ValueError("Model dimension and input embedding size"
                             "do not match")

        self._variable_scope.set_initializer(
            tf.variance_scaling_initializer(mode="fan_avg",
                                            distribution="uniform"))

        log("Decoder cost op: {}".format(self.cost))
        self._variable_scope.reuse_variables()
        log("Runtime logits: {}".format(self.runtime_logits))
 def attention_values(self) -> tf.Tensor:
     return get_attention_states(self.values_encoder)
 def attention_keys(self) -> tf.Tensor:
     return get_attention_states(self.keys_encoder)
Exemple #19
0
 def _encoders_tensors(self) -> List[tf.Tensor]:
     tensors = [get_attention_states(e) for e in self._encoders]
     for e_t in tensors:
         assert_shape(e_t, [-1, -1, -1])
     return tensors
    def __init__(self,
                 name: str,
                 encoders: List[Attendable],
                 vocabulary: Vocabulary,
                 data_id: str,
                 # TODO infer the default for these three from the encoder
                 ff_hidden_size: int,
                 n_heads_self: int,
                 n_heads_enc: Union[List[int], int],
                 depth: int,
                 max_output_len: int,
                 attention_combination_strategy: str = "serial",
                 n_heads_hier: int = None,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = True,
                 label_smoothing: float = None,
                 self_attention_dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: Union[float, List[float]] = 1.0,
                 use_att_transform_bias: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoders: Input encoders for the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over each encoder.
                Either a list which size must be equal to ``encoders``, or a
                single integer. In the latter case, the number of heads is
                equal for all encoders.
            attention_comnbination_strategy: One of ``serial``, ``parallel``,
                ``flat``, ``hierarchical``. Controls the attention combination
                strategy for enc-dec attention.
            n_heads_hier: Number of the attention heads for the second
                attention in the ``hierarchical`` attention combination.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            embedding_size=embedding_size,
            embeddings_source=embeddings_source,
            tie_embeddings=tie_embeddings,
            label_smoothing=label_smoothing,
            supress_unk=supress_unk,
            reuse=reuse,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self

        if isinstance(n_heads_enc, int):
            if attention_combination_strategy == "flat":
                self.n_heads_enc = [n_heads_enc]
            else:
                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
        else:
            self.n_heads_enc = n_heads_enc

        self.depth = depth
        if isinstance(attention_dropout_keep_prob, float):
            self.attention_dropout_keep_prob = [
                attention_dropout_keep_prob for _ in encoders]
        else:
            self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias
        self.attention_combination_strategy = attention_combination_strategy
        self.n_heads_hier = n_heads_hier

        self.encoder_states = lambda: [get_attention_states(e)
                                       for e in self.encoders]
        self.encoder_masks = lambda: [get_attention_mask(e)
                                      for e in self.encoders]

        if self.attention_combination_strategy not in STRATEGIES:
            raise ValueError(
                "Unknown attention combination strategy '{}'. "
                "Allowed: {}.".format(self.attention_combination_strategy,
                                      ", ".join(STRATEGIES)))

        if (self.attention_combination_strategy == "hierarchical"
                and self.n_heads_hier is None):
            raise ValueError(
                "You must provide n_heads_hier when using the hierarchical "
                "attention combination strategy.")

        if (self.attention_combination_strategy != "hierarchical"
                and self.n_heads_hier is not None):
            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
                 "attention combination strategy instead.")

        if (self.attention_combination_strategy == "flat"
                and len(self.n_heads_enc) != 1):
            raise ValueError(
                "For the flat attention combination strategy, only a single "
                "value is permitted in n_heads_enc.")

        self._variable_scope.set_initializer(tf.variance_scaling_initializer(
            mode="fan_avg", distribution="uniform"))
Exemple #21
0
 def _attention_states_dropped(self) -> tf.Tensor:
     return dropout(get_attention_states(self.input_sequence),
                    self.dropout_keep_prob, self.train_mode)
 def attention_keys(self) -> tf.Tensor:
     return get_attention_states(self.keys_encoder)
 def attention_values(self) -> tf.Tensor:
     return get_attention_states(self.values_encoder)
Exemple #24
0
 def _attention_states_dropped(self) -> tf.Tensor:
     return dropout(get_attention_states(self.input_sequence),
                    self.dropout_keep_prob, self.train_mode)