Example #1
0
    def get_initial_loop_state(self) -> LoopState:

        default_ls = AutoregressiveDecoder.get_initial_loop_state(self)
        # feedables = default_ls.feedables._asdict()
        histories = default_ls.histories._asdict()

        histories["self_attention_histories"] = [
            empty_multi_head_loop_state(self.n_heads_self)
            for a in range(self.depth)]

        histories["inter_attention_histories"] = [
            empty_multi_head_loop_state(self.n_heads_enc)
            for a in range(self.depth)]

        histories["decoded_symbols"] = tf.TensorArray(
            dtype=tf.int32, dynamic_size=True, size=0,
            clear_after_read=False, name="decoded_symbols")

        input_mask = tf.TensorArray(
            dtype=tf.float32, dynamic_size=True, size=0,
            clear_after_read=False, name="input_mask")

        histories["input_mask"] = input_mask.write(
            0, tf.ones_like(self.go_symbols, dtype=tf.float32))

        # TransformerHistories is a type and should be callable
        # pylint: disable=not-callable
        tr_histories = TransformerHistories(**histories)
        # pylint: enable=not-callable

        return LoopState(
            histories=tr_histories,
            constants=[],
            feedables=default_ls.feedables)
Example #2
0
    def get_initial_loop_state(self) -> LoopState:
        default_ls = AutoregressiveDecoder.get_initial_loop_state(self)
        feedables = default_ls.feedables._asdict()
        histories = default_ls.histories._asdict()

        feedables["prev_contexts"] = [
            tf.zeros([self.batch_size, a.context_vector_size])
            for a in self.attentions
        ]

        feedables["prev_rnn_state"] = self.initial_state
        feedables["prev_rnn_output"] = self.initial_state

        histories["attention_histories"] = [
            a.initial_loop_state() for a in self.attentions if a is not None
        ]

        histories["decoder_outputs"] = tf.zeros(
            shape=[0, self.batch_size, self.rnn_size],
            dtype=tf.float32,
            name="hist_decoder_outputs")

        # pylint: disable=not-callable
        rnn_feedables = RNNFeedables(**feedables)
        rnn_histories = RNNHistories(**histories)
        # pylint: enable=not-callable

        return LoopState(histories=rnn_histories,
                         constants=default_ls.constants,
                         feedables=rnn_feedables)
Example #3
0
    def get_initial_loop_state(self) -> LoopState:

        default_ls = AutoregressiveDecoder.get_initial_loop_state(self)
        histories = default_ls.histories._asdict()

        #        histories["self_attention_histories"] = [
        #            empty_multi_head_loop_state(self.batch_size, self.n_heads_self)
        #            for a in range(self.depth)]

        #        histories["inter_attention_histories"] = [
        #            empty_multi_head_loop_state(self.batch_size, self.n_heads_enc)
        #            for a in range(self.depth)]

        histories["decoded_symbols"] = tf.zeros(shape=[0, self.batch_size],
                                                dtype=tf.int32,
                                                name="decoded_symbols")

        histories["input_mask"] = tf.zeros(shape=[0, self.batch_size],
                                           dtype=tf.float32,
                                           name="input_mask")

        # TransformerHistories is a type and should be callable
        # pylint: disable=not-callable
        tr_histories = TransformerHistories(**histories)
        # pylint: enable=not-callable

        return LoopState(histories=tr_histories,
                         constants=[],
                         feedables=default_ls.feedables)
Example #4
0
    def train_loop_result(self) -> LoopState:
        # We process all decoding the steps together during training.
        # However, we still want to pretend that a proper decoding_loop
        # was called.
        decoder_ls = AutoregressiveDecoder.get_initial_loop_state(self)

        input_sequence = self.embed_input_symbols(self.train_input_symbols)
        input_mask = tf.transpose(self.train_mask)

        last_layer = self.layer(
            self.depth, input_sequence, input_mask)

        tr_feedables = TransformerFeedables(
            input_sequence=input_sequence,
            input_mask=tf.expand_dims(input_mask, -1))

        # t_states shape: (batch, time, channels)
        # dec_w shape: (channels, vocab)
        last_layer_shape = tf.shape(last_layer.temporal_states)
        last_layer_states = tf.reshape(
            last_layer.temporal_states,
            [-1, last_layer_shape[-1]])

        # shape (batch, time, vocab)
        logits = tf.reshape(
            tf.matmul(last_layer_states, self.decoding_w),
            [last_layer_shape[0], last_layer_shape[1], len(self.vocabulary)])
        logits += tf.reshape(self.decoding_b, [1, 1, -1])

        # TODO: record histories properly
        tr_histories = tf.zeros([])
        # tr_histories = TransformerHistories(
        #    self_attention_histories=[
        #        empty_multi_head_loop_state(self.batch_size,
        #                                    self.n_heads_self)
        #        for a in range(self.depth)],
        #    encoder_attention_histories=[
        #        empty_multi_head_loop_state(self.batch_size,
        #                                    self.n_heads_enc)
        #        for a in range(self.depth)])

        feedables = DecoderFeedables(
            step=last_layer_shape[1],
            finished=tf.ones([self.batch_size], dtype=tf.bool),
            embedded_input=self.embed_input_symbols(tf.tile(
                [END_TOKEN_INDEX], [self.batch_size])),
            other=tr_feedables)

        histories = DecoderHistories(
            logits=tf.transpose(logits, perm=[1, 0, 2]),
            output_states=tf.transpose(
                last_layer.temporal_states, [1, 0, 2]),
            output_mask=self.train_mask,
            output_symbols=self.train_inputs,
            other=tr_histories)

        return LoopState(
            feedables=feedables,
            histories=histories,
            constants=decoder_ls.constants)
Example #5
0
    def get_initial_feedables(self) -> DecoderFeedables:
        feedables = AutoregressiveDecoder.get_initial_feedables(self)

        rnn_feedables = RNNFeedables(
            prev_contexts=[tf.zeros([self.batch_size, a.context_vector_size])
                           for a in self.attentions],
            prev_rnn_state=self.initial_state,
            prev_rnn_output=self.initial_state)

        return feedables._replace(other=rnn_feedables)
Example #6
0
    def get_initial_feedables(self) -> DecoderFeedables:
        feedables = AutoregressiveDecoder.get_initial_feedables(self)

        rnn_feedables = RNNFeedables(prev_contexts=[
            tf.zeros([self.batch_size, a.context_vector_size])
            for a in self.attentions
        ],
                                     prev_rnn_state=self.initial_state,
                                     prev_rnn_output=self.initial_state)

        return feedables._replace(other=rnn_feedables)
Example #7
0
    def get_initial_histories(self) -> DecoderHistories:
        histories = AutoregressiveDecoder.get_initial_histories(self)

        rnn_histories = RNNHistories(
            rnn_outputs=tf.zeros(
                shape=[0, self.batch_size, self.rnn_size],
                dtype=tf.float32,
                name="hist_rnn_output_states"),
            attention_histories=[a.initial_loop_state()
                                 for a in self.attentions if a is not None])

        return histories._replace(other=rnn_histories)
Example #8
0
    def get_initial_histories(self) -> DecoderHistories:
        histories = AutoregressiveDecoder.get_initial_histories(self)

        rnn_histories = RNNHistories(
            rnn_outputs=tf.zeros(shape=[0, self.batch_size, self.rnn_size],
                                 dtype=tf.float32,
                                 name="hist_rnn_output_states"),
            attention_histories=[
                a.initial_loop_state() for a in self.attentions
                if a is not None
            ])

        return histories._replace(other=rnn_histories)
Example #9
0
    def get_initial_feedables(self) -> DecoderFeedables:
        feedables = AutoregressiveDecoder.get_initial_feedables(self)

        tr_feedables = TransformerFeedables(
            input_sequence=tf.zeros(
                shape=[self.batch_size, 0, self.dimension],
                dtype=tf.float32,
                name="input_sequence"),
            input_mask=tf.zeros(
                shape=[self.batch_size, 0, 1],
                dtype=tf.float32,
                name="input_mask"))

        return feedables._replace(other=tr_feedables)
Example #10
0
    def get_initial_histories(self) -> DecoderHistories:
        histories = AutoregressiveDecoder.get_initial_histories(self)

        # TODO: record histories properly
        tr_histories = tf.zeros([])
        # tr_histories = TransformerHistories(
        #    self_attention_histories=[
        #        empty_multi_head_loop_state(self.batch_size,
        #                                    self.n_heads_self)
        #        for a in range(self.depth)],
        #    encoder_attention_histories=[
        #        empty_multi_head_loop_state(self.batch_size,
        #                                    self.n_heads_enc)
        #        for a in range(self.depth)])

        return histories._replace(other=tr_histories)
Example #11
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = False,
                 label_smoothing: float = None,
                 rnn_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 attention_on_input: bool = False,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs.
            encoder_projection: How to construct initial state from encoders.
            attention: The attention object to use. Optional.
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM).
            conditional_gru: Flag whether to use the Conditional GRU
                architecture.
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the model variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       embedding_size=embedding_size,
                                       embeddings_source=embeddings_source,
                                       tie_embeddings=tie_embeddings,
                                       label_smoothing=label_smoothing,
                                       supress_unk=supress_unk,
                                       reuse=reuse,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint,
                                       initializers=initializers)

        self.encoders = encoders
        self.output_projection_spec = output_projection
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        self.attentions = []  # type: List[BaseAttention]
        if attentions is not None:
            self.attentions = attentions

        if rnn_size is not None:
            self.rnn_size = rnn_size

        if encoder_projection is not None:
            self.encoder_projection = encoder_projection
        elif not self.encoders:
            log("No direct encoder input. Using empty initial state")
            self.encoder_projection = empty_initial_state
        elif rnn_size is None:
            log("No rnn_size or encoder_projection: Using concatenation of"
                " encoded states")
            self.encoder_projection = concat_encoder_projection
            self.rnn_size = sum(e.output.get_shape()[1].value
                                for e in encoders)
        else:
            log("Using linear projection of encoders as the initial state")
            self.encoder_projection = linear_encoder_projection(
                self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(self.rnn_size,
                                                      tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            self.output_projection_spec = cast(Tuple[OutputProjection, int],
                                               self.output_projection_spec)
            (self.output_projection,
             self.output_projection_size) = self.output_projection_spec
        else:
            self.output_projection = cast(OutputProjection,
                                          self.output_projection_spec)
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        self._variable_scope.set_initializer(
            tf.random_normal_initializer(stddev=0.001))

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
Example #12
0
    def __init__(
            self,
            name: str,
            encoder: Attendable,
            vocabulary: Vocabulary,
            data_id: str,
            # TODO infer the default for these three from the encoder
            ff_hidden_size: int,
            n_heads_self: int,
            n_heads_enc: int,
            depth: int,
            max_output_len: int,
            dropout_keep_prob: float = 1.0,
            embedding_size: int = None,
            embeddings_source: EmbeddedSequence = None,
            tie_embeddings: bool = True,
            label_smoothing: float = None,
            attention_dropout_keep_prob: float = 1.0,
            use_att_transform_bias: bool = False,
            supress_unk: bool = False,
            save_checkpoint: str = None,
            load_checkpoint: str = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoder: Input encoder of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.

        Keyword arguments:
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over the encoder.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       embedding_size=embedding_size,
                                       embeddings_source=embeddings_source,
                                       tie_embeddings=tie_embeddings,
                                       label_smoothing=label_smoothing,
                                       supress_unk=supress_unk,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint)

        self.encoder = encoder
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self
        self.n_heads_enc = n_heads_enc
        self.depth = depth
        self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias

        self.encoder_states = get_attention_states(self.encoder)
        self.encoder_mask = get_attention_mask(self.encoder)
        self.dimension = (self.encoder_states.get_shape()[2].value
                          )  # type: ignore

        if self.embedding_size != self.dimension:
            raise ValueError("Model dimension and input embedding size"
                             "do not match")

        self._variable_scope.set_initializer(
            tf.variance_scaling_initializer(mode="fan_avg",
                                            distribution="uniform"))

        log("Decoder cost op: {}".format(self.cost))
        self._variable_scope.reuse_variables()
        log("Runtime logits: {}".format(self.runtime_logits))
Example #13
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 rnn_size: int = None,
                 embedding_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 embeddings_source: EmbeddedSequence = None,
                 attention_on_input: bool = True,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder
            vocabulary: Target vocabulary
            data_id: Target data series
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects
            max_output_len: Maximum length of an output sequence
            dropout_keep_prob: Probability of keeping a value during dropout

        Keyword arguments:
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            embedding_size: Size of embedding vectors for target words
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs
            encoder_projection: How to construct initial state from encoders
            attention: The attention object to use. Optional.
            embeddings_source: Embedded sequence to take embeddings from
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM)
            conditional_gru: Flag whether to use the Conditional GRU
                architecture
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.output_projection_spec = output_projection
        self.encoder_projection = encoder_projection
        self.attentions = attentions
        self.embeddings_source = embeddings_source
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell

        if self.attentions is None:
            self.attentions = []

        if self.embedding_size is None and self.embeddings_source is None:
            raise ValueError("You must specify either embedding size or the "
                             "embedded sequence from which to reuse the "
                             "embeddings (e.g. set either 'embedding_size' or "
                             " 'embeddings_source' parameter)")

        if self.embeddings_source is not None:
            if self.embedding_size is not None:
                warn("Overriding the embedding_size parameter with the"
                     " size of the reused embeddings from the encoder.")

            self.embedding_size = (
                self.embeddings_source.embedding_matrix.get_shape()[1].value)

        if self.encoder_projection is None:
            if not self.encoders:
                log("No encoder - language model only.")
                self.encoder_projection = empty_initial_state
            elif rnn_size is None:
                log("No rnn_size or encoder_projection: Using concatenation of"
                    " encoded states")
                self.encoder_projection = concat_encoder_projection
                self.rnn_size = sum(e.output.get_shape()[1].value
                                    for e in encoders)
            else:
                log("Using linear projection of encoders as the initial state")
                self.encoder_projection = linear_encoder_projection(
                    self.dropout_keep_prob)

        assert self.rnn_size is not None

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self.output_projection_spec is None:
            log("No output projection specified - using tanh projection")
            self.output_projection = nonlinear_output(self.rnn_size,
                                                      tf.tanh)[0]
            self.output_projection_size = self.rnn_size
        elif isinstance(self.output_projection_spec, tuple):
            self.output_projection_spec = cast(Tuple[OutputProjection, int],
                                               self.output_projection_spec)
            (self.output_projection,
             self.output_projection_size) = self.output_projection_spec
        else:
            self.output_projection = cast(OutputProjection,
                                          self.output_projection_spec)
            self.output_projection_size = self.rnn_size

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = self.embed_input_symbol

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        # TODO when it is possible, remove the printing of the cost var
        log("Decoder initalized. Cost var: {}".format(str(self.cost)))
        log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
Example #14
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = False,
                 label_smoothing: float = None,
                 rnn_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 attention_on_input: bool = False,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs.
            encoder_projection: How to construct initial state from encoders.
            attention: The attention object to use. Optional.
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM).
            conditional_gru: Flag whether to use the Conditional GRU
                architecture.
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the model variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(self,
                                       name=name,
                                       vocabulary=vocabulary,
                                       data_id=data_id,
                                       max_output_len=max_output_len,
                                       dropout_keep_prob=dropout_keep_prob,
                                       embedding_size=embedding_size,
                                       embeddings_source=embeddings_source,
                                       tie_embeddings=tie_embeddings,
                                       label_smoothing=label_smoothing,
                                       supress_unk=supress_unk,
                                       reuse=reuse,
                                       save_checkpoint=save_checkpoint,
                                       load_checkpoint=load_checkpoint,
                                       initializers=initializers)

        self.encoders = encoders
        self._output_projection_spec = output_projection
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell
        self._rnn_size = rnn_size
        self._encoder_projection = encoder_projection

        self.attentions = []  # type: List[BaseAttention]
        if attentions is not None:
            self.attentions = attentions

        if not rnn_size and not encoder_projection and not encoders:
            raise ValueError(
                "No RNN size, no encoders and no encoder_projection specified")

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = (
                lambda *args: LoopState(*args).feedables.embedded_input)

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        self._variable_scope.set_initializer(
            tf.random_normal_initializer(stddev=0.001))
Example #15
0
    def __init__(self,
                 encoders: List[Stateful],
                 vocabulary: Vocabulary,
                 data_id: str,
                 name: str,
                 max_output_len: int,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = False,
                 label_smoothing: float = None,
                 rnn_size: int = None,
                 output_projection: OutputProjectionSpec = None,
                 encoder_projection: EncoderProjection = None,
                 attentions: List[BaseAttention] = None,
                 attention_on_input: bool = False,
                 rnn_cell: str = "GRU",
                 conditional_gru: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a refactored version of monster decoder.

        Arguments:
            encoders: Input encoders of the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            rnn_size: Size of the decoder hidden state, if None set
                according to encoders.
            output_projection: How to generate distribution over vocabulary
                from decoder_outputs.
            encoder_projection: How to construct initial state from encoders.
            attention: The attention object to use. Optional.
            rnn_cell: RNN Cell used by the decoder (GRU or LSTM).
            conditional_gru: Flag whether to use the Conditional GRU
                architecture.
            attention_on_input: Flag whether attention from previous decoding
                step should be combined with the input in the next step.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the model variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            embedding_size=embedding_size,
            embeddings_source=embeddings_source,
            tie_embeddings=tie_embeddings,
            label_smoothing=label_smoothing,
            supress_unk=supress_unk,
            reuse=reuse,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint,
            initializers=initializers)

        self.encoders = encoders
        self._output_projection_spec = output_projection
        self._conditional_gru = conditional_gru
        self._attention_on_input = attention_on_input
        self._rnn_cell_str = rnn_cell
        self._rnn_size = rnn_size
        self._encoder_projection = encoder_projection

        self.attentions = []  # type: List[BaseAttention]
        if attentions is not None:
            self.attentions = attentions

        if not rnn_size and not encoder_projection and not encoders:
            raise ValueError(
                "No RNN size, no encoders and no encoder_projection specified")

        if self._rnn_cell_str not in RNN_CELL_TYPES:
            raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or "
                             "'NematusGRU'. Not {}".format(self._rnn_cell_str))

        if self._attention_on_input:
            self.input_projection = self.input_plus_attention
        else:
            self.input_projection = (
                lambda *args: LoopState(*args).feedables.embedded_input)

        with self.use_scope():
            with tf.variable_scope("attention_decoder") as self.step_scope:
                pass

        self._variable_scope.set_initializer(
            tf.random_normal_initializer(stddev=0.001))
Example #16
0
    def __init__(self,
                 name: str,
                 encoders: List[Attendable],
                 vocabulary: Vocabulary,
                 data_id: str,
                 # TODO infer the default for these three from the encoder
                 ff_hidden_size: int,
                 n_heads_self: int,
                 n_heads_enc: Union[List[int], int],
                 depth: int,
                 max_output_len: int,
                 attention_combination_strategy: str = "serial",
                 n_heads_hier: int = None,
                 dropout_keep_prob: float = 1.0,
                 embedding_size: int = None,
                 embeddings_source: EmbeddedSequence = None,
                 tie_embeddings: bool = True,
                 label_smoothing: float = None,
                 self_attention_dropout_keep_prob: float = 1.0,
                 attention_dropout_keep_prob: Union[float, List[float]] = 1.0,
                 use_att_transform_bias: bool = False,
                 supress_unk: bool = False,
                 reuse: ModelPart = None,
                 save_checkpoint: str = None,
                 load_checkpoint: str = None,
                 initializers: InitializerSpecs = None) -> None:
        """Create a decoder of the Transformer model.

        Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762

        Arguments:
            encoders: Input encoders for the decoder.
            vocabulary: Target vocabulary.
            data_id: Target data series.
            name: Name of the decoder. Should be unique accross all Neural
                Monkey objects.
            max_output_len: Maximum length of an output sequence.
            dropout_keep_prob: Probability of keeping a value during dropout.
            embedding_size: Size of embedding vectors for target words.
            embeddings_source: Embedded sequence to take embeddings from.
            tie_embeddings: Use decoder.embedding_matrix also in place
                of the output decoding matrix.
            ff_hidden_size: Size of the feedforward sublayers.
            n_heads_self: Number of the self-attention heads.
            n_heads_enc: Number of the attention heads over each encoder.
                Either a list which size must be equal to ``encoders``, or a
                single integer. In the latter case, the number of heads is
                equal for all encoders.
            attention_comnbination_strategy: One of ``serial``, ``parallel``,
                ``flat``, ``hierarchical``. Controls the attention combination
                strategy for enc-dec attention.
            n_heads_hier: Number of the attention heads for the second
                attention in the ``hierarchical`` attention combination.
            depth: Number of sublayers.
            label_smoothing: A label smoothing parameter for cross entropy
                loss computation.
            attention_dropout_keep_prob: Probability of keeping a value
                during dropout on the attention output.
            supress_unk: If true, decoder will not produce symbols for unknown
                tokens.
            reuse: Reuse the variables from the given model part.
        """
        check_argument_types()
        AutoregressiveDecoder.__init__(
            self,
            name=name,
            vocabulary=vocabulary,
            data_id=data_id,
            max_output_len=max_output_len,
            dropout_keep_prob=dropout_keep_prob,
            embedding_size=embedding_size,
            embeddings_source=embeddings_source,
            tie_embeddings=tie_embeddings,
            label_smoothing=label_smoothing,
            supress_unk=supress_unk,
            reuse=reuse,
            save_checkpoint=save_checkpoint,
            load_checkpoint=load_checkpoint)

        self.encoders = encoders
        self.ff_hidden_size = ff_hidden_size
        self.n_heads_self = n_heads_self

        if isinstance(n_heads_enc, int):
            if attention_combination_strategy == "flat":
                self.n_heads_enc = [n_heads_enc]
            else:
                self.n_heads_enc = [n_heads_enc for _ in self.encoders]
        else:
            self.n_heads_enc = n_heads_enc

        self.depth = depth
        if isinstance(attention_dropout_keep_prob, float):
            self.attention_dropout_keep_prob = [
                attention_dropout_keep_prob for _ in encoders]
        else:
            self.attention_dropout_keep_prob = attention_dropout_keep_prob
        self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob
        self.use_att_transform_bias = use_att_transform_bias
        self.attention_combination_strategy = attention_combination_strategy
        self.n_heads_hier = n_heads_hier

        self.encoder_states = lambda: [get_attention_states(e)
                                       for e in self.encoders]
        self.encoder_masks = lambda: [get_attention_mask(e)
                                      for e in self.encoders]

        if self.attention_combination_strategy not in STRATEGIES:
            raise ValueError(
                "Unknown attention combination strategy '{}'. "
                "Allowed: {}.".format(self.attention_combination_strategy,
                                      ", ".join(STRATEGIES)))

        if (self.attention_combination_strategy == "hierarchical"
                and self.n_heads_hier is None):
            raise ValueError(
                "You must provide n_heads_hier when using the hierarchical "
                "attention combination strategy.")

        if (self.attention_combination_strategy != "hierarchical"
                and self.n_heads_hier is not None):
            warn("Ignoring n_heads_hier parameter -- use the hierarchical "
                 "attention combination strategy instead.")

        if (self.attention_combination_strategy == "flat"
                and len(self.n_heads_enc) != 1):
            raise ValueError(
                "For the flat attention combination strategy, only a single "
                "value is permitted in n_heads_enc.")

        self._variable_scope.set_initializer(tf.variance_scaling_initializer(
            mode="fan_avg", distribution="uniform"))