def get_initial_loop_state(self) -> LoopState: default_ls = AutoregressiveDecoder.get_initial_loop_state(self) # feedables = default_ls.feedables._asdict() histories = default_ls.histories._asdict() histories["self_attention_histories"] = [ empty_multi_head_loop_state(self.n_heads_self) for a in range(self.depth)] histories["inter_attention_histories"] = [ empty_multi_head_loop_state(self.n_heads_enc) for a in range(self.depth)] histories["decoded_symbols"] = tf.TensorArray( dtype=tf.int32, dynamic_size=True, size=0, clear_after_read=False, name="decoded_symbols") input_mask = tf.TensorArray( dtype=tf.float32, dynamic_size=True, size=0, clear_after_read=False, name="input_mask") histories["input_mask"] = input_mask.write( 0, tf.ones_like(self.go_symbols, dtype=tf.float32)) # TransformerHistories is a type and should be callable # pylint: disable=not-callable tr_histories = TransformerHistories(**histories) # pylint: enable=not-callable return LoopState( histories=tr_histories, constants=[], feedables=default_ls.feedables)
def get_initial_loop_state(self) -> LoopState: default_ls = AutoregressiveDecoder.get_initial_loop_state(self) feedables = default_ls.feedables._asdict() histories = default_ls.histories._asdict() feedables["prev_contexts"] = [ tf.zeros([self.batch_size, a.context_vector_size]) for a in self.attentions ] feedables["prev_rnn_state"] = self.initial_state feedables["prev_rnn_output"] = self.initial_state histories["attention_histories"] = [ a.initial_loop_state() for a in self.attentions if a is not None ] histories["decoder_outputs"] = tf.zeros( shape=[0, self.batch_size, self.rnn_size], dtype=tf.float32, name="hist_decoder_outputs") # pylint: disable=not-callable rnn_feedables = RNNFeedables(**feedables) rnn_histories = RNNHistories(**histories) # pylint: enable=not-callable return LoopState(histories=rnn_histories, constants=default_ls.constants, feedables=rnn_feedables)
def get_initial_loop_state(self) -> LoopState: default_ls = AutoregressiveDecoder.get_initial_loop_state(self) histories = default_ls.histories._asdict() # histories["self_attention_histories"] = [ # empty_multi_head_loop_state(self.batch_size, self.n_heads_self) # for a in range(self.depth)] # histories["inter_attention_histories"] = [ # empty_multi_head_loop_state(self.batch_size, self.n_heads_enc) # for a in range(self.depth)] histories["decoded_symbols"] = tf.zeros(shape=[0, self.batch_size], dtype=tf.int32, name="decoded_symbols") histories["input_mask"] = tf.zeros(shape=[0, self.batch_size], dtype=tf.float32, name="input_mask") # TransformerHistories is a type and should be callable # pylint: disable=not-callable tr_histories = TransformerHistories(**histories) # pylint: enable=not-callable return LoopState(histories=tr_histories, constants=[], feedables=default_ls.feedables)
def train_loop_result(self) -> LoopState: # We process all decoding the steps together during training. # However, we still want to pretend that a proper decoding_loop # was called. decoder_ls = AutoregressiveDecoder.get_initial_loop_state(self) input_sequence = self.embed_input_symbols(self.train_input_symbols) input_mask = tf.transpose(self.train_mask) last_layer = self.layer( self.depth, input_sequence, input_mask) tr_feedables = TransformerFeedables( input_sequence=input_sequence, input_mask=tf.expand_dims(input_mask, -1)) # t_states shape: (batch, time, channels) # dec_w shape: (channels, vocab) last_layer_shape = tf.shape(last_layer.temporal_states) last_layer_states = tf.reshape( last_layer.temporal_states, [-1, last_layer_shape[-1]]) # shape (batch, time, vocab) logits = tf.reshape( tf.matmul(last_layer_states, self.decoding_w), [last_layer_shape[0], last_layer_shape[1], len(self.vocabulary)]) logits += tf.reshape(self.decoding_b, [1, 1, -1]) # TODO: record histories properly tr_histories = tf.zeros([]) # tr_histories = TransformerHistories( # self_attention_histories=[ # empty_multi_head_loop_state(self.batch_size, # self.n_heads_self) # for a in range(self.depth)], # encoder_attention_histories=[ # empty_multi_head_loop_state(self.batch_size, # self.n_heads_enc) # for a in range(self.depth)]) feedables = DecoderFeedables( step=last_layer_shape[1], finished=tf.ones([self.batch_size], dtype=tf.bool), embedded_input=self.embed_input_symbols(tf.tile( [END_TOKEN_INDEX], [self.batch_size])), other=tr_feedables) histories = DecoderHistories( logits=tf.transpose(logits, perm=[1, 0, 2]), output_states=tf.transpose( last_layer.temporal_states, [1, 0, 2]), output_mask=self.train_mask, output_symbols=self.train_inputs, other=tr_histories) return LoopState( feedables=feedables, histories=histories, constants=decoder_ls.constants)
def get_initial_feedables(self) -> DecoderFeedables: feedables = AutoregressiveDecoder.get_initial_feedables(self) rnn_feedables = RNNFeedables( prev_contexts=[tf.zeros([self.batch_size, a.context_vector_size]) for a in self.attentions], prev_rnn_state=self.initial_state, prev_rnn_output=self.initial_state) return feedables._replace(other=rnn_feedables)
def get_initial_feedables(self) -> DecoderFeedables: feedables = AutoregressiveDecoder.get_initial_feedables(self) rnn_feedables = RNNFeedables(prev_contexts=[ tf.zeros([self.batch_size, a.context_vector_size]) for a in self.attentions ], prev_rnn_state=self.initial_state, prev_rnn_output=self.initial_state) return feedables._replace(other=rnn_feedables)
def get_initial_histories(self) -> DecoderHistories: histories = AutoregressiveDecoder.get_initial_histories(self) rnn_histories = RNNHistories( rnn_outputs=tf.zeros( shape=[0, self.batch_size, self.rnn_size], dtype=tf.float32, name="hist_rnn_output_states"), attention_histories=[a.initial_loop_state() for a in self.attentions if a is not None]) return histories._replace(other=rnn_histories)
def get_initial_histories(self) -> DecoderHistories: histories = AutoregressiveDecoder.get_initial_histories(self) rnn_histories = RNNHistories( rnn_outputs=tf.zeros(shape=[0, self.batch_size, self.rnn_size], dtype=tf.float32, name="hist_rnn_output_states"), attention_histories=[ a.initial_loop_state() for a in self.attentions if a is not None ]) return histories._replace(other=rnn_histories)
def get_initial_feedables(self) -> DecoderFeedables: feedables = AutoregressiveDecoder.get_initial_feedables(self) tr_feedables = TransformerFeedables( input_sequence=tf.zeros( shape=[self.batch_size, 0, self.dimension], dtype=tf.float32, name="input_sequence"), input_mask=tf.zeros( shape=[self.batch_size, 0, 1], dtype=tf.float32, name="input_mask")) return feedables._replace(other=tr_feedables)
def get_initial_histories(self) -> DecoderHistories: histories = AutoregressiveDecoder.get_initial_histories(self) # TODO: record histories properly tr_histories = tf.zeros([]) # tr_histories = TransformerHistories( # self_attention_histories=[ # empty_multi_head_loop_state(self.batch_size, # self.n_heads_self) # for a in range(self.depth)], # encoder_attention_histories=[ # empty_multi_head_loop_state(self.batch_size, # self.n_heads_enc) # for a in range(self.depth)]) return histories._replace(other=tr_histories)
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = False, label_smoothing: float = None, rnn_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, attention_on_input: bool = False, rnn_cell: str = "GRU", conditional_gru: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. rnn_size: Size of the decoder hidden state, if None set according to encoders. output_projection: How to generate distribution over vocabulary from decoder_outputs. encoder_projection: How to construct initial state from encoders. attention: The attention object to use. Optional. rnn_cell: RNN Cell used by the decoder (GRU or LSTM). conditional_gru: Flag whether to use the Conditional GRU architecture. attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the model variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__(self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self.encoders = encoders self.output_projection_spec = output_projection self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell self.attentions = [] # type: List[BaseAttention] if attentions is not None: self.attentions = attentions if rnn_size is not None: self.rnn_size = rnn_size if encoder_projection is not None: self.encoder_projection = encoder_projection elif not self.encoders: log("No direct encoder input. Using empty initial state") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output(self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): self.output_projection_spec = cast(Tuple[OutputProjection, int], self.output_projection_spec) (self.output_projection, self.output_projection_size) = self.output_projection_spec else: self.output_projection = cast(OutputProjection, self.output_projection_spec) self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass self._variable_scope.set_initializer( tf.random_normal_initializer(stddev=0.001)) # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def __init__( self, name: str, encoder: Attendable, vocabulary: Vocabulary, data_id: str, # TODO infer the default for these three from the encoder ff_hidden_size: int, n_heads_self: int, n_heads_enc: int, depth: int, max_output_len: int, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = True, label_smoothing: float = None, attention_dropout_keep_prob: float = 1.0, use_att_transform_bias: bool = False, supress_unk: bool = False, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a decoder of the Transformer model. Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762 Arguments: encoder: Input encoder of the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. Keyword arguments: ff_hidden_size: Size of the feedforward sublayers. n_heads_self: Number of the self-attention heads. n_heads_enc: Number of the attention heads over the encoder. depth: Number of sublayers. label_smoothing: A label smoothing parameter for cross entropy loss computation. attention_dropout_keep_prob: Probability of keeping a value during dropout on the attention output. supress_unk: If true, decoder will not produce symbols for unknown tokens. """ check_argument_types() AutoregressiveDecoder.__init__(self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoder = encoder self.ff_hidden_size = ff_hidden_size self.n_heads_self = n_heads_self self.n_heads_enc = n_heads_enc self.depth = depth self.attention_dropout_keep_prob = attention_dropout_keep_prob self.use_att_transform_bias = use_att_transform_bias self.encoder_states = get_attention_states(self.encoder) self.encoder_mask = get_attention_mask(self.encoder) self.dimension = (self.encoder_states.get_shape()[2].value ) # type: ignore if self.embedding_size != self.dimension: raise ValueError("Model dimension and input embedding size" "do not match") self._variable_scope.set_initializer( tf.variance_scaling_initializer(mode="fan_avg", distribution="uniform")) log("Decoder cost op: {}".format(self.cost)) self._variable_scope.reuse_variables() log("Runtime logits: {}".format(self.runtime_logits))
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, rnn_size: int = None, embedding_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, embeddings_source: EmbeddedSequence = None, attention_on_input: bool = True, rnn_cell: str = "GRU", conditional_gru: bool = False, save_checkpoint: str = None, load_checkpoint: str = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder vocabulary: Target vocabulary data_id: Target data series name: Name of the decoder. Should be unique accross all Neural Monkey objects max_output_len: Maximum length of an output sequence dropout_keep_prob: Probability of keeping a value during dropout Keyword arguments: rnn_size: Size of the decoder hidden state, if None set according to encoders. embedding_size: Size of embedding vectors for target words output_projection: How to generate distribution over vocabulary from decoder_outputs encoder_projection: How to construct initial state from encoders attention: The attention object to use. Optional. embeddings_source: Embedded sequence to take embeddings from rnn_cell: RNN Cell used by the decoder (GRU or LSTM) conditional_gru: Flag whether to use the Conditional GRU architecture attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. """ check_argument_types() AutoregressiveDecoder.__init__(self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoders = encoders self.embedding_size = embedding_size self.rnn_size = rnn_size self.output_projection_spec = output_projection self.encoder_projection = encoder_projection self.attentions = attentions self.embeddings_source = embeddings_source self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell if self.attentions is None: self.attentions = [] if self.embedding_size is None and self.embeddings_source is None: raise ValueError("You must specify either embedding size or the " "embedded sequence from which to reuse the " "embeddings (e.g. set either 'embedding_size' or " " 'embeddings_source' parameter)") if self.embeddings_source is not None: if self.embedding_size is not None: warn("Overriding the embedding_size parameter with the" " size of the reused embeddings from the encoder.") self.embedding_size = ( self.embeddings_source.embedding_matrix.get_shape()[1].value) if self.encoder_projection is None: if not self.encoders: log("No encoder - language model only.") self.encoder_projection = empty_initial_state elif rnn_size is None: log("No rnn_size or encoder_projection: Using concatenation of" " encoded states") self.encoder_projection = concat_encoder_projection self.rnn_size = sum(e.output.get_shape()[1].value for e in encoders) else: log("Using linear projection of encoders as the initial state") self.encoder_projection = linear_encoder_projection( self.dropout_keep_prob) assert self.rnn_size is not None if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self.output_projection_spec is None: log("No output projection specified - using tanh projection") self.output_projection = nonlinear_output(self.rnn_size, tf.tanh)[0] self.output_projection_size = self.rnn_size elif isinstance(self.output_projection_spec, tuple): self.output_projection_spec = cast(Tuple[OutputProjection, int], self.output_projection_spec) (self.output_projection, self.output_projection_size) = self.output_projection_spec else: self.output_projection = cast(OutputProjection, self.output_projection_spec) self.output_projection_size = self.rnn_size if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = self.embed_input_symbol with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass # TODO when it is possible, remove the printing of the cost var log("Decoder initalized. Cost var: {}".format(str(self.cost))) log("Runtime logits tensor: {}".format(str(self.runtime_logits)))
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = False, label_smoothing: float = None, rnn_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, attention_on_input: bool = False, rnn_cell: str = "GRU", conditional_gru: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. rnn_size: Size of the decoder hidden state, if None set according to encoders. output_projection: How to generate distribution over vocabulary from decoder_outputs. encoder_projection: How to construct initial state from encoders. attention: The attention object to use. Optional. rnn_cell: RNN Cell used by the decoder (GRU or LSTM). conditional_gru: Flag whether to use the Conditional GRU architecture. attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the model variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__(self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self.encoders = encoders self._output_projection_spec = output_projection self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell self._rnn_size = rnn_size self._encoder_projection = encoder_projection self.attentions = [] # type: List[BaseAttention] if attentions is not None: self.attentions = attentions if not rnn_size and not encoder_projection and not encoders: raise ValueError( "No RNN size, no encoders and no encoder_projection specified") if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = ( lambda *args: LoopState(*args).feedables.embedded_input) with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass self._variable_scope.set_initializer( tf.random_normal_initializer(stddev=0.001))
def __init__(self, encoders: List[Stateful], vocabulary: Vocabulary, data_id: str, name: str, max_output_len: int, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = False, label_smoothing: float = None, rnn_size: int = None, output_projection: OutputProjectionSpec = None, encoder_projection: EncoderProjection = None, attentions: List[BaseAttention] = None, attention_on_input: bool = False, rnn_cell: str = "GRU", conditional_gru: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a refactored version of monster decoder. Arguments: encoders: Input encoders of the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. rnn_size: Size of the decoder hidden state, if None set according to encoders. output_projection: How to generate distribution over vocabulary from decoder_outputs. encoder_projection: How to construct initial state from encoders. attention: The attention object to use. Optional. rnn_cell: RNN Cell used by the decoder (GRU or LSTM). conditional_gru: Flag whether to use the Conditional GRU architecture. attention_on_input: Flag whether attention from previous decoding step should be combined with the input in the next step. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the model variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__( self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint, initializers=initializers) self.encoders = encoders self._output_projection_spec = output_projection self._conditional_gru = conditional_gru self._attention_on_input = attention_on_input self._rnn_cell_str = rnn_cell self._rnn_size = rnn_size self._encoder_projection = encoder_projection self.attentions = [] # type: List[BaseAttention] if attentions is not None: self.attentions = attentions if not rnn_size and not encoder_projection and not encoders: raise ValueError( "No RNN size, no encoders and no encoder_projection specified") if self._rnn_cell_str not in RNN_CELL_TYPES: raise ValueError("RNN cell must be a either 'GRU', 'LSTM', or " "'NematusGRU'. Not {}".format(self._rnn_cell_str)) if self._attention_on_input: self.input_projection = self.input_plus_attention else: self.input_projection = ( lambda *args: LoopState(*args).feedables.embedded_input) with self.use_scope(): with tf.variable_scope("attention_decoder") as self.step_scope: pass self._variable_scope.set_initializer( tf.random_normal_initializer(stddev=0.001))
def __init__(self, name: str, encoders: List[Attendable], vocabulary: Vocabulary, data_id: str, # TODO infer the default for these three from the encoder ff_hidden_size: int, n_heads_self: int, n_heads_enc: Union[List[int], int], depth: int, max_output_len: int, attention_combination_strategy: str = "serial", n_heads_hier: int = None, dropout_keep_prob: float = 1.0, embedding_size: int = None, embeddings_source: EmbeddedSequence = None, tie_embeddings: bool = True, label_smoothing: float = None, self_attention_dropout_keep_prob: float = 1.0, attention_dropout_keep_prob: Union[float, List[float]] = 1.0, use_att_transform_bias: bool = False, supress_unk: bool = False, reuse: ModelPart = None, save_checkpoint: str = None, load_checkpoint: str = None, initializers: InitializerSpecs = None) -> None: """Create a decoder of the Transformer model. Described in Vaswani et al. (2017), arxiv.org/abs/1706.03762 Arguments: encoders: Input encoders for the decoder. vocabulary: Target vocabulary. data_id: Target data series. name: Name of the decoder. Should be unique accross all Neural Monkey objects. max_output_len: Maximum length of an output sequence. dropout_keep_prob: Probability of keeping a value during dropout. embedding_size: Size of embedding vectors for target words. embeddings_source: Embedded sequence to take embeddings from. tie_embeddings: Use decoder.embedding_matrix also in place of the output decoding matrix. ff_hidden_size: Size of the feedforward sublayers. n_heads_self: Number of the self-attention heads. n_heads_enc: Number of the attention heads over each encoder. Either a list which size must be equal to ``encoders``, or a single integer. In the latter case, the number of heads is equal for all encoders. attention_comnbination_strategy: One of ``serial``, ``parallel``, ``flat``, ``hierarchical``. Controls the attention combination strategy for enc-dec attention. n_heads_hier: Number of the attention heads for the second attention in the ``hierarchical`` attention combination. depth: Number of sublayers. label_smoothing: A label smoothing parameter for cross entropy loss computation. attention_dropout_keep_prob: Probability of keeping a value during dropout on the attention output. supress_unk: If true, decoder will not produce symbols for unknown tokens. reuse: Reuse the variables from the given model part. """ check_argument_types() AutoregressiveDecoder.__init__( self, name=name, vocabulary=vocabulary, data_id=data_id, max_output_len=max_output_len, dropout_keep_prob=dropout_keep_prob, embedding_size=embedding_size, embeddings_source=embeddings_source, tie_embeddings=tie_embeddings, label_smoothing=label_smoothing, supress_unk=supress_unk, reuse=reuse, save_checkpoint=save_checkpoint, load_checkpoint=load_checkpoint) self.encoders = encoders self.ff_hidden_size = ff_hidden_size self.n_heads_self = n_heads_self if isinstance(n_heads_enc, int): if attention_combination_strategy == "flat": self.n_heads_enc = [n_heads_enc] else: self.n_heads_enc = [n_heads_enc for _ in self.encoders] else: self.n_heads_enc = n_heads_enc self.depth = depth if isinstance(attention_dropout_keep_prob, float): self.attention_dropout_keep_prob = [ attention_dropout_keep_prob for _ in encoders] else: self.attention_dropout_keep_prob = attention_dropout_keep_prob self.self_att_dropout_keep_prob = self_attention_dropout_keep_prob self.use_att_transform_bias = use_att_transform_bias self.attention_combination_strategy = attention_combination_strategy self.n_heads_hier = n_heads_hier self.encoder_states = lambda: [get_attention_states(e) for e in self.encoders] self.encoder_masks = lambda: [get_attention_mask(e) for e in self.encoders] if self.attention_combination_strategy not in STRATEGIES: raise ValueError( "Unknown attention combination strategy '{}'. " "Allowed: {}.".format(self.attention_combination_strategy, ", ".join(STRATEGIES))) if (self.attention_combination_strategy == "hierarchical" and self.n_heads_hier is None): raise ValueError( "You must provide n_heads_hier when using the hierarchical " "attention combination strategy.") if (self.attention_combination_strategy != "hierarchical" and self.n_heads_hier is not None): warn("Ignoring n_heads_hier parameter -- use the hierarchical " "attention combination strategy instead.") if (self.attention_combination_strategy == "flat" and len(self.n_heads_enc) != 1): raise ValueError( "For the flat attention combination strategy, only a single " "value is permitted in n_heads_enc.") self._variable_scope.set_initializer(tf.variance_scaling_initializer( mode="fan_avg", distribution="uniform"))