def _add_multi_head_attention_layer(self, keys, queries, values, key_seq, value_seq): # Multi-Head Attention x = layers.SelfAttentionLayer(input_size=self.nb_heads * self.hidden_size, hidden_size=self.hidden_size, key_seq=key_seq, value_seq=value_seq, nb_heads=self.nb_heads, causality=False, dtype=self.dtype)(keys=keys, queries=queries, values=values) # Dropout x = self._dropout(x) # Add & Norm h = layers.LayerNorm(hidden_size=self.nb_heads * self.hidden_size, dtype=self.dtype)(x=x + queries) # 2-layer Feed Forward x = layers.FeedForwardLayer(hidden_size=self.nb_heads * self.hidden_size, activation=tf.nn.relu, dtype=self.dtype)(x=h) x = layers.FeedForwardLayer(hidden_size=self.nb_heads * self.hidden_size, activation=lambda x: x, dtype=self.dtype)(x=x) # Dropout x = self._dropout(x) # Add & Norm x = layers.LayerNorm(hidden_size=self.nb_heads * self.hidden_size, dtype=self.dtype)(x=x + h) return x
def __init__(self, config, batch_size, dropout_embedding, dropout_hidden, hidden_to_logits_W=None): self.config = config with tf.variable_scope("prev_emb_to_hidden"): self.prev_emb_to_hidden = layers.FeedForwardLayer( in_size=config.target_embedding_size, out_size=config.target_embedding_size, batch_size=batch_size, non_linearity=lambda y: y, use_layer_norm=config.use_layer_norm, dropout_input=dropout_embedding) with tf.variable_scope("state_to_hidden"): self.state_to_hidden = layers.FeedForwardLayer( in_size=config.state_size, out_size=config.target_embedding_size, batch_size=batch_size, non_linearity=lambda y: y, use_layer_norm=config.use_layer_norm, dropout_input=dropout_hidden) with tf.variable_scope("attended_context_to_hidden"): self.att_ctx_to_hidden = layers.FeedForwardLayer( in_size=2*config.state_size, out_size=config.target_embedding_size, batch_size=batch_size, non_linearity=lambda y: y, use_layer_norm=config.use_layer_norm, dropout_input=dropout_hidden) if config.output_hidden_activation == 'prelu': with tf.variable_scope("hidden_prelu"): self.hidden_prelu = PReLU(in_size=config.target_embedding_size) with tf.variable_scope("hidden_to_logits"): self.hidden_to_logits = layers.FeedForwardLayer( in_size=config.target_embedding_size, out_size=config.target_vocab_size, batch_size=batch_size, non_linearity=lambda y: y, W=hidden_to_logits_W, dropout_input=dropout_embedding) if config.softmax_mixture_size > 1: with tf.variable_scope("hidden_to_pi_logits"): self.hidden_to_pi_logits = layers.FeedForwardLayer( in_size=config.target_embedding_size, out_size=config.softmax_mixture_size, batch_size=batch_size, non_linearity=lambda y: y, dropout_input=dropout_embedding) self.hidden_to_mos_hidden = [] for k in range(config.softmax_mixture_size): with tf.variable_scope("hidden_to_mos_hidden_{}".format(k)): layer = layers.FeedForwardLayer( in_size=config.target_embedding_size, out_size=config.target_embedding_size, batch_size=batch_size, use_layer_norm=config.use_layer_norm, dropout_input=dropout_embedding) self.hidden_to_mos_hidden.append(layer)
def __init__(self, config, context, x_embs, x_mask, dropout_target, dropout_embedding, dropout_hidden, encoder_embedding_layer=None): self.dropout_target = dropout_target batch_size = tf.shape(x_mask)[1] with tf.variable_scope("initial_state_constructor"): context_sum = tf.reduce_sum(context * tf.expand_dims(x_mask, axis=2), axis=0) context_mean = context_sum / tf.expand_dims( tf.reduce_sum(x_mask, axis=0), axis=1) self.init_state_layer = layers.FeedForwardLayer( in_size=config.state_size * 2, out_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, dropout_input=dropout_hidden) self.init_state = self.init_state_layer.forward(context_mean) self.x_embs = x_embs self.translation_maxlen = config.translation_maxlen self.embedding_size = config.target_embedding_size self.state_size = config.state_size self.target_vocab_size = config.target_vocab_size with tf.variable_scope("embedding"): if encoder_embedding_layer == None: self.y_emb_layer = layers.EmbeddingLayer( vocabulary_sizes=[config.target_vocab_size], dim_per_factor=[config.target_embedding_size]) else: self.y_emb_layer = encoder_embedding_layer with tf.variable_scope("base"): with tf.variable_scope("gru0"): if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_A else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE self.grustep1 = layers.GRUStep( input_size=config.target_embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_embedding, dropout_state=dropout_hidden) with tf.variable_scope("attention"): self.attstep = layers.AttentionStep( context=context, context_state_size=2 * config.state_size, context_mask=x_mask, state_size=config.state_size, hidden_size=2 * config.state_size, use_layer_norm=config.rnn_layer_normalization, dropout_context=dropout_hidden, dropout_state=dropout_hidden) if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_B else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE self.grustep2 = layers.DeepTransitionGRUStep( input_size=2 * config.state_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_hidden, dropout_state=dropout_hidden, transition_depth=config.rnn_dec_base_transition_depth - 1, var_scope_fn=lambda i: "gru{0}".format(i + 1)) with tf.variable_scope("high"): if config.rnn_dec_depth == 1: self.high_gru_stack = None else: if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_A else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE self.high_gru_stack = layers.GRUStack( input_size=config.state_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_hidden, dropout_state=dropout_hidden, stack_depth=config.rnn_dec_depth - 1, transition_depth=config.rnn_dec_high_transition_depth, context_state_size=(2 * config.state_size if config.rnn_dec_deep_context else 0), residual_connections=True, first_residual_output=0) if config.rnn_lexical_model: with tf.variable_scope("lexical"): self.lexical_layer = layers.LexicalModel( in_size=config.embedding_size, out_size=config.embedding_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, dropout_embedding=dropout_embedding, dropout_hidden=dropout_hidden) else: self.lexical_layer = None with tf.variable_scope("next_word_predictor"): W = None if config.tie_decoder_embeddings: W = self.y_emb_layer.get_embeddings(factor=0) W = tf.transpose(W) self.predictor = Predictor(config, batch_size, dropout_embedding, dropout_hidden, hidden_to_logits_W=W)