Example #1
0
    def __init__(self,
                 n_input,
                 n_embed,
                 n_hidden,
                 n_output,
                 seq_len,
                 cell_type='gru'):

        # model input (no output, unsupervised)
        self.input = T.imatrix('input')
        self.y = T.ivector('y')

        self.encode_mask = T.fmatrix('encode_mask')

        self.cell_type = cell_type
        self.n_input = n_input
        self.n_embed = n_embed
        self.seq_len = seq_len
        self.n_hidden = n_hidden
        self.n_output = n_output

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        print('Building model ...')

        ######### Embedding ############
        self.embed = layers.EmbeddingLayer(input=self.input,
                                           n_input=self.n_input,
                                           n_output=self.n_embed)

        ########## Encoder ##############

        self.encoder = layers.RNNLayer(input=self.embed.out,
                                       mask=self.encode_mask,
                                       n_input=self.n_embed,
                                       n_hidden=self.n_hidden,
                                       seq_len=self.seq_len,
                                       cell_type=self.cell_type)

        ########## Predictor ##############

        # Loss layer
        self.ls = layers.SoftmaxLayer(input=self.encoder.feat,
                                      y=self.y,
                                      n_input=self.n_hidden,
                                      n_output=self.n_output)

        # combine parameters
        self.params = self.encoder.params + self.ls.params + self.embed.params

        self.loss = self.ls.loss
        self.pred = self.ls.pred
Example #2
0
 def _build(self):
     # Input embedding
     h_enc = layers.EmbeddingLayer(input_size=self.input_dim,
                                   output_size=self.hidden_size,
                                   name='W_enc',
                                   dtype=self.dtype)(ids=self.input)
     # Encoder part
     encoder = tf.transpose(h_enc, [1, 0, 2])  # (T, bs, nb_lat)
     last_h = []
     for k in range(self.nb_layers):
         encoder = self.rnn_layer(input_size=self.hidden_size,
                                  hidden_size=self.hidden_size,
                                  init=self.c_t,
                                  dtype=self.dtype)(h_t=encoder)
         encoder = self._dropout(encoder)
         last_h.append(self._compute_last_hidden_representation(encoder))
     # Output embedding
     h_dec = layers.EmbeddingLayer(input_size=self.output_dim,
                                   output_size=self.hidden_size,
                                   name='W_dec',
                                   dtype=self.dtype,
                                   pad_zero=True)(ids=self.shifted_gt)
     # Decoder part
     decoder = tf.transpose(h_dec, [1, 0, 2])  # (T, bs, nb_lat)
     for k in range(self.nb_layers):
         decoder = self.rnn_layer(input_size=self.hidden_size,
                                  hidden_size=self.hidden_size,
                                  init=last_h[k],
                                  dtype=self.dtype)(h_t=decoder)
         decoder = self._dropout(decoder)
     decoder = tf.transpose(decoder, [1, 0, 2])  # (bs, T, nb_lat)
     # Output linear transformation layer
     W_out = layers.EmbeddingLayer(input_size=self.output_dim,
                                   output_size=self.hidden_size,
                                   name='W_out',
                                   dtype=self.dtype)
     self._compute_scores(decoder, W_out)
Example #3
0
    def __init__(self, config, batch_size, dropout_source, dropout_embedding,
                 dropout_hidden):

        self.dropout_source = dropout_source

        with tf.variable_scope("embedding"):
            self.emb_layer = layers.EmbeddingLayer(config.source_vocab_sizes,
                                                   config.dim_per_factor)

        if config.theano_compat:
            bias_type = layers.LegacyBiasType.THEANO_A
        else:
            bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE

        with tf.variable_scope("forward-stack"):
            self.forward_encoder = layers.GRUStack(
                input_size=config.embedding_size,
                state_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                legacy_bias_type=bias_type,
                dropout_input=dropout_embedding,
                dropout_state=dropout_hidden,
                stack_depth=config.rnn_enc_depth,
                transition_depth=config.rnn_enc_transition_depth,
                alternating=True,
                residual_connections=True,
                first_residual_output=1)

        with tf.variable_scope("backward-stack"):
            self.backward_encoder = layers.GRUStack(
                input_size=config.embedding_size,
                state_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                legacy_bias_type=bias_type,
                dropout_input=dropout_embedding,
                dropout_state=dropout_hidden,
                stack_depth=config.rnn_enc_depth,
                transition_depth=config.rnn_enc_transition_depth,
                alternating=True,
                reverse_alternation=True,
                residual_connections=True,
                first_residual_output=1)
Example #4
0
    def __init__(self, config, batch_size, dropout_source, dropout_embedding,
                 dropout_hidden):

        self.dropout_source = dropout_source

        with tf.variable_scope("embedding"):
            self.emb_layer = layers.EmbeddingLayer(config.source_vocab_sizes,
                                                   config.dim_per_factor)

        with tf.variable_scope("forward-stack"):
            self.forward_encoder = layers.GRUStack(
                    input_size=config.embedding_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.use_layer_norm,
                    nematus_compat=False,
                    dropout_input=dropout_embedding,
                    dropout_state=dropout_hidden,
                    stack_depth=config.enc_depth,
                    transition_depth=config.enc_recurrence_transition_depth,
                    alternating=True,
                    residual_connections=True,
                    first_residual_output=1)

        with tf.variable_scope("backward-stack"):
            self.backward_encoder = layers.GRUStack(
                    input_size=config.embedding_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.use_layer_norm,
                    nematus_compat=False,
                    dropout_input=dropout_embedding,
                    dropout_state=dropout_hidden,
                    stack_depth=config.enc_depth,
                    transition_depth=config.enc_recurrence_transition_depth,
                    alternating=True,
                    reverse_alternation=True,
                    residual_connections=True,
                    first_residual_output=1)
Example #5
0
    def __init__(self,
                 config,
                 context,
                 x_embs,
                 x_mask,
                 dropout_target,
                 dropout_embedding,
                 dropout_hidden,
                 encoder_embedding_layer=None):

        self.dropout_target = dropout_target
        batch_size = tf.shape(x_mask)[1]

        with tf.variable_scope("initial_state_constructor"):
            context_sum = tf.reduce_sum(context *
                                        tf.expand_dims(x_mask, axis=2),
                                        axis=0)

            context_mean = context_sum / tf.expand_dims(
                tf.reduce_sum(x_mask, axis=0), axis=1)
            self.init_state_layer = layers.FeedForwardLayer(
                in_size=config.state_size * 2,
                out_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                dropout_input=dropout_hidden)
            self.init_state = self.init_state_layer.forward(context_mean)
            self.x_embs = x_embs

            self.translation_maxlen = config.translation_maxlen
            self.embedding_size = config.target_embedding_size
            self.state_size = config.state_size
            self.target_vocab_size = config.target_vocab_size

        with tf.variable_scope("embedding"):
            if encoder_embedding_layer == None:
                self.y_emb_layer = layers.EmbeddingLayer(
                    vocabulary_sizes=[config.target_vocab_size],
                    dim_per_factor=[config.target_embedding_size])
            else:
                self.y_emb_layer = encoder_embedding_layer

        with tf.variable_scope("base"):
            with tf.variable_scope("gru0"):
                if config.theano_compat:
                    bias_type = layers.LegacyBiasType.THEANO_A
                else:
                    bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE
                self.grustep1 = layers.GRUStep(
                    input_size=config.target_embedding_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    legacy_bias_type=bias_type,
                    dropout_input=dropout_embedding,
                    dropout_state=dropout_hidden)
            with tf.variable_scope("attention"):
                self.attstep = layers.AttentionStep(
                    context=context,
                    context_state_size=2 * config.state_size,
                    context_mask=x_mask,
                    state_size=config.state_size,
                    hidden_size=2 * config.state_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    dropout_context=dropout_hidden,
                    dropout_state=dropout_hidden)
            if config.theano_compat:
                bias_type = layers.LegacyBiasType.THEANO_B
            else:
                bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE
            self.grustep2 = layers.DeepTransitionGRUStep(
                input_size=2 * config.state_size,
                state_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                legacy_bias_type=bias_type,
                dropout_input=dropout_hidden,
                dropout_state=dropout_hidden,
                transition_depth=config.rnn_dec_base_transition_depth - 1,
                var_scope_fn=lambda i: "gru{0}".format(i + 1))

        with tf.variable_scope("high"):
            if config.rnn_dec_depth == 1:
                self.high_gru_stack = None
            else:
                if config.theano_compat:
                    bias_type = layers.LegacyBiasType.THEANO_A
                else:
                    bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE
                self.high_gru_stack = layers.GRUStack(
                    input_size=config.state_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    legacy_bias_type=bias_type,
                    dropout_input=dropout_hidden,
                    dropout_state=dropout_hidden,
                    stack_depth=config.rnn_dec_depth - 1,
                    transition_depth=config.rnn_dec_high_transition_depth,
                    context_state_size=(2 * config.state_size
                                        if config.rnn_dec_deep_context else 0),
                    residual_connections=True,
                    first_residual_output=0)

        if config.rnn_lexical_model:
            with tf.variable_scope("lexical"):
                self.lexical_layer = layers.LexicalModel(
                    in_size=config.embedding_size,
                    out_size=config.embedding_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    dropout_embedding=dropout_embedding,
                    dropout_hidden=dropout_hidden)
        else:
            self.lexical_layer = None

        with tf.variable_scope("next_word_predictor"):
            W = None
            if config.tie_decoder_embeddings:
                W = self.y_emb_layer.get_embeddings(factor=0)
                W = tf.transpose(W)
            self.predictor = Predictor(config,
                                       batch_size,
                                       dropout_embedding,
                                       dropout_hidden,
                                       hidden_to_logits_W=W)
Example #6
0
 def _build(self):
     # Input embedding
     h_enc = layers.EmbeddingLayer(input_size=self.input_dim,
                                   output_size=self.nb_heads *
                                   self.hidden_size,
                                   name='W_enc',
                                   dtype=self.dtype,
                                   pos_enc=self.pos_enc)(ids=self.input)
     # Dropout
     h_enc = self._dropout(h_enc)
     # Encoder
     encoder = [h_enc]
     for _ in range(self.nb_layers):
         # Multi-Head Attention
         encoder.append(
             self._add_multi_head_attention_layer(keys=encoder[-1],
                                                  queries=encoder[-1],
                                                  values=encoder[-1],
                                                  key_seq=self.input,
                                                  value_seq=self.input))
     # Output embedding
     h_dec = layers.EmbeddingLayer(
         input_size=self.output_dim,
         output_size=self.nb_heads * self.hidden_size,
         name='W_dec',
         dtype=self.dtype,
         pad_zero=True,
         pos_enc=self.pos_enc)(ids=self.shifted_gt)
     # Dropout
     h_dec = self._dropout(h_dec)
     # Decoder
     decoder = [h_dec]
     for k in range(self.nb_layers):
         # Self Attention
         h_dec = layers.SelfAttentionLayer(
             input_size=self.nb_heads * self.hidden_size,
             hidden_size=self.hidden_size,
             key_seq=self.gt,
             value_seq=self.gt,
             nb_heads=self.nb_heads,
             causality=True,
             dtype=self.dtype)(keys=decoder[-1],
                               queries=decoder[-1],
                               values=decoder[-1])
         # Dropout
         h_dec = self._dropout(h_dec)
         # Add & Norm
         h_dec = layers.LayerNorm(hidden_size=self.nb_heads *
                                  self.hidden_size,
                                  dtype=self.dtype)(x=h_dec + decoder[-1])
         # Multi-Head Attention
         decoder.append(
             self._add_multi_head_attention_layer(keys=encoder[k],
                                                  queries=h_dec,
                                                  values=encoder[k],
                                                  key_seq=self.input,
                                                  value_seq=self.gt))
     # Output linear transformation layer
     W_out = layers.EmbeddingLayer(input_size=self.output_dim,
                                   output_size=self.nb_heads *
                                   self.hidden_size,
                                   name='W_out',
                                   dtype=self.dtype)
     self._compute_scores(decoder[-1], W_out)