def __init__(self, n_input, n_embed, n_hidden, n_output, seq_len, cell_type='gru'): # model input (no output, unsupervised) self.input = T.imatrix('input') self.y = T.ivector('y') self.encode_mask = T.fmatrix('encode_mask') self.cell_type = cell_type self.n_input = n_input self.n_embed = n_embed self.seq_len = seq_len self.n_hidden = n_hidden self.n_output = n_output ###################### # BUILD ACTUAL MODEL # ###################### print('Building model ...') ######### Embedding ############ self.embed = layers.EmbeddingLayer(input=self.input, n_input=self.n_input, n_output=self.n_embed) ########## Encoder ############## self.encoder = layers.RNNLayer(input=self.embed.out, mask=self.encode_mask, n_input=self.n_embed, n_hidden=self.n_hidden, seq_len=self.seq_len, cell_type=self.cell_type) ########## Predictor ############## # Loss layer self.ls = layers.SoftmaxLayer(input=self.encoder.feat, y=self.y, n_input=self.n_hidden, n_output=self.n_output) # combine parameters self.params = self.encoder.params + self.ls.params + self.embed.params self.loss = self.ls.loss self.pred = self.ls.pred
def _build(self): # Input embedding h_enc = layers.EmbeddingLayer(input_size=self.input_dim, output_size=self.hidden_size, name='W_enc', dtype=self.dtype)(ids=self.input) # Encoder part encoder = tf.transpose(h_enc, [1, 0, 2]) # (T, bs, nb_lat) last_h = [] for k in range(self.nb_layers): encoder = self.rnn_layer(input_size=self.hidden_size, hidden_size=self.hidden_size, init=self.c_t, dtype=self.dtype)(h_t=encoder) encoder = self._dropout(encoder) last_h.append(self._compute_last_hidden_representation(encoder)) # Output embedding h_dec = layers.EmbeddingLayer(input_size=self.output_dim, output_size=self.hidden_size, name='W_dec', dtype=self.dtype, pad_zero=True)(ids=self.shifted_gt) # Decoder part decoder = tf.transpose(h_dec, [1, 0, 2]) # (T, bs, nb_lat) for k in range(self.nb_layers): decoder = self.rnn_layer(input_size=self.hidden_size, hidden_size=self.hidden_size, init=last_h[k], dtype=self.dtype)(h_t=decoder) decoder = self._dropout(decoder) decoder = tf.transpose(decoder, [1, 0, 2]) # (bs, T, nb_lat) # Output linear transformation layer W_out = layers.EmbeddingLayer(input_size=self.output_dim, output_size=self.hidden_size, name='W_out', dtype=self.dtype) self._compute_scores(decoder, W_out)
def __init__(self, config, batch_size, dropout_source, dropout_embedding, dropout_hidden): self.dropout_source = dropout_source with tf.variable_scope("embedding"): self.emb_layer = layers.EmbeddingLayer(config.source_vocab_sizes, config.dim_per_factor) if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_A else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE with tf.variable_scope("forward-stack"): self.forward_encoder = layers.GRUStack( input_size=config.embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_embedding, dropout_state=dropout_hidden, stack_depth=config.rnn_enc_depth, transition_depth=config.rnn_enc_transition_depth, alternating=True, residual_connections=True, first_residual_output=1) with tf.variable_scope("backward-stack"): self.backward_encoder = layers.GRUStack( input_size=config.embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_embedding, dropout_state=dropout_hidden, stack_depth=config.rnn_enc_depth, transition_depth=config.rnn_enc_transition_depth, alternating=True, reverse_alternation=True, residual_connections=True, first_residual_output=1)
def __init__(self, config, batch_size, dropout_source, dropout_embedding, dropout_hidden): self.dropout_source = dropout_source with tf.variable_scope("embedding"): self.emb_layer = layers.EmbeddingLayer(config.source_vocab_sizes, config.dim_per_factor) with tf.variable_scope("forward-stack"): self.forward_encoder = layers.GRUStack( input_size=config.embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.use_layer_norm, nematus_compat=False, dropout_input=dropout_embedding, dropout_state=dropout_hidden, stack_depth=config.enc_depth, transition_depth=config.enc_recurrence_transition_depth, alternating=True, residual_connections=True, first_residual_output=1) with tf.variable_scope("backward-stack"): self.backward_encoder = layers.GRUStack( input_size=config.embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.use_layer_norm, nematus_compat=False, dropout_input=dropout_embedding, dropout_state=dropout_hidden, stack_depth=config.enc_depth, transition_depth=config.enc_recurrence_transition_depth, alternating=True, reverse_alternation=True, residual_connections=True, first_residual_output=1)
def __init__(self, config, context, x_embs, x_mask, dropout_target, dropout_embedding, dropout_hidden, encoder_embedding_layer=None): self.dropout_target = dropout_target batch_size = tf.shape(x_mask)[1] with tf.variable_scope("initial_state_constructor"): context_sum = tf.reduce_sum(context * tf.expand_dims(x_mask, axis=2), axis=0) context_mean = context_sum / tf.expand_dims( tf.reduce_sum(x_mask, axis=0), axis=1) self.init_state_layer = layers.FeedForwardLayer( in_size=config.state_size * 2, out_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, dropout_input=dropout_hidden) self.init_state = self.init_state_layer.forward(context_mean) self.x_embs = x_embs self.translation_maxlen = config.translation_maxlen self.embedding_size = config.target_embedding_size self.state_size = config.state_size self.target_vocab_size = config.target_vocab_size with tf.variable_scope("embedding"): if encoder_embedding_layer == None: self.y_emb_layer = layers.EmbeddingLayer( vocabulary_sizes=[config.target_vocab_size], dim_per_factor=[config.target_embedding_size]) else: self.y_emb_layer = encoder_embedding_layer with tf.variable_scope("base"): with tf.variable_scope("gru0"): if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_A else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE self.grustep1 = layers.GRUStep( input_size=config.target_embedding_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_embedding, dropout_state=dropout_hidden) with tf.variable_scope("attention"): self.attstep = layers.AttentionStep( context=context, context_state_size=2 * config.state_size, context_mask=x_mask, state_size=config.state_size, hidden_size=2 * config.state_size, use_layer_norm=config.rnn_layer_normalization, dropout_context=dropout_hidden, dropout_state=dropout_hidden) if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_B else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE self.grustep2 = layers.DeepTransitionGRUStep( input_size=2 * config.state_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_hidden, dropout_state=dropout_hidden, transition_depth=config.rnn_dec_base_transition_depth - 1, var_scope_fn=lambda i: "gru{0}".format(i + 1)) with tf.variable_scope("high"): if config.rnn_dec_depth == 1: self.high_gru_stack = None else: if config.theano_compat: bias_type = layers.LegacyBiasType.THEANO_A else: bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE self.high_gru_stack = layers.GRUStack( input_size=config.state_size, state_size=config.state_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, legacy_bias_type=bias_type, dropout_input=dropout_hidden, dropout_state=dropout_hidden, stack_depth=config.rnn_dec_depth - 1, transition_depth=config.rnn_dec_high_transition_depth, context_state_size=(2 * config.state_size if config.rnn_dec_deep_context else 0), residual_connections=True, first_residual_output=0) if config.rnn_lexical_model: with tf.variable_scope("lexical"): self.lexical_layer = layers.LexicalModel( in_size=config.embedding_size, out_size=config.embedding_size, batch_size=batch_size, use_layer_norm=config.rnn_layer_normalization, dropout_embedding=dropout_embedding, dropout_hidden=dropout_hidden) else: self.lexical_layer = None with tf.variable_scope("next_word_predictor"): W = None if config.tie_decoder_embeddings: W = self.y_emb_layer.get_embeddings(factor=0) W = tf.transpose(W) self.predictor = Predictor(config, batch_size, dropout_embedding, dropout_hidden, hidden_to_logits_W=W)
def _build(self): # Input embedding h_enc = layers.EmbeddingLayer(input_size=self.input_dim, output_size=self.nb_heads * self.hidden_size, name='W_enc', dtype=self.dtype, pos_enc=self.pos_enc)(ids=self.input) # Dropout h_enc = self._dropout(h_enc) # Encoder encoder = [h_enc] for _ in range(self.nb_layers): # Multi-Head Attention encoder.append( self._add_multi_head_attention_layer(keys=encoder[-1], queries=encoder[-1], values=encoder[-1], key_seq=self.input, value_seq=self.input)) # Output embedding h_dec = layers.EmbeddingLayer( input_size=self.output_dim, output_size=self.nb_heads * self.hidden_size, name='W_dec', dtype=self.dtype, pad_zero=True, pos_enc=self.pos_enc)(ids=self.shifted_gt) # Dropout h_dec = self._dropout(h_dec) # Decoder decoder = [h_dec] for k in range(self.nb_layers): # Self Attention h_dec = layers.SelfAttentionLayer( input_size=self.nb_heads * self.hidden_size, hidden_size=self.hidden_size, key_seq=self.gt, value_seq=self.gt, nb_heads=self.nb_heads, causality=True, dtype=self.dtype)(keys=decoder[-1], queries=decoder[-1], values=decoder[-1]) # Dropout h_dec = self._dropout(h_dec) # Add & Norm h_dec = layers.LayerNorm(hidden_size=self.nb_heads * self.hidden_size, dtype=self.dtype)(x=h_dec + decoder[-1]) # Multi-Head Attention decoder.append( self._add_multi_head_attention_layer(keys=encoder[k], queries=h_dec, values=encoder[k], key_seq=self.input, value_seq=self.gt)) # Output linear transformation layer W_out = layers.EmbeddingLayer(input_size=self.output_dim, output_size=self.nb_heads * self.hidden_size, name='W_out', dtype=self.dtype) self._compute_scores(decoder[-1], W_out)