def build_graph(self):
        elmo_bilm = self.elmo_bilm

        context_elmo_embeddings_op = elmo_bilm(self.memory_elmo_token_ids)
        query_elmo_embeddings_op = elmo_bilm(self.query_elmo_token_ids)

        with tf.variable_scope("elmo_encodings_input"):
            elmo_context_input = weight_layers('input',
                                               context_elmo_embeddings_op,
                                               l2_coef=0.0)['weighted_op']

            context_len = tf.shape(self.memory_vectors)[1]
            elmo_context_input = elmo_context_input[:, :context_len]

        with tf.variable_scope("elmo_encodings_input", reuse=True):
            elmo_query_input = weight_layers('input',
                                             query_elmo_embeddings_op,
                                             l2_coef=0.0)['weighted_op']

            query_len = tf.shape(self.encoder_inputs)[1]
            elmo_query_input = elmo_query_input[:, :query_len]

        print("ELMo shapes:")
        print(elmo_context_input.get_shape().as_list())
        print(elmo_query_input.get_shape().as_list())

        with tf.device("/cpu:0"):
            with tf.variable_scope("embedding"):
                embedded_input_seq = tf.nn.embedding_lookup(
                    self.emb, self.encoder_inputs)
                embedded_dec_input_seq = tf.nn.embedding_lookup(
                    self.emb, self.decoder_inputs)
                embedded_dec_target_seq = tf.nn.embedding_lookup(
                    self.emb, self.decoder_targets)
                embedded_memory_vectors = tf.nn.embedding_lookup(
                    self.emb, self.memory_vectors)

        enc_hidden_sz = self.opt.hidden_size_encoder
        enc_num_layers = self.opt.num_layers_encoder

        # add elmo
        embedded_memory_vectors = tf.concat(
            [embedded_memory_vectors, elmo_context_input], -1)
        embedded_input_seq = tf.concat([embedded_input_seq, elmo_query_input],
                                       -1)

        mem_rep = embedded_memory_vectors

        print(mem_rep.get_shape().as_list())

        for i in range(self.opt.num_attn_hops):
            with tf.variable_scope("attn_layer_%d" % i):
                with tf.variable_scope("mem_encoder"):
                    mem_rep, _ = bi_cudnn_rnn_encoder(
                        'lstm', enc_hidden_sz, enc_num_layers,
                        self.opt.dropout_rate, mem_rep,
                        self.memory_vector_lengths, self.is_training)

                with tf.variable_scope("ques_encoder"):
                    ques_inp, _ = bi_cudnn_rnn_encoder(
                        'lstm', enc_hidden_sz, enc_num_layers,
                        self.opt.dropout_rate, embedded_input_seq,
                        self.encoder_input_lengths, self.is_training)

                # attend
                mem_rep = bidaf_attention(mem_rep, ques_inp,
                                          self.memory_vector_lengths,
                                          self.encoder_input_lengths,
                                          tri_linear_attention)

                print(mem_rep.get_shape().as_list())

        with tf.variable_scope("res_self_attn"):
            units = mem_rep.get_shape().as_list()[-1]
            print(units)

            mem_proj = tf.layers.dense(inputs=mem_rep,
                                       units=units,
                                       activation=tf.nn.relu,
                                       name="self_attn_input_proj")

            print(mem_proj.get_shape().as_list())

            with tf.variable_scope("input_proj"):
                self_attn_mem_input, _ = bi_cudnn_rnn_encoder(
                    'lstm', enc_hidden_sz, enc_num_layers,
                    self.opt.dropout_rate, mem_proj,
                    self.memory_vector_lengths, self.is_training)

            self_attn_mem = self_attention_encoder(
                x=self_attn_mem_input,
                sim_func=tri_linear_attention,
                mask=self.memory_vector_lengths,
                merge_function=concat_with_product)

            print(self_attn_mem.get_shape().as_list())

            with tf.variable_scope("output_proj"):
                self_attn_output_proj, _ = bi_cudnn_rnn_encoder(
                    'lstm', units / 2, enc_num_layers, self.opt.dropout_rate,
                    self_attn_mem, self.memory_vector_lengths,
                    self.is_training)

            mem_rep = mem_rep + self_attn_output_proj

            print(mem_rep.get_shape().as_list())

        batch_size = self.opt.batch_size
        sos_id = self.vocab.start_token_id
        eos_id = self.vocab.end_token_id

        dec_hidden_sz = self.opt.hidden_size_encoder
        dec_num_layers = self.opt.num_layers_decoder

        train_helper = tf.contrib.seq2seq.TrainingHelper(
            embedded_dec_input_seq, self.decoder_input_lengths)

        pred_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            self.emb, start_tokens=tf.fill([batch_size], sos_id),
            end_token=-1)  # XXX hack here to allow correct loss #eos_id)

        def decode(helper, scope, reuse=None):
            with tf.variable_scope(scope, reuse=reuse):
                attention_over_context = tf.contrib.seq2seq.BahdanauAttention(
                    num_units=self.opt.decoder_attn_size,
                    memory=mem_rep,
                    memory_sequence_length=self.memory_vector_lengths)

                decoder_cell = create_multi_rnn('basic_lstm', dec_hidden_sz,
                                                dec_num_layers, self.keep_prob)

                projection_layer = layers_core.Dense(self.vocab.size(),
                                                     use_bias=True,
                                                     name='output_projection')

                decoder_cell = AttnPointerWrapper(
                    cell=decoder_cell,
                    attention_mechanism=attention_over_context,
                    output_layer=projection_layer,
                    max_oovs=self.opt.max_oovs,
                    batch_size=self.opt.batch_size,
                    memory_full_vocab=self.memory_vectors_full_vocab,
                    attention_layer_size=self.opt.decoder_attn_size / 2,
                    alignment_history=True,
                    output_combined_distribution=True,
                    unk_id=self.vocab.unk_token_id)

                decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=helper,
                    initial_state=decoder_cell.zero_state(
                        batch_size=self.opt.batch_size, dtype=tf.float32))

                outputs = tf.contrib.seq2seq.dynamic_decode(
                    decoder=decoder,
                    output_time_major=False,
                    maximum_iterations=self.max_decoder_length)

                return outputs

        train_outputs, train_state, self.train_final_lengths = decode(
            train_helper, 'decode')
        pred_outputs, pred_state, self.pred_final_lengths = decode(pred_helper,
                                                                   'decode',
                                                                   reuse=True)

        train_logits = tf.transpose(train_state.final_dist_history.stack(),
                                    [1, 0, 2])
        pred_logits = tf.transpose(pred_state.final_dist_history.stack(),
                                   [1, 0, 2])

        self.preds = tf.argmax(pred_logits, axis=2)

        output_mask = tf.sequence_mask(self.decoder_target_lengths,
                                       dtype=tf.float32,
                                       maxlen=self.max_decoder_length)

        self.loss = tf.contrib.seq2seq.sequence_loss(
            logits=train_logits,
            targets=self.decoder_targets,
            weights=output_mask,
            softmax_loss_function=sparse_cross_entropy_with_probs)

        self.eval_loss = tf.contrib.seq2seq.sequence_loss(
            logits=pred_logits,
            targets=self.decoder_targets,
            weights=output_mask,
            softmax_loss_function=sparse_cross_entropy_with_probs)

        self.graph_built = True
Exemple #2
0
def model_fn(features, labels, mode, params):
    # For serving features are a bit different
    if isinstance(features, dict):
        features = ((features['words'], features['nwords']),
                    (features['chars'], features['nchars']))

    # Read vocabs and inputs
    (words, nwords), (chars, nchars) = features
    dropout = params['dropout']
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    vocab_chars = tf.contrib.lookup.index_table_from_file(
        params['chars'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1
    with Path(params['chars']).open() as f:
        num_chars = sum(1 for _ in f) + params['num_oov_buckets']

    # Char Embeddings
    char_ids = vocab_chars.lookup(chars)
    variable = tf.get_variable('chars_embeddings',
                               [num_chars, params['dim_chars']], tf.float32)
    char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
    char_embeddings = tf.layers.dropout(char_embeddings,
                                        rate=dropout,
                                        training=training)

    # Char LSTM
    dim_words = tf.shape(char_embeddings)[1]
    dim_chars = tf.shape(char_embeddings)[2]
    flat = tf.reshape(char_embeddings, [-1, dim_chars, params['dim_chars']])
    t = tf.transpose(flat, perm=[1, 0, 2])
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    _, (_, output_fw) = lstm_cell_fw(t,
                                     dtype=tf.float32,
                                     sequence_length=tf.reshape(nchars, [-1]))
    _, (_, output_bw) = lstm_cell_bw(t,
                                     dtype=tf.float32,
                                     sequence_length=tf.reshape(nchars, [-1]))
    output = tf.concat([output_fw, output_bw], axis=-1)
    char_embeddings = tf.reshape(output,
                                 [-1, dim_words, 2 * params['char_lstm_size']])

    # Word Embeddings
    word_ids = vocab_words.lookup(words)
    glove = np.load(params['glove'])['embeddings']  # np.array
    variable = np.vstack([glove, [[0.] * params['dim']]])
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    word_embeddings = tf.nn.embedding_lookup(variable, word_ids)

    # Concatenate Word and Char Embeddings
    embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
    embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)

    # LSTM
    t = tf.transpose(embeddings, perm=[1, 0, 2])  # Need time-major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    #output = tf.layers.dropout(output, rate=dropout, training=training)

    layers = []
    layers.append(char_embeddings)
    layers.append(output)

    lm_embeddings = tf.concat([tf.expand_dims(t, axis=1) for t in layers],
                              axis=1)

    weights = tf.sequence_mask(nwords)

    bilm_ops = {'lm_embeddings': lm_embeddings, 'mask': weights}

    weight_sum = weight_layers('elmo_input1',
                               bilm_ops,
                               l2_coef=1.0,
                               do_layer_norm=True,
                               use_top_only=False)

    output = tf.layers.dropout(weight_sum['weighted_op'],
                               rate=dropout,
                               training=training)

    # CRF
    logits = tf.layers.dense(output, num_tags)
    crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
    pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
        predictions = {'pred_ids': pred_ids, 'tags': pred_strings}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
        tags = vocab_tags.lookup(labels)
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params)
        loss = tf.reduce_mean(-log_likelihood)

        # Metrics
        weights = tf.sequence_mask(nwords)
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(tags, pred_ids, num_tags, indices, weights),
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(metric_name, op[1])

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            train_op = tf.train.AdamOptimizer().minimize(
                loss, global_step=tf.train.get_or_create_global_step())
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              train_op=train_op)
Exemple #3
0
    def _check_weighted_layer(self, l2_coef, do_layer_norm, use_top_only):
        # create the Batcher
        vocab_file = os.path.join(FIXTURES, 'vocab_test.txt')
        batcher = Batcher(vocab_file, 50)

        # load the model
        options_file = os.path.join(FIXTURES, 'options.json')
        weight_file = os.path.join(FIXTURES, 'lm_weights.hdf5')
        character_ids = tf.placeholder('int32', (None, None, 50))
        model = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=4)
        bilm_ops = model(character_ids)

        weighted_ops = []
        for k in range(2):
            ops = weight_layers(str(k), bilm_ops, l2_coef=l2_coef, 
                                     do_layer_norm=do_layer_norm,
                                     use_top_only=use_top_only)
            weighted_ops.append(ops)

        # initialize
        self.sess.run(tf.global_variables_initializer())

        n_expected_trainable_weights = 2 * (1 + int(not use_top_only))
        self.assertEqual(len(tf.trainable_variables()),
                         n_expected_trainable_weights)
        # and one regularizer per weighted layer
        n_expected_reg_losses = 2 * int(not use_top_only)
        self.assertEqual(
            len(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
            n_expected_reg_losses,
        )

        # Set the variables.
        weights = [[np.array([0.1, 0.3, 0.5]), np.array([1.1])],
                   [np.array([0.2, 0.4, 0.6]), np.array([0.88])]]
        for k in range(2):
            with tf.variable_scope('', reuse=True):
                if not use_top_only:
                    W = tf.get_variable('{}_ELMo_W'.format(k))
                    _ = self.sess.run([W.assign(weights[k][0])])
                gamma = tf.get_variable('{}_ELMo_gamma'.format(k))
                _ = self.sess.run([gamma.assign(weights[k][1])])

        # make some data
        sentences = [
            ['The', 'first', 'sentence', '.'],
            ['The', 'second'],
            ['Third']
        ]
        X_chars = batcher.batch_sentences(sentences)

        ops = model(character_ids)
        lm_embeddings, mask, weighted0, weighted1 = self.sess.run(
            [ops['lm_embeddings'], ops['mask'],
             weighted_ops[0]['weighted_op'], weighted_ops[1]['weighted_op']],
            feed_dict={character_ids: X_chars}
        )
        actual_elmo = [weighted0, weighted1]

        # check the mask first
        expected_mask = [[True, True, True, True],
                         [True, True, False, False],
                         [True, False, False, False]]
        self.assertTrue((expected_mask == mask).all())

        # Now compute the actual weighted layers
        for k in range(2):
            normed_weights = np.exp(weights[k][0] + 1.0 / 3) / np.sum(
                                  np.exp(weights[k][0] + 1.0 / 3))
            # masked layer normalization
            expected_elmo = np.zeros((3, 4, lm_embeddings.shape[-1]))
            if not use_top_only:
                for j in range(3):  # number of LM layers
                    if do_layer_norm:
                        mean = np.mean(lm_embeddings[:, j, :, :][mask])
                        std = np.std(lm_embeddings[:, j, :, :][mask])
                        normed_lm_embed = (lm_embeddings[:, j, :, :] - mean) / (
                            std + 1E-12)
                        expected_elmo += normed_weights[j] * normed_lm_embed
                    else:
                        expected_elmo += normed_weights[j] * lm_embeddings[
                                                                    :, j, :, :]
            else:
                expected_elmo += lm_embeddings[:, -1, :, :]

            # the scale parameter
            expected_elmo *= weights[k][1]
            self.assertTrue(
                np.allclose(expected_elmo, actual_elmo[k], atol=1e-6)
            )
Exemple #4
0
question_character_ids = tf.placeholder('int32', shape=(None, None, 50))

# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)

# Get ops to compute the LM embeddings.
context_embeddings_op = bilm(context_character_ids)
question_embeddings_op = bilm(question_character_ids)

# Get an op to compute ELMo (weighted average of the internal biLM layers)
# Our SQuAD model includes ELMo at both the input and output layers
# of the task GRU, so we need 4x ELMo representations for the question
# and context at each of the input and output.
# We use the same ELMo weights for both the question and context
# at each of the input and output.
elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_input = weight_layers(
        'input', question_embeddings_op, l2_coef=0.0
    )

elmo_context_output = weight_layers(
    'output', context_embeddings_op, l2_coef=0.0
)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_output = weight_layers(
        'output', question_embeddings_op, l2_coef=0.0
    )
Exemple #5
0
def model_fn(features, labels, mode, params):
    # For serving features are a bit different
    if isinstance(features, dict):
        features = ((features['words'], features['nwords']),
                    (features['chars'], features['nchars']))

    # Read vocabs and inputs
    (words, nwords), (chars, nchars) = features

    dropout = params['dropout']
    training = (mode == tf.estimator.ModeKeys.TRAIN)
    vocab_words = tf.contrib.lookup.index_table_from_file(
        params['words'], num_oov_buckets=params['num_oov_buckets'])
    vocab_chars = tf.contrib.lookup.index_table_from_file(
        params['chars'], num_oov_buckets=params['num_oov_buckets'])
    with Path(params['tags']).open() as f:
        indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
        num_tags = len(indices) + 1
    with Path(params['chars']).open() as f:
        num_chars = sum(1 for _ in f) + params['num_oov_buckets']

    # Char Embeddings
    char_ids = vocab_chars.lookup(chars)  #[[a,b][c,z]] => [[0,1][2,25]]
    variable = tf.get_variable('chars_embeddings',
                               [num_chars, params['dim_chars']],
                               tf.float32)  #dimension char embeddings [86,100]
    char_embeddings = tf.nn.embedding_lookup(
        variable, char_ids
    )  #char_ids [0,1] 0 va prendre le premier vecteur (variable [0,:]), donc [[0,1][2,25]] => [[variable[0,:],variable[1,:]][variable[2,:],variable[25,:]]]
    char_embeddings = tf.layers.dropout(char_embeddings,
                                        rate=dropout,
                                        training=training)  #50% de l'entrée

    # Char LSTM
    dim_words = tf.shape(
        char_embeddings
    )[1]  #max dim word (time len)(or number of chars max of a word)[nombre de phrase(batch),nombre de mots max,time len, dim char 100]
    dim_chars = tf.shape(
        char_embeddings
    )[2]  #dimension de char 100 [nombre de phrase(batch),nombre de mots max,time len ,dim char 100]

    flat = tf.reshape(char_embeddings, [-1, dim_chars, params['dim_chars']
                                        ])  #[?,max len word(or time len),100]
    t = tf.transpose(flat,
                     perm=[1, 0,
                           2])  #[max len word(or time len),?,100] time major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    _, (_, output_fw) = lstm_cell_fw(t,
                                     dtype=tf.float32,
                                     sequence_length=tf.reshape(
                                         nchars, [-1]))  #we take last state
    _, (_, output_bw) = lstm_cell_bw(t,
                                     dtype=tf.float32,
                                     sequence_length=tf.reshape(
                                         nchars, [-1]))  #we take last state
    output = tf.concat(
        [output_fw, output_bw],
        axis=-1)  #concat on the last D dimension of tensors 25+25

    char_embeddings_lstm = tf.reshape(
        output, [-1, dim_words, params['char_lstm_size'] * 2])  # [b,t,D]

    # Char 1d convolution
    weights = tf.sequence_mask(nchars)
    char_embeddings_cnn = masked_conv1d_and_max(char_embeddings, weights,
                                                params['filters'],
                                                params['kernel_size'])

    # Word Embeddings
    word_ids = vocab_words.lookup(
        words
    )  #[[b'Peter', b'Blackburn'],[b'Yac', b'Amirat']] => [[b'0', b'1'],[b'2', b'3']]
    glove = np.load(params['glove'])[
        'embeddings']  # np.array glove made of vocab words (reduces list)
    variable = np.vstack([glove, [[0.] * params['dim']]
                          ])  #concatenate on -1 axis, glove + [[0.]]
    variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
    word_embeddings = tf.nn.embedding_lookup(
        variable, word_ids
    )  #[[b'0', b'1'],[b'2', b'3']] => [[b'variable[0]', b'variable[1]'],[b'variable[2]', b'variable[3]']] [2,2,300]

    # Concatenate Word and Char Embeddings
    embeddings = tf.concat([word_embeddings, char_embeddings_lstm],
                           axis=-1)  #concat on the last dimension axis 100+300
    embeddings = tf.layers.dropout(embeddings, rate=dropout,
                                   training=training)  #50% de l'entrée

    # LSTM for lstm
    t = tf.transpose(
        embeddings, perm=[1, 0, 2]
    )  # Need time-major #put the word dim as first dimension. check batch-major VS time-major
    lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size_lstm'])
    lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size_lstm'])
    lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
    output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
    output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])

    # Concatenate Word and Char cnn Embeddings
    embeddings2 = tf.concat(
        [word_embeddings, char_embeddings_cnn],
        axis=-1)  #concat on the last dimension axis 100+300
    embeddings2 = tf.layers.dropout(embeddings,
                                    rate=dropout,
                                    training=training)  #50% de l'entrée

    # LSTM fro cnn
    t2 = tf.transpose(
        embeddings2, perm=[1, 0, 2]
    )  # Need time-major #put the word dim as first dimension. check batch-major VS time-major
    lstm_cell_fw2 = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size_cnn'])
    lstm_cell_bw2 = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size_cnn'])
    lstm_cell_bw2 = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw2)
    output_fw2, _ = lstm_cell_fw(t2, dtype=tf.float32, sequence_length=nwords)
    output_bw2, _ = lstm_cell_bw(t2, dtype=tf.float32, sequence_length=nwords)
    output2 = tf.concat([output_fw2, output_bw2], axis=-1)
    output2 = tf.transpose(output2, perm=[1, 0, 2])
    #output = tf.concat([output, output2], axis=-1)

    layers = []
    layers.append(output)
    layers.append(output2)

    lm_embeddings = tf.concat([tf.expand_dims(t, axis=1) for t in layers],
                              axis=1)

    weights = tf.sequence_mask(nwords)

    bilm_ops = {'lm_embeddings': lm_embeddings, 'mask': weights}

    weight_sum = weight_layers('elmo_input',
                               bilm_ops,
                               l2_coef=1.0,
                               do_layer_norm=True,
                               use_top_only=False)

    output = tf.layers.dropout(weight_sum['weighted_op'],
                               rate=dropout,
                               training=training)

    # CRF
    logits = tf.layers.dense(
        output, num_tags
    )  #nn dense input : (output of bilstm), output dimension : same shape excpet last dim will be num of tags
    crf_params = tf.get_variable(
        "crf", [num_tags, num_tags],
        dtype=tf.float32)  #variable of crf pars matrix num_tags*num_tags
    pred_ids, _ = tf.contrib.crf.crf_decode(
        logits, crf_params, nwords
    )  #decode_tags: A [batch_size, max_seq_len] matrix, with dtype tf.int32. Contains the highest scoring tag indices.
    #potentials(logits): A [batch_size, max_seq_len, num_tags] tensor of unary potentials.

    if mode == tf.estimator.ModeKeys.PREDICT:  #prediction
        # Predictions
        reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
            params['tags'])
        pred_strings = reverse_vocab_tags.lookup(
            tf.to_int64(pred_ids)
        )  #indices = tf.constant([1, 5], tf.int64) => ["lake", "UNKNOWN"]
        predictions = {'pred_ids': pred_ids, 'tags': pred_strings}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
        # Loss
        vocab_tags = tf.contrib.lookup.index_table_from_file(
            params['tags'])  #get tags index from file
        tags = vocab_tags.lookup(labels)  #replace lables by thier indexes
        log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
            logits, tags, nwords, crf_params
        )  #calculate log_likelihood given the real tags, return: A [batch_size] Tensor containing the log-likelihood of each example, given the sequence of tag indices.
        loss = tf.reduce_mean(
            -log_likelihood
        )  #Computes the mean of elements across dimensions of a tensor. x = tf.constant([[1., 1.], [2., 2.]]) tf.reduce_mean(x)  # 1.5

        # Metrics
        weights = tf.sequence_mask(
            nwords
        )  #convert the vector of size n to a matrix of bool of size n * max value in the vector v[1,2] ==> m[[true,false],[true, true]]
        metrics = {
            'acc': tf.metrics.accuracy(tags, pred_ids, weights),
            'precision': precision(
                tags, pred_ids, num_tags, indices, weights
            ),  #ground truth, predictions, num of tags 9, The indices of the positive classes, 
            'recall': recall(tags, pred_ids, num_tags, indices, weights),
            'f1': f1(tags, pred_ids, num_tags, indices, weights),
        }
        for metric_name, op in metrics.items():
            tf.summary.scalar(
                metric_name, op[1]
            )  #for tensor board#tuple of (scalar float Tensor, update_op) op[1] => update_op: An operation that increments the total and count variables appropriately and whose value matches accuracy.

        if mode == tf.estimator.ModeKeys.EVAL:  #Eval
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:  #training
            train_op = tf.train.AdamOptimizer(learning_rate=0.001).minimize(
                loss, global_step=tf.train.get_or_create_global_step()
            )  #adam optimizer operation to optimize the loss, global_step: Optional Variable to increment by one after the variables have been updated.
            return tf.estimator.EstimatorSpec(mode,
                                              loss=loss,
                                              train_op=train_op)