Beispiel #1
0
    def __init__(self,
                 num_tokens,
                 embeddings,
                 embeddings_size,
                 train_embeddings,
                 dropout_input,
                 rnn_hidden_size,
                 id2token,
                 token2id,
                 id2label,
                 label2id,
                 mode='teacher',
                 vocab_proj_dim=None):
        super(eVSNLI_net, self).__init__()

        self.mode = mode
        assert mode == 'teacher' or 'forloop'

        self.num_tokens = num_tokens

        self.lstm_cell = DropoutWrapper(
            tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
            input_keep_prob=dropout_input,
            output_keep_prob=dropout_input)

        if embeddings is not None:
            self.embedding_matrix = tf.get_variable(
                "embedding_matrix",
                shape=(num_tokens, embeddings_size),
                initializer=glove_embeddings_initializer(embeddings),
                trainable=train_embeddings)
            print("Loaded GloVe embeddings!")
        else:
            self.embedding_matrix = tf.get_variable(
                "embedding_matrix",
                shape=(num_tokens, embeddings_size),
                initializer=tf.random_normal_initializer(stddev=0.05),
                trainable=train_embeddings)

        #vocab_proj_dim for vocab projection
        #self.decoder = RNN_Decoder(embeddings_size, rnn_hidden_size, num_tokens, vocab_proj_dim)
        self.decoder = RNN_Decoder(self.embedding_matrix, rnn_hidden_size,
                                   num_tokens, vocab_proj_dim)

        keys = list(token2id.keys())
        values = [token2id[k] for k in keys]
        self.token2id_table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(keys,
                                                        values,
                                                        key_dtype=tf.string,
                                                        value_dtype=tf.int64),
            -1)

        mapping_token = tf.constant(list(id2token.values()), dtype=tf.string)
        self.id2token_table = tf.contrib.lookup.index_to_string_table_from_tensor(
            mapping_token, default_value="#unk#", name=None)

        mapping_label = tf.constant(list(id2label.values()), dtype=tf.string)
        self.id2label_table = tf.contrib.lookup.index_to_string_table_from_tensor(
            mapping_label, default_value="#unk#", name=None)
Beispiel #2
0
def build_simple_ic_model(sentence_input, img_features_input, dropout_input,
                          num_tokens, num_labels, embeddings, embeddings_size,
                          train_embeddings, rnn_hidden_size,
                          multimodal_fusion_hidden_size,
                          classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(sentence_input,
                             tf.zeros_like(sentence_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                 sentence_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=1)
    gated_sentence_hidden_layer = tf.nn.dropout(gated_tanh(
        sentence_final_states.h, multimodal_fusion_hidden_size),
                                                keep_prob=dropout_input)
    gated_img_hidden_layer = tf.nn.dropout(gated_tanh(
        normalized_img_features, multimodal_fusion_hidden_size),
                                           keep_prob=dropout_input)
    sentence_img_multimodal_fusion = tf.multiply(gated_sentence_hidden_layer,
                                                 gated_img_hidden_layer)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        sentence_img_multimodal_fusion, classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    return tf.contrib.layers.fully_connected(gated_third_layer,
                                             num_labels,
                                             activation_fn=None)
def build_simple_te_model_h(premise_input, hypothesis_input, dropout_input,
                            num_tokens, num_labels, embeddings,
                            embeddings_size, train_embeddings, rnn_hidden_size,
                            classification_hidden_size):
    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                   hypothesis_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    gated_first_layer = tf.nn.dropout(gated_tanh(hypothesis_final_states.h,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    return tf.contrib.layers.fully_connected(gated_third_layer,
                                             num_labels,
                                             activation_fn=None)
Beispiel #4
0
def build_tl_mt_model(sentence_input, premise_input, hypothesis_input,
                      img_features_input, dropout_input, num_tokens,
                      num_ic_labels, num_vte_labels, embeddings,
                      embeddings_size, num_img_features, img_features_size,
                      train_embeddings, rnn_hidden_size,
                      multimodal_fusion_hidden_size,
                      classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(sentence_input,
                             tf.zeros_like(sentence_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    premise_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(premise_input,
                             tf.zeros_like(premise_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                 sentence_input)
    premise_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                premise_input)
    hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                   hypothesis_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32)
    premise_outputs, premise_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=premise_embeddings,
        sequence_length=premise_length,
        dtype=tf.float32)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_sentence = tf.reshape(
        tf.tile(sentence_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_sentence_concatenation = tf.concat(
        [normalized_img_features, reshaped_sentence], -1)
    gated_img_sentence_concatenation = tf.nn.dropout(gated_tanh(
        img_sentence_concatenation, rnn_hidden_size),
                                                     keep_prob=dropout_input)
    att_wa_sentence = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_sentence = att_wa_sentence(gated_img_sentence_concatenation)
    a_sentence = tf.nn.softmax(tf.squeeze(a_sentence))
    v_head_sentence = tf.squeeze(
        tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features))

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime)
    gated_sentence = tf.nn.dropout(
        gated_tanh(sentence_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_sentence_W_plus_b,
                   W_plus_b_prime=gated_sentence_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    v_head_sentence.set_shape(
        (sentence_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime)
    gated_img_features_sentence = tf.nn.dropout(gated_tanh(
        v_head_sentence,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_sentence_W_plus_b,
        W_plus_b_prime=gated_img_features_sentence_W_plus_b_prime),
                                                keep_prob=dropout_input)

    h_premise_img = tf.multiply(gated_sentence, gated_img_features_sentence)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_first_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_first_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        h_premise_img,
        W_plus_b=gated_first_layer_W_plus_b,
        W_plus_b_prime=gated_first_layer_W_plus_b_prime),
                                      keep_prob=dropout_input)

    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    ic_classification = tf.nn.dropout(tf.contrib.layers.fully_connected(
        gated_third_layer, num_ic_labels, activation_fn=None),
                                      keep_prob=dropout_input)

    reshaped_premise = tf.reshape(
        tf.tile(premise_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_premise_concatenation = tf.concat(
        [normalized_img_features, reshaped_premise], -1)
    gated_img_premise_concatenation = tf.nn.dropout(gated_tanh(
        img_premise_concatenation, rnn_hidden_size),
                                                    keep_prob=dropout_input)
    att_wa_premise = lambda x: tf.nn.dropout(tf.contrib.layers.fully_connected(
        x, 1, activation_fn=None, biases_initializer=None),
                                             keep_prob=dropout_input)
    a_premise = att_wa_premise(gated_img_premise_concatenation)
    a_premise = tf.nn.softmax(tf.squeeze(a_premise))
    v_head_premise = tf.squeeze(
        tf.matmul(tf.expand_dims(a_premise, 1), normalized_img_features))

    reshaped_hypothesis = tf.reshape(
        tf.tile(hypothesis_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_hypothesis_concatenation = tf.concat(
        [normalized_img_features, reshaped_hypothesis], -1)
    gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh(
        img_hypothesis_concatenation, rnn_hidden_size),
                                                       keep_prob=dropout_input)
    att_wa_hypothesis = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation)
    a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis))
    v_head_hypothesis = tf.squeeze(
        tf.matmul(tf.expand_dims(a_hypothesis, 1), normalized_img_features))

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_premise = tf.nn.dropout(
        gated_tanh(premise_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_premise_W_plus_b,
                   W_plus_b_prime=gated_premise_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_hypothesis = tf.nn.dropout(
        gated_tanh(hypothesis_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_hypothesis_W_plus_b,
                   W_plus_b_prime=gated_hypothesis_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    v_head_premise.set_shape(
        (premise_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_img_features_premise = tf.nn.dropout(gated_tanh(
        v_head_premise,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_premise_W_plus_b,
        W_plus_b_prime=gated_img_features_premise_W_plus_b_prime),
                                               keep_prob=dropout_input)

    v_head_hypothesis.set_shape(
        (hypothesis_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_img_features_hypothesis = tf.nn.dropout(gated_tanh(
        v_head_hypothesis,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_hypothesis_W_plus_b,
        W_plus_b_prime=gated_img_features_hypothesis_W_plus_b_prime),
                                                  keep_prob=dropout_input)

    h_premise_img = tf.multiply(gated_premise, gated_img_features_premise)
    h_hypothesis_img = tf.multiply(gated_hypothesis,
                                   gated_img_features_hypothesis)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_h_premise_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_h_premise_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime,
            reuse=True)
    gated_h_premise_img_hidden_layer = tf.nn.dropout(gated_tanh(
        h_premise_img,
        W_plus_b=gated_h_premise_img_hidden_layer_W_plus_b,
        W_plus_b_prime=gated_h_premise_hidden_layer_W_plus_b_prime),
                                                     keep_prob=dropout_input)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_h_hypothesis_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_h_hypothesis_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime,
            reuse=True)
    gated_h_hypothesis_img_hidden_layer = tf.nn.dropout(
        gated_tanh(
            h_hypothesis_img,
            W_plus_b=gated_h_hypothesis_img_hidden_layer_W_plus_b,
            W_plus_b_prime=gated_h_hypothesis_hidden_layer_W_plus_b_prime),
        keep_prob=dropout_input)

    final_concatenation = tf.concat([
        gated_h_premise_img_hidden_layer, gated_h_hypothesis_img_hidden_layer
    ], 1)

    gated_first_layer = tf.nn.dropout(gated_tanh(final_concatenation,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)

    vte_classification = tf.nn.dropout(tf.contrib.layers.fully_connected(
        gated_second_layer, num_vte_labels, activation_fn=None),
                                       keep_prob=dropout_input)

    return ic_classification, vte_classification
Beispiel #5
0
def build_bottom_up_top_down_ic_model(sentence_input,
                                      img_features_input,
                                      dropout_input,
                                      num_tokens,
                                      num_labels,
                                      embeddings,
                                      embeddings_size,
                                      num_img_features,
                                      img_features_size,
                                      train_embeddings,
                                      rnn_hidden_size,
                                      multimodal_fusion_hidden_size,
                                      classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(tf.not_equal(sentence_input, tf.zeros_like(sentence_input, dtype=tf.int32)), tf.int64),
            1
        ),
        tf.int32
    )
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings
        )
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings
        )
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix, sentence_input)
    lstm_cell = DropoutWrapper(
        tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
        input_keep_prob=dropout_input,
        output_keep_prob=dropout_input
    )
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32
    )
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_sentence = tf.reshape(tf.tile(sentence_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size])
    img_sentence_concatenation = tf.concat([normalized_img_features, reshaped_sentence], -1)
    gated_img_sentence_concatenation = gated_tanh(img_sentence_concatenation, rnn_hidden_size)
    att_wa_sentence = lambda x: tf.contrib.layers.fully_connected(x, 1, activation_fn=None, biases_initializer=None)
    a_sentence = att_wa_sentence(gated_img_sentence_concatenation)
    a_sentence = tf.nn.softmax(tf.squeeze(a_sentence))
    v_head_sentence = tf.squeeze(tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features))
    v_head_sentence.set_shape((sentence_embeddings.get_shape()[0], img_features_size))

    gated_sentence = tf.nn.dropout(
        gated_tanh(sentence_final_states.h, multimodal_fusion_hidden_size),
        keep_prob=dropout_input
    )
    gated_img_features_sentence = tf.nn.dropout(
        gated_tanh(v_head_sentence, multimodal_fusion_hidden_size),
        keep_prob=dropout_input
    )
    h_sentence_img = tf.multiply(gated_sentence, gated_img_features_sentence)
    gated_first_layer = tf.nn.dropout(
        gated_tanh(h_sentence_img, classification_hidden_size),
        keep_prob=dropout_input
    )
    gated_second_layer = tf.nn.dropout(
        gated_tanh(gated_first_layer, classification_hidden_size),
        keep_prob=dropout_input
    )
    gated_third_layer = tf.nn.dropout(
        gated_tanh(gated_second_layer, classification_hidden_size),
        keep_prob=dropout_input
    )

    return tf.contrib.layers.fully_connected(
        gated_third_layer,
        num_labels,
        activation_fn=None
    )
Beispiel #6
0
def build_lstm_vte_model(premise_input,
                         hypothesis_input,
                         img_features_input,
                         dropout_input,
                         num_tokens,
                         num_labels,
                         embeddings,
                         embeddings_size,
                         train_embeddings,
                         rnn_hidden_size,
                         multimodal_fusion_hidden_size,
                         classification_hidden_size):
    premise_length = tf.cast(
        tf.reduce_sum(
            tf.cast(tf.not_equal(premise_input, tf.zeros_like(premise_input, dtype=tf.int32)), tf.int64),
            1
        ),
        tf.int32
    )
    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(tf.not_equal(hypothesis_input, tf.zeros_like(hypothesis_input, dtype=tf.int32)), tf.int64),
            1
        ),
        tf.int32
    )
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings
        )
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings
        )
    premise_embeddings = tf.nn.embedding_lookup(embedding_matrix, premise_input)
    hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix, hypothesis_input)
    lstm_cell = DropoutWrapper(
        tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
        input_keep_prob=dropout_input,
        output_keep_prob=dropout_input
    )
    premise_outputs, premise_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=premise_embeddings,
        sequence_length=premise_length,
        dtype=tf.float32
    )
    # premise_last = extract_axis_1(premise_outputs, premise_length - 1)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32
    )
    # hypothesis_last = extract_axis_1(hypothesis_outputs, hypothesis_length - 1)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=1)
    premise_hidden_features = tf.contrib.layers.fully_connected(
        premise_final_states.h,
        multimodal_fusion_hidden_size,
        activation_fn=tf.nn.relu
    )
    hypothesis_hidden_features = tf.contrib.layers.fully_connected(
        hypothesis_final_states.h,
        multimodal_fusion_hidden_size,
        activation_fn=tf.nn.relu
    )
    img_hidden_features = tf.contrib.layers.fully_connected(
        normalized_img_features,
        multimodal_fusion_hidden_size,
        activation_fn=tf.nn.relu
    )
    premise_img_multimodal_fusion = tf.multiply(premise_hidden_features, img_hidden_features)
    hypothesis_img_multimodal_fusion = tf.multiply(hypothesis_hidden_features, img_hidden_features)
    final_concatenation = tf.concat([premise_img_multimodal_fusion, hypothesis_img_multimodal_fusion], axis=1)
    return tf.contrib.layers.fully_connected(
        tf.contrib.layers.fully_connected(
            tf.contrib.layers.fully_connected(
                tf.contrib.layers.fully_connected(
                    final_concatenation,
                    classification_hidden_size,
                    activation_fn=tf.nn.relu
                ),
                classification_hidden_size,
                activation_fn=tf.nn.relu
            ),
            classification_hidden_size,
            activation_fn=tf.nn.relu
        ),
        num_labels,
        activation_fn=None
    )
Beispiel #7
0
def build_model(config,
                embeddings,
                mode,
                ilabel2itoken=None,
                inference_batch=None):
    """Basic setup.

    Args:
      config: Object containing configuration parameters.
      mode: "train" or "inference".
      inference_batch: if mode is 'inference', we will need to provide the batch_size of input data. Otherwise, leave it as None. 
      glove_vocab: if we need to use glove word2vec to initialize our vocab embeddings, we will provide with a matrix of [config.vocab_size, config.embedding_size]. If not, we leave it as None. 
    """
    assert mode in ["train", "inference"]
    if mode == 'inference' and inference_batch is None:
        raise ValueError(
            "When inference mode, inference_batch must be provided!")
    config = config

    # To match the "Show and Tell" paper we initialize all variables with a
    # random uniform initializer.
    initializer = tf.random_uniform_initializer(
        minval=-config.initializer_scale, maxval=config.initializer_scale)

    ### Inputs for VQA model ###

    hypothesis_input = tf.placeholder(tf.int32, (None, None),
                                      name="hypothesis_input")
    img_features_input = tf.placeholder(
        tf.float32, (None, config.num_img_features, config.img_features_size),
        name="img_features_input")
    label_input = tf.placeholder(tf.int32, (None, ), name="label_input")
    dropout_input = tf.placeholder(tf.float32, name="dropout_input")

    ### Inputs for explanation generation ###

    # An int32 Tensor with shape [batch_size, padded_length].
    input_seqs = tf.placeholder(tf.int32, [None, None], name='input_seqs')

    # An int32 Tensor with shape [batch_size, padded_length].
    target_seqs = tf.placeholder(tf.int32, [None, None], name='target_seqs')

    # A float32 Tensor with shape [1]
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    # An int32 0/1 Tensor with shape [batch_size, padded_length].
    input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask')

    # A float32 Tensor with shape [batch_size, image_feature_size].
    image_feature = tf.placeholder(tf.float32,
                                   [None, config.image_feature_size],
                                   name='image_feature')

    # A float32 Tensor with shape [batch_size, padded_length, embedding_size].
    seq_embedding = None

    # A float32 scalar Tensor; the total loss for the trainer to optimize.
    total_loss = None

    # A float32 Tensor with shape [batch_size * padded_length].
    target_cross_entropy_losses = None

    # A float32 Tensor with shape [batch_size * padded_length].
    target_cross_entropy_loss_weights = None

    # Collection of variables from the inception submodel.
    inception_variables = []

    # Global step Tensor.
    global_step = None
    """Sets up the global step Tensor."""
    global_step = tf.Variable(
        initial_value=0,
        name="global_step",
        trainable=False,
        collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

    # Dynamic batch size
    batch_size = tf.shape(hypothesis_input)[0]

    # Table to map label_id to token_id
    if ilabel2itoken:
        keys = list(ilabel2itoken.keys())
        values = [ilabel2itoken[k] for k in keys]
        ilabel2itoken_table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(keys,
                                                        values,
                                                        key_dtype=tf.int32,
                                                        value_dtype=tf.int32),
            -1)

    ### Builds the input sequence embeddings ###
    # Inputs:
    #   self.input_seqs
    # Outputs:
    #   self.seq_embeddings
    ############################################

#     with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
#         if glove_vocab is None:
#             embedding_map = tf.get_variable(
#                 name="map",
#                 shape=[config.vocab_size, config.embedding_size],
#                 initializer=initializer)
#         else:
#             init = tf.constant(glove_vocab.astype('float32'))
#             embedding_map = tf.get_variable(
#                 name="map",
#                 initializer=init)
#         seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs)

    with tf.variable_scope("hypothesis_embeddings"), tf.device("/cpu:0"):
        if embeddings is not None:
            embedding_map = tf.get_variable(
                "map",
                shape=[config.vocab_size, config.embedding_size],
                initializer=glove_embeddings_initializer(embeddings),
                trainable=config.train_embeddings)
            print("Loaded GloVe embeddings!")
        else:
            embedding_map = tf.get_variable(
                "map",
                shape=[config.vocab_size, config.embedding_size],
                initializer=tf.random_normal_initializer(stddev=0.05),
                trainable=config.train_embeddings  #TODO
            )
        hypothesis_embeddings = tf.nn.embedding_lookup(embedding_map,
                                                       hypothesis_input)

    ############ Builds the model ##############
    # Inputs:
    #   self.image_feature
    #   self.seq_embeddings
    #   self.target_seqs (training and eval only)
    #   self.input_mask (training and eval only)
    # Outputs:
    #   self.total_loss (training and eval only)
    #   self.target_cross_entropy_losses (training and eval only)
    #   self.target_cross_entropy_loss_weights (training and eval only)
    ############################################

    ############ VQA part ######################

    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)

    lstm_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(
        config.num_lstm_units),
                                              input_keep_prob=dropout_input,
                                              output_keep_prob=dropout_input)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_hypothesis = tf.reshape(
        tf.tile(hypothesis_final_states.h, [1, config.num_img_features]),
        [-1, config.num_img_features, config.num_lstm_units])
    img_hypothesis_concatenation = tf.concat(
        [normalized_img_features, reshaped_hypothesis], -1)
    gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh(
        img_hypothesis_concatenation, config.num_lstm_units),
                                                       keep_prob=dropout_input)
    att_wa_hypothesis = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation)
    a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis, axis=-1))

    v_head_hypothesis = tf.squeeze(tf.matmul(tf.expand_dims(a_hypothesis, 1),
                                             normalized_img_features),
                                   axis=1)

    gated_hypothesis = tf.nn.dropout(gated_tanh(
        hypothesis_final_states.h, config.multimodal_fusion_hidden_size),
                                     keep_prob=dropout_input)
    v_head_hypothesis.set_shape(
        (hypothesis_embeddings.get_shape()[0], config.img_features_size))
    gated_img_features_hypothesis = tf.nn.dropout(gated_tanh(
        v_head_hypothesis, config.multimodal_fusion_hidden_size),
                                                  keep_prob=dropout_input)
    h_hypothesis_img = tf.multiply(gated_hypothesis,
                                   gated_img_features_hypothesis)

    final_concatenation = tf.concat([h_hypothesis_img], 1)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        final_concatenation, config.classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(
        gated_first_layer, config.classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(
        gated_second_layer, config.classification_hidden_size),
                                      keep_prob=dropout_input)

    label_logits = tf.contrib.layers.fully_connected(gated_third_layer,
                                                     config.num_labels,
                                                     activation_fn=None)

    ############## Explanation generation part ######################
    multimodal_feature = final_concatenation

    if mode == 'train' and ilabel2itoken:
        # prepend gold label
        # done outside of the build function in inference mode
        pre_labels = ilabel2itoken_table.lookup(label_input)
        input_seqs = tf.concat([tf.expand_dims(pre_labels, 1), input_seqs],
                               axis=1)

    with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
        seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs)

    lstm_cell_expl = tf.nn.rnn_cell.LSTMCell(num_units=config.num_lstm_units,
                                             state_is_tuple=True)

    lstm_cell_expl = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_expl,
                                                   input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    # TODO: attention?
    #attn_meca = tf.contrib.seq2seq.BahdanauAttention(config.num_lstm_units, multimodal_feature)
    #attn_cell = tf.contrib.seq2seq.AttentionWrapper(lstm_cell_expl, attn_meca, output_attention=False)

    with tf.variable_scope("lstm", initializer=initializer) as lstm_scope:

        # Feed the image embeddings to set the initial LSTM state.
        if mode == 'train':
            zero_state = lstm_cell_expl.zero_state(batch_size=batch_size,
                                                   dtype=tf.float32)
            #zero_state = attn_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

        elif mode == 'inference':
            zero_state = lstm_cell_expl.zero_state(batch_size=inference_batch,
                                                   dtype=tf.float32)
            #zero_state = attn_cell.zero_state(batch_size=inference_batch, dtype=tf.float32)

        with tf.variable_scope('multimodal_embeddings'):
            multimodal_embeddings = tf.contrib.layers.fully_connected(
                inputs=multimodal_feature,
                num_outputs=config.embedding_size,
                activation_fn=None,
                weights_initializer=initializer,
                biases_initializer=None)

        _, initial_state = lstm_cell_expl(multimodal_embeddings, zero_state)
        #_, initial_state = attn_cell(multimodal_embeddings, zero_state)

        # Allow the LSTM variables to be reused.
        lstm_scope.reuse_variables()

        # Run the batch of sequence embeddings through the LSTM.
        sequence_length = tf.reduce_sum(input_mask, 1)
        lstm_outputs, final_state = tf.nn.dynamic_rnn(
            cell=lstm_cell_expl,
            inputs=seq_embedding,
            sequence_length=sequence_length,
            initial_state=initial_state,
            dtype=tf.float32,
            scope=lstm_scope)

        #         lstm_outputs, final_state = tf.nn.dynamic_rnn(cell=attn_cell,
        #                                                     inputs=seq_embedding,
        #                                                     sequence_length=sequence_length,
        #                                                     initial_state=initial_state,
        #                                                     dtype=tf.float32,
        #                                                     scope=lstm_scope)

        # Stack batches vertically.
        lstm_outputs = tf.reshape(
            lstm_outputs,
            [-1, lstm_cell_expl.output_size])  # output_size == 256
        #lstm_outputs = tf.reshape(lstm_outputs, [-1, attn_cell.output_size]) # output_size == 256

    with tf.variable_scope('logits'):
        W = tf.get_variable('W',
                            [lstm_cell_expl.output_size, config.vocab_size],
                            initializer=initializer)
        #W = tf.get_variable('W', [attn_cell.output_size, config.vocab_size], initializer=initializer)
        b = tf.get_variable('b', [config.vocab_size],
                            initializer=tf.constant_initializer(0.0))

        logits = tf.matmul(
            lstm_outputs,
            W) + b  # logits: [batch_size * padded_length, config.vocab_size]

    ###### for inference & validation only #######
    softmax = tf.nn.softmax(logits)
    preds = tf.argmax(softmax, 1)
    ##############################################

    # for training only below
    targets = tf.reshape(target_seqs, [-1])
    weights = tf.to_float(tf.reshape(input_mask, [-1]))

    # Compute losses.

    label_loss = tf.losses.sparse_softmax_cross_entropy(
        label_input, label_logits)
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                            logits=logits)

    explanation_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
                              tf.reduce_sum(weights),
                              name="explanation_loss")
    batch_loss = (1 -
                  config.alpha) * explanation_loss + config.alpha * label_loss
    tf.contrib.losses.add_loss(batch_loss)
    total_loss = tf.contrib.losses.get_total_loss()

    # target_cross_entropy_losses = losses  # Used in evaluation.
    # target_cross_entropy_loss_weights = weights  # Used in evaluation.

    # TODO; what else should I return?

    return dict(total_loss=total_loss,
                global_step=global_step,
                image_feature=image_feature,
                input_mask=input_mask,
                target_seqs=target_seqs,
                input_seqs=input_seqs,
                final_state=final_state,
                initial_state=initial_state,
                softmax=softmax,
                preds=preds,
                keep_prob=keep_prob,
                saver=tf.train.Saver(),
                hypothesis_input=hypothesis_input,
                img_features_input=img_features_input,
                label_input=label_input,
                dropout_input=dropout_input,
                label_logits=label_logits,
                explanation_loss=explanation_loss,
                attention_output=a_hypothesis)