Beispiel #1
0
def build_simple_ic_model(sentence_input, img_features_input, dropout_input,
                          num_tokens, num_labels, embeddings, embeddings_size,
                          train_embeddings, rnn_hidden_size,
                          multimodal_fusion_hidden_size,
                          classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(sentence_input,
                             tf.zeros_like(sentence_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                 sentence_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=1)
    gated_sentence_hidden_layer = tf.nn.dropout(gated_tanh(
        sentence_final_states.h, multimodal_fusion_hidden_size),
                                                keep_prob=dropout_input)
    gated_img_hidden_layer = tf.nn.dropout(gated_tanh(
        normalized_img_features, multimodal_fusion_hidden_size),
                                           keep_prob=dropout_input)
    sentence_img_multimodal_fusion = tf.multiply(gated_sentence_hidden_layer,
                                                 gated_img_hidden_layer)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        sentence_img_multimodal_fusion, classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    return tf.contrib.layers.fully_connected(gated_third_layer,
                                             num_labels,
                                             activation_fn=None)
def build_simple_te_model_h(premise_input, hypothesis_input, dropout_input,
                            num_tokens, num_labels, embeddings,
                            embeddings_size, train_embeddings, rnn_hidden_size,
                            classification_hidden_size):
    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                   hypothesis_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    gated_first_layer = tf.nn.dropout(gated_tanh(hypothesis_final_states.h,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    return tf.contrib.layers.fully_connected(gated_third_layer,
                                             num_labels,
                                             activation_fn=None)
Beispiel #3
0
def build_tl_mt_model(sentence_input, premise_input, hypothesis_input,
                      img_features_input, dropout_input, num_tokens,
                      num_ic_labels, num_vte_labels, embeddings,
                      embeddings_size, num_img_features, img_features_size,
                      train_embeddings, rnn_hidden_size,
                      multimodal_fusion_hidden_size,
                      classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(sentence_input,
                             tf.zeros_like(sentence_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    premise_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(premise_input,
                             tf.zeros_like(premise_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings)
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings)
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                 sentence_input)
    premise_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                premise_input)
    hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix,
                                                   hypothesis_input)
    lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
                               input_keep_prob=dropout_input,
                               output_keep_prob=dropout_input)
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32)
    premise_outputs, premise_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=premise_embeddings,
        sequence_length=premise_length,
        dtype=tf.float32)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_sentence = tf.reshape(
        tf.tile(sentence_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_sentence_concatenation = tf.concat(
        [normalized_img_features, reshaped_sentence], -1)
    gated_img_sentence_concatenation = tf.nn.dropout(gated_tanh(
        img_sentence_concatenation, rnn_hidden_size),
                                                     keep_prob=dropout_input)
    att_wa_sentence = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_sentence = att_wa_sentence(gated_img_sentence_concatenation)
    a_sentence = tf.nn.softmax(tf.squeeze(a_sentence))
    v_head_sentence = tf.squeeze(
        tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features))

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime)
    gated_sentence = tf.nn.dropout(
        gated_tanh(sentence_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_sentence_W_plus_b,
                   W_plus_b_prime=gated_sentence_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    v_head_sentence.set_shape(
        (sentence_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime)
    gated_img_features_sentence = tf.nn.dropout(gated_tanh(
        v_head_sentence,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_sentence_W_plus_b,
        W_plus_b_prime=gated_img_features_sentence_W_plus_b_prime),
                                                keep_prob=dropout_input)

    h_premise_img = tf.multiply(gated_sentence, gated_img_features_sentence)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_first_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_first_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        h_premise_img,
        W_plus_b=gated_first_layer_W_plus_b,
        W_plus_b_prime=gated_first_layer_W_plus_b_prime),
                                      keep_prob=dropout_input)

    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)

    ic_classification = tf.nn.dropout(tf.contrib.layers.fully_connected(
        gated_third_layer, num_ic_labels, activation_fn=None),
                                      keep_prob=dropout_input)

    reshaped_premise = tf.reshape(
        tf.tile(premise_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_premise_concatenation = tf.concat(
        [normalized_img_features, reshaped_premise], -1)
    gated_img_premise_concatenation = tf.nn.dropout(gated_tanh(
        img_premise_concatenation, rnn_hidden_size),
                                                    keep_prob=dropout_input)
    att_wa_premise = lambda x: tf.nn.dropout(tf.contrib.layers.fully_connected(
        x, 1, activation_fn=None, biases_initializer=None),
                                             keep_prob=dropout_input)
    a_premise = att_wa_premise(gated_img_premise_concatenation)
    a_premise = tf.nn.softmax(tf.squeeze(a_premise))
    v_head_premise = tf.squeeze(
        tf.matmul(tf.expand_dims(a_premise, 1), normalized_img_features))

    reshaped_hypothesis = tf.reshape(
        tf.tile(hypothesis_final_states.h, [1, num_img_features]),
        [-1, num_img_features, rnn_hidden_size])
    img_hypothesis_concatenation = tf.concat(
        [normalized_img_features, reshaped_hypothesis], -1)
    gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh(
        img_hypothesis_concatenation, rnn_hidden_size),
                                                       keep_prob=dropout_input)
    att_wa_hypothesis = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation)
    a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis))
    v_head_hypothesis = tf.squeeze(
        tf.matmul(tf.expand_dims(a_hypothesis, 1), normalized_img_features))

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_premise = tf.nn.dropout(
        gated_tanh(premise_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_premise_W_plus_b,
                   W_plus_b_prime=gated_premise_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    with tf.variable_scope(
            "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b:
        gated_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_sentence_scope_W_plus_b_prime"
                           ) as gated_sentence_scope_W_plus_b_prime:
        gated_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_hypothesis = tf.nn.dropout(
        gated_tanh(hypothesis_final_states.h,
                   multimodal_fusion_hidden_size,
                   W_plus_b=gated_hypothesis_W_plus_b,
                   W_plus_b_prime=gated_hypothesis_W_plus_b_prime),
        keep_prob=dropout_input,
    )

    v_head_premise.set_shape(
        (premise_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_img_features_premise = tf.nn.dropout(gated_tanh(
        v_head_premise,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_premise_W_plus_b,
        W_plus_b_prime=gated_img_features_premise_W_plus_b_prime),
                                               keep_prob=dropout_input)

    v_head_hypothesis.set_shape(
        (hypothesis_embeddings.get_shape()[0], img_features_size))
    with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b"
                           ) as gated_img_features_sentence_scope_W_plus_b:
        gated_img_features_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope(
            "gated_img_features_sentence_scope_W_plus_b_prime"
    ) as gated_img_features_sentence_scope_W_plus_b_prime:
        gated_img_features_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            multimodal_fusion_hidden_size,
            activation_fn=None,
            scope=gated_img_features_sentence_scope_W_plus_b_prime,
            reuse=True)
    gated_img_features_hypothesis = tf.nn.dropout(gated_tanh(
        v_head_hypothesis,
        multimodal_fusion_hidden_size,
        W_plus_b=gated_img_features_hypothesis_W_plus_b,
        W_plus_b_prime=gated_img_features_hypothesis_W_plus_b_prime),
                                                  keep_prob=dropout_input)

    h_premise_img = tf.multiply(gated_premise, gated_img_features_premise)
    h_hypothesis_img = tf.multiply(gated_hypothesis,
                                   gated_img_features_hypothesis)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_h_premise_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_h_premise_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime,
            reuse=True)
    gated_h_premise_img_hidden_layer = tf.nn.dropout(gated_tanh(
        h_premise_img,
        W_plus_b=gated_h_premise_img_hidden_layer_W_plus_b,
        W_plus_b_prime=gated_h_premise_hidden_layer_W_plus_b_prime),
                                                     keep_prob=dropout_input)

    with tf.variable_scope("gated_first_layer_scope_W_plus_b"
                           ) as gated_first_layer_scope_W_plus_b:
        gated_h_hypothesis_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b,
            reuse=True)
    with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime"
                           ) as gated_first_layer_scope_W_plus_b_prime:
        gated_h_hypothesis_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected(
            x,
            classification_hidden_size,
            activation_fn=None,
            scope=gated_first_layer_scope_W_plus_b_prime,
            reuse=True)
    gated_h_hypothesis_img_hidden_layer = tf.nn.dropout(
        gated_tanh(
            h_hypothesis_img,
            W_plus_b=gated_h_hypothesis_img_hidden_layer_W_plus_b,
            W_plus_b_prime=gated_h_hypothesis_hidden_layer_W_plus_b_prime),
        keep_prob=dropout_input)

    final_concatenation = tf.concat([
        gated_h_premise_img_hidden_layer, gated_h_hypothesis_img_hidden_layer
    ], 1)

    gated_first_layer = tf.nn.dropout(gated_tanh(final_concatenation,
                                                 classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer,
                                                  classification_hidden_size),
                                       keep_prob=dropout_input)

    vte_classification = tf.nn.dropout(tf.contrib.layers.fully_connected(
        gated_second_layer, num_vte_labels, activation_fn=None),
                                       keep_prob=dropout_input)

    return ic_classification, vte_classification
Beispiel #4
0
def build_bottom_up_top_down_ic_model(sentence_input,
                                      img_features_input,
                                      dropout_input,
                                      num_tokens,
                                      num_labels,
                                      embeddings,
                                      embeddings_size,
                                      num_img_features,
                                      img_features_size,
                                      train_embeddings,
                                      rnn_hidden_size,
                                      multimodal_fusion_hidden_size,
                                      classification_hidden_size):
    sentence_length = tf.cast(
        tf.reduce_sum(
            tf.cast(tf.not_equal(sentence_input, tf.zeros_like(sentence_input, dtype=tf.int32)), tf.int64),
            1
        ),
        tf.int32
    )
    if embeddings is not None:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=glove_embeddings_initializer(embeddings),
            trainable=train_embeddings
        )
        print("Loaded GloVe embeddings!")
    else:
        embedding_matrix = tf.get_variable(
            "embedding_matrix",
            shape=(num_tokens, embeddings_size),
            initializer=tf.random_normal_initializer(stddev=0.05),
            trainable=train_embeddings
        )
    sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix, sentence_input)
    lstm_cell = DropoutWrapper(
        tf.nn.rnn_cell.LSTMCell(rnn_hidden_size),
        input_keep_prob=dropout_input,
        output_keep_prob=dropout_input
    )
    sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=sentence_embeddings,
        sequence_length=sentence_length,
        dtype=tf.float32
    )
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_sentence = tf.reshape(tf.tile(sentence_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size])
    img_sentence_concatenation = tf.concat([normalized_img_features, reshaped_sentence], -1)
    gated_img_sentence_concatenation = gated_tanh(img_sentence_concatenation, rnn_hidden_size)
    att_wa_sentence = lambda x: tf.contrib.layers.fully_connected(x, 1, activation_fn=None, biases_initializer=None)
    a_sentence = att_wa_sentence(gated_img_sentence_concatenation)
    a_sentence = tf.nn.softmax(tf.squeeze(a_sentence))
    v_head_sentence = tf.squeeze(tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features))
    v_head_sentence.set_shape((sentence_embeddings.get_shape()[0], img_features_size))

    gated_sentence = tf.nn.dropout(
        gated_tanh(sentence_final_states.h, multimodal_fusion_hidden_size),
        keep_prob=dropout_input
    )
    gated_img_features_sentence = tf.nn.dropout(
        gated_tanh(v_head_sentence, multimodal_fusion_hidden_size),
        keep_prob=dropout_input
    )
    h_sentence_img = tf.multiply(gated_sentence, gated_img_features_sentence)
    gated_first_layer = tf.nn.dropout(
        gated_tanh(h_sentence_img, classification_hidden_size),
        keep_prob=dropout_input
    )
    gated_second_layer = tf.nn.dropout(
        gated_tanh(gated_first_layer, classification_hidden_size),
        keep_prob=dropout_input
    )
    gated_third_layer = tf.nn.dropout(
        gated_tanh(gated_second_layer, classification_hidden_size),
        keep_prob=dropout_input
    )

    return tf.contrib.layers.fully_connected(
        gated_third_layer,
        num_labels,
        activation_fn=None
    )
Beispiel #5
0
def build_model(config,
                embeddings,
                mode,
                ilabel2itoken=None,
                inference_batch=None):
    """Basic setup.

    Args:
      config: Object containing configuration parameters.
      mode: "train" or "inference".
      inference_batch: if mode is 'inference', we will need to provide the batch_size of input data. Otherwise, leave it as None. 
      glove_vocab: if we need to use glove word2vec to initialize our vocab embeddings, we will provide with a matrix of [config.vocab_size, config.embedding_size]. If not, we leave it as None. 
    """
    assert mode in ["train", "inference"]
    if mode == 'inference' and inference_batch is None:
        raise ValueError(
            "When inference mode, inference_batch must be provided!")
    config = config

    # To match the "Show and Tell" paper we initialize all variables with a
    # random uniform initializer.
    initializer = tf.random_uniform_initializer(
        minval=-config.initializer_scale, maxval=config.initializer_scale)

    ### Inputs for VQA model ###

    hypothesis_input = tf.placeholder(tf.int32, (None, None),
                                      name="hypothesis_input")
    img_features_input = tf.placeholder(
        tf.float32, (None, config.num_img_features, config.img_features_size),
        name="img_features_input")
    label_input = tf.placeholder(tf.int32, (None, ), name="label_input")
    dropout_input = tf.placeholder(tf.float32, name="dropout_input")

    ### Inputs for explanation generation ###

    # An int32 Tensor with shape [batch_size, padded_length].
    input_seqs = tf.placeholder(tf.int32, [None, None], name='input_seqs')

    # An int32 Tensor with shape [batch_size, padded_length].
    target_seqs = tf.placeholder(tf.int32, [None, None], name='target_seqs')

    # A float32 Tensor with shape [1]
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    # An int32 0/1 Tensor with shape [batch_size, padded_length].
    input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask')

    # A float32 Tensor with shape [batch_size, image_feature_size].
    image_feature = tf.placeholder(tf.float32,
                                   [None, config.image_feature_size],
                                   name='image_feature')

    # A float32 Tensor with shape [batch_size, padded_length, embedding_size].
    seq_embedding = None

    # A float32 scalar Tensor; the total loss for the trainer to optimize.
    total_loss = None

    # A float32 Tensor with shape [batch_size * padded_length].
    target_cross_entropy_losses = None

    # A float32 Tensor with shape [batch_size * padded_length].
    target_cross_entropy_loss_weights = None

    # Collection of variables from the inception submodel.
    inception_variables = []

    # Global step Tensor.
    global_step = None
    """Sets up the global step Tensor."""
    global_step = tf.Variable(
        initial_value=0,
        name="global_step",
        trainable=False,
        collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

    # Dynamic batch size
    batch_size = tf.shape(hypothesis_input)[0]

    # Table to map label_id to token_id
    if ilabel2itoken:
        keys = list(ilabel2itoken.keys())
        values = [ilabel2itoken[k] for k in keys]
        ilabel2itoken_table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(keys,
                                                        values,
                                                        key_dtype=tf.int32,
                                                        value_dtype=tf.int32),
            -1)

    ### Builds the input sequence embeddings ###
    # Inputs:
    #   self.input_seqs
    # Outputs:
    #   self.seq_embeddings
    ############################################

#     with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
#         if glove_vocab is None:
#             embedding_map = tf.get_variable(
#                 name="map",
#                 shape=[config.vocab_size, config.embedding_size],
#                 initializer=initializer)
#         else:
#             init = tf.constant(glove_vocab.astype('float32'))
#             embedding_map = tf.get_variable(
#                 name="map",
#                 initializer=init)
#         seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs)

    with tf.variable_scope("hypothesis_embeddings"), tf.device("/cpu:0"):
        if embeddings is not None:
            embedding_map = tf.get_variable(
                "map",
                shape=[config.vocab_size, config.embedding_size],
                initializer=glove_embeddings_initializer(embeddings),
                trainable=config.train_embeddings)
            print("Loaded GloVe embeddings!")
        else:
            embedding_map = tf.get_variable(
                "map",
                shape=[config.vocab_size, config.embedding_size],
                initializer=tf.random_normal_initializer(stddev=0.05),
                trainable=config.train_embeddings  #TODO
            )
        hypothesis_embeddings = tf.nn.embedding_lookup(embedding_map,
                                                       hypothesis_input)

    ############ Builds the model ##############
    # Inputs:
    #   self.image_feature
    #   self.seq_embeddings
    #   self.target_seqs (training and eval only)
    #   self.input_mask (training and eval only)
    # Outputs:
    #   self.total_loss (training and eval only)
    #   self.target_cross_entropy_losses (training and eval only)
    #   self.target_cross_entropy_loss_weights (training and eval only)
    ############################################

    ############ VQA part ######################

    hypothesis_length = tf.cast(
        tf.reduce_sum(
            tf.cast(
                tf.not_equal(hypothesis_input,
                             tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                tf.int64), 1), tf.int32)

    lstm_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(
        config.num_lstm_units),
                                              input_keep_prob=dropout_input,
                                              output_keep_prob=dropout_input)
    hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        inputs=hypothesis_embeddings,
        sequence_length=hypothesis_length,
        dtype=tf.float32)
    normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

    reshaped_hypothesis = tf.reshape(
        tf.tile(hypothesis_final_states.h, [1, config.num_img_features]),
        [-1, config.num_img_features, config.num_lstm_units])
    img_hypothesis_concatenation = tf.concat(
        [normalized_img_features, reshaped_hypothesis], -1)
    gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh(
        img_hypothesis_concatenation, config.num_lstm_units),
                                                       keep_prob=dropout_input)
    att_wa_hypothesis = lambda x: tf.nn.dropout(
        tf.contrib.layers.fully_connected(
            x, 1, activation_fn=None, biases_initializer=None),
        keep_prob=dropout_input)
    a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation)
    a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis, axis=-1))

    v_head_hypothesis = tf.squeeze(tf.matmul(tf.expand_dims(a_hypothesis, 1),
                                             normalized_img_features),
                                   axis=1)

    gated_hypothesis = tf.nn.dropout(gated_tanh(
        hypothesis_final_states.h, config.multimodal_fusion_hidden_size),
                                     keep_prob=dropout_input)
    v_head_hypothesis.set_shape(
        (hypothesis_embeddings.get_shape()[0], config.img_features_size))
    gated_img_features_hypothesis = tf.nn.dropout(gated_tanh(
        v_head_hypothesis, config.multimodal_fusion_hidden_size),
                                                  keep_prob=dropout_input)
    h_hypothesis_img = tf.multiply(gated_hypothesis,
                                   gated_img_features_hypothesis)

    final_concatenation = tf.concat([h_hypothesis_img], 1)
    gated_first_layer = tf.nn.dropout(gated_tanh(
        final_concatenation, config.classification_hidden_size),
                                      keep_prob=dropout_input)
    gated_second_layer = tf.nn.dropout(gated_tanh(
        gated_first_layer, config.classification_hidden_size),
                                       keep_prob=dropout_input)
    gated_third_layer = tf.nn.dropout(gated_tanh(
        gated_second_layer, config.classification_hidden_size),
                                      keep_prob=dropout_input)

    label_logits = tf.contrib.layers.fully_connected(gated_third_layer,
                                                     config.num_labels,
                                                     activation_fn=None)

    ############## Explanation generation part ######################
    multimodal_feature = final_concatenation

    if mode == 'train' and ilabel2itoken:
        # prepend gold label
        # done outside of the build function in inference mode
        pre_labels = ilabel2itoken_table.lookup(label_input)
        input_seqs = tf.concat([tf.expand_dims(pre_labels, 1), input_seqs],
                               axis=1)

    with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
        seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs)

    lstm_cell_expl = tf.nn.rnn_cell.LSTMCell(num_units=config.num_lstm_units,
                                             state_is_tuple=True)

    lstm_cell_expl = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_expl,
                                                   input_keep_prob=keep_prob,
                                                   output_keep_prob=keep_prob)

    # TODO: attention?
    #attn_meca = tf.contrib.seq2seq.BahdanauAttention(config.num_lstm_units, multimodal_feature)
    #attn_cell = tf.contrib.seq2seq.AttentionWrapper(lstm_cell_expl, attn_meca, output_attention=False)

    with tf.variable_scope("lstm", initializer=initializer) as lstm_scope:

        # Feed the image embeddings to set the initial LSTM state.
        if mode == 'train':
            zero_state = lstm_cell_expl.zero_state(batch_size=batch_size,
                                                   dtype=tf.float32)
            #zero_state = attn_cell.zero_state(batch_size=batch_size, dtype=tf.float32)

        elif mode == 'inference':
            zero_state = lstm_cell_expl.zero_state(batch_size=inference_batch,
                                                   dtype=tf.float32)
            #zero_state = attn_cell.zero_state(batch_size=inference_batch, dtype=tf.float32)

        with tf.variable_scope('multimodal_embeddings'):
            multimodal_embeddings = tf.contrib.layers.fully_connected(
                inputs=multimodal_feature,
                num_outputs=config.embedding_size,
                activation_fn=None,
                weights_initializer=initializer,
                biases_initializer=None)

        _, initial_state = lstm_cell_expl(multimodal_embeddings, zero_state)
        #_, initial_state = attn_cell(multimodal_embeddings, zero_state)

        # Allow the LSTM variables to be reused.
        lstm_scope.reuse_variables()

        # Run the batch of sequence embeddings through the LSTM.
        sequence_length = tf.reduce_sum(input_mask, 1)
        lstm_outputs, final_state = tf.nn.dynamic_rnn(
            cell=lstm_cell_expl,
            inputs=seq_embedding,
            sequence_length=sequence_length,
            initial_state=initial_state,
            dtype=tf.float32,
            scope=lstm_scope)

        #         lstm_outputs, final_state = tf.nn.dynamic_rnn(cell=attn_cell,
        #                                                     inputs=seq_embedding,
        #                                                     sequence_length=sequence_length,
        #                                                     initial_state=initial_state,
        #                                                     dtype=tf.float32,
        #                                                     scope=lstm_scope)

        # Stack batches vertically.
        lstm_outputs = tf.reshape(
            lstm_outputs,
            [-1, lstm_cell_expl.output_size])  # output_size == 256
        #lstm_outputs = tf.reshape(lstm_outputs, [-1, attn_cell.output_size]) # output_size == 256

    with tf.variable_scope('logits'):
        W = tf.get_variable('W',
                            [lstm_cell_expl.output_size, config.vocab_size],
                            initializer=initializer)
        #W = tf.get_variable('W', [attn_cell.output_size, config.vocab_size], initializer=initializer)
        b = tf.get_variable('b', [config.vocab_size],
                            initializer=tf.constant_initializer(0.0))

        logits = tf.matmul(
            lstm_outputs,
            W) + b  # logits: [batch_size * padded_length, config.vocab_size]

    ###### for inference & validation only #######
    softmax = tf.nn.softmax(logits)
    preds = tf.argmax(softmax, 1)
    ##############################################

    # for training only below
    targets = tf.reshape(target_seqs, [-1])
    weights = tf.to_float(tf.reshape(input_mask, [-1]))

    # Compute losses.

    label_loss = tf.losses.sparse_softmax_cross_entropy(
        label_input, label_logits)
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
                                                            logits=logits)

    explanation_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
                              tf.reduce_sum(weights),
                              name="explanation_loss")
    batch_loss = (1 -
                  config.alpha) * explanation_loss + config.alpha * label_loss
    tf.contrib.losses.add_loss(batch_loss)
    total_loss = tf.contrib.losses.get_total_loss()

    # target_cross_entropy_losses = losses  # Used in evaluation.
    # target_cross_entropy_loss_weights = weights  # Used in evaluation.

    # TODO; what else should I return?

    return dict(total_loss=total_loss,
                global_step=global_step,
                image_feature=image_feature,
                input_mask=input_mask,
                target_seqs=target_seqs,
                input_seqs=input_seqs,
                final_state=final_state,
                initial_state=initial_state,
                softmax=softmax,
                preds=preds,
                keep_prob=keep_prob,
                saver=tf.train.Saver(),
                hypothesis_input=hypothesis_input,
                img_features_input=img_features_input,
                label_input=label_input,
                dropout_input=dropout_input,
                label_logits=label_logits,
                explanation_loss=explanation_loss,
                attention_output=a_hypothesis)
Beispiel #6
0
    def call(self, premise_input, hypothesis_input, img_features_input,
             label_input, target_expl, target_length, dropout_input,
             num_labels, num_img_features, img_features_size, rnn_hidden_size,
             multimodal_fusion_hidden_size, classification_hidden_size,
             max_length):

        hypothesis_length = tf.cast(
            tf.reduce_sum(
                tf.cast(
                    tf.not_equal(
                        hypothesis_input,
                        tf.zeros_like(hypothesis_input, dtype=tf.int32)),
                    tf.int64), 1), tf.int32)

        hypothesis_embeddings = tf.nn.embedding_lookup(self.embedding_matrix,
                                                       hypothesis_input)

        hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn(
            cell=self.lstm_cell,
            inputs=hypothesis_embeddings,
            sequence_length=hypothesis_length,
            dtype=tf.float32)
        normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2)

        reshaped_hypothesis = tf.reshape(
            tf.tile(hypothesis_final_states.h, [1, num_img_features]),
            [-1, num_img_features, rnn_hidden_size])
        img_hypothesis_concatenation = tf.concat(
            [normalized_img_features, reshaped_hypothesis], -1)
        gated_img_hypothesis_concatenation = tf.nn.dropout(
            gated_tanh(img_hypothesis_concatenation, rnn_hidden_size),
            keep_prob=dropout_input)
        att_wa_hypothesis = lambda x: tf.nn.dropout(
            tf.contrib.layers.fully_connected(
                x, 1, activation_fn=None, biases_initializer=None),
            keep_prob=dropout_input)
        a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation)
        a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis))
        v_head_hypothesis = tf.squeeze(
            tf.matmul(tf.expand_dims(a_hypothesis, 1),
                      normalized_img_features))

        gated_hypothesis = tf.nn.dropout(gated_tanh(
            hypothesis_final_states.h, multimodal_fusion_hidden_size),
                                         keep_prob=dropout_input)
        v_head_hypothesis.set_shape(
            (hypothesis_embeddings.get_shape()[0], img_features_size))

        gated_img_features_hypothesis = tf.nn.dropout(gated_tanh(
            v_head_hypothesis, multimodal_fusion_hidden_size),
                                                      keep_prob=dropout_input)
        h_hypothesis_img = tf.multiply(gated_hypothesis,
                                       gated_img_features_hypothesis)

        # Features used to classify label and generate explanation
        final_concatenation = tf.concat([h_hypothesis_img], 1)

        # Classifier
        gated_first_layer = tf.nn.dropout(gated_tanh(
            final_concatenation, classification_hidden_size),
                                          keep_prob=dropout_input)
        gated_second_layer = tf.nn.dropout(gated_tanh(
            gated_first_layer, classification_hidden_size),
                                           keep_prob=dropout_input)
        gated_third_layer = tf.nn.dropout(gated_tanh(
            gated_second_layer, classification_hidden_size),
                                          keep_prob=dropout_input)

        pred_label = tf.contrib.layers.fully_connected(gated_third_layer,
                                                       num_labels,
                                                       activation_fn=None)

        # insert GRU here to generate explanations
        # expl= (bs, T, 300)

        start_token = tf.constant('<start>', dtype=tf.string)
        end_token = tf.constant('<end>', dtype=tf.string)

        batch_size = tf.shape(hypothesis_input)[0]

        #if tf.reduce_all(tf.math.equal(mode, tf.constant('teacher', dtype=tf.string))):
        # teacher forcing
        if self.mode == 'teacher':

            print("teacher")

            hidden_t = self.decoder.reset_state(batch_size=batch_size)

            batch_start_token = tf.fill([batch_size], '<start>')
            batch_end_token = tf.fill([batch_size], '<end>')
            dec_input_t = tf.expand_dims(
                self.token2id_table.lookup(batch_start_token), 1)

            all_predictions = []

            # TODO: why target_expl.shape[1] gives None?
            # replacing with max_length but bad

            #for t in range(1, tf.shape(target_expl)[1])
            #for t in tf.range(self.explanation_length_input):
            for t in range(1, max_length + 1):
                # passing the features through the decoder
                predictions, hidden_t, attention_weights = self.decoder(
                    dec_input_t, tf.expand_dims(final_concatenation, 1),
                    hidden_t)

                #prepend label
                if t == 1 and label_input is not None:
                    labels = self.id2label_table.lookup(label_input)  #(bs,)
                    dec_input_t = self.token2id_table.lookup(labels)  #(bs,)
                    dec_input_t = tf.expand_dims(dec_input_t, 1)  #(bs,1)

                # using teacher forcing
                #if t < max_length:
                elif t < max_length:
                    dec_input_t = tf.expand_dims(target_expl[:, t], 1)
                else:
                    dec_input_t = tf.expand_dims(
                        self.token2id_table.lookup(batch_end_token), 1)

                # predictions: (bs, 1, n_vocab)
                all_predictions.append(predictions)

            # all_predictions: (bs, T, n_vocab)
            all_predictions = tf.stack(all_predictions, axis=1)

            return pred_label, all_predictions

        else:
            print("forloop")
            #all_predictions = []

            ##TODO: attention shape
            #attention_features_shape = 36
            #all_attention_plots = []

            # pred_expls is a list of strings of size batch_size
            #pred_expls = [""] * batch_size
            #finished = [False] * batch_size

            #pred_expls = tf.fill([batch_size], "")
            #finished = tf.fill([batch_size], False)

            #TODO
            pred_expls = []
            pred_expls_words = []
            #finished = tf.zeros((batch_size))

            t = 0
            hidden_t = self.decoder.reset_state(batch_size=batch_size)

            batch_start_token = tf.fill([batch_size], '<start>')
            dec_input_t = tf.expand_dims(
                self.token2id_table.lookup(batch_start_token), 1)

            #TODO:
            #while t < max_length and tf.reduce_sum(finished) != batch_size:
            while t < max_length:
                t += 1

                #dec_output_t: (bs, max_vocab)
                #dec_output_t: (bs * max_length, max_vocab)
                dec_output_t, hidden_t, attention_weights = self.decoder(
                    dec_input_t, tf.expand_dims(final_concatenation, 1),
                    hidden_t)

                #predicted_id: (bs* max_length) or (bs*max_length, 1)
                predicted_id = tf.argmax(dec_output_t, axis=1)
                pred_expls.append(predicted_id)
                pred_expls_words.append(
                    self.id2token_table.lookup(predicted_id))

                # TODO
                #completed = tf.where(predicted_id == self.token2id_table.lookup(end_token))
                #finished[completed] = 1

                if t > 1:
                    #if True:
                    #dec_input_t = tf.expand_dims(predicted_id, 1)
                    dec_input_t = tf.reshape(predicted_id, [batch_size, 1])

                else:
                    out_labels = tf.argmax(pred_label, axis=1)
                    # pred_label IDs --> labels words --> embeddings
                    labels = self.id2label_table.lookup(out_labels)  #(bs,)
                    dec_input_t = self.token2id_table.lookup(labels)  #(bs,)
                    dec_input_t = tf.expand_dims(dec_input_t, 1)  #(bs,1)

                #all_predictions.append(dec_output_t)

            #all_predictions = tf.stack(all_predictions, axis=1)
            pred_expls = tf.stack(pred_expls, axis=1)
            pred_expls_words = tf.stack(pred_expls_words, axis=1)
            return pred_label, pred_expls_words