Beispiel #1
0
    def __init__(self,
                 hidden_size=10,
                 vocab_size=42791,
                 embedding_size=50,
                 embedding_matrix=None):
        #tf.set_random_seed(1234)

        # Placeholders
        # ==================================================
        # (batch_size * max_sentence_count x max_sentence_length)
        self.sentences = tf.placeholder(tf.int32, [None, None],
                                        name="sentences")
        self.questions = tf.placeholder(tf.int32, [None, None],
                                        name="questions")
        self.labels = tf.placeholder(tf.int32, [
            None,
        ], name="labels")

        batch_size = tf.shape(self.questions)[0]
        #self.batch_size = tf.placeholder(tf.int32, name="batch_size")
        sentences = self.sentences
        questions = self.questions
        labels = self.labels

        # (batch_size * mask_sentence_count x max_sentence_length)
        sentence_mask = tf.cast(sentences >= 0, tf.int32)
        self.sentence_mask = sentence_mask
        #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32)
        masked_sentences = tf.mul(sentences, sentence_mask)

        max_sent_per_doc = tf.cast(
            tf.shape(sentence_mask)[0] / batch_size, tf.int32)
        self.max_sent_per_doc = max_sent_per_doc

        # Input Preparation
        # ==================================================
        with tf.variable_scope("embeddings"):
            if embedding_matrix == None:
                self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \
                                               initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                               name="W_embeddings", dtype=tf.float32)
            else:
                ################## option to use pre-trained embeddings ##################
                self.W_embeddings = tf.Variable(embedding_matrix, \
                                               name="W_embeddings", dtype=tf.float32)

            # SENTENCES MASKED
            # (batch_size x max_sent_per_doc)
            batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1),
                                    [batch_size, -1])
            # (batch_size * max_sent_per_doc x 1 x 1)
            sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]),
                                          tf.float32)

            # batch_size * max_sent_per_doc x max_sentence_length x embedding_size
            sentence_embeddings = tf.gather(self.W_embeddings,
                                            masked_sentences)
            masked_sentence_embeddings = tf.mul(
                sentence_embeddings,
                tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32))

            # QUERY MASKED
            # create mask (batch_size x max_question_length)
            question_mask = tf.cast(questions > 0, tf.int32)
            self.question_mask = question_mask
            masked_question = tf.mul(question_mask, questions)
            self.masked_question = masked_question

            # (batch_size x max_question_length x embedding_size)
            question_embeddings = tf.gather(self.W_embeddings, masked_question)
            self.question_embeddings = question_embeddings
            question_mask_float = tf.expand_dims(
                tf.cast(question_mask, tf.float32), -1)
            masked_question_embeddings = tf.mul(question_embeddings,
                                                question_mask_float)
            self.masked_question_embeddings = masked_question_embeddings

        # CBOW Sentence Representation
        # ==================================================
        with tf.variable_scope("sentence-representation"):

            self.forward_cell_d = GRUCell(state_size=hidden_size,
                                          input_size=embedding_size,
                                          scope="GRU-Forward-D")
            self.backward_cell_d = GRUCell(state_size=hidden_size,
                                           input_size=embedding_size,
                                           scope="GRU-Backward-D")

            self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \
                sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True)

            doc_sentences = tf.reshape(last_state_d,
                                       [batch_size, -1, hidden_size * 2])
            self.cbow_sentences = doc_sentences

            # # (batch_size * max_sentence_count x embedding_size)
            # cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1)
            # self.cbow_sentences = cbow_sentences
            # # reshape batch to (batch_size x max_doc_length x embedding_size)
            # doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size])

        # Query Representation
        # ==================================================
        with tf.variable_scope("query-representation"):
            # easy baseline: cbow
            # (batch_size x embedding_size)

            #question_cbow = tf.reduce_mean(masked_question_embeddings, 1)
            #self.question_cbow = question_cbow

            self.forward_cell_q = GRUCell(state_size=hidden_size,
                                          input_size=embedding_size,
                                          scope="GRU-Forward-Q")
            self.backward_cell_q = GRUCell(state_size=hidden_size,
                                           input_size=embedding_size,
                                           scope="GRU-Backward-Q")

            self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \
                question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True)

            self.question_cbow = last_state_q

            # can use RNN representation as well*************************************

        # Similarity Scoring
        # ==================================================
        # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf)

        with tf.variable_scope("similarity-scoring"):
            # (batch_size x max_sent_per_doc)
            #"""
            # dot_prod = tf.squeeze(tf.batch_matmul(doc_sentences, tf.expand_dims(question_cbow, -1)), [-1])
            # self.dot_prod = dot_prod
            #
            # # softmax
            # numerator = tf.exp(tf.sub(dot_prod, tf.expand_dims(tf.reduce_max(dot_prod, 1), -1))) * tf.cast(batch_mask, tf.float32)
            # denom = tf.reduce_sum(numerator, 1)
            #
            # # Dimensions (batch x time)
            # probabilities = tf.div(numerator, tf.expand_dims(denom, 1))
            #"""

            # #(batch_size x max_sent_per_doc)
            # sentence_norm = tf.sqrt(tf.reduce_sum(tf.mul(doc_sentences, doc_sentences), -1))
            # self.sentence_norm = sentence_norm
            # # (batch_size)
            # question_norm = tf.sqrt(tf.reduce_sum(tf.mul(question_cbow, question_cbow), 1))
            # self.question_norm = question_norm
            #
            # denom = tf.mul(sentence_norm, tf.expand_dims(question_norm, -1))+1e-30
            # self.denom = denom
            # # (batch_size x max_sent_per_doc) - scalars between -1 and +1
            # cosine_similarity = tf.div(dot_prod, denom)
            # self.cosine_similarity = cosine_similarity
            #
            # masked_pos_cos_sim = tf.sub(tf.add(cosine_similarity, 1), tf.cast(batch_mask < 1, tf.float32))
            # self.masked_pos_cos_sim = masked_pos_cos_sim
            # normalized_cos_sim = tf.div(masked_pos_cos_sim, tf.expand_dims(tf.reduce_sum(masked_pos_cos_sim, 1), -1))

            #"""
            attention = BilinearFunction(attending_size=hidden_size * 2,
                                         attended_size=hidden_size * 2)
            alpha_weights, attend_result = attention(self.question_cbow, attended=doc_sentences, \
                time_mask=tf.cast(batch_mask, tf.float32))
            probabilities = alpha_weights

            #"""

            #probabilities = tf.abs(dot_prod) #normalized_cos_sim
            self.probabilities = probabilities

        with tf.variable_scope("prediction"):
            one_hot_labels = tf.one_hot(labels,
                                        max_sent_per_doc,
                                        dtype=tf.float32)

            likelihoods = tf.reduce_sum(tf.mul(probabilities, one_hot_labels),
                                        1)
            log_likelihoods = tf.log(likelihoods + 0.00000000000000000001)
            self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)
            correct_vector = tf.cast(tf.equal(labels, tf.cast(tf.argmax(probabilities, 1), tf.int32)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #2
0
masked_question = tf.mul(question_mask, questions)

# easy baseline: cbow
# (batch_size x max_question_length x embedding_size)
question_embeddings = tf.gather(W_embeddings, masked_question)
question_mask_float = tf.expand_dims(tf.cast(question_mask, tf.float32), -1)
masked_question_embeddings = tf.mul(question_embeddings, question_mask_float)
# (batch_size x embedding_size)
question_cbow = tf.reduce_mean(masked_question_embeddings, 1)

# can use RNN representation as well*************************************

# ATTENTION/SIMILARITY SCORING --------------------------------------------------
# Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf)

attention = BilinearFunction(attending_size=embedding_size,
                             attended_size=embedding_size)
alpha_weights, attend_result = attention(attending=question_cbow, attended=doc_sentences, \
    time_mask=tf.cast(batch_mask, tf.float32))

# (batch_size x max_sent_per_doc)
dot_prod = tf.squeeze(
    tf.batch_matmul(doc_sentences, tf.expand_dims(question_cbow, -1)), [-1])

# (batch_size x max_sent_per_doc)
sentence_norm = tf.sqrt(tf.reduce_sum(tf.mul(doc_sentences, doc_sentences),
                                      -1))
# (batch_size)
question_norm = tf.sqrt(tf.reduce_sum(tf.mul(question_cbow, question_cbow), 1))

denom = tf.mul(sentence_norm, tf.expand_dims(question_norm, -1)) + 1e-30
# (batch_size x max_sent_per_doc) - scalars between -1 and +1
Beispiel #3
0
    def __init__(self, hidden_size=128, vocab_size=42791, embedding_size=50, embedding_matrix=None, \
        embedding_trainable=False, sentence_rep_str="cbow", question_rep_str="cbow"):
        tf.set_random_seed(1234)

        # Placeholders
        # ==================================================
        # (batch_size * max_sentence_count x max_sentence_length)
        self.sentences = tf.placeholder(tf.int32, [None, None], name="sentences")
        self.questions = tf.placeholder(tf.int32, [None, None], name="questions")
        self.labels = tf.placeholder(tf.int32, [None, ], name="labels")

        attending_size = hidden_size

        # Input Preparation (Mask Creation)
        # ==================================================
        with tf.variable_scope("masks"):
            # MASK SENTENCES
            # (batch_size * mask_sentence_count x max_sentence_length)
            sentence_mask = tf.cast(self.sentences >= 0, tf.int32)
            #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32)
            masked_sentences = tf.mul(self.sentences, sentence_mask)

            batch_size = tf.shape(self.questions)[0]

            # (batch_size x max_sent_per_doc)
            batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1), [batch_size, -1])
            answer_counts = tf.cast(tf.reduce_sum(batch_mask, 1), tf.float32)

            # MASK QUESTIONS
            # create mask (batch_size x max_question_length)
            question_mask = tf.cast(self.questions >= 0, tf.int32)
            masked_question = tf.mul(question_mask, self.questions)
            question_mask_float = tf.expand_dims(tf.cast(question_mask, tf.float32), -1)

            max_sent_per_doc = tf.cast(tf.shape(sentence_mask)[0]/batch_size, tf.int32)
            max_sent_len = tf.shape(self.sentences)[1]
            max_ques_len = tf.shape(self.questions)[1]

            document_mask = tf.cast(tf.reshape(sentence_mask, [batch_size, max_sent_per_doc * max_sent_len]), tf.float32)

        # Embeddings
        # ==================================================
        with tf.variable_scope("embeddings"):
            if embedding_matrix is None:
                self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \
                                               initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                               name="W_embeddings", dtype=tf.float32)
            else:
                ################## option to use pre-trained embeddings ##################
                self.W_embeddings = tf.Variable(embedding_matrix, \
                                               name="W_embeddings", dtype=tf.float32, trainable=embedding_trainable)

            masked_documents = tf.reshape(masked_sentences, [batch_size, max_sent_per_doc * max_sent_len])
            document_embeddings = tf.gather(self.W_embeddings, masked_documents)
            masked_document_embeddings = tf.mul(document_embeddings, document_mask)

            # (batch_size x max_question_length x embedding_size)
            question_embeddings = tf.gather(self.W_embeddings, masked_question)
            masked_question_embeddings = tf.mul(question_embeddings, question_mask_float)

        # Query Representation (CBOW or RNN)
        # ==================================================
        with tf.variable_scope("query-representation"):

            # RNN Attention on sentence embeddings -----------------------------------------
            doc_sentences = masked_document_embeddings

            question_cbow = tf.reduce_mean(masked_question_embeddings, 1)

            self.decoder_cell = GRUCell(state_size=hidden_size, input_size=embedding_size*2, scope="GRU_decoder")
            self.bilinearf = BilinearFunction(attending_size=hidden_size, attended_size=embedding_size)

            hidden_states_decoder, last_state_decoder, alpha_weights_time = \
                rnn_decoder_attention(cell=self.decoder_cell,
                                            start_state=question_cbow, # ZEROS
                                            inputs=question_embeddings,
                                            inputs_mask=tf.cast(question_mask, tf.float32),
                                            attentionf=self.bilinearf,
                                            attended=document_embeddings, #doc_sentences,
                                            attended_mask=tf.cast(document_mask, tf.float32)
                                          )
            # alpha_weights originally (batch x context_time x max_ques_len)
            alpha_weights_time_prime = tf.reshape(alpha_weights_time, [batch_size, max_sent_len, max_sent_per_doc, max_ques_len]) #
            # (batch_size, max_sent_per_doc, max_ques_len)
            alpha_weights_per_qword = tf.reduce_sum(alpha_weights_time_prime, 1)
            # (batch_size, max_sent_per_doc)
            sent_score = tf.reduce_sum(alpha_weights_per_qword, 2)

            # normalize
            self.probabilities = tf.div(sent_score, tf.reduce_sum(sent_score))

        # Loss
        # ==================================================
        with tf.variable_scope("prediction"):

            one_hot_labels = tf.one_hot(self.labels, max_sent_per_doc, dtype=tf.float32)

            likelihoods = tf.reduce_sum(tf.mul(self.probabilities, one_hot_labels), 1)
            likelihoods = tf.div(likelihoods, answer_counts)
            log_likelihoods = tf.log(likelihoods+0.00000000000000000001)
            self.loss = tf.div(tf.mul(tf.reduce_sum(log_likelihoods), -1), tf.cast(batch_size, tf.float32))
            correct_vector = tf.cast(tf.equal(self.labels, tf.cast(tf.argmax(self.probabilities, 1), tf.int32)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #4
0
    forward_cell_q = GRUCell(state_size, input_size, scope="GRU-Forward-Q")
    backward_cell_q = GRUCell(state_size, input_size, scope="GRU-Backward-Q")

    # hidden_states_d, last_state_d = rnn(forward_cell_d, document_embedding, seq_lens_d)
    # hidden_states_q, last_state_q = rnn(forward_cell_q, question_embedding, seq_lens_q)

    hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \
        document_embedding, seq_lens_d, concatenate=True)

    hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \
        question_embedding, seq_lens_q, concatenate=True)

with tf.variable_scope("attention"):
    time_mask = tf.sequence_mask(seq_lens_d, dtype=tf.float32)
    # Attention Layer
    attention = BilinearFunction(attending_size=state_size * 2,
                                 attended_size=state_size * 2)
    alpha_weights, attend_result = attention(attending=last_state_q, attended=hidden_states_d, \
       seq_lens=seq_lens_d)

with tf.variable_scope("prediction"):
    W_predict = tf.get_variable(name="predict_weight", shape=[state_size*2, max_entities], \
        initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
    b_predict = tf.get_variable(name="predict_bias",
                                shape=[max_entities],
                                initializer=tf.random_normal_initializer(
                                    mean=0.0, stddev=0.1))
    # Dimensions (batch_size x state_size*2)
    prediction_probs_unnormalized = tf.matmul(attend_result,
                                              W_predict) + b_predict

    # Custom Softmax b/c need to use time_mask --------------------
    def __init__(self,
                 max_entities,
                 hidden_size=128,
                 vocab_size=50000,
                 embedding_dim=100,
                 batch_size=32):
        self.max_entities = max_entities
        tf.set_random_seed(1234)

        # Placeholders
        # can add assert statements to ensure shared None dimensions are equal (batch_size)
        self.input_d = tf.placeholder(tf.int32, [None, None], name="input_d")
        self.input_q = tf.placeholder(tf.int32, [None, None], name="input_q")
        self.input_a = tf.placeholder(tf.int32, [
            None,
        ], name="input_a")
        self.input_m = tf.placeholder(tf.int32, [
            None,
        ], name="input_m")
        self.cbow_mask = tf.placeholder(tf.float32, [None, None, None],
                                        name="cbow_mask")
        self.window_size = tf.placeholder(tf.int32, name="window_size")
        self.num_docs = tf.placeholder(tf.int32, name="num_docs")
        self.max_length = tf.placeholder(tf.int32, name="max_length")

        seq_lens_d = tf.reduce_sum(tf.cast(self.input_d >= 0, tf.int32), 1)
        seq_lens_q = tf.reduce_sum(tf.cast(self.input_q >= 0, tf.int32), 1)

        mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.int32)
        mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.int32)
        mask_m = tf.cast(tf.sequence_mask(self.input_m, maxlen=max_entities),
                         dtype=tf.float32)

        # Document and Query embddings; One-hot-encoded answers
        masked_d = tf.mul(self.input_d, mask_d)
        masked_q = tf.mul(self.input_q, mask_q)
        one_hot_a = tf.one_hot(self.input_a, self.max_entities)

        # Buildling Graph (Network Layers)
        # ==================================================
        with tf.device('/cpu:0'), tf.variable_scope("embedding"):
            W_embeddings = tf.get_variable(shape=[vocab_size, embedding_dim], \
                                           initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                           name="W_embeddings")
            ################## Make option to use pre-trained embeddings ##################

            # Dimensions: batch x max_length x embedding_dim
            document_embedding = tf.gather(W_embeddings, masked_d)
            question_embedding = tf.gather(W_embeddings, masked_q)

#document_embedding = tf.reshape(document_embedding, shape = [self.num_docs, self.window_size, self.max_length / self.window_size, embedding_dim])
#document_cbow = tf.reduce_mean(document_embedding, 1)

# Experimental aspect. Combining the CBOW of a sequence of 20 words in every document.
#document_cbow = tf.batch_matmul(self.cbow_mask, document_embedding)

        with tf.variable_scope("bidirection_rnn"):
            #seq_lens_cbow = tf.reshape(tf.div(seq_lens_d, self.window_size), [-1])

            mask_d = tf.cast(tf.sequence_mask(seq_lens),
                             tf.float32)  #or float64?
            #mask_d = tf.cast(tf.sequence_mask(seq_lens_d), tf.float32)
            mask_q = tf.cast(tf.sequence_mask(seq_lens_q), tf.float32)

            # Bidirectional RNNs for Document and Question
            forward_cell_d = GRUCell(state_size=hidden_size,
                                     input_size=embedding_dim,
                                     scope="GRU-Forward-D")
            backward_cell_d = GRUCell(state_size=hidden_size,
                                      input_size=embedding_dim,
                                      scope="GRU-Backward-D")

            forward_cell_q = GRUCell(state_size=hidden_size,
                                     input_size=embedding_dim,
                                     scope="GRU-Forward-Q")
            backward_cell_q = GRUCell(state_size=hidden_size,
                                      input_size=embedding_dim,
                                      scope="GRU-Backward-Q")

            hidden_states_d, last_state_d = bidirectional_rnn(forward_cell_d, backward_cell_d, \
                document_embedding, mask_d, concatenate=True)

            hidden_states_q, last_state_q = bidirectional_rnn(forward_cell_q, backward_cell_q, \
                question_embedding, mask_q, concatenate=True)

        with tf.variable_scope("attention"):
            # Attention Layer
            attention = BilinearFunction(attending_size=hidden_size * 2,
                                         attended_size=hidden_size * 2)
            self.alpha_weights, self.attend_result = attention(attending=last_state_q, attended=hidden_states_d, \
                time_mask=mask_d)

        with tf.variable_scope("prediction"):
            W_predict = tf.get_variable(name="predict_weight", shape=[hidden_size*2, self.max_entities], \
                initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
            b_predict = tf.get_variable(
                name="predict_bias",
                shape=[self.max_entities],
                initializer=tf.constant_initializer(0.0))
            # Dimensions (batch_size x max_entities)
            predict_probs = (tf.matmul(self.attend_result, W_predict) +
                             b_predict) * mask_m

            # Custom Softmax b/c need to use time_mask --------------------
            # Also numerical stability:

            # e_x = exp(x - x.max(axis=1))
            # out = e_x / e_x.sum(axis=1)
            numerator = tf.exp(
                tf.sub(predict_probs,
                       tf.expand_dims(tf.reduce_max(predict_probs, 1),
                                      -1))) * mask_m
            denom = tf.reduce_sum(numerator, 1)

            # Transpose so broadcasting scalar division works properly
            # Dimensions (batch x max_entities)
            #self.predict_probs_normalized = tf.transpose(tf.div(tf.transpose(numerator), denom))
            predict_probs_normalized = tf.div(numerator,
                                              tf.expand_dims(denom, 1))
            likelihoods = tf.reduce_sum(
                tf.mul(predict_probs_normalized, one_hot_a), 1)
            log_likelihoods = tf.log(likelihoods + 0.00000000000000000001)
            # Negative log-likelihood loss
            self.loss = tf.mul(tf.reduce_sum(log_likelihoods), -1)
            correct_vector = tf.cast(tf.equal(tf.argmax(one_hot_a, 1), tf.argmax(predict_probs_normalized, 1)), \
                tf.float32, name="correct_vector")
            self.accuracy = tf.reduce_mean(correct_vector)
Beispiel #6
0
    def __init__(self, hidden_size=128, vocab_size=65011, embedding_size=50, embedding_matrix=None, \
        embedding_trainable=False, sentence_rep_str="cbow", question_rep_str="cbow"):
        tf.set_random_seed(1234)

        # Placeholders
        # ==================================================
        # (batch_size * max_sentence_count x max_sentence_length)
        self.sentences = tf.placeholder(tf.int32, [None, None], name="sentences")
        self.questions = tf.placeholder(tf.int32, [None, None], name="questions")
        self.labels = tf.placeholder(tf.int32, [None, ], name="labels")

        # initialize dimension variables based on contructor arguments
        if sentence_rep_str == "cbow":
            attended_size = embedding_size
        elif sentence_rep_str == "rnn":
            attended_size = hidden_size*2
        else:
            raise ValueError("Invalid `sentence_rep_str` argument; choose 'cbow' or 'rnn'.")

        if question_rep_str == "cbow":
            attending_size = embedding_size
        elif question_rep_str == "rnn":
            attending_size = hidden_size*2
        else:
            raise ValueError("Invalid `question_rep_str` argument; choose 'cbow', 'rnn', or 'rnn-attention'.")

        # Input Preparation (Mask Creation)
        # ==================================================
        with tf.variable_scope("masks"):
            # MASK SENTENCES
            # (batch_size * mask_sentence_count x max_sentence_length)
            sentence_mask = tf.cast(self.sentences >= 0, tf.int32)
            #sentence_mask = tf.sequence_mask(doc_lengths, dtype=tf.int32)
            masked_sentences = tf.mul(self.sentences, sentence_mask)

            batch_size = tf.shape(self.questions)[0]

            # RESHAPE SENTENCE MASK
            # (batch_size x max_sent_per_doc)
            batch_mask = tf.reshape(tf.reduce_max(sentence_mask, 1), [batch_size, -1])
            answer_counts = tf.cast(tf.reduce_sum(batch_mask, 1), tf.float32)
            # (batch_size * max_sent_per_doc x 1 x 1)
            sentence_batch_mask = tf.cast(tf.reshape(batch_mask, [-1, 1, 1]), tf.float32)

            # MASK QUESTIONS
            # create mask (batch_size x max_question_length)
            question_mask = tf.cast(self.questions >= 0, tf.int32)
            masked_question = tf.mul(question_mask, self.questions)
            question_mask_float = tf.expand_dims(tf.cast(question_mask, tf.float32), -1)

            max_sent_per_doc = tf.cast(tf.shape(sentence_mask)[0]/batch_size, tf.int32)

        # Embeddings
        # ==================================================
        with tf.variable_scope("embeddings"):
            if embedding_matrix is None:
                self.W_embeddings = tf.get_variable(shape=[vocab_size, embedding_size], \
                                               initializer=tf.random_uniform_initializer(-0.01, 0.01),\
                                               name="W_embeddings", dtype=tf.float32)
            else:
                ################## option to use pre-trained embeddings ##################
                self.W_embeddings = tf.Variable(embedding_matrix, \
                                               name="W_embeddings", dtype=tf.float32, trainable=embedding_trainable)

            # batch_size * max_sent_per_doc x max_sentence_length x embedding_size
            sentence_embeddings = tf.gather(self.W_embeddings, masked_sentences)
            masked_sentence_embeddings = tf.mul(sentence_embeddings, tf.cast(tf.expand_dims(sentence_mask, -1), tf.float32))

            # (batch_size x max_question_length x embedding_size)
            question_embeddings = tf.gather(self.W_embeddings, masked_question)
            masked_question_embeddings = tf.mul(question_embeddings, question_mask_float)

        # Sentence Representation (CBOW or RNN)
        # ==================================================
        with tf.variable_scope("sentence-representation"):

            # CBOW -----------------------------------------
            if sentence_rep_str == "cbow":
                # (batch_size * max_sentence_count x embedding_size)
                cbow_sentences = tf.reduce_mean(masked_sentence_embeddings, 1)
                # reshape batch to (batch_size x max_doc_length x embedding_size)
                doc_sentences = tf.reshape(cbow_sentences, [batch_size, -1, embedding_size])

                self.sentence_representation = cbow_sentences

            # RNN -----------------------------------------
            elif sentence_rep_str == "rnn":
                self.forward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-D")
                self.backward_cell_d = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-D")

                self.hidden_states_d, last_state_d = bidirectional_rnn(self.forward_cell_d, self.backward_cell_d, \
                    sentence_embeddings, tf.cast(sentence_mask, tf.float32), concatenate=True)

                doc_sentences = tf.reshape(last_state_d, [batch_size, -1, hidden_size*2])

                self.sentence_representation = last_state_d

        # Query Representation (CBOW or RNN)
        # ==================================================
        with tf.variable_scope("query-representation"):

            # CBOW -----------------------------------------
            if question_rep_str == "cbow":
                # (batch_size x embedding_size)
                question_cbow = tf.reduce_mean(masked_question_embeddings, 1)
                self.question_representation = question_cbow

            # RNN -----------------------------------------
            elif question_rep_str == "rnn":
                self.forward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Forward-Q")
                self.backward_cell_q = GRUCell(state_size=hidden_size, input_size=embedding_size, scope="GRU-Backward-Q")

                self.hidden_states_q, last_state_q = bidirectional_rnn(self.forward_cell_q, self.backward_cell_q, \
                    question_embeddings, tf.cast(question_mask, tf.float32), concatenate=True)

                #tf.reduce_mean(self.hidden_states_q, )

                self.question_representation = last_state_q


        # Similarity Scoring
        # ==================================================
        # Using simple dot product/cosine similiarity as of now (https://arxiv.org/pdf/1605.07427v1.pdf)

        with tf.variable_scope("similarity-scoring"):

            # (batch_size x max_sent_per_doc)
            attention = BilinearFunction(attending_size=attending_size, attended_size=attended_size)
            alpha_weights, attend_result = attention(self.question_representation, attended=doc_sentences, \
                 time_mask=tf.cast(batch_mask, tf.float32))
            self.probabilities = alpha_weights

        # Loss
        # ==================================================
        with tf.variable_scope("prediction"):

            one_hot_labels = tf.one_hot(self.labels, max_sent_per_doc, dtype=tf.float32)

            likelihoods = tf.reduce_sum(tf.mul(self.probabilities, one_hot_labels), 1)
            likelihoods = tf.div(likelihoods, answer_counts)
            log_likelihoods = tf.log(likelihoods+0.00000000000000000001)
            self.loss = tf.div(tf.mul(tf.reduce_sum(log_likelihoods), -1), tf.cast(batch_size, tf.float32))
            self.correct_vector = tf.cast(tf.equal(self.labels, tf.cast(tf.argmax(self.probabilities, 1), tf.int32)), tf.float64, name="correct_vector")
            self.predict_labels = tf.argmax(self.probabilities, 1)
            self.accuracy = tf.reduce_mean(self.correct_vector)