def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
        
        if self.FLAGS.attention_type == 'dot_product':
            print("<<<<<<<< Adding dot_poduct attention >>>")         
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
    
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        
        elif self.FLAGS.attention_type == 'self_attention':
            print("<<<<<<<<< Adding Self attention over basic attention >>>>>>>")
            basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            
            self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.self_attn_zsize, self.FLAGS.hidden_size*2)
            _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask)
            concated_basic_self = tf.concat([basic_attn_output,self_attn_output], axis=2) #(bs,N,4h)
            
            self_attn_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps = self_attn_encoder.build_graph(concated_basic_self, self.context_mask, scope_name="self_attn_encoder") # (batch_size, N, hidden_size*2)
        
        elif self.FLAGS.attention_type == 'bidaf':
            print("<<<<<<<<< Adding BIDAF attention >>>>>>>")
            attn_layer = BidafAttn(self.keep_prob, self.FLAGS.hidden_size*2)
            c2q_attention, q2c_attention = attn_layer.build_graph(context_hiddens, question_hiddens, self.qn_mask, self.context_mask)
            
            # Combined tensors o get final output.....
            body_c2q_attention_mult = context_hiddens*c2q_attention # (batch_size, num_keys(N), 2h)
            q2c_expanded = tf.expand_dims(q2c_attention, 1) #(bs,1,2h)
            body_q2c_attention_mult = context_hiddens*q2c_expanded # (batch_size, num_keys(N), 2h)
            blended_reps = tf.concat([c2q_attention, body_c2q_attention_mult, body_q2c_attention_mult], axis=2) #(bs,N,6h) # context_hiddens removed
            blended_reps = tf.nn.dropout(blended_reps, self.keep_prob)
        
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        
        with vs.variable_scope("ClassProb"):
            softmax_layer_class = CustomSimpleSoftmaxLayer()
            
            #Both have dimesions:  shape (batch_size, 4)
            self.logits_class, self.probdist_class =  softmax_layer_class.build_graph(blended_reps_final, self.context_mask, self.FLAGS.reduction_type)
Exemple #2
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building SelfAttention Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        self.c2q_attn_dist, self.q2c_attn_dist, bidaf_output = bidaf_layer.build_graph(
          question_hiddens,
          self.qn_mask,
          context_hiddens,
          self.context_mask)

        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size)
        self.self_attn_dist, self_attn_output = self_attn_layer.build_graph(bidaf_output, self.context_mask) # batch_size, context_len, 2 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*10)

        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= 2 * self.FLAGS.num_layers, name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        # blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps, self.context_mask)

        end_pointer = tf.concat([tf.expand_dims(self.probdist_start, -1), blended_reps], axis=2)
        # end_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= self.FLAGS.num_layers, name="EndEncoder")
        # end_pointer = end_encoder.build_graph(end_pointer, self.context_mask)
        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(end_pointer, self.context_mask)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building Pointer Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states

        # BIDAG LAYER
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens,
                                                     self.qn_mask,
                                                     context_hiddens,
                                                     self.context_mask)
        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output],
                                 axis=2)  # bs, c_l, 8h

        #SELF ATTENTION LAYER
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size,
                                   self.FLAGS.selfattn_size)
        _, self_attn_output = self_attn_layer.build_graph(
            bidaf_output,
            self.context_mask)  # batch_size, context_len, 8 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [bidaf_output, self_attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*16)
        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                            self.keep_prob,
                                            num_layers=self.FLAGS.num_layers,
                                            name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(
            blended_reps,
            self.context_mask)  # batch_size, context_len, hidden_size * 2

        # MODELING LAYER
        modeling_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                      self.keep_prob,
                                      num_layers=self.FLAGS.num_layers,
                                      name="ModelingEncoder")
        modeling_output = modeling_encoder.build_graph(blended_reps,
                                                       self.context_mask)
        modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.num_layers,
                                          name="ModelingEncoder2")
        modeling_output_two = modeling_encoder_two.build_graph(
            modeling_output, self.context_mask)

        total_reps_start = tf.concat([blended_reps, modeling_output], axis=2)
        total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2)

        # OUTPUT LAYER
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                total_reps_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                total_reps_end, self.context_mask)
Exemple #4
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        _, attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        #Applying Self attention to blended_reps
        self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 4)
        self_attn_output = self_attn_layer.build_graph(blended_reps,
                                                       self.context_mask)

        # Concat self_attn_output to blended_reps to get attn_reps
        attn_reps = tf.concat([blended_reps, self_attn_output], axis=2)

        #Feeding attn_reps to a Bidirectional GRU to get h
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob,
                             "RNNEncoder2")
        h = encoder.build_graph(attn_reps, self.context_mask)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            h, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.model == "baseline" :
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn":
            print("INSIDE the BIDAF model")
            encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn":
            encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)

        if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn":
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "baseline" :
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
            _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "coatt" :
            #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2)
            #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2)
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
            blended_reps_final = attn_output
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                contextLen = tf.reduce_sum(self.context_mask, axis=1)
                cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size)
                (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32)
                U_1 = tf.concat([fw_out, bw_out], axis=2)
                out = tf.nn.dropout(U_1, self.keep_prob)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask)


        elif self.FLAGS.model =="bidaf"  or self.FLAGS.model=="bidaf_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            if self.FLAGS.model == "bidaf_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp


            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask)



            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                # Concatenate the start logits with the modelling layer output to get the input to the
                # end word lstm
                #self.logits_start has a shape of #(batch_size, context_len)
                logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1)
                end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2)

                # LSTM
                end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask)

                blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask)

        elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

            if self.FLAGS.model == "bidaf_dynamic_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp

            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)

            # We now feed this to dynamic decoder module coded in Answer decoder
            # the output of the decoder are start, end, alpha_logits and beta_logits
            # start and end have a shape of (batch_size, num_iterations)
            #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim)
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = mod_layer_out[:,0,:]
            u_e_init = mod_layer_out[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

        elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn":
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)

            if self.FLAGS.model == "coatt_dynamic_self_attn":
                CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                attn_output = U
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = attn_output[:,0,:]
            u_e_init = attn_output[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.self_attention:
            encoder = RNNEncoder(self.FLAGS.hidden_size_encoder,
                                 self.keep_prob)
        else:
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.simple_attention:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        if self.FLAGS.co_attention:
            #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space
            question_hiddens_t = tf.transpose(
                question_hiddens,
                perm=[0, 2, 1])  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens_t = tf.contrib.layers.fully_connected(
                question_hiddens_t,
                num_outputs=self.FLAGS.question_len,
                activation_fn=tf.nn.tanh
            )  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens = tf.transpose(
                trans_question_hiddens_t,
                perm=[0, 2, 1])  #(batch_size,question_len,hidden_size*2)

            #Computing the coattention context
            co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            co_attn_output = co_attn_layer.build_graph(
                trans_question_hiddens, self.qn_mask, self.context_mask,
                context_hiddens)  #(batch_size,context_len,6*hidden_size)

            # performing the fusion of temporal information to the coattention context via a bidirectional GRU
            with tf.variable_scope("co-attn-encoder"):
                co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size,
                                              self.keep_prob)
                blended_reps_final = co_attn_encoder.build_graph(
                    co_attn_output, self.context_mask)

        if self.FLAGS.self_attention:
            # implemrntation of self attention of the rnet paper

            self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder,
                                              self.FLAGS.hidden_size_qp,
                                              self.FLAGS.hidden_size_pp,
                                              self.keep_prob)
            v_p = self_attention_encoder.build_graph_qp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len)
            h_p = self_attention_encoder.build_graph_pp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, v_p, self.FLAGS.context_len,
                self.FLAGS.question_len)
            blended_reps_final = tf.concat(
                [context_hiddens, v_p, h_p],
                axis=2)  #(batch_size,context_len,5*hidden_size)

        if self.FLAGS.answer_pointer:
            #implementation of answer pointer as used in R-Net paper
            if self.FLAGS.co_attention:
                hidden_size_attn = self.FLAGS.hidden_size * 2
            elif self.FLAGS.self_attention:
                hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp
            else:
                hidden_size_attn = self.FLAGS.hidden_size

            answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder,
                                           hidden_size_attn,
                                           self.FLAGS.question_len,
                                           self.keep_prob)
            p, logits = answer_decoder.build_graph_answer_pointer(
                question_hiddens, context_hiddens, blended_reps_final,
                self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask,
                self.context_mask)

            self.logits_start = logits[0]
            self.probdist_start = p[0]

            self.logits_end = logits[1]
            self.probdist_end = p[1]

        if self.FLAGS.simple_softmax:
            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
    def build_graph(self):
        """
        Builds the main part of the graph for the model
        
         Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # NOTE CHANGE: concantanate glove and elmo embedding
        # How to handle elmo context_len and glove context_len mismatch?
        # Just make the context_ids no max context_len
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)
        qn_embs_concat = tf.concat(
            [self.elmo_question_input, self.qn_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        #set shape so that it can pass to dynamic lstm
        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        qn_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("biLSTM"):
            Encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 keep_prob=self.keep_prob,
                                 cell_type="lstm",
                                 input_size=1024 + self.FLAGS.embedding_size)
            #shared weights (same scope)
            context_hiddens = Encoder.build_graph(
                context_embs_concat,
                self.context_mask,
                scope="context_question_encoder"
            )  #(batch_size, context_len, hidden_size*2)
            question_hiddens = Encoder.build_graph(
                qn_embs_concat, self.qn_mask, scope="context_question_encoder"
            )  #(batch_size, question_len, hidden_size*2)

        with tf.variable_scope("bidaf"):
            bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob)
            b = bidaf_object.build_graph(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask)  #(batch_size, context_len, hidden_size*8)

        with tf.variable_scope("self_attn_layer"):
            SelfAttn_object = SelfAttn(self.FLAGS.hidden_size,
                                       self.FLAGS.hidden_size * 2,
                                       self.keep_prob,
                                       input_size=self.FLAGS.hidden_size * 2)
            M = SelfAttn_object.build_graph(
                b, self.context_mask,
                cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

        #Make prediction
        with tf.variable_scope('prediction_layer'):
            #Encode the self-attended context first
            with tf.variable_scope("final_lstm_layer"):
                final_lstm_object = RNNEncoder(
                    self.FLAGS.hidden_size,
                    keep_prob=self.keep_prob,
                    cell_type="lstm",
                    input_size=self.FLAGS.hidden_size * 2)
                M_prime = final_lstm_object.build_graph(
                    M, self.context_mask,
                    scope="final_lstm")  #(batch_size, context_len, h*2)

            #Get start distribution
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    M_prime,
                    self.context_mask)  #both are (batch_size, context_len)

            with tf.variable_scope("EndDist"):
                logit_start_expand = tf.expand_dims(
                    self.logits_start, axis=2)  #(batch_size, context_len, 1)
                blended_end_rnn_input = tf.concat(
                    [logit_start_expand, M_prime],
                    axis=2)  #(batch_size, context_len, hidden_size*2)
                end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size,
                                          keep_prob=self.keep_prob,
                                          direction="unidirectional")
                end_rnn_output = end_dist_rnn.build_graph(
                    blended_end_rnn_input,
                    self.context_mask,
                    scope="end_dist_rnn")

                # Get the end dist
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    end_rnn_output, self.context_mask)
Exemple #8
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
	with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE):
            if self.FLAGS.use_stacked_encoder:
                encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob)
            else:
                encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
	if self.FLAGS.num_encoding_layers > 1:
	    with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE):
		encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
	        context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask)
	        question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.bidaf:
            attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2)
            blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8)
        else:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Self-attention layer
        if self.FLAGS.self_attend:
            self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz)
            blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked
        #       from BiDAF paper) to the hidden states from the attention layer.
        if self.FLAGS.modeling_layer:
            with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE):
                model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask)
	if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1:
	    with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE):
		model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask)
            # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob)
            # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask)

        if self.FLAGS.pointer_network: #TODO: define flag
            with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE):
                pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size)
                (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \
                    pointer_network.build_graph(blended_reps_final, self.context_mask)
        else:
                # Use softmax layer to compute probability distribution for start location
                # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)
Exemple #9
0
    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == 'bidaf':
            bidaf_layer = BiDAF(self.FLAGS.hidden_size, self.keep_prob,
                                self.FLAGS.hidden_size * 2,
                                self.FLAGS.hidden_size * 2)
            g_m, g_m2 = bidaf_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)  # (batch_size, context_len, hidden_size*10)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'bicoattn':
            bicoattn_layer = BiCoattn(self.FLAGS.batch_size,
                                      self.FLAGS.context_len,
                                      self.FLAGS.hidden_size, self.keep_prob,
                                      self.FLAGS.hidden_size * 2,
                                      self.FLAGS.hidden_size * 2)
            g_m, g_m2, self.attn_output = bicoattn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)  # (batch_size, context_len, hidden_size*14)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'transformernetwork':
            transformernetwork_layer = TransformerNetwork(
                self.FLAGS.hidden_size, self.keep_prob,
                self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, 3, 8)
            g_m, g_m2 = transformernetwork_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask, self.FLAGS.is_training
            )  # (batch_size, context_len, hidden_size*2)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'bctn':
            bctn_layer = BCTN(self.FLAGS.hidden_size, self.keep_prob,
                              self.FLAGS.hidden_size * 2,
                              self.FLAGS.hidden_size * 2, 2, 8)
            g_m, g_m2 = bctn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask, self.FLAGS.is_training
            )  # (batch_size, context_len, hidden_size*2)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)

        elif self.FLAGS.model == 'rnet':
            with vs.variable_scope("Contextual"):
                encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                context_hiddens = encoder.build_graph(self.context_embs,
                                                      self.context_mask)
                question_hiddens = encoder.build_graph(self.qn_embs,
                                                       self.qn_mask)
            print "GatedAttn"
            with vs.variable_scope("GatedAttn"):
                attn_layer_gated = GatedAttn(self.keep_prob,
                                             self.FLAGS.hidden_size * 2,
                                             self.FLAGS.hidden_size * 2,
                                             self.FLAGS.hidden_size)
                context_hiddens_gated, self.a_t = attn_layer_gated.build_graph(
                    question_hiddens, self.qn_mask,
                    context_hiddens)  # (batch_size, context_len, hidden_size)
            print "SelfAttn"
            with vs.variable_scope("SelfAttn"):
                attn_layer_self = SelfAttn(self.keep_prob,
                                           self.FLAGS.hidden_size,
                                           self.FLAGS.hidden_size)
                attn_output_self, self.attn_output = attn_layer_self.build_graph(
                    context_hiddens_gated, self.context_mask
                )  # (batch_size, context_len, hidden_size * 2)

            print "Output"
            with vs.variable_scope("Output"):
                output_layer = Output_Rnet(self.keep_prob,
                                           self.FLAGS.hidden_size * 2,
                                           self.FLAGS.hidden_size * 2,
                                           self.FLAGS.hidden_size)
                self.logits_start, self.probdist_start, self.logits_end, self.probdist_end, self.a = output_layer.build_graph(
                    attn_output_self, question_hiddens, self.context_mask,
                    self.qn_mask)
        elif self.FLAGS.model == 'basicattnplusone':
            attn_layer = BasicAttnPlusOne(self.FLAGS.batch_size,
                                          self.FLAGS.context_len,
                                          self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          self.FLAGS.hidden_size * 2,
                                          self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)

            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
        elif self.FLAGS.model == 'basicattnplustwo':
            attn_layer = BasicAttnPlusTwo(self.FLAGS.batch_size,
                                          self.FLAGS.context_len,
                                          self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          self.FLAGS.hidden_size * 2,
                                          self.FLAGS.hidden_size * 2)
            g_m, g_m2 = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        else:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
    def build_graph(self):
        self._define_embedding()

        self.inputs = tf.placeholder(
            tf.int32, [None, self.max_length])
        if self.char_embed:
            self.char_inputs = tf.placeholder(
                tf.int32, [None, self.max_length, self.word_length])

        self.ex_lengths = tf.placeholder(tf.int32, [None])

        # Outputs as usual:
        self.outputs = tf.placeholder(
            tf.float32, shape=[None, self.output_dim])

        # This converts the inputs to a list of lists of dense vector
        # representations:
        self.feats = tf.nn.embedding_lookup(
            self.embedding, self.inputs)
        if self.char_embed:
            # shape (?, max_len, word_len, char_embed_dim)
            self.char_feats = tf.nn.embedding_lookup(
                self.char_embedding, self.char_inputs)
            # shape (? * max_len, word_len, char_embed_dim)
            self.char_feats = tf.reshape(self.char_feats, 
                                         (-1, self.word_length, self.char_embed_dim))
            self.char_feats = tf.layers.conv1d(self.char_feats, 
                                               self.num_char_filters, 
                                               self.char_kernel_size, 
                                               padding="same")
            # shape (?, max_len, word_len, num_char_filters)
            self.char_feats = tf.reshape(self.char_feats, 
                                         (-1, self.max_length, self.word_length, self.num_char_filters))
            # shape (?, max_len, num_char_filters)
            self.char_feats = tf.reduce_max(self.char_feats, axis=2)
            # concatenate char embeds with word embeds to get final representation
            self.feats = tf.concat([self.feats, self.char_feats], 2)

        # Defines the RNN structure:
        self.cell = self.cell_class(
            self.hidden_dim, activation=self.hidden_activation)
        self.cell = DropoutWrapper(self.cell, self.dropout)
        # If bidirectional RNN used, define a second RNN cell
        # alternatively we could shared cells for fw/bw, but i think not for now.
        if self.bidir_rnn:
            self.bw_cell = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.bw_cell = DropoutWrapper(self.bw_cell, self.dropout)
            
        if self.stacked and self.bidir_rnn:
            self.cell2 = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.cell2 = DropoutWrapper(self.cell2, self.dropout)
            self.bw_cell2 = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.bw_cell2 = DropoutWrapper(self.bw_cell2, self.dropout)

        # Run the RNN:
        if self.bidir_rnn:
            with tf.variable_scope("lstm1", reuse=tf.AUTO_REUSE):
                outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
                    self.cell,
                    self.bw_cell,
                    self.feats,
                    dtype=tf.float32,
                    sequence_length=self.ex_lengths)
                out = tf.concat(outputs, 1)
            
            if self.stacked:
                with tf.variable_scope("lstm2", reuse=tf.AUTO_REUSE):
                    outputs2, output_states2 = tf.nn.bidirectional_dynamic_rnn(
                        self.cell2,
                        self.bw_cell2,
                        out,
                        dtype=tf.float32,
                        sequence_length=self.ex_lengths)
            
            # let the last state be the concatenation of the fw and bw
            # final ``outputs''. Note that output_states[0] is the FW
            # (c, h) pair, and output_states[1] is the BW (c, h) pair where
            # c is the hidden state and h is the output
            if self.stacked:
                self.last = tf.concat([output_states2[0][1], output_states2[1][1]], 1)
            else:
                if self.cell_class == tf.nn.rnn_cell.LSTMCell:
                    self.last = tf.concat([output_states[0][1], output_states[1][1]], 1)
                elif self.cell_class == tf.nn.rnn_cell.GRUCell:
                    self.last = tf.concat(output_states, 1)
        else:
            outputs, state = tf.nn.dynamic_rnn(
                self.cell,
                self.feats,
                dtype=tf.float32,
                sequence_length=self.ex_lengths)

	# Attention Layer
        if self.self_attend:
            out = tf.concat(outputs, 1)
            print(out)
            self_attn_layer = SelfAttn(self.dropout, out.shape[-1], self.attention_dim)
            outputs = self_attn_layer.build_graph(out, self.ex_lengths)

        # How can I be sure that I have found the last true state? This
        # first option seems to work for all cell types but sometimes
        # leads to indexing errors and is in general pretty complex:
        #
        # self.last = self._get_last_non_masked(outputs, self.ex_lengths)
        #
        # This option is more reliable, but is it definitely getting
        # the final true state?
        #
        # Note that we set self.last above for the BiDir RNN case.
        if not self.bidir_rnn:
            self.last = self._get_final_state(self.cell, state)

        # Softmax classifier on the final hidden state:
        if self.bidir_rnn:
            self.W_hy = self.weight_init(
                2 * self.hidden_dim, self.output_dim, 'W_hy')
        else:
            self.W_hy = self.weight_init(
                self.hidden_dim, self.output_dim, 'W_hy')
        self.b_y = self.bias_init(self.output_dim, 'b_y')
        self.model = tf.matmul(self.last, self.W_hy) + self.b_y
Exemple #11
0
def build_graph():
    def bilm_build_graph(options_file, weight_file):
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file, weight_file)

        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(context_elmo)
        question_embeddings_op = bilm(question_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)['weighted_op']
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)['weighted_op']
        """
        elmo_context_output = weight_layers(
            'output', context_embeddings_op, l2_coef=0.0
        )['weighted_op']

        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_output = weight_layers(
                'output', question_embeddings_op, l2_coef=0.0
            )

        """
        return elmo_context_input, elmo_question_input

    def add_embedding_layer(emb_matrix):
        with tf.variable_scope("embeddings"):
            #set to constant so its untrainable
            embedding_matrix = tf.constant(
                emb_matrix, dtype=tf.float32,
                name="emb_matrix")  # shape (400002, embedding_size)

            # Get the word embeddings for the context and question,
            context_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                  context_ids)
            qn_embs = tf.nn.embedding_lookup(embedding_matrix, qn_ids)
        return context_embs, qn_embs

    #start the main graph
    context_embs, qn_embs = add_embedding_layer(emb_matrix)
    elmo_context_input, elmo_question_input = bilm_build_graph(
        options_file, weight_file)

    context_embs_concat = tf.concat([elmo_context_input, context_embs],
                                    2)  #(2, context_len, 1074)
    qn_embs_concat = tf.concat([elmo_question_input, qn_embs],
                               2)  #(2, question_len, 1074)

    context_embs_concat.set_shape((None, None, 1074))
    qn_embs_concat.set_shape((None, None, 1074))
    qn_mask.set_shape((None, None))
    context_mask.set_shape((None, None))

    with tf.variable_scope("biLSTM"):
        print("Starting biLSTM...")
        LSTMencoder_context = RNNEncoder(hidden_size,
                                         keep_prob=keep_prob,
                                         cell_type="lstm",
                                         input_size=1074)
        LSTMencoder_question = RNNEncoder(hidden_size,
                                          keep_prob=keep_prob,
                                          cell_type="lstm",
                                          input_size=1074)
        #shared weights
        context_hiddens = LSTMencoder_context.build_graph(
            context_embs_concat,
            context_mask,
            scope="context_question_encoder",
            reuse=False)
        question_hiddens = LSTMencoder_question.build_graph(
            qn_embs_concat,
            qn_mask,
            scope="context_question_encoder",
            reuse=True)

    with tf.variable_scope("bidaf_layer"):
        print("Starting bidaf...")
        bidaf_object = Bidaf(hidden_size * 2, keep_prob)
        b = bidaf_object.build_graph(context_hiddens, question_hiddens,
                                     context_mask, qn_mask)

    with tf.variable_scope("self_attn_layer"):
        SelfAttn_object = SelfAttn(hidden_size,
                                   hidden_size * 2,
                                   keep_prob,
                                   input_size=hidden_size * 2)
        M = SelfAttn_object.build_graph(
            b, context_mask,
            cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

    with tf.variable_scope("final_lstm_layer"):
        final_lstm_object = RNNEncoder(hidden_size,
                                       keep_prob=keep_prob,
                                       cell_type="lstm",
                                       input_size=hidden_size * 2)
        M_prime = final_lstm_object.build_graph(M,
                                                context_mask,
                                                scope="final_lstm",
                                                reuse=False)

    with tf.variable_scope("StartDist"):
        softmax_layer_start = SimpleSoftmaxLayer()
        logits_start, probdist_start = softmax_layer_start.build_graph(
            M_prime, context_mask)

    with tf.variable_scope("EndDist"):
        softmax_layer_end = SimpleSoftmaxLayer()
        logits_end, probdist_end = softmax_layer_end.build_graph(
            M_prime, context_mask)

    return logits_start, probdist_start, logits_end, probdist_end