Python SelfAttn Examples

Programming Language: Python

Namespace/Package Name: modules

Class/Type: SelfAttn

Examples at hotexamples.com: 11

Python SelfAttn - 11 examples found. These are the top rated real world Python examples of modules.SelfAttn extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SelfAttn(11)

build_graph(10)

build_graph_pp(1)

build_graph_qp(1)

Example #1

Show file

File: qa_model.py Project: fuaadmire/StanceDetection

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
        
        if self.FLAGS.attention_type == 'dot_product':
            print("<<<<<<<< Adding dot_poduct attention >>>")         
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
    
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)
        
        elif self.FLAGS.attention_type == 'self_attention':
            print("<<<<<<<<< Adding Self attention over basic attention >>>>>>>")
            basic_attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, basic_attn_output = basic_attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            
            self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.self_attn_zsize, self.FLAGS.hidden_size*2)
            _, self_attn_output = self_attn_layer.build_graph(basic_attn_output, self.context_mask)
            concated_basic_self = tf.concat([basic_attn_output,self_attn_output], axis=2) #(bs,N,4h)
            
            self_attn_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            blended_reps = self_attn_encoder.build_graph(concated_basic_self, self.context_mask, scope_name="self_attn_encoder") # (batch_size, N, hidden_size*2)
        
        elif self.FLAGS.attention_type == 'bidaf':
            print("<<<<<<<<< Adding BIDAF attention >>>>>>>")
            attn_layer = BidafAttn(self.keep_prob, self.FLAGS.hidden_size*2)
            c2q_attention, q2c_attention = attn_layer.build_graph(context_hiddens, question_hiddens, self.qn_mask, self.context_mask)
            
            # Combined tensors o get final output.....
            body_c2q_attention_mult = context_hiddens*c2q_attention # (batch_size, num_keys(N), 2h)
            q2c_expanded = tf.expand_dims(q2c_attention, 1) #(bs,1,2h)
            body_q2c_attention_mult = context_hiddens*q2c_expanded # (batch_size, num_keys(N), 2h)
            blended_reps = tf.concat([c2q_attention, body_c2q_attention_mult, body_q2c_attention_mult], axis=2) #(bs,N,6h) # context_hiddens removed
            blended_reps = tf.nn.dropout(blended_reps, self.keep_prob)
        
        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)
        
        with vs.variable_scope("ClassProb"):
            softmax_layer_class = CustomSimpleSoftmaxLayer()
            
            #Both have dimesions:  shape (batch_size, 4)
            self.logits_class, self.probdist_class =  softmax_layer_class.build_graph(blended_reps_final, self.context_mask, self.FLAGS.reduction_type)

Example #2

Show file

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building SelfAttention Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers=self.FLAGS.num_layers, mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        self.c2q_attn_dist, self.q2c_attn_dist, bidaf_output = bidaf_layer.build_graph(
          question_hiddens,
          self.qn_mask,
          context_hiddens,
          self.context_mask)

        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output], axis=2) # bs, c_l, 8h
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size, self.FLAGS.selfattn_size)
        self.self_attn_dist, self_attn_output = self_attn_layer.build_graph(bidaf_output, self.context_mask) # batch_size, context_len, 2 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat([bidaf_output, self_attn_output], axis=2) # (batch_size, context_len, hidden_size*10)

        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= 2 * self.FLAGS.num_layers, name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(blended_reps, self.context_mask) # batch_size, context_len, hidden_size * 2

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        # blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps, self.context_mask)

        end_pointer = tf.concat([tf.expand_dims(self.probdist_start, -1), blended_reps], axis=2)
        # end_encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob, num_layers= self.FLAGS.num_layers, name="EndEncoder")
        # end_pointer = end_encoder.build_graph(end_pointer, self.context_mask)
        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(end_pointer, self.context_mask)

Example #3

Show file

File: qa_pointer_model.py Project: lttsh/cs224n-squad

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """
        print("Building Pointer Model")
        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size,
                             self.keep_prob,
                             num_layers=self.FLAGS.num_layers,
                             mode=self.FLAGS.rnn_cell)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states

        # BIDAG LAYER
        bidaf_layer = BidirectionAttn(self.keep_prob, self.FLAGS.hidden_size)
        _, _, bidaf_output = bidaf_layer.build_graph(question_hiddens,
                                                     self.qn_mask,
                                                     context_hiddens,
                                                     self.context_mask)
        # attn_output is shape (batch_size, context_len, hidden_size*6)
        bidaf_output = tf.concat([context_hiddens, bidaf_output],
                                 axis=2)  # bs, c_l, 8h

        #SELF ATTENTION LAYER
        self_attn_layer = SelfAttn(self.keep_prob, 8 * self.FLAGS.hidden_size,
                                   self.FLAGS.selfattn_size)
        _, self_attn_output = self_attn_layer.build_graph(
            bidaf_output,
            self.context_mask)  # batch_size, context_len, 8 * hidden_size

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [bidaf_output, self_attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*16)
        self_attention_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                            self.keep_prob,
                                            num_layers=self.FLAGS.num_layers,
                                            name="AttentionEncoder")
        blended_reps = self_attention_encoder.build_graph(
            blended_reps,
            self.context_mask)  # batch_size, context_len, hidden_size * 2

        # MODELING LAYER
        modeling_encoder = RNNEncoder(self.FLAGS.hidden_size,
                                      self.keep_prob,
                                      num_layers=self.FLAGS.num_layers,
                                      name="ModelingEncoder")
        modeling_output = modeling_encoder.build_graph(blended_reps,
                                                       self.context_mask)
        modeling_encoder_two = RNNEncoder(self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          num_layers=self.FLAGS.num_layers,
                                          name="ModelingEncoder2")
        modeling_output_two = modeling_encoder_two.build_graph(
            modeling_output, self.context_mask)

        total_reps_start = tf.concat([blended_reps, modeling_output], axis=2)
        total_reps_end = tf.concat([blended_reps, modeling_output_two], axis=2)

        # OUTPUT LAYER
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                total_reps_start, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                total_reps_end, self.context_mask)

Example #4

Show file

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                               self.FLAGS.hidden_size * 2)
        _, attn_output = attn_layer.build_graph(
            question_hiddens, self.qn_mask, context_hiddens
        )  # attn_output is shape (batch_size, context_len, hidden_size*2)

        # Concat attn_output to context_hiddens to get blended_reps
        blended_reps = tf.concat(
            [context_hiddens, attn_output],
            axis=2)  # (batch_size, context_len, hidden_size*4)

        #Applying Self attention to blended_reps
        self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 4)
        self_attn_output = self_attn_layer.build_graph(blended_reps,
                                                       self.context_mask)

        # Concat self_attn_output to blended_reps to get attn_reps
        attn_reps = tf.concat([blended_reps, self_attn_output], axis=2)

        #Feeding attn_reps to a Bidirectional GRU to get h
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob,
                             "RNNEncoder2")
        h = encoder.build_graph(attn_reps, self.context_mask)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(
            h, num_outputs=self.FLAGS.hidden_size
        )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # Use softmax layer to compute probability distribution for start location
        # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
        with vs.variable_scope("StartDist"):
            softmax_layer_start = SimpleSoftmaxLayer()
            self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                blended_reps_final, self.context_mask)

        # Use softmax layer to compute probability distribution for end location
        # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
        with vs.variable_scope("EndDist"):
            softmax_layer_end = SimpleSoftmaxLayer()
            self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                blended_reps_final, self.context_mask)

Example #5

Show file

File: qa_model.py Project: Sneha05kotha/Question_Answering_System

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.
        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.model == "baseline" :
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "bidaf" or self.FLAGS.model == "bidaf_dynamic" or self.FLAGS.model=="bidaf_self_attn" or self.FLAGS.model=="bidaf_dynamic_self_attn":
            print("INSIDE the BIDAF model")
            encoder = RNNEncoder_LSTM(self.FLAGS.hidden_size, self.keep_prob)
        elif self.FLAGS.model == "coatt" or self.FLAGS.model == "coatt_dynamic" or self.FLAGS.model=="coatt_dynamic_self_attn":
            encoder = LSTMEncoder(self.FLAGS.hidden_size, self.keep_prob)

        if self.FLAGS.model != "coatt" and self.FLAGS.model != "coatt_dynamic" and self.FLAGS.model!="coatt_dynamic_self_attn":
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "baseline" :
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2)
            _,attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens)  # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2)  # (batch_size, context_len, hidden_size*4)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final,self.context_mask)

        # Attention model
        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == "coatt" :
            #context_hiddens = encoder.build_graph(self.context_embs, self.context_mask, "context") # (batch_size, context_len, hidden_size*2)
            #question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask, "question") # (batch_size, question_len, hidden_size*2)
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
            blended_reps_final = attn_output
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            #blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size)  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final,self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                contextLen = tf.reduce_sum(self.context_mask, axis=1)
                cell = tf.contrib.rnn.LSTMBlockCell(2 * self.FLAGS.hidden_size)
                (fw_out, bw_out), _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, attn_output, contextLen, dtype = tf.float32)
                U_1 = tf.concat([fw_out, bw_out], axis=2)
                out = tf.nn.dropout(U_1, self.keep_prob)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(out,self.context_mask)


        elif self.FLAGS.model =="bidaf"  or self.FLAGS.model=="bidaf_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)
            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            if self.FLAGS.model == "bidaf_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp, self.context_mask) #(batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp


            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_start, self.context_mask)



            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                # Concatenate the start logits with the modelling layer output to get the input to the
                # end word lstm
                #self.logits_start has a shape of #(batch_size, context_len)
                logits_start_expand = tf.expand_dims(self.logits_start, axis=2) #(batch_size, context_len, 1)
                end_lstm_input = tf.concat([logits_start_expand, mod_layer_out], axis=2) #(batch_size, context_len, 1 + hidden_size*2)

                # LSTM
                end_layer = END_WORD_LAYER(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_end = end_layer.build_graph(end_lstm_input, self.context_mask)

                blended_reps_end_final = tf.concat([attn_output, blended_reps_end], axis=2)
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_end_final, self.context_mask)

        elif self.FLAGS.model =="bidaf_dynamic" or self.FLAGS.model =="bidaf_dynamic_self_attn":
            attn_layer = BiDafAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            attn_output_tmp = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # attn_output is shape (batch_size, context_len, hidden_size*8)

            if self.FLAGS.model == "bidaf_dynamic_self_attn":
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _,self_attn_output = self_attn_layer.build_graph(attn_output_tmp,self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([attn_output_tmp, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                attn_output = attn_output_tmp

            # Set of vectors which produces a set of query aware feature vectors for each word in the context
            #blended_reps = attn_output  #(batch_size, num_keys, 4*value_vec_size)

            # In BIDAF the attention output is feed to a modeling layer
            # The Modeling layer is a 2 layer lstm
            mod_layer = MODEL_LAYER_BIDAF(self.FLAGS.hidden_size, self.keep_prob)
            mod_layer_out = mod_layer.build_graph(attn_output, self.context_mask)  # (batch_size, context_len, hidden_size*2)
            blended_reps_start = tf.concat([attn_output,mod_layer_out], axis=2)  # (batch_size, context_len, hidden_size*10)

            # We now feed this to dynamic decoder module coded in Answer decoder
            # the output of the decoder are start, end, alpha_logits and beta_logits
            # start and end have a shape of (batch_size, num_iterations)
            #alpha_logits and beta_logits have a shape of (batch_size, num_iterations, inpit_dim)
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = mod_layer_out[:,0,:]
            u_e_init = mod_layer_out[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(mod_layer_out, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

        elif self.FLAGS.model =="coatt_dynamic" or self.FLAGS.model == "coatt_dynamic_self_attn":
            context_hiddens, question_hiddens = encoder.build_graph1(self.context_embs, self.qn_embs, self.context_mask, self.qn_mask)

            attn_layer = CoAttention(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)

            if self.FLAGS.model == "coatt_dynamic_self_attn":
                CoATT = attn_layer.build_graph1(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                self_attn_layer = SelfAttn(self.keep_prob, self.FLAGS.hidden_size * 8, self.FLAGS.hidden_size * 8)
                _, self_attn_output = self_attn_layer.build_graph(CoATT, self.context_mask)  # (batch_size, conetx_len, 8*hidden_size)
                attn_output = tf.concat([CoATT, self_attn_output], axis=2) #(batch_size, context_len, 16*hidden_size)
            else:
                U = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask)
                attn_output = U
            #blended_reps = tf.concat([context_hiddens, attn_output], axis=2)
            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            decoder = ANSWER_DECODER(self.FLAGS.hidden_size, self.keep_prob, self.FLAGS.num_iterations, self.FLAGS.max_pool, self.FLAGS.batch_size)

            u_s_init = attn_output[:,0,:]
            u_e_init = attn_output[:,0,:]
            start_location, end_location, alpha_logits, beta_logits = decoder.build_graph(attn_output, self.context_mask, u_s_init, u_e_init)


            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                #softmax_layer_start = SimpleSoftmaxLayer()
                logits_start_tmp = [masked_softmax(logits, self.context_mask,1) for logits in alpha_logits]
                self.alpha_logits , alpha_logits_probs = zip(*logits_start_tmp)
                self.logits_start, self.probdist_start = self.alpha_logits[self.FLAGS.num_iterations -1], alpha_logits_probs[self.FLAGS.num_iterations -1]

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)


            with vs.variable_scope("EndDist"):
                logits_end_tmp = [masked_softmax(logits, self.context_mask,1) for logits in beta_logits]
                self.beta_logits , beta_logits_probs = zip(*logits_end_tmp)
                self.logits_end, self.probdist_end = self.beta_logits[self.FLAGS.num_iterations -1], beta_logits_probs[self.FLAGS.num_iterations -1]

Example #6

Show file

File: qa_model.py Project: shuchita-rahman/SQuAD-NLP

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        if self.FLAGS.self_attention:
            encoder = RNNEncoder(self.FLAGS.hidden_size_encoder,
                                 self.keep_prob)
        else:
            encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)

        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.simple_attention:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*2)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

        if self.FLAGS.co_attention:
            #This step sends the question embeddings through a fully-connected-layer to allow for variation between question_embedding and document embedding space
            question_hiddens_t = tf.transpose(
                question_hiddens,
                perm=[0, 2, 1])  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens_t = tf.contrib.layers.fully_connected(
                question_hiddens_t,
                num_outputs=self.FLAGS.question_len,
                activation_fn=tf.nn.tanh
            )  #(batch_size,hidden_size*2,question_len)
            trans_question_hiddens = tf.transpose(
                trans_question_hiddens_t,
                perm=[0, 2, 1])  #(batch_size,question_len,hidden_size*2)

            #Computing the coattention context
            co_attn_layer = CoAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            co_attn_output = co_attn_layer.build_graph(
                trans_question_hiddens, self.qn_mask, self.context_mask,
                context_hiddens)  #(batch_size,context_len,6*hidden_size)

            # performing the fusion of temporal information to the coattention context via a bidirectional GRU
            with tf.variable_scope("co-attn-encoder"):
                co_attn_encoder = LSTMEncoder(self.FLAGS.hidden_size,
                                              self.keep_prob)
                blended_reps_final = co_attn_encoder.build_graph(
                    co_attn_output, self.context_mask)

        if self.FLAGS.self_attention:
            # implemrntation of self attention of the rnet paper

            self_attention_encoder = SelfAttn(self.FLAGS.hidden_size_encoder,
                                              self.FLAGS.hidden_size_qp,
                                              self.FLAGS.hidden_size_pp,
                                              self.keep_prob)
            v_p = self_attention_encoder.build_graph_qp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, self.FLAGS.context_len, self.FLAGS.question_len)
            h_p = self_attention_encoder.build_graph_pp(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask, v_p, self.FLAGS.context_len,
                self.FLAGS.question_len)
            blended_reps_final = tf.concat(
                [context_hiddens, v_p, h_p],
                axis=2)  #(batch_size,context_len,5*hidden_size)

        if self.FLAGS.answer_pointer:
            #implementation of answer pointer as used in R-Net paper
            if self.FLAGS.co_attention:
                hidden_size_attn = self.FLAGS.hidden_size * 2
            elif self.FLAGS.self_attention:
                hidden_size_attn = 2 * self.FLAGS.hidden_size_encoder + self.FLAGS.hidden_size_qp + 2 * self.FLAGS.hidden_size_pp
            else:
                hidden_size_attn = self.FLAGS.hidden_size

            answer_decoder = AnswerPointer(self.FLAGS.hidden_size_encoder,
                                           hidden_size_attn,
                                           self.FLAGS.question_len,
                                           self.keep_prob)
            p, logits = answer_decoder.build_graph_answer_pointer(
                question_hiddens, context_hiddens, blended_reps_final,
                self.FLAGS.question_len, self.FLAGS.context_len, self.qn_mask,
                self.context_mask)

            self.logits_start = logits[0]
            self.probdist_start = p[0]

            self.logits_end = logits[1]
            self.probdist_end = p[1]

        if self.FLAGS.simple_softmax:
            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)

Example #7

Show file

File: qa_model.py Project: jason965/AI-Reading-Comprehension

    def build_graph(self):
        """
        Builds the main part of the graph for the model
        
         Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # NOTE CHANGE: concantanate glove and elmo embedding
        # How to handle elmo context_len and glove context_len mismatch?
        # Just make the context_ids no max context_len
        context_embs_concat = tf.concat(
            [self.elmo_context_input, self.context_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)
        qn_embs_concat = tf.concat(
            [self.elmo_question_input, self.qn_embs],
            2)  #(batch_size, qn_len, 1024+self.FLAGS.embedding_size)

        #set shape so that it can pass to dynamic lstm
        context_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        qn_embs_concat.set_shape(
            (None, None, 1024 + self.FLAGS.embedding_size))
        self.qn_mask.set_shape((None, None))
        self.context_mask.set_shape((None, None))

        with tf.variable_scope("biLSTM"):
            Encoder = RNNEncoder(self.FLAGS.hidden_size,
                                 keep_prob=self.keep_prob,
                                 cell_type="lstm",
                                 input_size=1024 + self.FLAGS.embedding_size)
            #shared weights (same scope)
            context_hiddens = Encoder.build_graph(
                context_embs_concat,
                self.context_mask,
                scope="context_question_encoder"
            )  #(batch_size, context_len, hidden_size*2)
            question_hiddens = Encoder.build_graph(
                qn_embs_concat, self.qn_mask, scope="context_question_encoder"
            )  #(batch_size, question_len, hidden_size*2)

        with tf.variable_scope("bidaf"):
            bidaf_object = Bidaf(self.FLAGS.hidden_size * 2, self.keep_prob)
            b = bidaf_object.build_graph(
                context_hiddens, question_hiddens, self.context_mask,
                self.qn_mask)  #(batch_size, context_len, hidden_size*8)

        with tf.variable_scope("self_attn_layer"):
            SelfAttn_object = SelfAttn(self.FLAGS.hidden_size,
                                       self.FLAGS.hidden_size * 2,
                                       self.keep_prob,
                                       input_size=self.FLAGS.hidden_size * 2)
            M = SelfAttn_object.build_graph(
                b, self.context_mask,
                cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

        #Make prediction
        with tf.variable_scope('prediction_layer'):
            #Encode the self-attended context first
            with tf.variable_scope("final_lstm_layer"):
                final_lstm_object = RNNEncoder(
                    self.FLAGS.hidden_size,
                    keep_prob=self.keep_prob,
                    cell_type="lstm",
                    input_size=self.FLAGS.hidden_size * 2)
                M_prime = final_lstm_object.build_graph(
                    M, self.context_mask,
                    scope="final_lstm")  #(batch_size, context_len, h*2)

            #Get start distribution
            with tf.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    M_prime,
                    self.context_mask)  #both are (batch_size, context_len)

            with tf.variable_scope("EndDist"):
                logit_start_expand = tf.expand_dims(
                    self.logits_start, axis=2)  #(batch_size, context_len, 1)
                blended_end_rnn_input = tf.concat(
                    [logit_start_expand, M_prime],
                    axis=2)  #(batch_size, context_len, hidden_size*2)
                end_dist_rnn = RNNEncoder(self.FLAGS.hidden_size,
                                          keep_prob=self.keep_prob,
                                          direction="unidirectional")
                end_rnn_output = end_dist_rnn.build_graph(
                    blended_end_rnn_input,
                    self.context_mask,
                    scope="end_dist_rnn")

                # Get the end dist
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    end_rnn_output, self.context_mask)

Example #8

Show file

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
	with vs.variable_scope("encoder_layer1", reuse=tf.AUTO_REUSE):
            if self.FLAGS.use_stacked_encoder:
                encoder = StackedRNNEncoder(self.FLAGS.hidden_size, self.FLAGS.num_encoding_layers, self.keep_prob)
            else:
                encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
            context_hiddens = encoder.build_graph(self.context_embs, self.context_mask) # (batch_size, context_len, hidden_size*2)
            question_hiddens = encoder.build_graph(self.qn_embs, self.qn_mask) # (batch_size, question_len, hidden_size*2)
	if self.FLAGS.num_encoding_layers > 1:
	    with vs.variable_scope("encoder_layer2", reuse=tf.AUTO_REUSE):
		encoder2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
	        context_hiddens = encoder2.build_graph(context_hiddens, self.context_mask)
	        question_hiddens = encoder2.build_graph(question_hiddens, self.qn_mask)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.bidaf:
            attn_layer = BiDirAttnFlow(self.keep_prob, self.FLAGS.hidden_size*2)
            blended_reps = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens, self.context_mask) # (batch_size, context_len, hidden_size*8)
        else:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size*2, self.FLAGS.hidden_size*2)
            _, attn_output = attn_layer.build_graph(question_hiddens, self.qn_mask, context_hiddens) # attn_output is shape (batch_size, context_len, hidden_size*2)
            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat([context_hiddens, attn_output], axis=2) # (batch_size, context_len, hidden_size*4)

        # Self-attention layer
        if self.FLAGS.self_attend:
            self_attn_layer = SelfAttn(self.keep_prob, blended_reps.shape[-1], self.FLAGS.self_attend_hidden_sz)
            blended_reps = self_attn_layer.build_graph(blended_reps, self.context_mask) # (batch_size, context_len, 2*self_attend_hidden_sz)

        # Apply fully connected layer to each blended representation
        # Note, blended_reps_final corresponds to b' in the handout
        # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
        blended_reps_final = tf.contrib.layers.fully_connected(blended_reps, num_outputs=self.FLAGS.hidden_size) # blended_reps_final is shape (batch_size, context_len, hidden_size)

        # TODO: Modeling layer from BiDAF. We can add another RNN (two stacked
        #       from BiDAF paper) to the hidden states from the attention layer.
        if self.FLAGS.modeling_layer:
            with vs.variable_scope("Model_Layer", reuse=tf.AUTO_REUSE):
                model_layer = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer.build_graph(blended_reps_final, self.context_mask)
	if self.FLAGS.modeling_layer and self.FLAGS.num_model_rnn_layers > 1:
	    with vs.variable_scope("Model_layer2", reuse=tf.AUTO_REUSE):
		model_layer2 = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                blended_reps_final = model_layer2.build_graph(blended_reps_final, self.context_mask)
            # modeling_layer = StackedRNNEncoder(blended_reps_final.shape[-1], self.FLAGS.num_model_rnn_layers, self.keep_prob)
            # blended_reps_final = modeling_layer.build_graph(blended_reps_final, self.context_mask)

        if self.FLAGS.pointer_network: #TODO: define flag
            with vs.variable_scope("OutputLayer", reuse=tf.AUTO_REUSE):
                pointer_network = PointerNetwork(self.keep_prob, blended_reps_final.shape[-1].value, self.FLAGS.hidden_size)
                (self.logits_start, self.probdist_start, _, self.logits_end, self.probdist_end, _) = \
                    pointer_network.build_graph(blended_reps_final, self.context_mask)
        else:
                # Use softmax layer to compute probability distribution for start location
                # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer()
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(blended_reps_final, self.context_mask)

                # Use softmax layer to compute probability distribution for end location
                # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer()
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(blended_reps_final, self.context_mask)

Example #9

Show file

    def build_graph(self):
        """Builds the main part of the graph for the model, starting from the input embeddings to the final distributions for the answer span.

        Defines:
          self.logits_start, self.logits_end: Both tensors shape (batch_size, context_len).
            These are the logits (i.e. values that are fed into the softmax function) for the start and end distribution.
            Important: these are -large in the pad locations. Necessary for when we feed into the cross entropy function.
          self.probdist_start, self.probdist_end: Both shape (batch_size, context_len). Each row sums to 1.
            These are the result of taking (masked) softmax of logits_start and logits_end.
        """

        # Use a RNN to get hidden states for the context and the question
        # Note: here the RNNEncoder is shared (i.e. the weights are the same)
        # between the context and the question.
        encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
        context_hiddens = encoder.build_graph(
            self.context_embs,
            self.context_mask)  # (batch_size, context_len, hidden_size*2)
        question_hiddens = encoder.build_graph(
            self.qn_embs,
            self.qn_mask)  # (batch_size, question_len, hidden_size*2)

        # Use context hidden states to attend to question hidden states
        if self.FLAGS.model == 'bidaf':
            bidaf_layer = BiDAF(self.FLAGS.hidden_size, self.keep_prob,
                                self.FLAGS.hidden_size * 2,
                                self.FLAGS.hidden_size * 2)
            g_m, g_m2 = bidaf_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)  # (batch_size, context_len, hidden_size*10)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'bicoattn':
            bicoattn_layer = BiCoattn(self.FLAGS.batch_size,
                                      self.FLAGS.context_len,
                                      self.FLAGS.hidden_size, self.keep_prob,
                                      self.FLAGS.hidden_size * 2,
                                      self.FLAGS.hidden_size * 2)
            g_m, g_m2, self.attn_output = bicoattn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask)  # (batch_size, context_len, hidden_size*14)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'transformernetwork':
            transformernetwork_layer = TransformerNetwork(
                self.FLAGS.hidden_size, self.keep_prob,
                self.FLAGS.hidden_size * 2, self.FLAGS.hidden_size * 2, 3, 8)
            g_m, g_m2 = transformernetwork_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask, self.FLAGS.is_training
            )  # (batch_size, context_len, hidden_size*2)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        elif self.FLAGS.model == 'bctn':
            bctn_layer = BCTN(self.FLAGS.hidden_size, self.keep_prob,
                              self.FLAGS.hidden_size * 2,
                              self.FLAGS.hidden_size * 2, 2, 8)
            g_m, g_m2 = bctn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask, self.FLAGS.is_training
            )  # (batch_size, context_len, hidden_size*2)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)

        elif self.FLAGS.model == 'rnet':
            with vs.variable_scope("Contextual"):
                encoder = RNNEncoder(self.FLAGS.hidden_size, self.keep_prob)
                context_hiddens = encoder.build_graph(self.context_embs,
                                                      self.context_mask)
                question_hiddens = encoder.build_graph(self.qn_embs,
                                                       self.qn_mask)
            print "GatedAttn"
            with vs.variable_scope("GatedAttn"):
                attn_layer_gated = GatedAttn(self.keep_prob,
                                             self.FLAGS.hidden_size * 2,
                                             self.FLAGS.hidden_size * 2,
                                             self.FLAGS.hidden_size)
                context_hiddens_gated, self.a_t = attn_layer_gated.build_graph(
                    question_hiddens, self.qn_mask,
                    context_hiddens)  # (batch_size, context_len, hidden_size)
            print "SelfAttn"
            with vs.variable_scope("SelfAttn"):
                attn_layer_self = SelfAttn(self.keep_prob,
                                           self.FLAGS.hidden_size,
                                           self.FLAGS.hidden_size)
                attn_output_self, self.attn_output = attn_layer_self.build_graph(
                    context_hiddens_gated, self.context_mask
                )  # (batch_size, context_len, hidden_size * 2)

            print "Output"
            with vs.variable_scope("Output"):
                output_layer = Output_Rnet(self.keep_prob,
                                           self.FLAGS.hidden_size * 2,
                                           self.FLAGS.hidden_size * 2,
                                           self.FLAGS.hidden_size)
                self.logits_start, self.probdist_start, self.logits_end, self.probdist_end, self.a = output_layer.build_graph(
                    attn_output_self, question_hiddens, self.context_mask,
                    self.qn_mask)
        elif self.FLAGS.model == 'basicattnplusone':
            attn_layer = BasicAttnPlusOne(self.FLAGS.batch_size,
                                          self.FLAGS.context_len,
                                          self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          self.FLAGS.hidden_size * 2,
                                          self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)

            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)
        elif self.FLAGS.model == 'basicattnplustwo':
            attn_layer = BasicAttnPlusTwo(self.FLAGS.batch_size,
                                          self.FLAGS.context_len,
                                          self.FLAGS.hidden_size,
                                          self.keep_prob,
                                          self.FLAGS.hidden_size * 2,
                                          self.FLAGS.hidden_size * 2)
            g_m, g_m2 = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens,
                self.context_mask
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    g_m, self.context_mask)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    g_m2, self.context_mask)
        else:
            attn_layer = BasicAttn(self.keep_prob, self.FLAGS.hidden_size * 2,
                                   self.FLAGS.hidden_size * 2)
            _, attn_output = attn_layer.build_graph(
                question_hiddens, self.qn_mask, context_hiddens
            )  # attn_output is shape (batch_size, context_len, hidden_size*4)

            # Concat attn_output to context_hiddens to get blended_reps
            blended_reps = tf.concat(
                [context_hiddens, attn_output],
                axis=2)  # (batch_size, context_len, hidden_size*4)

            # Apply fully connected layer to each blended representation
            # Note, blended_reps_final corresponds to b' in the handout
            # Note, tf.contrib.layers.fully_connected applies a ReLU non-linarity here by default
            blended_reps_final = tf.contrib.layers.fully_connected(
                blended_reps, num_outputs=self.FLAGS.hidden_size
            )  # blended_reps_final is shape (batch_size, context_len, hidden_size)

            # Use softmax layer to compute probability distribution for start location
            # Note this produces self.logits_start and self.probdist_start, both of which have shape (batch_size, context_len)
            with vs.variable_scope("StartDist"):
                softmax_layer_start = SimpleSoftmaxLayer(1 -
                                                         self.FLAGS.dropout)
                self.logits_start, self.probdist_start = softmax_layer_start.build_graph(
                    blended_reps_final, self.context_mask)

            # Use softmax layer to compute probability distribution for end location
            # Note this produces self.logits_end and self.probdist_end, both of which have shape (batch_size, context_len)
            with vs.variable_scope("EndDist"):
                softmax_layer_end = SimpleSoftmaxLayer(1 - self.FLAGS.dropout)
                self.logits_end, self.probdist_end = softmax_layer_end.build_graph(
                    blended_reps_final, self.context_mask)

Example #10

Show file

File: tf_rnn_classifier.py Project: nmakow/cs224u-final

    def build_graph(self):
        self._define_embedding()

        self.inputs = tf.placeholder(
            tf.int32, [None, self.max_length])
        if self.char_embed:
            self.char_inputs = tf.placeholder(
                tf.int32, [None, self.max_length, self.word_length])

        self.ex_lengths = tf.placeholder(tf.int32, [None])

        # Outputs as usual:
        self.outputs = tf.placeholder(
            tf.float32, shape=[None, self.output_dim])

        # This converts the inputs to a list of lists of dense vector
        # representations:
        self.feats = tf.nn.embedding_lookup(
            self.embedding, self.inputs)
        if self.char_embed:
            # shape (?, max_len, word_len, char_embed_dim)
            self.char_feats = tf.nn.embedding_lookup(
                self.char_embedding, self.char_inputs)
            # shape (? * max_len, word_len, char_embed_dim)
            self.char_feats = tf.reshape(self.char_feats, 
                                         (-1, self.word_length, self.char_embed_dim))
            self.char_feats = tf.layers.conv1d(self.char_feats, 
                                               self.num_char_filters, 
                                               self.char_kernel_size, 
                                               padding="same")
            # shape (?, max_len, word_len, num_char_filters)
            self.char_feats = tf.reshape(self.char_feats, 
                                         (-1, self.max_length, self.word_length, self.num_char_filters))
            # shape (?, max_len, num_char_filters)
            self.char_feats = tf.reduce_max(self.char_feats, axis=2)
            # concatenate char embeds with word embeds to get final representation
            self.feats = tf.concat([self.feats, self.char_feats], 2)

        # Defines the RNN structure:
        self.cell = self.cell_class(
            self.hidden_dim, activation=self.hidden_activation)
        self.cell = DropoutWrapper(self.cell, self.dropout)
        # If bidirectional RNN used, define a second RNN cell
        # alternatively we could shared cells for fw/bw, but i think not for now.
        if self.bidir_rnn:
            self.bw_cell = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.bw_cell = DropoutWrapper(self.bw_cell, self.dropout)
            
        if self.stacked and self.bidir_rnn:
            self.cell2 = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.cell2 = DropoutWrapper(self.cell2, self.dropout)
            self.bw_cell2 = self.cell_class(
                self.hidden_dim, activation=self.hidden_activation)
            self.bw_cell2 = DropoutWrapper(self.bw_cell2, self.dropout)

        # Run the RNN:
        if self.bidir_rnn:
            with tf.variable_scope("lstm1", reuse=tf.AUTO_REUSE):
                outputs, output_states = tf.nn.bidirectional_dynamic_rnn(
                    self.cell,
                    self.bw_cell,
                    self.feats,
                    dtype=tf.float32,
                    sequence_length=self.ex_lengths)
                out = tf.concat(outputs, 1)
            
            if self.stacked:
                with tf.variable_scope("lstm2", reuse=tf.AUTO_REUSE):
                    outputs2, output_states2 = tf.nn.bidirectional_dynamic_rnn(
                        self.cell2,
                        self.bw_cell2,
                        out,
                        dtype=tf.float32,
                        sequence_length=self.ex_lengths)
            
            # let the last state be the concatenation of the fw and bw
            # final ``outputs''. Note that output_states[0] is the FW
            # (c, h) pair, and output_states[1] is the BW (c, h) pair where
            # c is the hidden state and h is the output
            if self.stacked:
                self.last = tf.concat([output_states2[0][1], output_states2[1][1]], 1)
            else:
                if self.cell_class == tf.nn.rnn_cell.LSTMCell:
                    self.last = tf.concat([output_states[0][1], output_states[1][1]], 1)
                elif self.cell_class == tf.nn.rnn_cell.GRUCell:
                    self.last = tf.concat(output_states, 1)
        else:
            outputs, state = tf.nn.dynamic_rnn(
                self.cell,
                self.feats,
                dtype=tf.float32,
                sequence_length=self.ex_lengths)

	# Attention Layer
        if self.self_attend:
            out = tf.concat(outputs, 1)
            print(out)
            self_attn_layer = SelfAttn(self.dropout, out.shape[-1], self.attention_dim)
            outputs = self_attn_layer.build_graph(out, self.ex_lengths)

        # How can I be sure that I have found the last true state? This
        # first option seems to work for all cell types but sometimes
        # leads to indexing errors and is in general pretty complex:
        #
        # self.last = self._get_last_non_masked(outputs, self.ex_lengths)
        #
        # This option is more reliable, but is it definitely getting
        # the final true state?
        #
        # Note that we set self.last above for the BiDir RNN case.
        if not self.bidir_rnn:
            self.last = self._get_final_state(self.cell, state)

        # Softmax classifier on the final hidden state:
        if self.bidir_rnn:
            self.W_hy = self.weight_init(
                2 * self.hidden_dim, self.output_dim, 'W_hy')
        else:
            self.W_hy = self.weight_init(
                self.hidden_dim, self.output_dim, 'W_hy')
        self.b_y = self.bias_init(self.output_dim, 'b_y')
        self.model = tf.matmul(self.last, self.W_hy) + self.b_y

Example #11

Show file

def build_graph():
    def bilm_build_graph(options_file, weight_file):
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file, weight_file)

        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(context_elmo)
        question_embeddings_op = bilm(question_elmo)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        # Our SQuAD model includes ELMo at both the input and output layers
        # of the task GRU, so we need 4x ELMo representations for the question
        # and context at each of the input and output.
        # We use the same ELMo weights for both the question and context
        # at each of the input and output.
        elmo_context_input = weight_layers('input',
                                           context_embeddings_op,
                                           l2_coef=0.0)['weighted_op']
        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_input = weight_layers('input',
                                                question_embeddings_op,
                                                l2_coef=0.0)['weighted_op']
        """
        elmo_context_output = weight_layers(
            'output', context_embeddings_op, l2_coef=0.0
        )['weighted_op']

        with tf.variable_scope('', reuse=True):
            # the reuse=True scope reuses weights from the context for the question
            elmo_question_output = weight_layers(
                'output', question_embeddings_op, l2_coef=0.0
            )

        """
        return elmo_context_input, elmo_question_input

    def add_embedding_layer(emb_matrix):
        with tf.variable_scope("embeddings"):
            #set to constant so its untrainable
            embedding_matrix = tf.constant(
                emb_matrix, dtype=tf.float32,
                name="emb_matrix")  # shape (400002, embedding_size)

            # Get the word embeddings for the context and question,
            context_embs = tf.nn.embedding_lookup(embedding_matrix,
                                                  context_ids)
            qn_embs = tf.nn.embedding_lookup(embedding_matrix, qn_ids)
        return context_embs, qn_embs

    #start the main graph
    context_embs, qn_embs = add_embedding_layer(emb_matrix)
    elmo_context_input, elmo_question_input = bilm_build_graph(
        options_file, weight_file)

    context_embs_concat = tf.concat([elmo_context_input, context_embs],
                                    2)  #(2, context_len, 1074)
    qn_embs_concat = tf.concat([elmo_question_input, qn_embs],
                               2)  #(2, question_len, 1074)

    context_embs_concat.set_shape((None, None, 1074))
    qn_embs_concat.set_shape((None, None, 1074))
    qn_mask.set_shape((None, None))
    context_mask.set_shape((None, None))

    with tf.variable_scope("biLSTM"):
        print("Starting biLSTM...")
        LSTMencoder_context = RNNEncoder(hidden_size,
                                         keep_prob=keep_prob,
                                         cell_type="lstm",
                                         input_size=1074)
        LSTMencoder_question = RNNEncoder(hidden_size,
                                          keep_prob=keep_prob,
                                          cell_type="lstm",
                                          input_size=1074)
        #shared weights
        context_hiddens = LSTMencoder_context.build_graph(
            context_embs_concat,
            context_mask,
            scope="context_question_encoder",
            reuse=False)
        question_hiddens = LSTMencoder_question.build_graph(
            qn_embs_concat,
            qn_mask,
            scope="context_question_encoder",
            reuse=True)

    with tf.variable_scope("bidaf_layer"):
        print("Starting bidaf...")
        bidaf_object = Bidaf(hidden_size * 2, keep_prob)
        b = bidaf_object.build_graph(context_hiddens, question_hiddens,
                                     context_mask, qn_mask)

    with tf.variable_scope("self_attn_layer"):
        SelfAttn_object = SelfAttn(hidden_size,
                                   hidden_size * 2,
                                   keep_prob,
                                   input_size=hidden_size * 2)
        M = SelfAttn_object.build_graph(
            b, context_mask,
            cell_type="lstm")  #(batch_size, context_len, hidden_size*2)

    with tf.variable_scope("final_lstm_layer"):
        final_lstm_object = RNNEncoder(hidden_size,
                                       keep_prob=keep_prob,
                                       cell_type="lstm",
                                       input_size=hidden_size * 2)
        M_prime = final_lstm_object.build_graph(M,
                                                context_mask,
                                                scope="final_lstm",
                                                reuse=False)

    with tf.variable_scope("StartDist"):
        softmax_layer_start = SimpleSoftmaxLayer()
        logits_start, probdist_start = softmax_layer_start.build_graph(
            M_prime, context_mask)

    with tf.variable_scope("EndDist"):
        softmax_layer_end = SimpleSoftmaxLayer()
        logits_end, probdist_end = softmax_layer_end.build_graph(
            M_prime, context_mask)

    return logits_start, probdist_start, logits_end, probdist_end