def forward(self, premise_x, hypothesis_x, \
                pre_pos, hyp_pos, premise_char_vectors, hypothesis_char_vectors, \
                premise_exact_match, hypothesis_exact_match):
        prem_seq_lengths, prem_mask = blocks.length(premise_x)  # mask [N, L , 1]
        hyp_seq_lengths, hyp_mask = blocks.length(hypothesis_x)    	

        premise_in = F.dropout(self.emb(premise_x), p = self.dropout_rate,  training=self.training)
        hypothesis_in = F.dropout(self.emb(hypothesis_x), p = self.dropout_rate,  training=self.training)

        conv_pre, conv_hyp = self.char_emb(premise_char_vectors, hypothesis_char_vectors)

        premise_in = torch.cat([premise_in, conv_pre], 2) #[70, 48, 300], [70, 48, 100] --> [70,48,400]
        hypothesis_in = torch.cat([hypothesis_in, conv_hyp], 2)

        premise_in = torch.cat([premise_in, pre_pos], 2) # 70*48*447
        hypothesis_in = torch.cat([hypothesis_in, hyp_pos], 2)

        premise_exact_match = torch.unsqueeze(premise_exact_match,2) #70*48*1
        premise_in = torch.cat([premise_in, premise_exact_match], 2) #70*48*448
        hypothesis_exact_match = torch.unsqueeze(hypothesis_exact_match,2)
        hypothesis_in = torch.cat([hypothesis_in, hypothesis_exact_match], 2) #70*48*448
        

        premise_in = highway_network(self.highway_network_linear, premise_in, self.config.highway_num_layers, True, wd=self.config.wd, is_train = self.training)    
        hypothesis_in = highway_network(self.highway_network_linear, hypothesis_in, self.config.highway_num_layers, True, wd=self.config.wd, is_train = self.training)

        pre = premise_in  #[70, 48, 448]
        hyp = hypothesis_in
        
        for i in range(self.config.self_att_enc_layers):
            pre = self_attention_layer(self.self_attention_linear_p, self.fuse_gate_linear_p1, self.fuse_gate_linear_p2, self.fuse_gate_linear_p3, self.fuse_gate_linear_p4, self.fuse_gate_linear_p5, self.fuse_gate_linear_p6, self.config, self.training, pre, input_drop_prob=self.dropout_rate, p_mask=prem_mask) # [N, len, dim]    
            hyp = self_attention_layer(self.self_attention_linear_h, self.fuse_gate_linear_h1, self.fuse_gate_linear_h2, self.fuse_gate_linear_h3, self.fuse_gate_linear_h4, self.fuse_gate_linear_h5, self.fuse_gate_linear_h6, self.config, self.training, hyp, input_drop_prob=self.dropout_rate, p_mask=prem_mask)

        bi_att_mx = bi_attention_mx(self.config, self.training, pre, hyp, p_mask=prem_mask, h_mask=hyp_mask) # [N, PL, HL] 70,448,48,48

        bi_att_mx = F.dropout(bi_att_mx, p=self.dropout_rate, training=self.training)

        fm = self.interaction_cnn(bi_att_mx) # [70, 134, 48, 48]
  
        if self.config.first_scale_down_layer_relu:
            fm = F.relu(fm)

        premise_final = self.dense_net(fm)

        premise_final = premise_final.view(self.config.batch_size, -1)
        print("premise_final", premise_final.size())
        logits = linear(self.final_linear, [premise_final], self.pred_size ,True, bias_start=0.0, squeeze=False, wd=self.config.wd, input_drop_prob=self.config.keep_rate,
                                is_train=self.training)

        return logits
Exemple #2
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.attention_size = 128
        self.mlp_size = self.dim
        self.sequence_length = seq_length
        self.lam = 0.01
        self.epsilon = 1e-10

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        ############# MY CODE STARTS ########

        premise_outs, premise_final = blocks.biLSTM(premise_in,
                                                    dim=self.dim,
                                                    seq_len=prem_seq_lengths,
                                                    name='premise')
        attention_outs_pre, self.alphas_pre = blocks.attention(
            premise_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(prem_mask))
        drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        hypothesis_outs, hypothesis_final = blocks.biLSTM(
            hypothesis_in,
            dim=self.dim,
            seq_len=hyp_seq_lengths,
            name='hypothesis')
        attention_outs_hyp, self.alphas_hyp = blocks.attention(
            hypothesis_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(hyp_mask))
        drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        drop = tf.concat([drop_pre, drop_hyp], axis=1)
        h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp)

        ############# MY CODE ENDS ########

        ############# Hex Part #########
        ############  MY CODE STARTS #########

        attention_outs_pre_hex, self.alphas_pre_hex = blocks.attention(
            premise_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(prem_mask))
        drop_pre_hex = tf.nn.dropout(attention_outs_pre_hex, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        attention_outs_hyp_hex, self.alphas_hyp_hex = blocks.attention(
            hypothesis_outs,
            self.attention_size,
            return_alphas=True,
            mask=tf.squeeze(hyp_mask))
        drop_hyp_hex = tf.nn.dropout(attention_outs_hyp_hex, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        bag_of_word_in = tf.concat([drop_pre_hex, drop_hyp_hex], axis=1)

        # Hex component inputs

        h_fc1 = h_mlp  # (?, 300)

        h_fc2 = bag_of_word_in  # (?, 1200)

        # Hex layer definition
        self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.W_cl_2 = tf.Variable(tf.random_normal([1200, 3]), trainable=True)
        self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True)
        self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0)

        # Compute prediction using  [h_fc1, 0(pad)]
        pad = tf.zeros_like(h_fc2, tf.float32)
        # print(pad.shape) -> (?, 600)

        yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1),
                                           self.keep_rate_ph)
        y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl

        self.logits = y_conv_pred  # Prediction

        # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2]
        pad2 = tf.zeros_like(h_fc1, tf.float32)

        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl  # get Fg

        yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1),
                                           self.keep_rate_ph)
        y_conv_loss = tf.matmul(yconv_contact_loss,
                                self.W_cl) + self.b_cl  # get Fb

        self.temp = y_conv_H
        temp = tf.matmul(y_conv_H, y_conv_H, transpose_a=True)

        y_conv_loss = y_conv_loss - tf.matmul(
            tf.matmul(tf.matmul(y_conv_H, tf.matrix_inverse(temp)),
                      y_conv_H,
                      transpose_b=True), y_conv_loss)  # get loss

        cost_logits = y_conv_loss

        # Regularize hex attention
        alphas_pre_loss_hex = self.alphas_pre_hex + self.epsilon
        alphas_hyp_loss_hex = self.alphas_hyp_hex + self.epsilon
        reg1 = tf.reduce_mean(-tf.reduce_sum(
            alphas_pre_loss_hex * tf.log(alphas_pre_loss_hex), axis=1))
        reg2 = tf.reduce_mean(-tf.reduce_sum(
            alphas_hyp_loss_hex * tf.log(alphas_hyp_loss_hex), axis=1))
        reg = reg1 + reg2

        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=y_conv_loss))
Exemple #3
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        lambd = 0.05

        ## note: embedding_dim and hidden_dim are both 300, used interchangeably
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        ############### MY CODE STARTS #####

        # Define layer size
        self.bow_layer_size = 600

        # LSTM layer (final layer of the original esim model)
        h_fc1 = h_mlp

        # Bag-of-word input (averaging word embeddings)
        bow_pre = premise_in
        bow_hyp = hypothesis_in
        # print(bow_pre.shape) -> (?, 50, 300)
        bag_of_word_pre = tf.reduce_mean(bow_pre, 1)
        bag_of_word_hyp = tf.reduce_mean(bow_hyp, 1)
        # print(bag_of_word_pre.shape) -> (?, 300)
        bag_of_word_in = tf.concat([bag_of_word_pre, bag_of_word_hyp], 1)
        # print(bag_of_word_in.shape) -> (?, 600)

        # Bag-of-word input layer params
        h_fc2 = bag_of_word_in
        # print( h_fc2.shape) -> (?, 600)

        # Bag-of-word output layer params
        weights_from_split = np.load(
            "../../rearrangingDS/rearranged_even_seqlen50/weights.npy")
        # (600, 3)
        bias_from_split = np.load(
            "../../rearrangingDS/rearranged_even_seqlen50/bias.npy")
        # (3,)

        self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.W_cl_2 = tf.Variable(tf.random_normal([600, 3]), trainable=True)
        self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True)
        self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0)

        reg = lambd * tf.reduce_sum(tf.abs(self.W_cl_2)) / (2 * 50)

        # Compute prediction using  [h_fc1, 0(pad)]
        pad = tf.zeros_like(h_fc2, tf.float32)
        # print(pad.shape) -> (?, 600)

        yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1),
                                           self.keep_rate_ph)
        y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl

        self.logits = y_conv_pred  # Prediction

        # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2]
        pad2 = tf.zeros_like(h_fc1, tf.float32)

        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl  # get Fg

        yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1),
                                           self.keep_rate_ph)
        y_conv_loss = tf.matmul(yconv_contact_loss,
                                self.W_cl) + self.b_cl  # get Fb

        y_conv_loss = y_conv_loss - tf.matmul(
            tf.matmul(tf.matmul(
                y_conv_H,
                tf.matrix_inverse(
                    tf.matmul(y_conv_H, y_conv_H, transpose_a=True))),
                      y_conv_H,
                      transpose_b=True), y_conv_loss)  # get loss

        cost_logits = y_conv_loss
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=y_conv_loss)) + reg  # Cost
Exemple #4
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.attention_size = 128
        self.mlp_size = self.dim
        self.sequence_length = seq_length
        self.lam = 0.01
        self.epsilon = 1e-10

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1))

        self.W_cl = tf.Variable(
            tf.random_normal([self.mlp_size, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        ############# MY CODE STARTS ########

        premise_outs, premise_final = blocks.biLSTM(premise_in,
                                                    dim=self.dim,
                                                    seq_len=prem_seq_lengths,
                                                    name='premise')
        attention_outs_pre, self.alphas_pre = blocks.attention(
            premise_outs, self.attention_size, return_alphas=True)
        drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph)
        #drop_pre = attention_outs_pre

        hypothesis_outs, hypothesis_final = blocks.biLSTM(
            hypothesis_in,
            dim=self.dim,
            seq_len=hyp_seq_lengths,
            name='hypothesis')
        attention_outs_hyp, self.alphas_hyp = blocks.attention(
            hypothesis_outs, self.attention_size, return_alphas=True)
        drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph)
        #drop_hyp = attention_outs_hyp

        # Concat output of pre and hyp outpuratet
        drop = tf.concat([drop_pre, drop_hyp], axis=1)

        # Add a small constant
        alphas_pre_loss = self.alphas_pre * tf.squeeze(
            prem_mask) + self.epsilon
        alphas_hyp_loss = self.alphas_hyp * tf.squeeze(hyp_mask) + self.epsilon

        # Calculate entropy
        reg1 = tf.reduce_mean(
            -tf.reduce_sum(alphas_pre_loss * tf.log(alphas_pre_loss), axis=1))
        reg2 = tf.reduce_mean(
            -tf.reduce_sum(alphas_hyp_loss * tf.log(alphas_hyp_loss), axis=1))
        reg = reg1 + reg2

        # MLP layer
        h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp)

        ############# MY CODE ENDS ########

        # Get prediction
        self.logits = tf.matmul(h_mlp, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=self.logits) + self.lam * reg)
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemple #6
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.premise_pos = tf.placeholder(tf.int32,
                                          [None, self.sequence_length, 47],
                                          name='premise_pos')
        self.hypothesis_pos = tf.placeholder(tf.int32,
                                             [None, self.sequence_length, 47],
                                             name='hypothesis_pos')
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 12, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        # ## Define External Knowledge dictionary para.
        # self.exterKnowledge_dic = exterKnowledge_dic
        ## Define R_matrix
        self.R_mat = tf.placeholder(
            tf.float32, [None, self.sequence_length, self.sequence_length])

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = tf.concat(
            [emb_drop(self.premise_x),
             tf.cast(self.premise_pos, tf.float32)],
            axis=2)
        hypothesis_in = tf.concat([
            emb_drop(self.hypothesis_x),
            tf.cast(self.hypothesis_pos, tf.float32)
        ],
                                  axis=2)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### self-attention ###
        premise_project = blocks.dense(premise_bi, 600)
        premise_project_list = tf.unstack(premise_project, axis=1)
        premise_self_attn = []
        alphas = []

        for i in range(self.sequence_length):
            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_project_list[i],
                                                     premise_project_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)
            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_prem)
            p_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, premise_bi), 1)
            premise_self_attn.append(p_tilde_i)

        hypothesis_project = blocks.dense(hypothesis_bi, 600)
        hypothesis_project_list = tf.unstack(hypothesis_project, axis=1)
        hypothesis_self_attn = []
        for i in range(self.sequence_length):
            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(
                    hypothesis_project_list[i], hypothesis_project_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)
            scores_i = tf.stack(scores_i_list, axis=1)
            beta_i = blocks.masked_softmax(scores_i, mask_hyp)
            h_tilde_i = tf.reduce_sum(tf.multiply(beta_i, hypothesis_bi), 1)
            hypothesis_self_attn.append(h_tilde_i)

        premise_self_attns = tf.stack(premise_self_attn, axis=1)
        hypothesis_self_attns = tf.stack(hypothesis_self_attn, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []
        r_alpha = []
        r_all = []

        for i in range(self.sequence_length):
            scores_i_list = []
            r_i_list = []
            for j in range(self.sequence_length):
                #caculate similarity score_ij (e_ij)

                score_ij_ori = tf.reduce_sum(tf.multiply(
                    premise_list[i], hypothesis_list[j]),
                                             1,
                                             keep_dims=True)
                ext_r = tf.expand_dims(self.R_mat[:, i, j], axis=1)
                score_ij = score_ij_ori + ext_r
                scores_i_list.append(score_ij)
                r_ij = self.R_mat[:, i, j]
                r_i_list.append(r_ij)
                #pdb.set_trace()
            scores_i = tf.stack(scores_i_list, axis=1)
            r_i = tf.expand_dims(tf.stack(r_i_list, axis=1), 2)
            #alpha_i: weigth of hypothesis_bi
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            r_alpha_i = tf.reduce_sum(tf.multiply(r_i, alpha_i), 1)

            scores_all.append(scores_i)
            alphas.append(alpha_i)
            r_alpha.append(r_alpha_i)
            r_all.append(r_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack,
                                 axis=1)  #turn i index to j index

        r_stack = tf.stack(r_all, axis=2)
        r_list = tf.unstack(r_stack, axis=1)  #turn i index to j index

        hypothesis_attn = []
        betas = []
        r_beta = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            r_j = r_list[j]
            r_beta_j = tf.reduce_sum(tf.multiply(r_j, beta_j), 1)
            r_beta.append(r_beta_j)

            betas.append(beta_j)
        # Make r_alpha and r_beta in tensor
        r_alphas = tf.stack(r_alpha, axis=1)
        r_betas = tf.stack(r_beta, axis=1)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###
        prem_self_diff = tf.subtract(premise_bi, premise_self_attns)
        prem_self_mul = tf.multiply(premise_bi, premise_self_attns)
        hyp_self_diff = tf.subtract(hypothesis_bi, hypothesis_self_attns)
        hyp_self_mul = tf.multiply(hypothesis_bi, hypothesis_self_attns)

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        ### Factorize Machine ###

        FM_premise_self_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([premise_bi, premise_self_attns], 2)), 2)
        FM_prem_self_diff = tf.expand_dims(
            blocks.factorize_machine(prem_self_diff), 2)
        FM_prem_self_mul = tf.expand_dims(
            blocks.factorize_machine(prem_self_mul), 2)

        FM_hypothesis_self_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([hypothesis_bi, hypothesis_self_attns], 2)), 2)
        FM_hyp_self_diff = tf.expand_dims(
            blocks.factorize_machine(hyp_self_diff), 2)
        FM_hyp_self_mul = tf.expand_dims(
            blocks.factorize_machine(hyp_self_mul), 2)

        FM_premise_attns = tf.expand_dims(
            blocks.factorize_machine(tf.concat([premise_bi, premise_attns],
                                               2)), 2)
        FM_prem_diff = tf.expand_dims(blocks.factorize_machine(prem_diff), 2)
        FM_prem_mul = tf.expand_dims(blocks.factorize_machine(prem_mul), 2)

        FM_hypothesis_attns = tf.expand_dims(
            blocks.factorize_machine(
                tf.concat([hypothesis_bi, hypothesis_attns], 2)), 2)
        FM_hyp_diff = tf.expand_dims(blocks.factorize_machine(hyp_diff), 2)
        FM_hyp_mul = tf.expand_dims(blocks.factorize_machine(hyp_mul), 2)

        m_a = tf.concat([
            premise_bi, FM_premise_attns, FM_prem_diff, FM_prem_mul,
            FM_premise_self_attns, FM_prem_self_diff, FM_prem_self_mul,
            r_alphas
        ], 2)
        m_b = tf.concat([
            hypothesis_bi, FM_hypothesis_attns, FM_hyp_diff, FM_hyp_mul,
            FM_hypothesis_self_attns, FM_hyp_self_diff, FM_hyp_self_mul,
            r_betas
        ], 2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        alpha_w = blocks.masked_softmax(blocks.dense(r_alphas, 1), mask_prem)
        a_w = tf.reduce_sum(tf.multiply(alpha_w, v1_bi), 1)

        beta_w = blocks.masked_softmax(blocks.dense(r_betas, 1), mask_hyp)
        b_w = tf.reduce_sum(tf.multiply(beta_w, v2_bi), 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max, a_w, b_w], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemple #7
0
    def __init__(self,
                 config,
                 seq_length,
                 emb_dim,
                 hidden_dim,
                 emb_train,
                 embeddings=None,
                 pred_size=3,
                 context_seq_len=None,
                 query_seq_len=None):
        ## Define hyperparameters
        # tf.reset_default_graph()
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length
        self.pred_size = pred_size
        self.context_seq_len = context_seq_len
        self.query_seq_len = query_seq_len
        # self.config = config

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length],
                                        name='premise')
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length],
                                           name='hypothesis')
        self.premise_pos = tf.placeholder(tf.int32,
                                          [None, self.sequence_length, 47],
                                          name='premise_pos')
        self.hypothesis_pos = tf.placeholder(tf.int32,
                                             [None, self.sequence_length, 47],
                                             name='hypothesis_pos')
        self.premise_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='premise_char')
        self.hypothesis_char = tf.placeholder(
            tf.int32, [None, self.sequence_length, config.char_in_word_size],
            name='hypothesis_char')
        self.premise_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='premise_exact_match')
        self.hypothesis_exact_match = tf.placeholder(
            tf.int32, [None, self.sequence_length, 1],
            name='hypothesis_exact_match')

        self.global_step = tf.Variable(0, name='global_step', trainable=False)

        self.dropout_keep_rate = tf.train.exponential_decay(
            config.keep_rate,
            self.global_step,
            config.dropout_decay_step,
            config.dropout_decay_rate,
            staircase=False,
            name='dropout_keep_rate')
        config.keep_rate = self.dropout_keep_rate
        tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate)

        self.y = tf.placeholder(tf.int32, [None], name='label_y')
        self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob')
        self.is_train = tf.placeholder('bool', [], name='is_train')

        ## Fucntion for embedding lookup and dropout at embedding layer
        def emb_drop(E, x):
            emb = tf.nn.embedding_lookup(E, x)
            emb_drop = tf.cond(self.is_train,
                               lambda: tf.nn.dropout(emb, config.keep_rate),
                               lambda: emb)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(
            self.premise_x)  # mask [N, L , 1]
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)
        self.prem_mask = prem_mask
        self.hyp_mask = hyp_mask

        ### Embedding layer ###
        with tf.variable_scope("emb"):
            with tf.variable_scope("emb_var"), tf.device("/cpu:0"):
                self.E = tf.Variable(embeddings, trainable=emb_train)
                premise_in = emb_drop(self.E, self.premise_x)  #P
                hypothesis_in = emb_drop(self.E, self.hypothesis_x)  #H

        with tf.variable_scope("char_emb"):
            char_emb_mat = tf.get_variable(
                "char_emb_mat",
                shape=[config.char_vocab_size, config.char_emb_size])
            with tf.variable_scope("char") as scope:
                char_pre = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.premise_char)
                char_hyp = tf.nn.embedding_lookup(char_emb_mat,
                                                  self.hypothesis_char)

                filter_sizes = list(
                    map(int, config.out_channel_dims.split(',')))  #[100]
                heights = list(map(int,
                                   config.filter_heights.split(',')))  #[5]
                assert sum(filter_sizes) == config.char_out_size, (
                    filter_sizes, config.char_out_size)
                with tf.variable_scope("conv") as scope:
                    conv_pre = multi_conv1d(char_pre,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    scope.reuse_variables()
                    conv_hyp = multi_conv1d(char_hyp,
                                            filter_sizes,
                                            heights,
                                            "VALID",
                                            self.is_train,
                                            config.keep_rate,
                                            scope='conv')
                    conv_pre = tf.reshape(
                        conv_pre,
                        [-1, self.sequence_length, config.char_out_size])
                    conv_hyp = tf.reshape(
                        conv_hyp,
                        [-1, self.sequence_length, config.char_out_size])
            premise_in = tf.concat([premise_in, conv_pre], axis=2)
            hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2)

        premise_in = tf.concat(
            (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2)
        hypothesis_in = tf.concat(
            (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2)

        premise_in = tf.concat(
            [premise_in,
             tf.cast(self.premise_exact_match, tf.float32)],
            axis=2)
        hypothesis_in = tf.concat(
            [hypothesis_in,
             tf.cast(self.hypothesis_exact_match, tf.float32)],
            axis=2)

        with tf.variable_scope("highway") as scope:
            premise_in = highway_network(premise_in,
                                         config.highway_num_layers,
                                         True,
                                         wd=config.wd,
                                         is_train=self.is_train)
            scope.reuse_variables()
            hypothesis_in = highway_network(hypothesis_in,
                                            config.highway_num_layers,
                                            True,
                                            wd=config.wd,
                                            is_train=self.is_train)

        with tf.variable_scope("prepro") as scope:
            pre = premise_in
            hyp = hypothesis_in
            for i in range(config.self_att_enc_layers):
                with tf.variable_scope(tf.get_variable_scope(), reuse=False):
                    p = self_attention_layer(
                        config,
                        self.is_train,
                        pre,
                        p_mask=prem_mask,
                        scope="{}_layer_self_att_enc".format(
                            i))  # [N, len, dim]
                    h = self_attention_layer(
                        config,
                        self.is_train,
                        hyp,
                        p_mask=hyp_mask,
                        scope="{}_layer_self_att_enc_h".format(i))
                    pre = p
                    hyp = h
                    variable_summaries(p,
                                       "p_self_enc_summary_layer_{}".format(i))
                    variable_summaries(h,
                                       "h_self_enc_summary_layer_{}".format(i))

        with tf.variable_scope("main") as scope:

            def model_one_side(config, main, support, main_length,
                               support_length, main_mask, support_mask, scope):
                bi_att_mx = bi_attention_mx(config,
                                            self.is_train,
                                            main,
                                            support,
                                            p_mask=main_mask,
                                            h_mask=support_mask)  # [N, PL, HL]

                bi_att_mx = tf.cond(
                    self.is_train,
                    lambda: tf.nn.dropout(bi_att_mx, config.keep_rate),
                    lambda: bi_att_mx)
                out_final = dense_net(config, bi_att_mx, self.is_train)

                return out_final

            premise_final = model_one_side(config,
                                           p,
                                           h,
                                           prem_seq_lengths,
                                           hyp_seq_lengths,
                                           prem_mask,
                                           hyp_mask,
                                           scope="premise_as_main")
            f0 = premise_final
            print('f0:', f0.get_shape().as_list())

        self.logits = linear(f0,
                             self.pred_size,
                             True,
                             bias_start=0.0,
                             scope="logit",
                             squeeze=False,
                             wd=config.wd,
                             input_keep_prob=config.keep_rate,
                             is_train=self.is_train)

        tf.summary.histogram('logit_histogram', self.logits)

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
        self.acc = tf.reduce_mean(
            tf.cast(
                tf.equal(tf.arg_max(self.logits, dimension=1),
                         tf.cast(self.y, tf.int64)), tf.float32))
        tf.summary.scalar('acc', self.acc)

        tf.summary.scalar('loss', self.total_cost)

        # calculate acc

        # L2 Loss
        if config.l2_loss:
            if config.sigmoid_growing_l2loss:
                weights_added = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables()
                    if tensor.name.endswith("weights:0")
                    and not tensor.name.endswith("weighted_sum/weights:0")
                    or tensor.name.endswith('kernel:0')
                ])
                full_l2_step = tf.constant(config.weight_l2loss_step_full_reg,
                                           dtype=tf.int32,
                                           shape=[],
                                           name='full_l2reg_step')
                full_l2_ratio = tf.constant(config.l2_regularization_ratio,
                                            dtype=tf.float32,
                                            shape=[],
                                            name='l2_regularization_ratio')
                gs_flt = tf.cast(self.global_step, tf.float32)
                half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32)

                # (self.global_step - full_l2_step / 2)
                # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)
                # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio
                l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) /
                                          half_l2_step_flt) * full_l2_ratio
                tf.summary.scalar('l2loss_ratio', l2loss_ratio)
                l2loss = weights_added * l2loss_ratio
            else:
                l2loss = tf.add_n([
                    tf.nn.l2_loss(tensor)
                    for tensor in tf.trainable_variables() if tensor.name.
                    endswith("weights:0") or tensor.name.endswith('kernel:0')
                ]) * tf.constant(config.l2_regularization_ratio,
                                 dtype='float',
                                 shape=[],
                                 name='l2_regularization_ratio')
            tf.summary.scalar('l2loss', l2loss)
            self.total_cost += l2loss

        if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff:
            diffs = []
            for i in range(config.self_att_enc_layers):
                for tensor in tf.trainable_variables():
                    print(tensor.name)
                    if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format(
                            i):
                        l_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format(
                            i):
                        r_lg = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        l_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format(
                            i):
                        r_fg_lhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        l_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format(
                            i):
                        r_fg_rhs_1 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        l_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format(
                            i):
                        r_fg_lhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        l_fg_rhs_2 = tensor
                    elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format(
                            i):
                        r_fg_rhs_2 = tensor

                    if config.two_gate_fuse_gate:
                        if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            l_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format(
                                i):
                            r_fg_lhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            l_fg_rhs_3 = tensor
                        elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format(
                                i):
                            r_fg_rhs_3 = tensor

                diffs += [
                    l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1,
                    l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2,
                    l_fg_rhs_2 - r_fg_rhs_2
                ]
                if config.two_gate_fuse_gate:
                    diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3]

            diff_loss = tf.add_n([tf.nn.l2_loss(tensor)
                                  for tensor in diffs]) * tf.constant(
                                      config.diff_penalty_loss_ratio,
                                      dtype='float',
                                      shape=[],
                                      name='diff_penalty_loss_ratio')
            tf.summary.scalar('diff_penalty_loss', diff_loss)
            self.total_cost += diff_loss

        self.summary = tf.summary.merge_all()

        total_parameters = 0
        for v in tf.global_variables():
            if not v.name.endswith("weights:0") and not v.name.endswith(
                    "biases:0") and not v.name.endswith(
                        'kernel:0') and not v.name.endswith('bias:0'):
                continue
            print(v.name)
            # print(type(v.name))
            shape = v.get_shape().as_list()
            param_num = 1
            for dim in shape:
                param_num *= dim
            print(param_num)
            total_parameters += param_num
        print(total_parameters)
Exemple #8
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        ## note: embedding_dim and hidden_dim are both 300, used interchangeably
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        ### First biLSTM layer ###

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)

        ### Attention ###

        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i, mask_hyp)
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1)
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        ### Subcomponent Inference ###

        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        ### Inference Composition ###

        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        ### Pooling Layer ###

        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # MLP layer
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        ############### MY CODE STARTS #####

        # Define layer size
        self.bow_layer_size = 300

        # LSTM layer (final layer of the original esim model)
        h_fc1 = h_mlp
        h_fc1 = tf.zeros_like(h_mlp)  # Don't need esim output

        # Bag-of-word input (averaing word embeddings)
        bow_pre = premise_in
        bow_hyp = hypothesis_in
        bag_of_word_in = tf.reduce_mean(tf.concat([bow_pre, bow_hyp], 1), 1)

        # Bag-of-word input layer params
        W_fc2 = tf.Variable(
            tf.random_normal([self.dim, self.bow_layer_size], stddev=0.1))
        b_fc2 = tf.Variable(tf.zeros([self.bow_layer_size]))
        h_fc2 = tf.nn.relu(tf.matmul(bag_of_word_in, W_fc2) + b_fc2)

        # Bag-of-word output layer params
        self.W_cl = tf.Variable(
            tf.random_normal([self.dim + self.bow_layer_size, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        pad2 = tf.zeros_like(h_fc1, tf.float32)

        # Compute both cost and prediction using yconv_contact_H
        yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1),
                                        self.keep_rate_ph)
        y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl

        y_conv_pred = y_conv_H
        y_conv_loss = y_conv_H

        self.logits = y_conv_pred  # Prediction
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=self.logits))  # Cost
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        ## Function for embedding lookup and dropout at embedding layer
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, prem_mask = blocks.length(self.premise_x)
        hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x)

        ### BiLSTM layer ###
        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        #premise_final = blocks.last_output(premise_bi, prem_seq_lengths)
        #hypothesis_final =  blocks.last_output(hypothesis_bi, hyp_seq_lengths)

        ### Mean pooling
        premise_sum = tf.reduce_sum(premise_bi, 1)
        premise_ave = tf.div(
            premise_sum,
            tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        hypothesis_sum = tf.reduce_sum(hypothesis_bi, 1)
        hypothesis_ave = tf.div(
            hypothesis_sum,
            tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        ### Mou et al. concat layer ###
        diff = tf.subtract(premise_ave, hypothesis_ave)
        mul = tf.multiply(premise_ave, hypothesis_ave)
        h = tf.concat([premise_ave, hypothesis_ave, diff, mul], 1)

        # MLP layer
        h_mlp = tf.nn.relu(tf.matmul(h, self.W_mlp) + self.b_mlp)
        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))
Exemple #10
0
    def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train):
        ## Define hyperparameters
        self.embedding_dim = emb_dim
        self.dim = hidden_dim
        self.sequence_length = seq_length

        ## Define the placeholders
        self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length])
        self.hypothesis_x = tf.placeholder(tf.int32,
                                           [None, self.sequence_length])
        self.y = tf.placeholder(tf.int32, [None])
        self.keep_rate_ph = tf.placeholder(tf.float32, [])

        ## Define parameters
        self.E = tf.Variable(embeddings, trainable=emb_train)

        self.W_mlp = tf.Variable(
            tf.random_normal([self.dim * 8, self.dim], stddev=0.1))
        self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1))

        self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1))
        self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1))

        # Function for embedding lookup and dropout at embedding layer
        # dropout就是忽略部分特征检测器(让部分隐层节点值为0)
        def emb_drop(x):
            emb = tf.nn.embedding_lookup(self.E, x)
            emb_drop = tf.nn.dropout(emb, self.keep_rate_ph)
            return emb_drop

        # Get lengths of unpadded sentences
        prem_seq_lengths, mask_prem = blocks.length(self.premise_x)
        hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x)

        # ————————————————————————input encoding阶段——————————————————————————————-

        premise_in = emb_drop(self.premise_x)
        hypothesis_in = emb_drop(self.hypothesis_x)

        # 通过BiLSTM重新学习单词和上下文的关系
        premise_outs, c1 = blocks.biLSTM(premise_in,
                                         dim=self.dim,
                                         seq_len=prem_seq_lengths,
                                         name='premise')
        hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in,
                                            dim=self.dim,
                                            seq_len=hyp_seq_lengths,
                                            name='hypothesis')
        print('premise_outs: ', premise_outs)

        premise_bi = tf.concat(premise_outs, axis=2)
        hypothesis_bi = tf.concat(hypothesis_outs, axis=2)

        premise_list = tf.unstack(premise_bi, axis=1)
        hypothesis_list = tf.unstack(hypothesis_bi, axis=1)
        print('hypothesis_list: ', hypothesis_list)

        # 注意力机制
        scores_all = []
        premise_attn = []
        alphas = []

        for i in range(self.sequence_length):

            scores_i_list = []
            for j in range(self.sequence_length):
                # 计算第一个句子(premise)的第i个单词和第二个句子所有单词的相似度(向量乘积)
                # 这里的score就是论文里面的e
                score_ij = tf.reduce_sum(tf.multiply(premise_list[i],
                                                     hypothesis_list[j]),
                                         1,
                                         keep_dims=True)
                scores_i_list.append(score_ij)

            scores_i = tf.stack(scores_i_list, axis=1)
            alpha_i = blocks.masked_softmax(scores_i,
                                            mask_hyp)  # 通过softmax标准化转换成权重
            a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi),
                                      1)  # 这里就是用句子b的各个词向量根据权重去表示句子a的第i个词向量
            premise_attn.append(a_tilde_i)

            scores_all.append(scores_i)
            alphas.append(alpha_i)

        # 把scores的结构转为list
        scores_stack = tf.stack(scores_all, axis=2)
        scores_list = tf.unstack(scores_stack, axis=1)

        # 对句子b也重复上面的过程
        hypothesis_attn = []
        betas = []
        for j in range(self.sequence_length):
            scores_j = scores_list[j]
            beta_j = blocks.masked_softmax(scores_j, mask_prem)
            b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1)
            hypothesis_attn.append(b_tilde_j)

            betas.append(beta_j)

        # Make attention-weighted sentence representations into one tensor,
        premise_attns = tf.stack(premise_attn, axis=1)
        hypothesis_attns = tf.stack(hypothesis_attn, axis=1)

        # For making attention plots,
        self.alpha_s = tf.stack(alphas, axis=2)
        self.beta_s = tf.stack(betas, axis=2)

        # Enhancement of local inference information
        # 下面就是分析差异的过程
        prem_diff = tf.subtract(premise_bi, premise_attns)
        prem_mul = tf.multiply(premise_bi, premise_attns)
        hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns)
        hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns)

        m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2)
        m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul],
                        2)

        # Inference Composition
        # 用BiLSTM分析overall inference relationship between a premise and hypothesis
        v1_outs, c3 = blocks.biLSTM(m_a,
                                    dim=self.dim,
                                    seq_len=prem_seq_lengths,
                                    name='v1')
        v2_outs, c4 = blocks.biLSTM(m_b,
                                    dim=self.dim,
                                    seq_len=hyp_seq_lengths,
                                    name='v2')

        v1_bi = tf.concat(v1_outs, axis=2)
        v2_bi = tf.concat(v2_outs, axis=2)

        # Pooling Layer
        v_1_sum = tf.reduce_sum(v1_bi, 1)
        v_1_ave = tf.div(
            v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1))

        v_2_sum = tf.reduce_sum(v2_bi, 1)
        v_2_ave = tf.div(
            v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1))

        v_1_max = tf.reduce_max(v1_bi, 1)
        v_2_max = tf.reduce_max(v2_bi, 1)

        v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1)

        # 最后用MLP layer做分类
        h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp)

        # Dropout applied to classifier
        h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph)

        # Get prediction
        self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl

        # Define the cost function
        self.total_cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y,
                                                           logits=self.logits))