def forward(self, premise_x, hypothesis_x, \ pre_pos, hyp_pos, premise_char_vectors, hypothesis_char_vectors, \ premise_exact_match, hypothesis_exact_match): prem_seq_lengths, prem_mask = blocks.length(premise_x) # mask [N, L , 1] hyp_seq_lengths, hyp_mask = blocks.length(hypothesis_x) premise_in = F.dropout(self.emb(premise_x), p = self.dropout_rate, training=self.training) hypothesis_in = F.dropout(self.emb(hypothesis_x), p = self.dropout_rate, training=self.training) conv_pre, conv_hyp = self.char_emb(premise_char_vectors, hypothesis_char_vectors) premise_in = torch.cat([premise_in, conv_pre], 2) #[70, 48, 300], [70, 48, 100] --> [70,48,400] hypothesis_in = torch.cat([hypothesis_in, conv_hyp], 2) premise_in = torch.cat([premise_in, pre_pos], 2) # 70*48*447 hypothesis_in = torch.cat([hypothesis_in, hyp_pos], 2) premise_exact_match = torch.unsqueeze(premise_exact_match,2) #70*48*1 premise_in = torch.cat([premise_in, premise_exact_match], 2) #70*48*448 hypothesis_exact_match = torch.unsqueeze(hypothesis_exact_match,2) hypothesis_in = torch.cat([hypothesis_in, hypothesis_exact_match], 2) #70*48*448 premise_in = highway_network(self.highway_network_linear, premise_in, self.config.highway_num_layers, True, wd=self.config.wd, is_train = self.training) hypothesis_in = highway_network(self.highway_network_linear, hypothesis_in, self.config.highway_num_layers, True, wd=self.config.wd, is_train = self.training) pre = premise_in #[70, 48, 448] hyp = hypothesis_in for i in range(self.config.self_att_enc_layers): pre = self_attention_layer(self.self_attention_linear_p, self.fuse_gate_linear_p1, self.fuse_gate_linear_p2, self.fuse_gate_linear_p3, self.fuse_gate_linear_p4, self.fuse_gate_linear_p5, self.fuse_gate_linear_p6, self.config, self.training, pre, input_drop_prob=self.dropout_rate, p_mask=prem_mask) # [N, len, dim] hyp = self_attention_layer(self.self_attention_linear_h, self.fuse_gate_linear_h1, self.fuse_gate_linear_h2, self.fuse_gate_linear_h3, self.fuse_gate_linear_h4, self.fuse_gate_linear_h5, self.fuse_gate_linear_h6, self.config, self.training, hyp, input_drop_prob=self.dropout_rate, p_mask=prem_mask) bi_att_mx = bi_attention_mx(self.config, self.training, pre, hyp, p_mask=prem_mask, h_mask=hyp_mask) # [N, PL, HL] 70,448,48,48 bi_att_mx = F.dropout(bi_att_mx, p=self.dropout_rate, training=self.training) fm = self.interaction_cnn(bi_att_mx) # [70, 134, 48, 48] if self.config.first_scale_down_layer_relu: fm = F.relu(fm) premise_final = self.dense_net(fm) premise_final = premise_final.view(self.config.batch_size, -1) print("premise_final", premise_final.size()) logits = linear(self.final_linear, [premise_final], self.pred_size ,True, bias_start=0.0, squeeze=False, wd=self.config.wd, input_drop_prob=self.config.keep_rate, is_train=self.training) return logits
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.attention_size = 128 self.mlp_size = self.dim self.sequence_length = seq_length self.lam = 0.01 self.epsilon = 1e-10 ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length(self.premise_x) hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) ### BiLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) ############# MY CODE STARTS ######## premise_outs, premise_final = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') attention_outs_pre, self.alphas_pre = blocks.attention( premise_outs, self.attention_size, return_alphas=True, mask=tf.squeeze(prem_mask)) drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph) #drop_pre = attention_outs_pre hypothesis_outs, hypothesis_final = blocks.biLSTM( hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') attention_outs_hyp, self.alphas_hyp = blocks.attention( hypothesis_outs, self.attention_size, return_alphas=True, mask=tf.squeeze(hyp_mask)) drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph) #drop_hyp = attention_outs_hyp # Concat output of pre and hyp outpuratet drop = tf.concat([drop_pre, drop_hyp], axis=1) h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp) ############# MY CODE ENDS ######## ############# Hex Part ######### ############ MY CODE STARTS ######### attention_outs_pre_hex, self.alphas_pre_hex = blocks.attention( premise_outs, self.attention_size, return_alphas=True, mask=tf.squeeze(prem_mask)) drop_pre_hex = tf.nn.dropout(attention_outs_pre_hex, self.keep_rate_ph) #drop_pre = attention_outs_pre attention_outs_hyp_hex, self.alphas_hyp_hex = blocks.attention( hypothesis_outs, self.attention_size, return_alphas=True, mask=tf.squeeze(hyp_mask)) drop_hyp_hex = tf.nn.dropout(attention_outs_hyp_hex, self.keep_rate_ph) #drop_hyp = attention_outs_hyp # Concat output of pre and hyp outpuratet bag_of_word_in = tf.concat([drop_pre_hex, drop_hyp_hex], axis=1) # Hex component inputs h_fc1 = h_mlp # (?, 300) h_fc2 = bag_of_word_in # (?, 1200) # Hex layer definition self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.W_cl_2 = tf.Variable(tf.random_normal([1200, 3]), trainable=True) self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True) self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0) # Compute prediction using [h_fc1, 0(pad)] pad = tf.zeros_like(h_fc2, tf.float32) # print(pad.shape) -> (?, 600) yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1), self.keep_rate_ph) y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl self.logits = y_conv_pred # Prediction # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2] pad2 = tf.zeros_like(h_fc1, tf.float32) yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1), self.keep_rate_ph) y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl # get Fg yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1), self.keep_rate_ph) y_conv_loss = tf.matmul(yconv_contact_loss, self.W_cl) + self.b_cl # get Fb self.temp = y_conv_H temp = tf.matmul(y_conv_H, y_conv_H, transpose_a=True) y_conv_loss = y_conv_loss - tf.matmul( tf.matmul(tf.matmul(y_conv_H, tf.matrix_inverse(temp)), y_conv_H, transpose_b=True), y_conv_loss) # get loss cost_logits = y_conv_loss # Regularize hex attention alphas_pre_loss_hex = self.alphas_pre_hex + self.epsilon alphas_hyp_loss_hex = self.alphas_hyp_hex + self.epsilon reg1 = tf.reduce_mean(-tf.reduce_sum( alphas_pre_loss_hex * tf.log(alphas_pre_loss_hex), axis=1)) reg2 = tf.reduce_mean(-tf.reduce_sum( alphas_hyp_loss_hex * tf.log(alphas_hyp_loss_hex), axis=1)) reg = reg1 + reg2 self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=y_conv_loss))
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters lambd = 0.05 ## note: embedding_dim and hidden_dim are both 300, used interchangeably self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, mask_prem = blocks.length(self.premise_x) hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) ### First biLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_list = tf.unstack(premise_bi, axis=1) hypothesis_list = tf.unstack(hypothesis_bi, axis=1) ### Attention ### scores_all = [] premise_attn = [] alphas = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) alpha_i = blocks.masked_softmax(scores_i, mask_hyp) a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) premise_attn.append(a_tilde_i) scores_all.append(scores_i) alphas.append(alpha_i) scores_stack = tf.stack(scores_all, axis=2) scores_list = tf.unstack(scores_stack, axis=1) hypothesis_attn = [] betas = [] for j in range(self.sequence_length): scores_j = scores_list[j] beta_j = blocks.masked_softmax(scores_j, mask_prem) b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) hypothesis_attn.append(b_tilde_j) betas.append(beta_j) # Make attention-weighted sentence representations into one tensor, premise_attns = tf.stack(premise_attn, axis=1) hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # For making attention plots, self.alpha_s = tf.stack(alphas, axis=2) self.beta_s = tf.stack(betas, axis=2) ### Subcomponent Inference ### prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) ### Inference Composition ### v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) ### Pooling Layer ### v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) # MLP layer h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) ############### MY CODE STARTS ##### # Define layer size self.bow_layer_size = 600 # LSTM layer (final layer of the original esim model) h_fc1 = h_mlp # Bag-of-word input (averaging word embeddings) bow_pre = premise_in bow_hyp = hypothesis_in # print(bow_pre.shape) -> (?, 50, 300) bag_of_word_pre = tf.reduce_mean(bow_pre, 1) bag_of_word_hyp = tf.reduce_mean(bow_hyp, 1) # print(bag_of_word_pre.shape) -> (?, 300) bag_of_word_in = tf.concat([bag_of_word_pre, bag_of_word_hyp], 1) # print(bag_of_word_in.shape) -> (?, 600) # Bag-of-word input layer params h_fc2 = bag_of_word_in # print( h_fc2.shape) -> (?, 600) # Bag-of-word output layer params weights_from_split = np.load( "../../rearrangingDS/rearranged_even_seqlen50/weights.npy") # (600, 3) bias_from_split = np.load( "../../rearrangingDS/rearranged_even_seqlen50/bias.npy") # (3,) self.W_cl_1 = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.W_cl_2 = tf.Variable(tf.random_normal([600, 3]), trainable=True) self.b_cl = tf.Variable(tf.random_normal((3, )), trainable=True) self.W_cl = tf.concat([self.W_cl_1, self.W_cl_2], 0) reg = lambd * tf.reduce_sum(tf.abs(self.W_cl_2)) / (2 * 50) # Compute prediction using [h_fc1, 0(pad)] pad = tf.zeros_like(h_fc2, tf.float32) # print(pad.shape) -> (?, 600) yconv_contact_pred = tf.nn.dropout(tf.concat([h_fc1, pad], 1), self.keep_rate_ph) y_conv_pred = tf.matmul(yconv_contact_pred, self.W_cl) + self.b_cl self.logits = y_conv_pred # Prediction # Compute loss using [h_fc1, h_fc2] and [0(pad2), h_fc2] pad2 = tf.zeros_like(h_fc1, tf.float32) yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1), self.keep_rate_ph) y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl # get Fg yconv_contact_loss = tf.nn.dropout(tf.concat([h_fc1, h_fc2], 1), self.keep_rate_ph) y_conv_loss = tf.matmul(yconv_contact_loss, self.W_cl) + self.b_cl # get Fb y_conv_loss = y_conv_loss - tf.matmul( tf.matmul(tf.matmul( y_conv_H, tf.matrix_inverse( tf.matmul(y_conv_H, y_conv_H, transpose_a=True))), y_conv_H, transpose_b=True), y_conv_loss) # get loss cost_logits = y_conv_loss self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y, logits=y_conv_loss)) + reg # Cost
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.attention_size = 128 self.mlp_size = self.dim self.sequence_length = seq_length self.lam = 0.01 self.epsilon = 1e-10 ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 4, self.mlp_size], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.mlp_size], stddev=0.1)) self.W_cl = tf.Variable( tf.random_normal([self.mlp_size, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length(self.premise_x) hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) ### BiLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) ############# MY CODE STARTS ######## premise_outs, premise_final = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') attention_outs_pre, self.alphas_pre = blocks.attention( premise_outs, self.attention_size, return_alphas=True) drop_pre = tf.nn.dropout(attention_outs_pre, self.keep_rate_ph) #drop_pre = attention_outs_pre hypothesis_outs, hypothesis_final = blocks.biLSTM( hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') attention_outs_hyp, self.alphas_hyp = blocks.attention( hypothesis_outs, self.attention_size, return_alphas=True) drop_hyp = tf.nn.dropout(attention_outs_hyp, self.keep_rate_ph) #drop_hyp = attention_outs_hyp # Concat output of pre and hyp outpuratet drop = tf.concat([drop_pre, drop_hyp], axis=1) # Add a small constant alphas_pre_loss = self.alphas_pre * tf.squeeze( prem_mask) + self.epsilon alphas_hyp_loss = self.alphas_hyp * tf.squeeze(hyp_mask) + self.epsilon # Calculate entropy reg1 = tf.reduce_mean( -tf.reduce_sum(alphas_pre_loss * tf.log(alphas_pre_loss), axis=1)) reg2 = tf.reduce_mean( -tf.reduce_sum(alphas_hyp_loss * tf.log(alphas_hyp_loss), axis=1)) reg = reg1 + reg2 # MLP layer h_mlp = tf.nn.relu(tf.matmul(drop, self.W_mlp) + self.b_mlp) ############# MY CODE ENDS ######## # Get prediction self.logits = tf.matmul(h_mlp, self.W_cl) + self.b_cl # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y, logits=self.logits) + self.lam * reg)
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, mask_prem = blocks.length(self.premise_x) hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) ### First biLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_list = tf.unstack(premise_bi, axis=1) hypothesis_list = tf.unstack(hypothesis_bi, axis=1) ### Attention ### scores_all = [] premise_attn = [] alphas = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) alpha_i = blocks.masked_softmax(scores_i, mask_hyp) a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) premise_attn.append(a_tilde_i) scores_all.append(scores_i) alphas.append(alpha_i) scores_stack = tf.stack(scores_all, axis=2) scores_list = tf.unstack(scores_stack, axis=1) hypothesis_attn = [] betas = [] for j in range(self.sequence_length): scores_j = scores_list[j] beta_j = blocks.masked_softmax(scores_j, mask_prem) b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) hypothesis_attn.append(b_tilde_j) betas.append(beta_j) # Make attention-weighted sentence representations into one tensor, premise_attns = tf.stack(premise_attn, axis=1) hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # For making attention plots, self.alpha_s = tf.stack(alphas, axis=2) self.beta_s = tf.stack(betas, axis=2) ### Subcomponent Inference ### prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) ### Inference Composition ### v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) ### Pooling Layer ### v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) # MLP layer h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) # Dropout applied to classifier h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph) # Get prediction self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits))
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.premise_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='premise_pos') self.hypothesis_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='hypothesis_pos') self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 12, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) # ## Define External Knowledge dictionary para. # self.exterKnowledge_dic = exterKnowledge_dic ## Define R_matrix self.R_mat = tf.placeholder( tf.float32, [None, self.sequence_length, self.sequence_length]) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, mask_prem = blocks.length(self.premise_x) hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) ### First biLSTM layer ### premise_in = tf.concat( [emb_drop(self.premise_x), tf.cast(self.premise_pos, tf.float32)], axis=2) hypothesis_in = tf.concat([ emb_drop(self.hypothesis_x), tf.cast(self.hypothesis_pos, tf.float32) ], axis=2) premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_list = tf.unstack(premise_bi, axis=1) hypothesis_list = tf.unstack(hypothesis_bi, axis=1) ### self-attention ### premise_project = blocks.dense(premise_bi, 600) premise_project_list = tf.unstack(premise_project, axis=1) premise_self_attn = [] alphas = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): score_ij = tf.reduce_sum(tf.multiply(premise_project_list[i], premise_project_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) alpha_i = blocks.masked_softmax(scores_i, mask_prem) p_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, premise_bi), 1) premise_self_attn.append(p_tilde_i) hypothesis_project = blocks.dense(hypothesis_bi, 600) hypothesis_project_list = tf.unstack(hypothesis_project, axis=1) hypothesis_self_attn = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): score_ij = tf.reduce_sum(tf.multiply( hypothesis_project_list[i], hypothesis_project_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) beta_i = blocks.masked_softmax(scores_i, mask_hyp) h_tilde_i = tf.reduce_sum(tf.multiply(beta_i, hypothesis_bi), 1) hypothesis_self_attn.append(h_tilde_i) premise_self_attns = tf.stack(premise_self_attn, axis=1) hypothesis_self_attns = tf.stack(hypothesis_self_attn, axis=1) ### Attention ### scores_all = [] premise_attn = [] alphas = [] r_alpha = [] r_all = [] for i in range(self.sequence_length): scores_i_list = [] r_i_list = [] for j in range(self.sequence_length): #caculate similarity score_ij (e_ij) score_ij_ori = tf.reduce_sum(tf.multiply( premise_list[i], hypothesis_list[j]), 1, keep_dims=True) ext_r = tf.expand_dims(self.R_mat[:, i, j], axis=1) score_ij = score_ij_ori + ext_r scores_i_list.append(score_ij) r_ij = self.R_mat[:, i, j] r_i_list.append(r_ij) #pdb.set_trace() scores_i = tf.stack(scores_i_list, axis=1) r_i = tf.expand_dims(tf.stack(r_i_list, axis=1), 2) #alpha_i: weigth of hypothesis_bi alpha_i = blocks.masked_softmax(scores_i, mask_hyp) a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) premise_attn.append(a_tilde_i) r_alpha_i = tf.reduce_sum(tf.multiply(r_i, alpha_i), 1) scores_all.append(scores_i) alphas.append(alpha_i) r_alpha.append(r_alpha_i) r_all.append(r_i) scores_stack = tf.stack(scores_all, axis=2) scores_list = tf.unstack(scores_stack, axis=1) #turn i index to j index r_stack = tf.stack(r_all, axis=2) r_list = tf.unstack(r_stack, axis=1) #turn i index to j index hypothesis_attn = [] betas = [] r_beta = [] for j in range(self.sequence_length): scores_j = scores_list[j] beta_j = blocks.masked_softmax(scores_j, mask_prem) b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) hypothesis_attn.append(b_tilde_j) r_j = r_list[j] r_beta_j = tf.reduce_sum(tf.multiply(r_j, beta_j), 1) r_beta.append(r_beta_j) betas.append(beta_j) # Make r_alpha and r_beta in tensor r_alphas = tf.stack(r_alpha, axis=1) r_betas = tf.stack(r_beta, axis=1) # Make attention-weighted sentence representations into one tensor, premise_attns = tf.stack(premise_attn, axis=1) hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # For making attention plots, self.alpha_s = tf.stack(alphas, axis=2) self.beta_s = tf.stack(betas, axis=2) ### Subcomponent Inference ### prem_self_diff = tf.subtract(premise_bi, premise_self_attns) prem_self_mul = tf.multiply(premise_bi, premise_self_attns) hyp_self_diff = tf.subtract(hypothesis_bi, hypothesis_self_attns) hyp_self_mul = tf.multiply(hypothesis_bi, hypothesis_self_attns) prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) ### Factorize Machine ### FM_premise_self_attns = tf.expand_dims( blocks.factorize_machine( tf.concat([premise_bi, premise_self_attns], 2)), 2) FM_prem_self_diff = tf.expand_dims( blocks.factorize_machine(prem_self_diff), 2) FM_prem_self_mul = tf.expand_dims( blocks.factorize_machine(prem_self_mul), 2) FM_hypothesis_self_attns = tf.expand_dims( blocks.factorize_machine( tf.concat([hypothesis_bi, hypothesis_self_attns], 2)), 2) FM_hyp_self_diff = tf.expand_dims( blocks.factorize_machine(hyp_self_diff), 2) FM_hyp_self_mul = tf.expand_dims( blocks.factorize_machine(hyp_self_mul), 2) FM_premise_attns = tf.expand_dims( blocks.factorize_machine(tf.concat([premise_bi, premise_attns], 2)), 2) FM_prem_diff = tf.expand_dims(blocks.factorize_machine(prem_diff), 2) FM_prem_mul = tf.expand_dims(blocks.factorize_machine(prem_mul), 2) FM_hypothesis_attns = tf.expand_dims( blocks.factorize_machine( tf.concat([hypothesis_bi, hypothesis_attns], 2)), 2) FM_hyp_diff = tf.expand_dims(blocks.factorize_machine(hyp_diff), 2) FM_hyp_mul = tf.expand_dims(blocks.factorize_machine(hyp_mul), 2) m_a = tf.concat([ premise_bi, FM_premise_attns, FM_prem_diff, FM_prem_mul, FM_premise_self_attns, FM_prem_self_diff, FM_prem_self_mul, r_alphas ], 2) m_b = tf.concat([ hypothesis_bi, FM_hypothesis_attns, FM_hyp_diff, FM_hyp_mul, FM_hypothesis_self_attns, FM_hyp_self_diff, FM_hyp_self_mul, r_betas ], 2) ### Inference Composition ### v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) ### Pooling Layer ### v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) alpha_w = blocks.masked_softmax(blocks.dense(r_alphas, 1), mask_prem) a_w = tf.reduce_sum(tf.multiply(alpha_w, v1_bi), 1) beta_w = blocks.masked_softmax(blocks.dense(r_betas, 1), mask_hyp) b_w = tf.reduce_sum(tf.multiply(beta_w, v2_bi), 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max, a_w, b_w], 1) # MLP layer h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) # Dropout applied to classifier h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph) # Get prediction self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits))
def __init__(self, config, seq_length, emb_dim, hidden_dim, emb_train, embeddings=None, pred_size=3, context_seq_len=None, query_seq_len=None): ## Define hyperparameters # tf.reset_default_graph() self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length self.pred_size = pred_size self.context_seq_len = context_seq_len self.query_seq_len = query_seq_len # self.config = config ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='premise') self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length], name='hypothesis') self.premise_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='premise_pos') self.hypothesis_pos = tf.placeholder(tf.int32, [None, self.sequence_length, 47], name='hypothesis_pos') self.premise_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='premise_char') self.hypothesis_char = tf.placeholder( tf.int32, [None, self.sequence_length, config.char_in_word_size], name='hypothesis_char') self.premise_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='premise_exact_match') self.hypothesis_exact_match = tf.placeholder( tf.int32, [None, self.sequence_length, 1], name='hypothesis_exact_match') self.global_step = tf.Variable(0, name='global_step', trainable=False) self.dropout_keep_rate = tf.train.exponential_decay( config.keep_rate, self.global_step, config.dropout_decay_step, config.dropout_decay_rate, staircase=False, name='dropout_keep_rate') config.keep_rate = self.dropout_keep_rate tf.summary.scalar('dropout_keep_rate', self.dropout_keep_rate) self.y = tf.placeholder(tf.int32, [None], name='label_y') self.keep_rate_ph = tf.placeholder(tf.float32, [], name='keep_prob') self.is_train = tf.placeholder('bool', [], name='is_train') ## Fucntion for embedding lookup and dropout at embedding layer def emb_drop(E, x): emb = tf.nn.embedding_lookup(E, x) emb_drop = tf.cond(self.is_train, lambda: tf.nn.dropout(emb, config.keep_rate), lambda: emb) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length( self.premise_x) # mask [N, L , 1] hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) self.prem_mask = prem_mask self.hyp_mask = hyp_mask ### Embedding layer ### with tf.variable_scope("emb"): with tf.variable_scope("emb_var"), tf.device("/cpu:0"): self.E = tf.Variable(embeddings, trainable=emb_train) premise_in = emb_drop(self.E, self.premise_x) #P hypothesis_in = emb_drop(self.E, self.hypothesis_x) #H with tf.variable_scope("char_emb"): char_emb_mat = tf.get_variable( "char_emb_mat", shape=[config.char_vocab_size, config.char_emb_size]) with tf.variable_scope("char") as scope: char_pre = tf.nn.embedding_lookup(char_emb_mat, self.premise_char) char_hyp = tf.nn.embedding_lookup(char_emb_mat, self.hypothesis_char) filter_sizes = list( map(int, config.out_channel_dims.split(','))) #[100] heights = list(map(int, config.filter_heights.split(','))) #[5] assert sum(filter_sizes) == config.char_out_size, ( filter_sizes, config.char_out_size) with tf.variable_scope("conv") as scope: conv_pre = multi_conv1d(char_pre, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') scope.reuse_variables() conv_hyp = multi_conv1d(char_hyp, filter_sizes, heights, "VALID", self.is_train, config.keep_rate, scope='conv') conv_pre = tf.reshape( conv_pre, [-1, self.sequence_length, config.char_out_size]) conv_hyp = tf.reshape( conv_hyp, [-1, self.sequence_length, config.char_out_size]) premise_in = tf.concat([premise_in, conv_pre], axis=2) hypothesis_in = tf.concat([hypothesis_in, conv_hyp], axis=2) premise_in = tf.concat( (premise_in, tf.cast(self.premise_pos, tf.float32)), axis=2) hypothesis_in = tf.concat( (hypothesis_in, tf.cast(self.hypothesis_pos, tf.float32)), axis=2) premise_in = tf.concat( [premise_in, tf.cast(self.premise_exact_match, tf.float32)], axis=2) hypothesis_in = tf.concat( [hypothesis_in, tf.cast(self.hypothesis_exact_match, tf.float32)], axis=2) with tf.variable_scope("highway") as scope: premise_in = highway_network(premise_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) scope.reuse_variables() hypothesis_in = highway_network(hypothesis_in, config.highway_num_layers, True, wd=config.wd, is_train=self.is_train) with tf.variable_scope("prepro") as scope: pre = premise_in hyp = hypothesis_in for i in range(config.self_att_enc_layers): with tf.variable_scope(tf.get_variable_scope(), reuse=False): p = self_attention_layer( config, self.is_train, pre, p_mask=prem_mask, scope="{}_layer_self_att_enc".format( i)) # [N, len, dim] h = self_attention_layer( config, self.is_train, hyp, p_mask=hyp_mask, scope="{}_layer_self_att_enc_h".format(i)) pre = p hyp = h variable_summaries(p, "p_self_enc_summary_layer_{}".format(i)) variable_summaries(h, "h_self_enc_summary_layer_{}".format(i)) with tf.variable_scope("main") as scope: def model_one_side(config, main, support, main_length, support_length, main_mask, support_mask, scope): bi_att_mx = bi_attention_mx(config, self.is_train, main, support, p_mask=main_mask, h_mask=support_mask) # [N, PL, HL] bi_att_mx = tf.cond( self.is_train, lambda: tf.nn.dropout(bi_att_mx, config.keep_rate), lambda: bi_att_mx) out_final = dense_net(config, bi_att_mx, self.is_train) return out_final premise_final = model_one_side(config, p, h, prem_seq_lengths, hyp_seq_lengths, prem_mask, hyp_mask, scope="premise_as_main") f0 = premise_final print('f0:', f0.get_shape().as_list()) self.logits = linear(f0, self.pred_size, True, bias_start=0.0, scope="logit", squeeze=False, wd=config.wd, input_keep_prob=config.keep_rate, is_train=self.is_train) tf.summary.histogram('logit_histogram', self.logits) # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits)) self.acc = tf.reduce_mean( tf.cast( tf.equal(tf.arg_max(self.logits, dimension=1), tf.cast(self.y, tf.int64)), tf.float32)) tf.summary.scalar('acc', self.acc) tf.summary.scalar('loss', self.total_cost) # calculate acc # L2 Loss if config.l2_loss: if config.sigmoid_growing_l2loss: weights_added = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name.endswith("weights:0") and not tensor.name.endswith("weighted_sum/weights:0") or tensor.name.endswith('kernel:0') ]) full_l2_step = tf.constant(config.weight_l2loss_step_full_reg, dtype=tf.int32, shape=[], name='full_l2reg_step') full_l2_ratio = tf.constant(config.l2_regularization_ratio, dtype=tf.float32, shape=[], name='l2_regularization_ratio') gs_flt = tf.cast(self.global_step, tf.float32) half_l2_step_flt = tf.cast(full_l2_step / 2, tf.float32) # (self.global_step - full_l2_step / 2) # tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32) # l2loss_ratio = tf.sigmoid( tf.cast((self.global_step - full_l2_step / 2) * 8, tf.float32) / tf.cast(full_l2_step / 2 ,tf.float32)) * full_l2_ratio l2loss_ratio = tf.sigmoid(((gs_flt - half_l2_step_flt) * 8) / half_l2_step_flt) * full_l2_ratio tf.summary.scalar('l2loss_ratio', l2loss_ratio) l2loss = weights_added * l2loss_ratio else: l2loss = tf.add_n([ tf.nn.l2_loss(tensor) for tensor in tf.trainable_variables() if tensor.name. endswith("weights:0") or tensor.name.endswith('kernel:0') ]) * tf.constant(config.l2_regularization_ratio, dtype='float', shape=[], name='l2_regularization_ratio') tf.summary.scalar('l2loss', l2loss) self.total_cost += l2loss if config.wo_enc_sharing or config.wo_highway_sharing_but_penalize_diff: diffs = [] for i in range(config.self_att_enc_layers): for tensor in tf.trainable_variables(): print(tensor.name) if tensor.name == "prepro/{}_layer_self_att_enc/self_attention/h_logits/first/kernel:0".format( i): l_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_attention/h_logits/first/kernel:0".format( i): r_lg = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_1/kernel:0".format( i): l_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_1/kernel:0".format( i): r_fg_lhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_1/kernel:0".format( i): l_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_1/kernel:0".format( i): r_fg_rhs_1 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_2/kernel:0".format( i): l_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_2/kernel:0".format( i): r_fg_lhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_2/kernel:0".format( i): l_fg_rhs_2 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_2/kernel:0".format( i): r_fg_rhs_2 = tensor if config.two_gate_fuse_gate: if tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/lhs_3/kernel:0".format( i): l_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/lhs_3/kernel:0".format( i): r_fg_lhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc/self_att_fuse_gate/rhs_3/kernel:0".format( i): l_fg_rhs_3 = tensor elif tensor.name == "prepro/{}_layer_self_att_enc_h/self_att_fuse_gate/rhs_3/kernel:0".format( i): r_fg_rhs_3 = tensor diffs += [ l_lg - r_lg, l_fg_lhs_1 - r_fg_lhs_1, l_fg_rhs_1 - r_fg_rhs_1, l_fg_lhs_2 - r_fg_lhs_2, l_fg_rhs_2 - r_fg_rhs_2 ] if config.two_gate_fuse_gate: diffs += [l_fg_lhs_3 - r_fg_lhs_3, l_fg_rhs_3 - r_fg_rhs_3] diff_loss = tf.add_n([tf.nn.l2_loss(tensor) for tensor in diffs]) * tf.constant( config.diff_penalty_loss_ratio, dtype='float', shape=[], name='diff_penalty_loss_ratio') tf.summary.scalar('diff_penalty_loss', diff_loss) self.total_cost += diff_loss self.summary = tf.summary.merge_all() total_parameters = 0 for v in tf.global_variables(): if not v.name.endswith("weights:0") and not v.name.endswith( "biases:0") and not v.name.endswith( 'kernel:0') and not v.name.endswith('bias:0'): continue print(v.name) # print(type(v.name)) shape = v.get_shape().as_list() param_num = 1 for dim in shape: param_num *= dim print(param_num) total_parameters += param_num print(total_parameters)
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters ## note: embedding_dim and hidden_dim are both 300, used interchangeably self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, mask_prem = blocks.length(self.premise_x) hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) ### First biLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_list = tf.unstack(premise_bi, axis=1) hypothesis_list = tf.unstack(hypothesis_bi, axis=1) ### Attention ### scores_all = [] premise_attn = [] alphas = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) alpha_i = blocks.masked_softmax(scores_i, mask_hyp) a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) premise_attn.append(a_tilde_i) scores_all.append(scores_i) alphas.append(alpha_i) scores_stack = tf.stack(scores_all, axis=2) scores_list = tf.unstack(scores_stack, axis=1) hypothesis_attn = [] betas = [] for j in range(self.sequence_length): scores_j = scores_list[j] beta_j = blocks.masked_softmax(scores_j, mask_prem) b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) hypothesis_attn.append(b_tilde_j) betas.append(beta_j) # Make attention-weighted sentence representations into one tensor, premise_attns = tf.stack(premise_attn, axis=1) hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # For making attention plots, self.alpha_s = tf.stack(alphas, axis=2) self.beta_s = tf.stack(betas, axis=2) ### Subcomponent Inference ### prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) ### Inference Composition ### v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) ### Pooling Layer ### v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) # MLP layer h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) ############### MY CODE STARTS ##### # Define layer size self.bow_layer_size = 300 # LSTM layer (final layer of the original esim model) h_fc1 = h_mlp h_fc1 = tf.zeros_like(h_mlp) # Don't need esim output # Bag-of-word input (averaing word embeddings) bow_pre = premise_in bow_hyp = hypothesis_in bag_of_word_in = tf.reduce_mean(tf.concat([bow_pre, bow_hyp], 1), 1) # Bag-of-word input layer params W_fc2 = tf.Variable( tf.random_normal([self.dim, self.bow_layer_size], stddev=0.1)) b_fc2 = tf.Variable(tf.zeros([self.bow_layer_size])) h_fc2 = tf.nn.relu(tf.matmul(bag_of_word_in, W_fc2) + b_fc2) # Bag-of-word output layer params self.W_cl = tf.Variable( tf.random_normal([self.dim + self.bow_layer_size, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) pad2 = tf.zeros_like(h_fc1, tf.float32) # Compute both cost and prediction using yconv_contact_H yconv_contact_H = tf.nn.dropout(tf.concat([pad2, h_fc2], 1), self.keep_rate_ph) y_conv_H = tf.matmul(yconv_contact_H, self.W_cl) + self.b_cl y_conv_pred = y_conv_H y_conv_loss = y_conv_H self.logits = y_conv_pred # Prediction self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.y, logits=self.logits)) # Cost
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) ## Function for embedding lookup and dropout at embedding layer def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, prem_mask = blocks.length(self.premise_x) hyp_seq_lengths, hyp_mask = blocks.length(self.hypothesis_x) ### BiLSTM layer ### premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) #premise_final = blocks.last_output(premise_bi, prem_seq_lengths) #hypothesis_final = blocks.last_output(hypothesis_bi, hyp_seq_lengths) ### Mean pooling premise_sum = tf.reduce_sum(premise_bi, 1) premise_ave = tf.div( premise_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) hypothesis_sum = tf.reduce_sum(hypothesis_bi, 1) hypothesis_ave = tf.div( hypothesis_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) ### Mou et al. concat layer ### diff = tf.subtract(premise_ave, hypothesis_ave) mul = tf.multiply(premise_ave, hypothesis_ave) h = tf.concat([premise_ave, hypothesis_ave, diff, mul], 1) # MLP layer h_mlp = tf.nn.relu(tf.matmul(h, self.W_mlp) + self.b_mlp) # Dropout applied to classifier h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph) # Get prediction self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits))
def __init__(self, seq_length, emb_dim, hidden_dim, embeddings, emb_train): ## Define hyperparameters self.embedding_dim = emb_dim self.dim = hidden_dim self.sequence_length = seq_length ## Define the placeholders self.premise_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.hypothesis_x = tf.placeholder(tf.int32, [None, self.sequence_length]) self.y = tf.placeholder(tf.int32, [None]) self.keep_rate_ph = tf.placeholder(tf.float32, []) ## Define parameters self.E = tf.Variable(embeddings, trainable=emb_train) self.W_mlp = tf.Variable( tf.random_normal([self.dim * 8, self.dim], stddev=0.1)) self.b_mlp = tf.Variable(tf.random_normal([self.dim], stddev=0.1)) self.W_cl = tf.Variable(tf.random_normal([self.dim, 3], stddev=0.1)) self.b_cl = tf.Variable(tf.random_normal([3], stddev=0.1)) # Function for embedding lookup and dropout at embedding layer # dropout就是忽略部分特征检测器(让部分隐层节点值为0) def emb_drop(x): emb = tf.nn.embedding_lookup(self.E, x) emb_drop = tf.nn.dropout(emb, self.keep_rate_ph) return emb_drop # Get lengths of unpadded sentences prem_seq_lengths, mask_prem = blocks.length(self.premise_x) hyp_seq_lengths, mask_hyp = blocks.length(self.hypothesis_x) # ————————————————————————input encoding阶段——————————————————————————————- premise_in = emb_drop(self.premise_x) hypothesis_in = emb_drop(self.hypothesis_x) # 通过BiLSTM重新学习单词和上下文的关系 premise_outs, c1 = blocks.biLSTM(premise_in, dim=self.dim, seq_len=prem_seq_lengths, name='premise') hypothesis_outs, c2 = blocks.biLSTM(hypothesis_in, dim=self.dim, seq_len=hyp_seq_lengths, name='hypothesis') print('premise_outs: ', premise_outs) premise_bi = tf.concat(premise_outs, axis=2) hypothesis_bi = tf.concat(hypothesis_outs, axis=2) premise_list = tf.unstack(premise_bi, axis=1) hypothesis_list = tf.unstack(hypothesis_bi, axis=1) print('hypothesis_list: ', hypothesis_list) # 注意力机制 scores_all = [] premise_attn = [] alphas = [] for i in range(self.sequence_length): scores_i_list = [] for j in range(self.sequence_length): # 计算第一个句子(premise)的第i个单词和第二个句子所有单词的相似度(向量乘积) # 这里的score就是论文里面的e score_ij = tf.reduce_sum(tf.multiply(premise_list[i], hypothesis_list[j]), 1, keep_dims=True) scores_i_list.append(score_ij) scores_i = tf.stack(scores_i_list, axis=1) alpha_i = blocks.masked_softmax(scores_i, mask_hyp) # 通过softmax标准化转换成权重 a_tilde_i = tf.reduce_sum(tf.multiply(alpha_i, hypothesis_bi), 1) # 这里就是用句子b的各个词向量根据权重去表示句子a的第i个词向量 premise_attn.append(a_tilde_i) scores_all.append(scores_i) alphas.append(alpha_i) # 把scores的结构转为list scores_stack = tf.stack(scores_all, axis=2) scores_list = tf.unstack(scores_stack, axis=1) # 对句子b也重复上面的过程 hypothesis_attn = [] betas = [] for j in range(self.sequence_length): scores_j = scores_list[j] beta_j = blocks.masked_softmax(scores_j, mask_prem) b_tilde_j = tf.reduce_sum(tf.multiply(beta_j, premise_bi), 1) hypothesis_attn.append(b_tilde_j) betas.append(beta_j) # Make attention-weighted sentence representations into one tensor, premise_attns = tf.stack(premise_attn, axis=1) hypothesis_attns = tf.stack(hypothesis_attn, axis=1) # For making attention plots, self.alpha_s = tf.stack(alphas, axis=2) self.beta_s = tf.stack(betas, axis=2) # Enhancement of local inference information # 下面就是分析差异的过程 prem_diff = tf.subtract(premise_bi, premise_attns) prem_mul = tf.multiply(premise_bi, premise_attns) hyp_diff = tf.subtract(hypothesis_bi, hypothesis_attns) hyp_mul = tf.multiply(hypothesis_bi, hypothesis_attns) m_a = tf.concat([premise_bi, premise_attns, prem_diff, prem_mul], 2) m_b = tf.concat([hypothesis_bi, hypothesis_attns, hyp_diff, hyp_mul], 2) # Inference Composition # 用BiLSTM分析overall inference relationship between a premise and hypothesis v1_outs, c3 = blocks.biLSTM(m_a, dim=self.dim, seq_len=prem_seq_lengths, name='v1') v2_outs, c4 = blocks.biLSTM(m_b, dim=self.dim, seq_len=hyp_seq_lengths, name='v2') v1_bi = tf.concat(v1_outs, axis=2) v2_bi = tf.concat(v2_outs, axis=2) # Pooling Layer v_1_sum = tf.reduce_sum(v1_bi, 1) v_1_ave = tf.div( v_1_sum, tf.expand_dims(tf.cast(prem_seq_lengths, tf.float32), -1)) v_2_sum = tf.reduce_sum(v2_bi, 1) v_2_ave = tf.div( v_2_sum, tf.expand_dims(tf.cast(hyp_seq_lengths, tf.float32), -1)) v_1_max = tf.reduce_max(v1_bi, 1) v_2_max = tf.reduce_max(v2_bi, 1) v = tf.concat([v_1_ave, v_2_ave, v_1_max, v_2_max], 1) # 最后用MLP layer做分类 h_mlp = tf.nn.tanh(tf.matmul(v, self.W_mlp) + self.b_mlp) # Dropout applied to classifier h_drop = tf.nn.dropout(h_mlp, self.keep_rate_ph) # Get prediction self.logits = tf.matmul(h_drop, self.W_cl) + self.b_cl # Define the cost function self.total_cost = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits))