def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, l2_reg_lambda=0.0, adjust_weight=False,label_weight=[],is_training=True): # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0,trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.neg_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) #embedding layer with tf.device("/cpu:0"),tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests =tf.nn.embedding_lookup(W, self.neg_input_quests) test_q =tf.nn.embedding_lookup(W, self.test_input_q) test_a =tf.nn.embedding_lookup(W, self.test_input_a) #build LSTM network with tf.variable_scope("LSTM_scope", reuse=None): ori_q = LSTM(ori_quests, self.rnn_size, self.batch_size) ori_q_feat = tf.nn.tanh(max_pooling(ori_q)) with tf.variable_scope("LSTM_scope", reuse=True): cand_a = LSTM(cand_quests, self.rnn_size, self.batch_size) neg_a = LSTM(neg_quests, self.rnn_size, self.batch_size) cand_q_feat = tf.nn.tanh(max_pooling(cand_a)) neg_q_feat = tf.nn.tanh(max_pooling(neg_a)) test_q_out = LSTM(test_q, self.rnn_size, self.batch_size) test_q_out = tf.nn.tanh(max_pooling(test_q_out)) test_a_out = LSTM(test_a, self.rnn_size, self.batch_size) test_a_out = tf.nn.tanh(max_pooling(test_a_out)) self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg = feature2cos_sim(ori_q_feat, neg_q_feat) self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg) self.test_q_a = feature2cos_sim(test_q_out, test_a_out)
def forward(self, A, X, seq_len, seqs, hidden): batch_size = seqs.shape[0] Ws = [] for i in range(self.num_layers): if i == 0: H, W = self.layers[i](A) else: H = self.normalization(H) H, W = self.layers[i](A, H) Ws.append(W) for i in range(self.num_channels): if i == 0: edge_index, edge_weight = H[i][0], H[i][1] X_ = self.gcn(X, edge_index=edge_index.detach(), edge_weight=edge_weight) X_ = F.relu(X_) else: edge_index, edge_weight = H[i][0], H[i][1] X_ = torch.cat((X_, F.relu( self.gcn(X, edge_index=edge_index.detach(), edge_weight=edge_weight))), dim=1) X_ = self.linear1(X_) basket_seqs = torch.zeros(batch_size * self.max_seq_length, self.w_out, dtype=self.dtype, device=self.device) seqs = seqs.contiguous().view(-1, self.nb_items) for i, basket in enumerate(seqs, 0): if torch.sum(basket) > 0: item_idx = torch.nonzero(basket, as_tuple=True) basket_embed = utils.max_pooling(X_[item_idx]) basket_seqs[i] = basket_embed basket_seqs = basket_seqs.contiguous().view(-1, self.max_seq_length, self.w_out) lstm_out, (h_n, c_n) = self.lstm(basket_seqs, hidden) actual_index = torch.arange(0, batch_size) * self.max_seq_length + ( seq_len - 1) actual_lstm_out = lstm_out.reshape(-1, self.rnn_units)[actual_index] hidden_to_score = self.h2item_score(actual_lstm_out) # print(hidden_to_score) # predict next items score next_item_probs = torch.sigmoid(hidden_to_score) # loss = self.loss(next_item_probs, target_basket) # return loss, target_basket, Ws return next_item_probs
def build(self, input, is_dropout = False): #is_dropout 是否dropout conv1_1 = conv3_3(input, 64, 'conv1_1',self.data_dict_VGG16, finetune=self.finetune) conv1_2 = conv3_3(conv1_1, 64, 'conv1_2',self.data_dict_VGG16, finetune=self.finetune) pool1 = max_pooling(conv1_2, 'pool1') # conv2 conv2_1 = conv3_3(pool1, 128, 'conv2_1',self.data_dict_VGG16, finetune=self.finetune) conv2_2 = conv3_3(conv2_1, 128, 'conv2_2',self.data_dict_VGG16, finetune=self.finetune) pool2 = max_pooling(conv2_2, 'pool2') # conv3 conv3_1 = conv3_3(pool2, 256, 'conv3_1',self.data_dict_VGG16, finetune=self.finetune) conv3_2 = conv3_3(conv3_1, 256, 'conv3_2',self.data_dict_VGG16, finetune=self.finetune) conv3_3 = conv3_3(conv3_2, 256, 'conv3_3',self.data_dict_VGG16, finetune=self.finetune) pool3 = max_pooling(conv3_3, 'pool3') # conv4 conv4_1 = conv3_3(pool3, 512, 'conv4_1', self.data_dict_VGG16, finetune=self.finetune) conv4_2 = conv3_3(conv4_1, 512, 'conv4_2', self.data_dict_VGG16, finetune=self.finetune) conv4_3 = conv3_3(conv4_2, 512, 'conv4_3', self.data_dict_VGG16, finetune=self.finetune) pool4 = max_pooling(conv4_3, 'pool4') # conv5 conv5_1 = conv3_3(pool4, 512, 'conv5_1', self.data_dict_VGG16, finetune=self.finetune) conv5_2 = conv3_3(conv5_1, 512, 'conv5_2', self.data_dict_VGG16, finetune=self.finetune) conv5_3 = conv3_3(conv5_2, 512, 'conv5_3', self.data_dict_VGG16, finetune=self.finetune) pool5 = max_pooling(conv5_3, 'pool5') # fully connected layer flatten = tf.reshape(pool5, [self.batchsize, -1]) fc_6 = fc(flatten, 4096, 'fc_6', finetune=False) fc_6 = tf.nn.relu(fc_6) if is_dropout: fc_6 = tf.nn.dropout(fc_6, 0.5) fc_7 = fc(fc_6, 4096, 'fc_7', finetune=False) fc_7 = tf.nn.relu(fc_7) if is_dropout: fc_7 = tf.nn.dropout(fc_7, 0.5) fc_8 = fc(fc_7, self.n_classes, 'fc_8', finetune=False) return fc_8
def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, attention_matrix_size, loss_ratio, l2_reg_lambda=0.0, adjust_weight=False, label_weight=[], is_training=True, m=0.1): # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0, trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.cand_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.neg_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps], name='test_q') self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps], name='test_a') self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER], name='cat_ids') #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests) test_q = tf.nn.embedding_lookup(W, self.test_input_q) test_a = tf.nn.embedding_lookup(W, self.test_input_a) # run lstm without attention with tf.variable_scope("LSTM_scope") as scope: ori_q = biLSTM(ori_quests, self.rnn_size) ori_q_feat = tf.nn.tanh(max_pooling(ori_q)) scope.reuse_variables() cand_a = biLSTM(cand_quests, self.rnn_size) neg_a = biLSTM(neg_quests, self.rnn_size) cand_q_feat = tf.nn.tanh(max_pooling(cand_a)) neg_q_feat = tf.nn.tanh(max_pooling(neg_a)) test_q_out = biLSTM(test_q, self.rnn_size) test_q_out = tf.nn.tanh(max_pooling(test_q_out)) test_a_out = biLSTM(test_a, self.rnn_size) test_a_out = tf.nn.tanh(max_pooling(test_a_out)) # build LSTM network # with tf.variable_scope("LSTM_scope") as scope: # ori_q = biLSTM(ori_quests, self.rnn_size) # #ori_q_feat = tf.nn.tanh(max_pooling(ori_q)) # # scope.reuse_variables() # # cand_a = biLSTM(cand_quests, self.rnn_size) # neg_a = biLSTM(neg_quests, self.rnn_size) # #cand_q_feat = tf.nn.tanh(max_pooling(cand_a)) # #neg_q_feat = tf.nn.tanh(max_pooling(neg_a)) # # test_q_out = biLSTM(test_q, self.rnn_size) # #test_q_out = tf.nn.tanh(max_pooling(test_q_out)) # test_a_out = biLSTM(test_a, self.rnn_size) # #test_a_out = tf.nn.tanh(max_pooling(test_a_out)) # with tf.name_scope("att_weight"): # # attention params # att_W = { # 'Wam': tf.Variable(tf.truncated_normal([2 * self.rnn_size, attention_matrix_size], stddev=0.1)), # 'Wqm': tf.Variable(tf.truncated_normal([2 * self.rnn_size, attention_matrix_size], stddev=0.1)), # 'Wms': tf.Variable(tf.truncated_normal([attention_matrix_size, 1], stddev=0.1)) # } # ori_q_feat, cand_q_feat = get_feature(ori_q, cand_a, att_W) # ori_nq_feat, neg_q_feat = get_feature(ori_q, neg_a, att_W) # test_q_out, test_a_out = get_feature(test_q_out, test_a_out, att_W) # multitasking with tf.name_scope("multitasking"): feature_size = int(ori_q_feat.get_shape()[1]) w = tf.get_variable(name='weights', shape=(feature_size, CAT_NUMBER), initializer=tf.random_normal_initializer()) b = tf.get_variable(name='bias', shape=(1, CAT_NUMBER), initializer=tf.zeros_initializer()) # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask") logits = tf.matmul(ori_q_feat, w) + b entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=self.cat_ids, name='loss') loss_multitask = tf.reduce_mean(entropy) # acc self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat) loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score, self.ori_neg_score, m) self.loss = loss_origin * (1 - loss_ratio) + loss_multitask * loss_ratio self.test_q_a = feature2cos_sim(test_q_out, test_a_out) #multitasking_acc with tf.name_scope("multi_acc"): self.preds = tf.nn.softmax(logits) self.correct_preds = tf.equal(tf.argmax(self.preds, 1), tf.argmax(self.cat_ids, 1)) self.multi_acc = tf.reduce_sum( tf.cast(self.correct_preds, tf.float32))
def forward(self, inputs): premises_indices = inputs[0] hypothesis_indices = inputs[1] premises_lengths = torch.sum(premises_indices != 0, dim=-1) hypothesis_lengths = torch.sum(hypothesis_indices != 0, dim=-1) premise_mask = get_mask(premises_indices, premises_lengths).to(self.args.device) hypothesis_mask = get_mask(hypothesis_indices, hypothesis_lengths).to(self.args.device) embed_premises = self.embed(premises_indices) embed_hypothesis = self.embed(hypothesis_indices) if self.dropout: embed_premises = self._rnn_dropout(embed_premises) embed_hypothesis = self._rnn_dropout(embed_hypothesis) encoded_premises = self._encoding(embed_premises, premises_lengths) encoded_hypothesis = self._encoding(embed_hypothesis, hypothesis_lengths) attended_premises, attended_hypothesis = self._attention( encoded_premises, premise_mask, encoded_hypothesis, hypothesis_mask) enhanced_premise = torch.cat([ encoded_premises, attended_premises, encoded_premises - attended_premises, encoded_premises * attended_premises ], dim=-1) enhanced_hypothesis = torch.cat([ encoded_hypothesis, attended_hypothesis, encoded_hypothesis - attended_hypothesis, encoded_hypothesis * attended_hypothesis ], dim=-1) projected_premises = self._projection(enhanced_premise) projected_hypothesis = self._projection(enhanced_hypothesis) if self.dropout: projected_premises = self._rnn_dropout(projected_premises) projected_hypothesis = self._rnn_dropout(projected_hypothesis) v_ai = self._composition(projected_premises, premises_lengths) v_bj = self._composition(projected_hypothesis, hypothesis_lengths) v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(1)\ .transpose(2, 1), dim=1) / torch.sum(premise_mask, dim=1, keepdim=True) v_b_avg = torch.sum( v_bj * hypothesis_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(hypothesis_mask, dim=1, keepdim=True) # v_a_max, _ = replace_masked(v_ai, premise_mask, -1e7).max(dim=1) # v_b_max, _ = replace_masked(v_bj, hypothesis_mask, -1e7).max(dim=1) v_a_max, _ = max_pooling(v_ai, premise_mask, dim=1) v_b_max, _ = max_pooling(v_bj, hypothesis_mask, dim=1) if self.args.use_char_emb: premises_char_indices = inputs[2] hypothesis_char_indices = inputs[3] premises_char_lengths = torch.sum(premises_char_indices != 0, dim=-1) hypothesis_char_lengths = torch.sum(hypothesis_char_indices != 0, dim=-1) premise_char_mask = get_mask(premises_char_indices, premises_char_lengths).to( self.args.device) hypothesis_char_mask = get_mask(hypothesis_char_indices, hypothesis_char_lengths).to( self.args.device) embed_char_premises = self.char_embed(premises_char_indices) embed_char_hypothesis = self.char_embed(hypothesis_char_indices) if self.dropout: embed_char_premises = self._rnn_dropout(embed_char_premises) embed_char_hypothesis = self._rnn_dropout( embed_char_hypothesis) encoded_char_premises = self._char_encoding( embed_char_premises, premises_char_lengths) encoded_char_hypothesis = self._char_encoding( embed_char_hypothesis, hypothesis_char_lengths) attended_char_premises, attended_char_hypothesis = self._attention( encoded_char_premises, premise_char_mask, encoded_char_hypothesis, hypothesis_char_mask) enhanced_char_premise = torch.cat([ encoded_char_premises, attended_char_premises, encoded_char_premises - attended_char_premises, encoded_char_premises * attended_char_premises ], dim=-1) enhanced_char_hypothesis = torch.cat([ encoded_char_hypothesis, attended_char_hypothesis, encoded_char_hypothesis - attended_char_hypothesis, encoded_char_hypothesis * attended_char_hypothesis ], dim=-1) projected_char_premises = self._char_projection( enhanced_char_premise) projected_char_hypothesis = self._char_projection( enhanced_char_hypothesis) if self.dropout: projected_char_premises = self._rnn_dropout( projected_char_premises) projected_char_hypothesis = self._rnn_dropout( projected_char_hypothesis) cv_ai = self._char_composition(projected_char_premises, premises_char_lengths) cv_bj = self._char_composition(projected_char_hypothesis, hypothesis_char_lengths) cv_a_avg = torch.sum(cv_ai * premise_char_mask.unsqueeze(1) \ .transpose(2, 1), dim=1) / torch.sum(premise_char_mask, dim=1, keepdim=True) cv_b_avg = torch.sum( cv_bj * hypothesis_char_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(hypothesis_char_mask, dim=1, keepdim=True) # cv_a_max, _ = replace_masked(cv_ai, premise_char_mask, -1e7).max(dim=1) # cv_b_max, _ = replace_masked(cv_bj, hypothesis_char_mask, -1e7).max(dim=1) cv_a_max, _ = max_pooling(cv_ai, premise_char_mask, dim=1) cv_b_max, _ = max_pooling(cv_bj, hypothesis_char_mask, dim=1) v = torch.cat([ v_a_avg, v_a_max, v_b_avg, v_b_max, cv_a_avg, cv_a_max, cv_b_avg, cv_b_max ], dim=1) logits = self._classification(v) return logits
def forward(self, inputs): premises_indices = inputs[0] hypothesis_indices = inputs[1] # print(premises_indices.size()) # batch, 1 premises_lengths = torch.sum(premises_indices != 0, dim=-1) hypothesis_lengths = torch.sum(hypothesis_indices != 0, dim=-1) # print(premises_lengths.size()) # batch, seq_len premise_mask = get_mask(premises_indices, premises_lengths).to(self.args.device) hypothesis_mask = get_mask(hypothesis_indices, hypothesis_lengths).to(self.args.device) # print(premise_mask.size()) embed_premise = self.embed(premises_indices) embed_hypothesis = self.embed(hypothesis_indices) # batch, seq_len, embed_dim embed_premise = self._rnn_dropout(embed_premise) embed_hypothesis = self._rnn_dropout(embed_hypothesis) # ----Encoder Layer---- # (batch, seq_len, 2*hidden_size) encode_premise = self.sentence_encoder(embed_premise, premises_lengths) encode_hypothesis = self.sentence_encoder(embed_hypothesis, hypothesis_lengths) # print(encode_premise.size()) # Co-Attention Layer # encode_premise,_ = self.average_attention(encode_premise , premise_mask) # encode_hypothesis,_ = self.average_attention(encode_hypothesis, hypothesis_mask) # attended_premise, attended_hypothesis = self._attention(encode_premise, premise_mask, # encode_hypothesis, hypothesis_mask) seq_len_p = encode_premise.size(1) seq_len_h = encode_hypothesis.size(1) _hypothesis_mask = hypothesis_mask.unsqueeze(1).expand(-1, seq_len_p, -1) # batch, p_seq_len, h_seq_len _premise_mask = premise_mask.unsqueeze(2).expand(-1, -1, seq_len_h) # batch, p_seq_len, h_seq_len # print(premise_mask.size()) _encode_premise = encode_premise.unsqueeze(2).expand(-1, -1, seq_len_h, -1) _encode_hypothesis = encode_hypothesis.unsqueeze(1).expand(-1, seq_len_p, -1, -1) # print(_encode_premise.size()) p_h = torch.cat([_encode_premise, _encode_hypothesis, _encode_premise - _encode_hypothesis, _encode_premise * _encode_hypothesis], dim=-1) # batch, seq_len1, seq_len2, 4*2*hidden_size p_h = self._trans(p_h).squeeze(-1) # batch, seq_len1, seq_len2 # print(p_h.size()) similarity_matrix_hyp = p_h + (-999999 * (_hypothesis_mask == 0).float()) similarity_matrix_pre = p_h + (-999999 * (_premise_mask == 0).float()) # softmax attention weight attention_a = F.softmax(similarity_matrix_pre, dim=2) # batch, p_seq_len, h_seq_len attention_b = F.softmax(similarity_matrix_hyp, dim=1) # batch, attended_premise = torch.bmm(attention_a, encode_hypothesis) # batch, p_seq_len, hidden_size attended_hypothesis = torch.bmm(attention_b.transpose(1, 2), encode_premise) # batch, h_seq_len, hidden_size # the enhancement layer # (batch, seq_len, 2*4*hidden_size) premise_enhanced = torch.cat([encode_premise, attended_premise, encode_premise - attended_premise, encode_premise * attended_premise], dim=-1) hypothesis_enhanced = torch.cat([encode_hypothesis, attended_hypothesis, encode_hypothesis - attended_hypothesis, encode_hypothesis * attended_hypothesis], dim=-1) # (batch, seq_len, hidden_size) projected_enhanced_premise = self._projection(premise_enhanced) projected_enhanced_hypothesis = self._projection(hypothesis_enhanced) # (batch, seq_len, 2*hidden_size) # premise = self.pair_encoder(projected_enhanced_premise, projected_enhanced_hypothesis, hypothesis_mask) # hypothesis = self.pair_encoder(projected_enhanced_hypothesis, projected_enhanced_premise, premise_mask) projected_enhanced_premise = self._rnn_dropout(projected_enhanced_premise) projected_enhanced_hypothesis = self._rnn_dropout(projected_enhanced_hypothesis) premise = self._composition(projected_enhanced_premise, premises_lengths) hypothesis = self._composition(projected_enhanced_hypothesis, hypothesis_lengths) # batch, seq_len, 2*hidden_size # premise = self.mulhead_attention(premise.transpose(1, 2), premise_mask).transpose(1, 2) # hypothesis = self.mulhead_attention(hypothesis.transpose(1, 2), hypothesis_mask).transpose(1, 2) # premise,_ = self.average_attention(premise, mask=premise_mask) # hypothesis,_ = self.average_attention(hypothesis, hypothesis_mask) if self.args.use_char_emb: cpremises_indices = inputs[2] chypothesis_indices = inputs[3] # batch, 1 cpremises_lengths = torch.sum(cpremises_indices != 0, dim=-1) chypothesis_lengths = torch.sum(chypothesis_indices != 0, dim=-1) # batch, seq_len cpremise_mask = get_mask(cpremises_indices, cpremises_lengths).to(self.args.device) chypothesis_mask = get_mask(chypothesis_indices, chypothesis_lengths).to(self.args.device) cembed_premise = self.cembed(cpremises_indices) cembed_hypothesis = self.cembed(chypothesis_indices) # batch, seq_len, embed_dim """ embed_premise = embed_premise.transpose(0, 1) embed_hypothesis = embed_hypothesis.transpose(0, 1) # seq_len, batch premise_mask = premise_mask.transpose(0, 1) hypothesis_mask = hypothesis_mask.transpose(0, 1) """ cembed_premise = self._rnn_dropout(cembed_premise) cembed_hypothesis = self._rnn_dropout(cembed_hypothesis) # ----Encoder Layer---- # (batch, seq_len, 2*hidden_size) cencode_premise = self.char_encoder(cembed_premise, cpremises_lengths) cencode_hypothesis = self.char_encoder(cembed_hypothesis, chypothesis_lengths) # (batch, seq_len, 2*4*hidden_size) # Co-Attention Layer # cencode_premise,_ = self.caverage_attention(cencode_premise, cpremise_mask) # cencode_hypothesis,_ = self.caverage_attention(cencode_hypothesis, chypothesis_mask) # cattended_premise, cattended_hypothesis = self._attention(cencode_premise, cpremise_mask, # cencode_hypothesis, chypothesis_mask) cseq_len_p = cencode_premise.size(1) cseq_len_h = cencode_hypothesis.size(1) _chypothesis_mask = chypothesis_mask.unsqueeze(1).expand(-1, cseq_len_p, -1) # batch, p_seq_len, h_seq_len _cpremise_mask = cpremise_mask.unsqueeze(2).expand(-1, -1, cseq_len_h) # batch, p_seq_len, h_seq_len # print(premise_mask.size()) _cencode_premise = cencode_premise.unsqueeze(2).expand(-1, -1, cseq_len_h, -1) _cencode_hypothesis = cencode_hypothesis.unsqueeze(1).expand(-1, cseq_len_p, -1, -1) cp_h = torch.cat([_cencode_premise, _cencode_hypothesis, _cencode_premise - _cencode_hypothesis, _cencode_premise * _cencode_hypothesis], dim=-1) # batch, seq_len1, seq_len2, 4*2*hidden_size cp_h = self.c_trans(cp_h).squeeze(-1) # batch, seq_len1, seq_len2 # print(cp_h.size()) csimilarity_matrix_hyp = cp_h + (-999999 * (_chypothesis_mask == 0).float()) csimilarity_matrix_pre = cp_h + (-999999 * (_cpremise_mask == 0).float()) # softmax attention weight cattention_a = F.softmax(csimilarity_matrix_pre, dim=2) # batch, p_seq_len, h_seq_len cattention_b = F.softmax(csimilarity_matrix_hyp, dim=1) # batch, cattended_premise = torch.bmm(cattention_a, cencode_hypothesis) # batch, p_seq_len, hidden_size cattended_hypothesis = torch.bmm(cattention_b.transpose(1, 2), cencode_premise) # batch, h_seq_len, hidden_size # the enhancement layer # (batch, seq_len, 2*4*hidden_size) cpremise_enhanced = torch.cat([cencode_premise, cattended_premise, cencode_premise - cattended_premise, cencode_premise * cattended_premise], dim=-1) chypothesis_enhanced = torch.cat([cencode_hypothesis, cattended_hypothesis, cencode_hypothesis - cattended_hypothesis, cencode_hypothesis * cattended_hypothesis], dim=-1) # (batch, seq_len, hidden_size) cprojected_enhanced_premise = self.char_projection(cpremise_enhanced) cprojected_enhanced_hypothesis = self.char_projection(chypothesis_enhanced) # (batch, seq_len, 2*hidden_size) # cpremise = self.char_pair_encoder(cprojected_enhanced_premise, cprojected_enhanced_hypothesis, chypothesis_mask) # chypothesis = self.char_pair_encoder(cprojected_enhanced_hypothesis, cprojected_enhanced_premise, cpremise_mask) cprojected_enhanced_premise = self._rnn_dropout(cprojected_enhanced_premise) cprojected_enhanced_hypothesis = self._rnn_dropout(cprojected_enhanced_hypothesis) cpremise = self._char_composition(cprojected_enhanced_premise, cpremises_lengths) chypothesis = self._char_composition(cprojected_enhanced_hypothesis, chypothesis_lengths) # cpremise = self.cmulhead_attention(cpremise.transpose(1, 2), cpremise_mask).transpose(1, 2) # chypothesis = self.cmulhead_attention(chypothesis.transpose(1, 2), chypothesis_mask).transpose(1, 2) # cpremise,_ = self.average_attention(cpremise, cpremise_mask) # chypothesis,_ = self.average_attention(chypothesis, chypothesis_mask) cpremise_avg = torch.sum(cpremise * cpremise_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(cpremise_mask, dim=1, keepdim=True) chypothesis_avg = torch.sum(chypothesis * chypothesis_mask.unsqueeze(1). transpose(2, 1), dim=1) / torch.sum(chypothesis_mask, dim=1, keepdim=True) cpremise_max, _ = max_pooling(cpremise, cpremise_mask, dim=1) chypothesis_max, _ = max_pooling(chypothesis, chypothesis_mask, dim=1) # batch, 2*2*hidden c_premise_max_avg = torch.cat([cpremise_avg-cpremise_max, cpremise_avg*cpremise_max], dim=1) c_hypothesis_max_avg = torch.cat([chypothesis_avg-chypothesis_max, chypothesis_avg*chypothesis_max], dim=1) # premise = self.self_match_encoder(premise, premise, premise_mask) # hypothesis = self.self_match_encoder(hypothesis, hypothesis, hypothesis_mask) premise_avg = torch.sum(premise*premise_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(premise_mask, dim=1, keepdim=True) hypothesis_avg = torch.sum(hypothesis*hypothesis_mask.unsqueeze(1). transpose(2, 1),dim=1) / torch.sum(hypothesis_mask, dim=1, keepdim=True) premise_max, _ = max_pooling(premise, premise_mask, dim=1) hypothesis_max, _ = max_pooling(hypothesis, hypothesis_mask, dim=1) premise_avg_max = torch.cat([premise_avg-premise_max, premise_avg*premise_max], dim=1) hypothesis_avg_max = torch.cat([hypothesis_avg-hypothesis_max, hypothesis_avg*hypothesis_max], dim=1) v = torch.cat([premise_avg, premise_max, hypothesis_avg, hypothesis_max, cpremise_avg, cpremise_max, chypothesis_avg, chypothesis_max], dim=1) logits = self._classification(v) return logits
def __init__(self, batch_size, quest_len, answer_len, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm,loss_ratio, l2_reg_lambda=0.0, adjust_weight=False,label_weight=[],is_training=True,m=0.1): # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.quest_len = quest_len self.answer_len = answer_len self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0,trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="ori_quest") self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="cand_quest") self.neg_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="neg_quest") self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="test_input_q") self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="test_input_a") self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER], name='cat_ids') #embedding layer with tf.device("/cpu:0"),tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests =tf.nn.embedding_lookup(W, self.neg_input_quests) test_quest =tf.nn.embedding_lookup(W, self.test_input_q) test_answer =tf.nn.embedding_lookup(W, self.test_input_a) #ori_quests = tf.nn.dropout(ori_quests, self.keep_prob) #cand_quests = tf.nn.dropout(cand_quests, self.keep_prob) #neg_quests = tf.nn.dropout(neg_quests, self.keep_prob) #build LSTM network with tf.variable_scope("LSTM_scope", reuse=None): ori_q = biLSTM(ori_quests, self.rnn_size) with tf.variable_scope("LSTM_scope", reuse=True): cand_a = biLSTM(cand_quests, self.rnn_size) neg_a = biLSTM(neg_quests, self.rnn_size) test_q = biLSTM(test_quest, self.rnn_size) test_a = biLSTM(test_answer, self.rnn_size) #----------------------------- cal attention ------------------------------- with tf.variable_scope("attention", reuse=None) as scope: U = tf.get_variable("U", [2 * self.rnn_size, 2 * rnn_size], initializer=tf.truncated_normal_initializer(stddev=0.1)) G = tf.matmul(tf.matmul(ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), cand_a, adjoint_b=True) delta_q = tf.nn.softmax(tf.reduce_max(G, 2)) delta_a = tf.nn.softmax(tf.reduce_max(G, 1)) neg_G = tf.matmul(tf.matmul(ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), neg_a, adjoint_b=True) delta_neg_q = tf.nn.softmax(tf.reduce_max(neg_G, 2)) delta_neg_a = tf.nn.softmax(tf.reduce_max(neg_G, 1)) with tf.variable_scope("attention", reuse=True) as scope: test_G = tf.matmul(tf.matmul(test_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), test_a, adjoint_b=True) delta_test_q = tf.nn.softmax(tf.reduce_max(test_G, 2)) delta_test_a = tf.nn.softmax(tf.reduce_max(test_G, 1)) #-------------------------- recalculate lstm output ------------------------- #ori_q_feat = tf.squeeze(tf.matmul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]), adjoint_a=True)) #cand_q_feat = tf.squeeze(tf.matmul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]), adjoint_a=True)) #neg_ori_q_feat = tf.squeeze(tf.matmul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]), adjoint_a=True)) #neg_q_feat = tf.squeeze(tf.matmul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]), adjoint_a=True)) #test_q_out = tf.squeeze(tf.matmul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]), adjoint_a=True)) #test_a_out = tf.squeeze(tf.matmul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]), adjoint_a=True)) ori_q_feat = max_pooling(tf.multiply(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]))) cand_q_feat = max_pooling(tf.multiply(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]))) neg_ori_q_feat = max_pooling(tf.multiply(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]))) neg_q_feat = max_pooling(tf.multiply(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]))) test_q_out = max_pooling(tf.multiply(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]))) test_a_out = max_pooling(tf.multiply(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]))) #-------------------------- recalculate lstm output end --------------------- # dropout #self.out_ori = tf.nn.dropout(self.out_ori, self.keep_prob) #self.out_cand = tf.nn.dropout(self.out_cand, self.keep_prob) #self.out_neg = tf.nn.dropout(self.out_neg, self.keep_prob) # multitasking with tf.name_scope("multitasking"): feature_size = int(ori_q_feat.get_shape()[1]) fc1 = tf.layers.dense(ori_q_feat, feature_size * 2, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, feature_size, activation=tf.nn.relu, name='fc2') logits = tf.layers.dense(fc2, CAT_NUMBER, activation=tf.nn.sigmoid) # feature_size = int(ori_q_feat.get_shape()[1]) # w = tf.get_variable(name='weights', shape=(feature_size, CAT_NUMBER, initializer=tf.random_normal_initializer()) # b = tf.get_variable(name='bias', shape=(1, CAT_NUMBER), initializer=tf.zeros_initializer()) # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask") # logits = tf.matmul(ori_q_feat, w) + b entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.cat_ids, name='loss') loss_multitask = tf.reduce_mean(entropy) # acc self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat) loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score, self.ori_neg_score, m) self.loss = loss_origin * (1 - loss_ratio) + loss_multitask * loss_ratio self.test_q_a = feature2cos_sim(test_q_out, test_a_out) # multitasking_acc with tf.name_scope("multi_acc"): self.preds = tf.nn.softmax(logits) self.correct_preds = tf.equal(tf.argmax(self.preds, 1), tf.argmax(self.cat_ids, 1)) self.multi_acc = tf.reduce_sum(tf.cast(self.correct_preds, tf.float32))
def __init__(self, batch_size, quest_len, answer_len, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, l2_reg_lambda=0.0, adjust_weight=False, label_weight=[], is_training=True): # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.quest_len = quest_len self.answer_len = answer_len self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0, trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="ori_quest") self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="cand_quest") self.neg_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="neg_quest") self.test_input_quests = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="test_quest") self.test_input_answer = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="test_cand_quest") #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests) test_quest = tf.nn.embedding_lookup(W, self.test_input_quests) test_answer = tf.nn.embedding_lookup(W, self.test_input_answer) #ori_quests = tf.nn.dropout(ori_quests, self.keep_prob) #cand_quests = tf.nn.dropout(cand_quests, self.keep_prob) #neg_quests = tf.nn.dropout(neg_quests, self.keep_prob) #build LSTM network with tf.variable_scope("LSTM_scope", reuse=None): ori_q = BILSTM(ori_quests, self.rnn_size) with tf.variable_scope("LSTM_scope", reuse=True): cand_a = BILSTM(cand_quests, self.rnn_size) neg_a = BILSTM(neg_quests, self.rnn_size) test_q = BILSTM(test_quest, self.rnn_size) test_a = BILSTM(test_answer, self.rnn_size) #----------------------------- cal attention ------------------------------- with tf.variable_scope("attention", reuse=None) as scope: U = tf.get_variable( "U", [2 * self.rnn_size, 2 * rnn_size], initializer=tf.truncated_normal_initializer(stddev=0.1)) G = tf.nn.tanh( tf.batch_matmul(tf.batch_matmul( ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), cand_a, adj_y=True)) delta_q = tf.nn.softmax(tf.reduce_max(G, 2)) delta_a = tf.nn.softmax(tf.reduce_max(G, 1)) neg_G = tf.nn.tanh( tf.batch_matmul(tf.batch_matmul( ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), neg_a, adj_y=True)) delta_neg_q = tf.nn.softmax(tf.reduce_max(neg_G, 2)) delta_neg_a = tf.nn.softmax(tf.reduce_max(neg_G, 1)) with tf.variable_scope("attention", reuse=True) as scope: test_G = tf.nn.tanh( tf.batch_matmul(tf.batch_matmul( test_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), test_a, adj_y=True)) delta_test_q = tf.nn.softmax(tf.reduce_max(test_G, 2)) delta_test_a = tf.nn.softmax(tf.reduce_max(test_G, 1)) #-------------------------- recalculate lstm output ------------------------- #ori_q_feat = tf.squeeze(tf.batch_matmul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]), adj_x=True)) #cand_q_feat = tf.squeeze(tf.batch_matmul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]), adj_x=True)) #neg_ori_q_feat = tf.squeeze(tf.batch_matmul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]), adj_x=True)) #neg_q_feat = tf.squeeze(tf.batch_matmul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]), adj_x=True)) #test_q_feat = tf.squeeze(tf.batch_matmul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]), adj_x=True)) #test_a_feat = tf.squeeze(tf.batch_matmul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]), adj_x=True)) ori_q_feat = max_pooling( tf.mul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]))) cand_q_feat = max_pooling( tf.mul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]))) neg_ori_q_feat = max_pooling( tf.mul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]))) neg_q_feat = max_pooling( tf.mul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]))) test_q_feat = max_pooling( tf.mul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]))) test_a_feat = max_pooling( tf.mul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]))) #-------------------------- recalculate lstm output end --------------------- # dropout #self.out_ori = tf.nn.dropout(self.out_ori, self.keep_prob) #self.out_cand = tf.nn.dropout(self.out_cand, self.keep_prob) #self.out_neg = tf.nn.dropout(self.out_neg, self.keep_prob) # cal cosine simulation self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg = feature2cos_sim(neg_ori_q_feat, neg_q_feat) self.test_q_a = feature2cos_sim(test_q_feat, test_a_feat) self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg)
def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, attention_matrix_size, loss_ratio, l2_reg_lambda=0.0, adjust_weight=False, label_weight=[], is_training=True, m=0.1): """ LSTM-BASED DEEP LEARNING MODELS FOR NON-FACTOID ANSWER SELECTION """ # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0, trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.cand_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.neg_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER], name='cat_ids') #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests) test_q = tf.nn.embedding_lookup(W, self.test_input_q) test_a = tf.nn.embedding_lookup(W, self.test_input_a) #build LSTM network U = tf.Variable(tf.truncated_normal( [2 * self.rnn_size, self.embedding_size], stddev=0.1), name="U") with tf.variable_scope("LSTM_scope", reuse=None): ori_q = biLSTM(ori_quests, self.rnn_size) ori_q_feat = tf.nn.tanh(max_pooling(ori_q)) with tf.variable_scope("LSTM_scope", reuse=True): cand_att_weight = tf.sigmoid( tf.matmul( cand_quests, tf.reshape(tf.expand_dims(tf.matmul(ori_q_feat, U), 1), [-1, self.embedding_size, 1]))) neg_att_weight = tf.sigmoid( tf.matmul( neg_quests, tf.reshape(tf.expand_dims(tf.matmul(ori_q_feat, U), 1), [-1, self.embedding_size, 1]))) cand_a = biLSTM( tf.multiply( cand_quests, tf.tile(cand_att_weight, [1, 1, self.embedding_size])), self.rnn_size) neg_a = biLSTM( tf.multiply( neg_quests, tf.tile(neg_att_weight, [1, 1, self.embedding_size])), self.rnn_size) cand_q_feat = tf.nn.tanh(max_pooling(cand_a)) neg_q_feat = tf.nn.tanh(max_pooling(neg_a)) test_q_out = biLSTM(test_q, self.rnn_size) test_q_out = tf.nn.tanh(max_pooling(test_q_out)) test_att_weight = tf.sigmoid( tf.matmul( test_a, tf.reshape(tf.expand_dims(tf.matmul(test_q_out, U), 1), [-1, self.embedding_size, 1]))) test_a_out = biLSTM( tf.multiply( test_a, tf.tile(test_att_weight, [1, 1, self.embedding_size])), self.rnn_size) test_a_out = tf.nn.tanh(max_pooling(test_a_out)) # multitasking with tf.name_scope("multitasking"): feature_size = int(ori_q_feat.get_shape()[1]) fc1 = tf.layers.dense(ori_q_feat, feature_size * 2, activation=tf.nn.relu, name='fc1') fc2 = tf.layers.dense(fc1, feature_size, activation=tf.nn.relu, name='fc2') logits = tf.layers.dense(fc2, CAT_NUMBER, activation=tf.nn.sigmoid) # feature_size = int(ori_q_feat.get_shape()[1]) # w = tf.get_variable(name='weights', shape=(feature_size, CAT_NUMBER, initializer=tf.random_normal_initializer()) # b = tf.get_variable(name='bias', shape=(1, CAT_NUMBER), initializer=tf.zeros_initializer()) # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask") # logits = tf.matmul(ori_q_feat, w) + b entropy = tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=self.cat_ids, name='loss') loss_multitask = tf.reduce_mean(entropy) # acc self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat) loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score, self.ori_neg_score, m) self.loss = loss_origin * (1 - loss_ratio) + loss_multitask * loss_ratio self.test_q_a = feature2cos_sim(test_q_out, test_a_out) # multitasking_acc with tf.name_scope("multi_acc"): self.preds = tf.nn.softmax(logits) self.correct_preds = tf.equal(tf.argmax(self.preds, 1), tf.argmax(self.cat_ids, 1)) self.multi_acc = tf.reduce_sum( tf.cast(self.correct_preds, tf.float32)) def assign_new_lr(self, session, lr_value): session.run(self._lr_update, feed_dict={self.new_lr: lr_value})
def forward(self, inputs, labels=None): """ :param inputs: [bsz, max_seq_leng] :param labels: [bsz, num_class] :return: """ inputs = inputs.t() mask = (inputs > 0).float() inputs_len = (inputs > 0).int().sum(dim=0) hidden = self.encoder(inputs, mask, inputs_len) pool_values = [] for pool in self.summary_type: if pool == 'max': val = max_pooling(hidden, mask) pool_values.append(val) elif pool == 'mean': val = mean_pooling(hidden, inputs_len, mask) pool_values.append(val) elif pool == 'first': seq_len, bsz, dim = hidden.size() val = hidden[0, :, :].view(bsz, -1).contiguous() pool_values.append(val) elif pool == 'last': seq_len, bsz, dim = hidden.size() val = hidden[-1, :, :].view(bsz, -1).contiguous() pool_values.append(val) elif pool == 'struct_att': val, att = self.strut_att(hidden, mask) bsz, head_num, dim = val.size() val = val.contiguous().view(bsz, -1) pool_values.append(val) elif pool == 'none': pool_values.append(hidden) if len(self.summary_type) == 1: hidden = pool_values[0] else: hidden = torch.cat(pool_values, dim=-1).contiguous() # [bsz, hid_dim] bsz, hid_dim = hidden.size() # logits = self.cls(self.dropout(hidden)) hidden = self.normalize(hidden) logits = self.cls(hidden) if self.training: # Mixup indices = torch.randperm(bsz, device=logits.device) shuf_labels = torch.index_select(labels, 0, indices) shuf_hidden = torch.index_select(hidden, 0, indices) if self.mixup_type == 'mixup': lam = self.beta_dist.sample(sample_shape=(bsz, 1)) lam = lam.to(inputs.device) lam_x, lam_y = lam, lam elif self.mixup_type == 'prior_mix': lam_x = self.beta_dist.sample(sample_shape=(bsz,)) lam_x = lam_x.to(inputs.device) lam_y = self.prior_mixup(labels, shuf_labels) lam_y = 2. * lam_x * lam_y / (lam_x + lam_y) else: raise Exception('Unsupported mixup type %s' % self.mixup_type) mix_hidden = lam_x * hidden + (1 - lam_x) * shuf_hidden if not self.multi_label: onehot_label = to_onehot(labels, self.num_class) onehot_shuf_label = to_onehot(shuf_labels, self.num_class) else: onehot_label = labels onehot_shuf_label = shuf_labels lam_y = lam_y.unsqueeze(-1) mix_labels = lam_y * onehot_label + (1 - lam_y) * onehot_shuf_label mix_logits = self.cls(mix_hidden) return logits, mix_logits, mix_labels return logits, hidden
def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, attention_matrix_size, l2_reg_lambda=0.0, adjust_weight=False, label_weight=[], is_training=True): """ LSTM-BASED DEEP LEARNING MODELS FOR NON-FACTOID ANSWER SELECTION """ # define input variable self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.max_grad_norm = max_grad_norm self.l2_reg_lambda = l2_reg_lambda self.is_training = is_training self.keep_prob = tf.placeholder(tf.float32, name="keep_drop") self.lr = tf.Variable(0.0, trainable=False) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr) self.ori_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.cand_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.neg_input_quests = tf.placeholder( tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps]) #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W") ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests) cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests) neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests) test_q = tf.nn.embedding_lookup(W, self.test_input_q) test_a = tf.nn.embedding_lookup(W, self.test_input_a) #build LSTM network U = tf.Variable(tf.truncated_normal( [2 * self.rnn_size, self.embedding_size], stddev=0.1), name="U") with tf.variable_scope("LSTM_scope", reuse=None): ori_q = biLSTM(ori_quests, self.rnn_size) ori_q_feat = tf.nn.tanh(max_pooling(ori_q)) with tf.variable_scope("LSTM_scope", reuse=True): cand_att_weight = tf.sigmoid( tf.batch_matmul( cand_quests, tf.reshape( tf.expand_dims(tf.batch_matmul(ori_q_feat, U), 1), [-1, self.embedding_size, 1]))) neg_att_weight = tf.sigmoid( tf.batch_matmul( neg_quests, tf.reshape( tf.expand_dims(tf.batch_matmul(ori_q_feat, U), 1), [-1, self.embedding_size, 1]))) cand_a = biLSTM( tf.mul(cand_quests, tf.tile(cand_att_weight, [1, 1, self.embedding_size])), self.rnn_size) neg_a = biLSTM( tf.mul(neg_quests, tf.tile(neg_att_weight, [1, 1, self.embedding_size])), self.rnn_size) cand_q_feat = tf.nn.tanh(max_pooling(cand_a)) neg_q_feat = tf.nn.tanh(max_pooling(neg_a)) test_q_out = biLSTM(test_q, self.rnn_size) test_q_out = tf.nn.tanh(max_pooling(test_q_out)) test_att_weight = tf.sigmoid( tf.batch_matmul( test_a, tf.reshape( tf.expand_dims(tf.batch_matmul(test_q_out, U), 1), [-1, self.embedding_size, 1]))) test_a_out = biLSTM( tf.mul(test_a, tf.tile(test_att_weight, [1, 1, self.embedding_size])), self.rnn_size) test_a_out = tf.nn.tanh(max_pooling(test_a_out)) self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat) self.ori_neg = feature2cos_sim(ori_q_feat, neg_q_feat) self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg) self.test_q_a = feature2cos_sim(test_q_out, test_a_out)