Beispiel #1
0
    def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm, l2_reg_lambda=0.0, adjust_weight=False,label_weight=[],is_training=True):
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.num_unroll_steps = num_unroll_steps
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")
        
        self.lr = tf.Variable(0.0,trainable=False)
        self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps])
        self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps])
        self.neg_input_quests = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps])

        self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps])
        self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.num_unroll_steps])


        #embedding layer
        with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W")
            ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests =tf.nn.embedding_lookup(W, self.neg_input_quests)

            test_q =tf.nn.embedding_lookup(W, self.test_input_q)
            test_a =tf.nn.embedding_lookup(W, self.test_input_a)

        #build LSTM network
        with tf.variable_scope("LSTM_scope", reuse=None):
            ori_q = LSTM(ori_quests, self.rnn_size, self.batch_size)
            ori_q_feat = tf.nn.tanh(max_pooling(ori_q))
        with tf.variable_scope("LSTM_scope", reuse=True):
            cand_a = LSTM(cand_quests, self.rnn_size, self.batch_size)
            neg_a = LSTM(neg_quests, self.rnn_size, self.batch_size)
            cand_q_feat = tf.nn.tanh(max_pooling(cand_a))
            neg_q_feat = tf.nn.tanh(max_pooling(neg_a))

            test_q_out = LSTM(test_q, self.rnn_size, self.batch_size)
            test_q_out = tf.nn.tanh(max_pooling(test_q_out))
            test_a_out = LSTM(test_a, self.rnn_size, self.batch_size)
            test_a_out = tf.nn.tanh(max_pooling(test_a_out))

        self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg = feature2cos_sim(ori_q_feat, neg_q_feat)
        self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg)

        self.test_q_a = feature2cos_sim(test_q_out, test_a_out)
Beispiel #2
0
    def forward(self, A, X, seq_len, seqs, hidden):
        batch_size = seqs.shape[0]
        Ws = []
        for i in range(self.num_layers):
            if i == 0:
                H, W = self.layers[i](A)
            else:
                H = self.normalization(H)
                H, W = self.layers[i](A, H)
            Ws.append(W)
        for i in range(self.num_channels):
            if i == 0:
                edge_index, edge_weight = H[i][0], H[i][1]
                X_ = self.gcn(X,
                              edge_index=edge_index.detach(),
                              edge_weight=edge_weight)
                X_ = F.relu(X_)
            else:
                edge_index, edge_weight = H[i][0], H[i][1]
                X_ = torch.cat((X_,
                                F.relu(
                                    self.gcn(X,
                                             edge_index=edge_index.detach(),
                                             edge_weight=edge_weight))),
                               dim=1)

        X_ = self.linear1(X_)

        basket_seqs = torch.zeros(batch_size * self.max_seq_length,
                                  self.w_out,
                                  dtype=self.dtype,
                                  device=self.device)
        seqs = seqs.contiguous().view(-1, self.nb_items)
        for i, basket in enumerate(seqs, 0):
            if torch.sum(basket) > 0:
                item_idx = torch.nonzero(basket, as_tuple=True)
                basket_embed = utils.max_pooling(X_[item_idx])
                basket_seqs[i] = basket_embed

        basket_seqs = basket_seqs.contiguous().view(-1, self.max_seq_length,
                                                    self.w_out)
        lstm_out, (h_n, c_n) = self.lstm(basket_seqs, hidden)
        actual_index = torch.arange(0, batch_size) * self.max_seq_length + (
            seq_len - 1)
        actual_lstm_out = lstm_out.reshape(-1, self.rnn_units)[actual_index]

        hidden_to_score = self.h2item_score(actual_lstm_out)
        # print(hidden_to_score)

        # predict next items score
        next_item_probs = torch.sigmoid(hidden_to_score)

        # loss = self.loss(next_item_probs, target_basket)
        # return loss, target_basket, Ws
        return next_item_probs
Beispiel #3
0
    def build(self, input, is_dropout = False):  #is_dropout 是否dropout
        conv1_1 = conv3_3(input, 64, 'conv1_1',self.data_dict_VGG16, finetune=self.finetune)
        conv1_2 = conv3_3(conv1_1, 64, 'conv1_2',self.data_dict_VGG16,  finetune=self.finetune)
        pool1 = max_pooling(conv1_2, 'pool1')
        # conv2
        conv2_1 = conv3_3(pool1, 128, 'conv2_1',self.data_dict_VGG16,  finetune=self.finetune)
        conv2_2 = conv3_3(conv2_1, 128, 'conv2_2',self.data_dict_VGG16,  finetune=self.finetune)
        pool2 = max_pooling(conv2_2, 'pool2')
        # conv3
        conv3_1 = conv3_3(pool2, 256, 'conv3_1',self.data_dict_VGG16,  finetune=self.finetune)
        conv3_2 = conv3_3(conv3_1, 256, 'conv3_2',self.data_dict_VGG16,  finetune=self.finetune)
        conv3_3 = conv3_3(conv3_2, 256, 'conv3_3',self.data_dict_VGG16,  finetune=self.finetune)
        pool3 = max_pooling(conv3_3, 'pool3')
        # conv4
        conv4_1 = conv3_3(pool3, 512, 'conv4_1', self.data_dict_VGG16, finetune=self.finetune)
        conv4_2 = conv3_3(conv4_1, 512, 'conv4_2', self.data_dict_VGG16, finetune=self.finetune)
        conv4_3 = conv3_3(conv4_2, 512, 'conv4_3', self.data_dict_VGG16, finetune=self.finetune)
        pool4 = max_pooling(conv4_3, 'pool4')

        # conv5
        conv5_1 = conv3_3(pool4, 512, 'conv5_1', self.data_dict_VGG16, finetune=self.finetune)
        conv5_2 = conv3_3(conv5_1, 512, 'conv5_2', self.data_dict_VGG16, finetune=self.finetune)
        conv5_3 = conv3_3(conv5_2, 512, 'conv5_3', self.data_dict_VGG16, finetune=self.finetune)
        pool5 = max_pooling(conv5_3, 'pool5')

        # fully connected layer
        flatten = tf.reshape(pool5, [self.batchsize, -1])
        fc_6 = fc(flatten, 4096, 'fc_6', finetune=False)
        fc_6 = tf.nn.relu(fc_6)
        if is_dropout: fc_6 = tf.nn.dropout(fc_6, 0.5)

        fc_7 = fc(fc_6, 4096, 'fc_7', finetune=False)
        fc_7 = tf.nn.relu(fc_7)
        if is_dropout: fc_7 = tf.nn.dropout(fc_7, 0.5)

        fc_8 = fc(fc_7, self.n_classes, 'fc_8', finetune=False)
        return fc_8
    def __init__(self,
                 batch_size,
                 num_unroll_steps,
                 embeddings,
                 embedding_size,
                 rnn_size,
                 num_rnn_layers,
                 max_grad_norm,
                 attention_matrix_size,
                 loss_ratio,
                 l2_reg_lambda=0.0,
                 adjust_weight=False,
                 label_weight=[],
                 is_training=True,
                 m=0.1):
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.num_unroll_steps = num_unroll_steps
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")

        self.lr = tf.Variable(0.0, trainable=False)
        self.new_lr = tf.placeholder(tf.float32,
                                     shape=[],
                                     name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.cand_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.neg_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])

        self.test_input_q = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps],
                                           name='test_q')
        self.test_input_a = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps],
                                           name='test_a')
        self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER],
                                      name='cat_ids')

        #embedding layer
        with tf.device("/cpu:0"), tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings),
                            trainable=True,
                            name="W")
            ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests)

            test_q = tf.nn.embedding_lookup(W, self.test_input_q)
            test_a = tf.nn.embedding_lookup(W, self.test_input_a)

        # run lstm without attention
        with tf.variable_scope("LSTM_scope") as scope:
            ori_q = biLSTM(ori_quests, self.rnn_size)
            ori_q_feat = tf.nn.tanh(max_pooling(ori_q))

            scope.reuse_variables()

            cand_a = biLSTM(cand_quests, self.rnn_size)
            neg_a = biLSTM(neg_quests, self.rnn_size)
            cand_q_feat = tf.nn.tanh(max_pooling(cand_a))
            neg_q_feat = tf.nn.tanh(max_pooling(neg_a))

            test_q_out = biLSTM(test_q, self.rnn_size)
            test_q_out = tf.nn.tanh(max_pooling(test_q_out))
            test_a_out = biLSTM(test_a, self.rnn_size)
            test_a_out = tf.nn.tanh(max_pooling(test_a_out))

        # build LSTM network
        # with tf.variable_scope("LSTM_scope") as scope:
        #     ori_q = biLSTM(ori_quests, self.rnn_size)
        #     #ori_q_feat = tf.nn.tanh(max_pooling(ori_q))
        #
        #     scope.reuse_variables()
        #
        #     cand_a = biLSTM(cand_quests, self.rnn_size)
        #     neg_a = biLSTM(neg_quests, self.rnn_size)
        #     #cand_q_feat = tf.nn.tanh(max_pooling(cand_a))
        #     #neg_q_feat = tf.nn.tanh(max_pooling(neg_a))
        #
        #     test_q_out = biLSTM(test_q, self.rnn_size)
        #     #test_q_out = tf.nn.tanh(max_pooling(test_q_out))
        #     test_a_out = biLSTM(test_a, self.rnn_size)
        #     #test_a_out = tf.nn.tanh(max_pooling(test_a_out))

        # with tf.name_scope("att_weight"):
        #     # attention params
        #     att_W = {
        #     	'Wam': tf.Variable(tf.truncated_normal([2 * self.rnn_size, attention_matrix_size], stddev=0.1)),
        #     	'Wqm': tf.Variable(tf.truncated_normal([2 * self.rnn_size, attention_matrix_size], stddev=0.1)),
        #     	'Wms': tf.Variable(tf.truncated_normal([attention_matrix_size, 1], stddev=0.1))
        #     }
        #     ori_q_feat, cand_q_feat = get_feature(ori_q, cand_a, att_W)
        #     ori_nq_feat, neg_q_feat = get_feature(ori_q, neg_a, att_W)
        #     test_q_out, test_a_out = get_feature(test_q_out, test_a_out, att_W)

        # multitasking
        with tf.name_scope("multitasking"):

            feature_size = int(ori_q_feat.get_shape()[1])

            w = tf.get_variable(name='weights',
                                shape=(feature_size, CAT_NUMBER),
                                initializer=tf.random_normal_initializer())
            b = tf.get_variable(name='bias',
                                shape=(1, CAT_NUMBER),
                                initializer=tf.zeros_initializer())

            # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask")

            logits = tf.matmul(ori_q_feat, w) + b

            entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=self.cat_ids, name='loss')
            loss_multitask = tf.reduce_mean(entropy)

        # acc
        self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat)
        loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score,
                                                 self.ori_neg_score, m)

        self.loss = loss_origin * (1 -
                                   loss_ratio) + loss_multitask * loss_ratio

        self.test_q_a = feature2cos_sim(test_q_out, test_a_out)

        #multitasking_acc
        with tf.name_scope("multi_acc"):
            self.preds = tf.nn.softmax(logits)
            self.correct_preds = tf.equal(tf.argmax(self.preds, 1),
                                          tf.argmax(self.cat_ids, 1))
            self.multi_acc = tf.reduce_sum(
                tf.cast(self.correct_preds, tf.float32))
Beispiel #5
0
    def forward(self, inputs):
        premises_indices = inputs[0]
        hypothesis_indices = inputs[1]
        premises_lengths = torch.sum(premises_indices != 0, dim=-1)
        hypothesis_lengths = torch.sum(hypothesis_indices != 0, dim=-1)
        premise_mask = get_mask(premises_indices,
                                premises_lengths).to(self.args.device)
        hypothesis_mask = get_mask(hypothesis_indices,
                                   hypothesis_lengths).to(self.args.device)

        embed_premises = self.embed(premises_indices)
        embed_hypothesis = self.embed(hypothesis_indices)

        if self.dropout:
            embed_premises = self._rnn_dropout(embed_premises)
            embed_hypothesis = self._rnn_dropout(embed_hypothesis)

        encoded_premises = self._encoding(embed_premises, premises_lengths)
        encoded_hypothesis = self._encoding(embed_hypothesis,
                                            hypothesis_lengths)

        attended_premises, attended_hypothesis = self._attention(
            encoded_premises, premise_mask, encoded_hypothesis,
            hypothesis_mask)
        enhanced_premise = torch.cat([
            encoded_premises, attended_premises, encoded_premises -
            attended_premises, encoded_premises * attended_premises
        ],
                                     dim=-1)
        enhanced_hypothesis = torch.cat([
            encoded_hypothesis, attended_hypothesis, encoded_hypothesis -
            attended_hypothesis, encoded_hypothesis * attended_hypothesis
        ],
                                        dim=-1)

        projected_premises = self._projection(enhanced_premise)
        projected_hypothesis = self._projection(enhanced_hypothesis)

        if self.dropout:
            projected_premises = self._rnn_dropout(projected_premises)
            projected_hypothesis = self._rnn_dropout(projected_hypothesis)

        v_ai = self._composition(projected_premises, premises_lengths)
        v_bj = self._composition(projected_hypothesis, hypothesis_lengths)

        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(1)\
                            .transpose(2, 1), dim=1) / torch.sum(premise_mask, dim=1, keepdim=True)
        v_b_avg = torch.sum(
            v_bj * hypothesis_mask.unsqueeze(1).transpose(2, 1),
            dim=1) / torch.sum(hypothesis_mask, dim=1, keepdim=True)

        # v_a_max, _ = replace_masked(v_ai, premise_mask, -1e7).max(dim=1)
        # v_b_max, _ = replace_masked(v_bj, hypothesis_mask, -1e7).max(dim=1)
        v_a_max, _ = max_pooling(v_ai, premise_mask, dim=1)
        v_b_max, _ = max_pooling(v_bj, hypothesis_mask, dim=1)

        if self.args.use_char_emb:
            premises_char_indices = inputs[2]
            hypothesis_char_indices = inputs[3]
            premises_char_lengths = torch.sum(premises_char_indices != 0,
                                              dim=-1)
            hypothesis_char_lengths = torch.sum(hypothesis_char_indices != 0,
                                                dim=-1)
            premise_char_mask = get_mask(premises_char_indices,
                                         premises_char_lengths).to(
                                             self.args.device)
            hypothesis_char_mask = get_mask(hypothesis_char_indices,
                                            hypothesis_char_lengths).to(
                                                self.args.device)

            embed_char_premises = self.char_embed(premises_char_indices)
            embed_char_hypothesis = self.char_embed(hypothesis_char_indices)

            if self.dropout:
                embed_char_premises = self._rnn_dropout(embed_char_premises)
                embed_char_hypothesis = self._rnn_dropout(
                    embed_char_hypothesis)

            encoded_char_premises = self._char_encoding(
                embed_char_premises, premises_char_lengths)
            encoded_char_hypothesis = self._char_encoding(
                embed_char_hypothesis, hypothesis_char_lengths)

            attended_char_premises, attended_char_hypothesis = self._attention(
                encoded_char_premises, premise_char_mask,
                encoded_char_hypothesis, hypothesis_char_mask)
            enhanced_char_premise = torch.cat([
                encoded_char_premises, attended_char_premises,
                encoded_char_premises - attended_char_premises,
                encoded_char_premises * attended_char_premises
            ],
                                              dim=-1)
            enhanced_char_hypothesis = torch.cat([
                encoded_char_hypothesis, attended_char_hypothesis,
                encoded_char_hypothesis - attended_char_hypothesis,
                encoded_char_hypothesis * attended_char_hypothesis
            ],
                                                 dim=-1)

            projected_char_premises = self._char_projection(
                enhanced_char_premise)
            projected_char_hypothesis = self._char_projection(
                enhanced_char_hypothesis)

            if self.dropout:
                projected_char_premises = self._rnn_dropout(
                    projected_char_premises)
                projected_char_hypothesis = self._rnn_dropout(
                    projected_char_hypothesis)

            cv_ai = self._char_composition(projected_char_premises,
                                           premises_char_lengths)
            cv_bj = self._char_composition(projected_char_hypothesis,
                                           hypothesis_char_lengths)

            cv_a_avg = torch.sum(cv_ai * premise_char_mask.unsqueeze(1) \
                                .transpose(2, 1), dim=1) / torch.sum(premise_char_mask, dim=1, keepdim=True)
            cv_b_avg = torch.sum(
                cv_bj * hypothesis_char_mask.unsqueeze(1).transpose(2, 1),
                dim=1) / torch.sum(hypothesis_char_mask, dim=1, keepdim=True)

            # cv_a_max, _ = replace_masked(cv_ai, premise_char_mask, -1e7).max(dim=1)
            # cv_b_max, _ = replace_masked(cv_bj, hypothesis_char_mask, -1e7).max(dim=1)
            cv_a_max, _ = max_pooling(cv_ai, premise_char_mask, dim=1)
            cv_b_max, _ = max_pooling(cv_bj, hypothesis_char_mask, dim=1)

        v = torch.cat([
            v_a_avg, v_a_max, v_b_avg, v_b_max, cv_a_avg, cv_a_max, cv_b_avg,
            cv_b_max
        ],
                      dim=1)

        logits = self._classification(v)

        return logits
Beispiel #6
0
    def forward(self, inputs):
        premises_indices = inputs[0]
        hypothesis_indices = inputs[1]
        # print(premises_indices.size())
        # batch, 1
        premises_lengths = torch.sum(premises_indices != 0, dim=-1)
        hypothesis_lengths = torch.sum(hypothesis_indices != 0, dim=-1)
        # print(premises_lengths.size())
        # batch, seq_len
        premise_mask = get_mask(premises_indices, premises_lengths).to(self.args.device)
        hypothesis_mask = get_mask(hypothesis_indices, hypothesis_lengths).to(self.args.device)
        # print(premise_mask.size())

        embed_premise = self.embed(premises_indices)
        embed_hypothesis = self.embed(hypothesis_indices)
        # batch, seq_len, embed_dim

        embed_premise = self._rnn_dropout(embed_premise)
        embed_hypothesis = self._rnn_dropout(embed_hypothesis)
        # ----Encoder Layer----
        # (batch, seq_len, 2*hidden_size)
        encode_premise = self.sentence_encoder(embed_premise, premises_lengths)
        encode_hypothesis = self.sentence_encoder(embed_hypothesis, hypothesis_lengths)
        # print(encode_premise.size())
        # Co-Attention Layer
        # encode_premise,_ = self.average_attention(encode_premise , premise_mask)
        # encode_hypothesis,_ = self.average_attention(encode_hypothesis, hypothesis_mask)

        # attended_premise, attended_hypothesis = self._attention(encode_premise, premise_mask,
        #                                                         encode_hypothesis, hypothesis_mask)
        seq_len_p = encode_premise.size(1)
        seq_len_h = encode_hypothesis.size(1)

        _hypothesis_mask = hypothesis_mask.unsqueeze(1).expand(-1, seq_len_p, -1)  # batch, p_seq_len, h_seq_len
        _premise_mask = premise_mask.unsqueeze(2).expand(-1, -1, seq_len_h)  # batch, p_seq_len, h_seq_len
        # print(premise_mask.size())

        _encode_premise = encode_premise.unsqueeze(2).expand(-1, -1, seq_len_h, -1)
        _encode_hypothesis = encode_hypothesis.unsqueeze(1).expand(-1, seq_len_p, -1, -1)
        # print(_encode_premise.size())

        p_h = torch.cat([_encode_premise, _encode_hypothesis,
                   _encode_premise - _encode_hypothesis,
                   _encode_premise * _encode_hypothesis], dim=-1)  # batch, seq_len1, seq_len2, 4*2*hidden_size

        p_h = self._trans(p_h).squeeze(-1)  # batch, seq_len1, seq_len2
        # print(p_h.size())

        similarity_matrix_hyp = p_h + (-999999 * (_hypothesis_mask == 0).float())
        similarity_matrix_pre = p_h + (-999999 * (_premise_mask == 0).float())
        # softmax attention weight

        attention_a = F.softmax(similarity_matrix_pre, dim=2)  # batch, p_seq_len, h_seq_len
        attention_b = F.softmax(similarity_matrix_hyp, dim=1)  # batch,

        attended_premise = torch.bmm(attention_a, encode_hypothesis)  # batch, p_seq_len, hidden_size
        attended_hypothesis = torch.bmm(attention_b.transpose(1, 2), encode_premise)  # batch, h_seq_len, hidden_size

        # the enhancement layer
        # (batch, seq_len, 2*4*hidden_size)
        premise_enhanced = torch.cat([encode_premise, attended_premise,
                                      encode_premise - attended_premise,
                                      encode_premise * attended_premise], dim=-1)
        hypothesis_enhanced = torch.cat([encode_hypothesis, attended_hypothesis,
                                         encode_hypothesis - attended_hypothesis,
                                         encode_hypothesis * attended_hypothesis], dim=-1)
        # (batch, seq_len, hidden_size)
        projected_enhanced_premise = self._projection(premise_enhanced)
        projected_enhanced_hypothesis = self._projection(hypothesis_enhanced)

        # (batch, seq_len, 2*hidden_size)
        # premise = self.pair_encoder(projected_enhanced_premise, projected_enhanced_hypothesis, hypothesis_mask)
        # hypothesis = self.pair_encoder(projected_enhanced_hypothesis, projected_enhanced_premise, premise_mask)
        projected_enhanced_premise = self._rnn_dropout(projected_enhanced_premise)
        projected_enhanced_hypothesis = self._rnn_dropout(projected_enhanced_hypothesis)

        premise = self._composition(projected_enhanced_premise, premises_lengths)
        hypothesis = self._composition(projected_enhanced_hypothesis, hypothesis_lengths)
        # batch, seq_len, 2*hidden_size
        # premise = self.mulhead_attention(premise.transpose(1, 2), premise_mask).transpose(1, 2)
        # hypothesis = self.mulhead_attention(hypothesis.transpose(1, 2), hypothesis_mask).transpose(1, 2)
        # premise,_ = self.average_attention(premise, mask=premise_mask)
        # hypothesis,_ = self.average_attention(hypothesis, hypothesis_mask)

        if self.args.use_char_emb:
            cpremises_indices = inputs[2]
            chypothesis_indices = inputs[3]
            # batch, 1
            cpremises_lengths = torch.sum(cpremises_indices != 0, dim=-1)
            chypothesis_lengths = torch.sum(chypothesis_indices != 0, dim=-1)
            # batch, seq_len
            cpremise_mask = get_mask(cpremises_indices, cpremises_lengths).to(self.args.device)
            chypothesis_mask = get_mask(chypothesis_indices, chypothesis_lengths).to(self.args.device)

            cembed_premise = self.cembed(cpremises_indices)
            cembed_hypothesis = self.cembed(chypothesis_indices)
            # batch, seq_len, embed_dim
            """
            embed_premise = embed_premise.transpose(0, 1)
            embed_hypothesis = embed_hypothesis.transpose(0, 1)
            # seq_len, batch
            premise_mask = premise_mask.transpose(0, 1)
            hypothesis_mask = hypothesis_mask.transpose(0, 1)
            """

            cembed_premise = self._rnn_dropout(cembed_premise)
            cembed_hypothesis = self._rnn_dropout(cembed_hypothesis)
            # ----Encoder Layer----
            # (batch, seq_len, 2*hidden_size)
            cencode_premise = self.char_encoder(cembed_premise, cpremises_lengths)
            cencode_hypothesis = self.char_encoder(cembed_hypothesis, chypothesis_lengths)
            # (batch, seq_len, 2*4*hidden_size)
            # Co-Attention Layer
            # cencode_premise,_ = self.caverage_attention(cencode_premise, cpremise_mask)
            # cencode_hypothesis,_ = self.caverage_attention(cencode_hypothesis, chypothesis_mask)

            # cattended_premise, cattended_hypothesis = self._attention(cencode_premise, cpremise_mask,
            #                                                        cencode_hypothesis, chypothesis_mask)
            cseq_len_p = cencode_premise.size(1)
            cseq_len_h = cencode_hypothesis.size(1)

            _chypothesis_mask = chypothesis_mask.unsqueeze(1).expand(-1, cseq_len_p, -1)  # batch, p_seq_len, h_seq_len
            _cpremise_mask = cpremise_mask.unsqueeze(2).expand(-1, -1, cseq_len_h)  # batch, p_seq_len, h_seq_len
            # print(premise_mask.size())

            _cencode_premise = cencode_premise.unsqueeze(2).expand(-1, -1, cseq_len_h, -1)
            _cencode_hypothesis = cencode_hypothesis.unsqueeze(1).expand(-1, cseq_len_p, -1, -1)

            cp_h = torch.cat([_cencode_premise, _cencode_hypothesis,
                             _cencode_premise - _cencode_hypothesis,
                             _cencode_premise * _cencode_hypothesis], dim=-1)  # batch, seq_len1, seq_len2, 4*2*hidden_size

            cp_h = self.c_trans(cp_h).squeeze(-1)  # batch, seq_len1, seq_len2
            # print(cp_h.size())

            csimilarity_matrix_hyp = cp_h + (-999999 * (_chypothesis_mask == 0).float())
            csimilarity_matrix_pre = cp_h + (-999999 * (_cpremise_mask == 0).float())
            # softmax attention weight

            cattention_a = F.softmax(csimilarity_matrix_pre, dim=2)  # batch, p_seq_len, h_seq_len
            cattention_b = F.softmax(csimilarity_matrix_hyp, dim=1)  # batch,

            cattended_premise = torch.bmm(cattention_a, cencode_hypothesis)  # batch, p_seq_len, hidden_size
            cattended_hypothesis = torch.bmm(cattention_b.transpose(1, 2),
                                            cencode_premise)  # batch, h_seq_len, hidden_size

            # the enhancement layer
            # (batch, seq_len, 2*4*hidden_size)
            cpremise_enhanced = torch.cat([cencode_premise, cattended_premise,
                                          cencode_premise - cattended_premise,
                                          cencode_premise * cattended_premise], dim=-1)
            chypothesis_enhanced = torch.cat([cencode_hypothesis, cattended_hypothesis,
                                             cencode_hypothesis - cattended_hypothesis,
                                             cencode_hypothesis * cattended_hypothesis], dim=-1)
            # (batch, seq_len, hidden_size)
            cprojected_enhanced_premise = self.char_projection(cpremise_enhanced)
            cprojected_enhanced_hypothesis = self.char_projection(chypothesis_enhanced)

            # (batch, seq_len, 2*hidden_size)
            # cpremise = self.char_pair_encoder(cprojected_enhanced_premise, cprojected_enhanced_hypothesis, chypothesis_mask)
            # chypothesis = self.char_pair_encoder(cprojected_enhanced_hypothesis, cprojected_enhanced_premise, cpremise_mask)
            cprojected_enhanced_premise = self._rnn_dropout(cprojected_enhanced_premise)
            cprojected_enhanced_hypothesis = self._rnn_dropout(cprojected_enhanced_hypothesis)

            cpremise = self._char_composition(cprojected_enhanced_premise, cpremises_lengths)
            chypothesis = self._char_composition(cprojected_enhanced_hypothesis, chypothesis_lengths)

            # cpremise = self.cmulhead_attention(cpremise.transpose(1, 2), cpremise_mask).transpose(1, 2)
            # chypothesis = self.cmulhead_attention(chypothesis.transpose(1, 2), chypothesis_mask).transpose(1, 2)

            # cpremise,_ = self.average_attention(cpremise, cpremise_mask)
            # chypothesis,_ = self.average_attention(chypothesis, chypothesis_mask)

            cpremise_avg = torch.sum(cpremise * cpremise_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(cpremise_mask,
                                                                                                        dim=1,
                                                                                                        keepdim=True)
            chypothesis_avg = torch.sum(chypothesis * chypothesis_mask.unsqueeze(1).
                                   transpose(2, 1), dim=1) / torch.sum(chypothesis_mask, dim=1, keepdim=True)

            cpremise_max, _ = max_pooling(cpremise, cpremise_mask, dim=1)
            chypothesis_max, _ = max_pooling(chypothesis, chypothesis_mask, dim=1)

            # batch, 2*2*hidden
            c_premise_max_avg = torch.cat([cpremise_avg-cpremise_max, cpremise_avg*cpremise_max], dim=1)
            c_hypothesis_max_avg = torch.cat([chypothesis_avg-chypothesis_max, chypothesis_avg*chypothesis_max], dim=1)


        # premise = self.self_match_encoder(premise, premise, premise_mask)
        # hypothesis = self.self_match_encoder(hypothesis, hypothesis, hypothesis_mask)

        premise_avg = torch.sum(premise*premise_mask.unsqueeze(1).transpose(2, 1), dim=1) / torch.sum(premise_mask, dim=1, keepdim=True)
        hypothesis_avg = torch.sum(hypothesis*hypothesis_mask.unsqueeze(1).
                                   transpose(2, 1),dim=1) / torch.sum(hypothesis_mask, dim=1, keepdim=True)

        premise_max, _ = max_pooling(premise, premise_mask, dim=1)
        hypothesis_max, _ = max_pooling(hypothesis, hypothesis_mask, dim=1)


        premise_avg_max = torch.cat([premise_avg-premise_max, premise_avg*premise_max], dim=1)
        hypothesis_avg_max = torch.cat([hypothesis_avg-hypothesis_max, hypothesis_avg*hypothesis_max], dim=1)

        v = torch.cat([premise_avg, premise_max, hypothesis_avg, hypothesis_max,
                       cpremise_avg, cpremise_max, chypothesis_avg, chypothesis_max], dim=1)
        logits = self._classification(v)

        return logits
Beispiel #7
0
    def __init__(self, batch_size, quest_len, answer_len, embeddings, embedding_size, rnn_size, num_rnn_layers, max_grad_norm,loss_ratio, l2_reg_lambda=0.0, adjust_weight=False,label_weight=[],is_training=True,m=0.1):
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.quest_len = quest_len 
        self.answer_len = answer_len 
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")
        
        self.lr = tf.Variable(0.0,trainable=False)
        self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="ori_quest")
        self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="cand_quest")
        self.neg_input_quests = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="neg_quest")
        self.test_input_q = tf.placeholder(tf.int32, shape=[None, self.quest_len], name="test_input_q")
        self.test_input_a = tf.placeholder(tf.int32, shape=[None, self.answer_len], name="test_input_a")
        self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER], name='cat_ids')

        #embedding layer
        with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W")
            ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests =tf.nn.embedding_lookup(W, self.neg_input_quests)
            test_quest =tf.nn.embedding_lookup(W, self.test_input_q)
            test_answer =tf.nn.embedding_lookup(W, self.test_input_a)

        #ori_quests = tf.nn.dropout(ori_quests, self.keep_prob)
        #cand_quests = tf.nn.dropout(cand_quests, self.keep_prob)
        #neg_quests = tf.nn.dropout(neg_quests, self.keep_prob)


        #build LSTM network
        with tf.variable_scope("LSTM_scope", reuse=None):
            ori_q = biLSTM(ori_quests, self.rnn_size)
        with tf.variable_scope("LSTM_scope", reuse=True):
            cand_a = biLSTM(cand_quests, self.rnn_size)
            neg_a = biLSTM(neg_quests, self.rnn_size)
            test_q = biLSTM(test_quest, self.rnn_size)
            test_a = biLSTM(test_answer, self.rnn_size)

        #----------------------------- cal attention -------------------------------
        with tf.variable_scope("attention", reuse=None) as scope:
            U = tf.get_variable("U", [2 * self.rnn_size, 2 * rnn_size], initializer=tf.truncated_normal_initializer(stddev=0.1))
            G = tf.matmul(tf.matmul(ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), cand_a, adjoint_b=True)
            delta_q = tf.nn.softmax(tf.reduce_max(G, 2))
            delta_a = tf.nn.softmax(tf.reduce_max(G, 1))
            neg_G = tf.matmul(tf.matmul(ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), neg_a, adjoint_b=True)
            delta_neg_q = tf.nn.softmax(tf.reduce_max(neg_G, 2))
            delta_neg_a = tf.nn.softmax(tf.reduce_max(neg_G, 1))
        with tf.variable_scope("attention", reuse=True) as scope:
            test_G = tf.matmul(tf.matmul(test_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])), test_a, adjoint_b=True)
            delta_test_q = tf.nn.softmax(tf.reduce_max(test_G, 2))
            delta_test_a = tf.nn.softmax(tf.reduce_max(test_G, 1))

        #-------------------------- recalculate lstm output -------------------------
        #ori_q_feat = tf.squeeze(tf.matmul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]), adjoint_a=True))
        #cand_q_feat = tf.squeeze(tf.matmul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]), adjoint_a=True))
        #neg_ori_q_feat = tf.squeeze(tf.matmul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]), adjoint_a=True))
        #neg_q_feat = tf.squeeze(tf.matmul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]), adjoint_a=True))
        #test_q_out = tf.squeeze(tf.matmul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]), adjoint_a=True))
        #test_a_out = tf.squeeze(tf.matmul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]), adjoint_a=True))
        ori_q_feat = max_pooling(tf.multiply(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1])))
        cand_q_feat = max_pooling(tf.multiply(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1])))
        neg_ori_q_feat = max_pooling(tf.multiply(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1])))
        neg_q_feat = max_pooling(tf.multiply(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1])))
        test_q_out = max_pooling(tf.multiply(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1])))
        test_a_out = max_pooling(tf.multiply(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1])))

        #-------------------------- recalculate lstm output end ---------------------
        # dropout
        #self.out_ori = tf.nn.dropout(self.out_ori, self.keep_prob)
        #self.out_cand = tf.nn.dropout(self.out_cand, self.keep_prob)
        #self.out_neg = tf.nn.dropout(self.out_neg, self.keep_prob)

        # multitasking
        with tf.name_scope("multitasking"):
            feature_size = int(ori_q_feat.get_shape()[1])

            fc1 = tf.layers.dense(ori_q_feat, feature_size * 2, activation=tf.nn.relu, name='fc1')
            fc2 = tf.layers.dense(fc1, feature_size, activation=tf.nn.relu, name='fc2')
            logits = tf.layers.dense(fc2, CAT_NUMBER, activation=tf.nn.sigmoid)

            # feature_size = int(ori_q_feat.get_shape()[1])

            # w = tf.get_variable(name='weights', shape=(feature_size, CAT_NUMBER, initializer=tf.random_normal_initializer())
            # b = tf.get_variable(name='bias', shape=(1, CAT_NUMBER), initializer=tf.zeros_initializer())

            # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask")

            # logits = tf.matmul(ori_q_feat, w) + b

            entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.cat_ids, name='loss')
            loss_multitask = tf.reduce_mean(entropy)

        # acc
        self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat)
        loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score, self.ori_neg_score, m)

        self.loss = loss_origin * (1 - loss_ratio) + loss_multitask * loss_ratio

        self.test_q_a = feature2cos_sim(test_q_out, test_a_out)

        # multitasking_acc
        with tf.name_scope("multi_acc"):
            self.preds = tf.nn.softmax(logits)
            self.correct_preds = tf.equal(tf.argmax(self.preds, 1), tf.argmax(self.cat_ids, 1))
            self.multi_acc = tf.reduce_sum(tf.cast(self.correct_preds, tf.float32))
Beispiel #8
0
    def __init__(self,
                 batch_size,
                 quest_len,
                 answer_len,
                 embeddings,
                 embedding_size,
                 rnn_size,
                 num_rnn_layers,
                 max_grad_norm,
                 l2_reg_lambda=0.0,
                 adjust_weight=False,
                 label_weight=[],
                 is_training=True):
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.quest_len = quest_len
        self.answer_len = answer_len
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")

        self.lr = tf.Variable(0.0, trainable=False)
        self.new_lr = tf.placeholder(tf.float32,
                                     shape=[],
                                     name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(tf.int32,
                                               shape=[None, self.quest_len],
                                               name="ori_quest")
        self.cand_input_quests = tf.placeholder(tf.int32,
                                                shape=[None, self.answer_len],
                                                name="cand_quest")
        self.neg_input_quests = tf.placeholder(tf.int32,
                                               shape=[None, self.answer_len],
                                               name="neg_quest")
        self.test_input_quests = tf.placeholder(tf.int32,
                                                shape=[None, self.quest_len],
                                                name="test_quest")
        self.test_input_answer = tf.placeholder(tf.int32,
                                                shape=[None, self.answer_len],
                                                name="test_cand_quest")

        #embedding layer
        with tf.device("/cpu:0"), tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings),
                            trainable=True,
                            name="W")
            ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests)
            test_quest = tf.nn.embedding_lookup(W, self.test_input_quests)
            test_answer = tf.nn.embedding_lookup(W, self.test_input_answer)

        #ori_quests = tf.nn.dropout(ori_quests, self.keep_prob)
        #cand_quests = tf.nn.dropout(cand_quests, self.keep_prob)
        #neg_quests = tf.nn.dropout(neg_quests, self.keep_prob)

        #build LSTM network
        with tf.variable_scope("LSTM_scope", reuse=None):
            ori_q = BILSTM(ori_quests, self.rnn_size)
        with tf.variable_scope("LSTM_scope", reuse=True):
            cand_a = BILSTM(cand_quests, self.rnn_size)
            neg_a = BILSTM(neg_quests, self.rnn_size)
            test_q = BILSTM(test_quest, self.rnn_size)
            test_a = BILSTM(test_answer, self.rnn_size)

        #----------------------------- cal attention -------------------------------
        with tf.variable_scope("attention", reuse=None) as scope:
            U = tf.get_variable(
                "U", [2 * self.rnn_size, 2 * rnn_size],
                initializer=tf.truncated_normal_initializer(stddev=0.1))
            G = tf.nn.tanh(
                tf.batch_matmul(tf.batch_matmul(
                    ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])),
                                cand_a,
                                adj_y=True))
            delta_q = tf.nn.softmax(tf.reduce_max(G, 2))
            delta_a = tf.nn.softmax(tf.reduce_max(G, 1))
            neg_G = tf.nn.tanh(
                tf.batch_matmul(tf.batch_matmul(
                    ori_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])),
                                neg_a,
                                adj_y=True))
            delta_neg_q = tf.nn.softmax(tf.reduce_max(neg_G, 2))
            delta_neg_a = tf.nn.softmax(tf.reduce_max(neg_G, 1))
        with tf.variable_scope("attention", reuse=True) as scope:
            test_G = tf.nn.tanh(
                tf.batch_matmul(tf.batch_matmul(
                    test_q, tf.tile(tf.expand_dims(U, 0), [batch_size, 1, 1])),
                                test_a,
                                adj_y=True))
            delta_test_q = tf.nn.softmax(tf.reduce_max(test_G, 2))
            delta_test_a = tf.nn.softmax(tf.reduce_max(test_G, 1))

        #-------------------------- recalculate lstm output -------------------------
        #ori_q_feat = tf.squeeze(tf.batch_matmul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1]), adj_x=True))
        #cand_q_feat = tf.squeeze(tf.batch_matmul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1]), adj_x=True))
        #neg_ori_q_feat = tf.squeeze(tf.batch_matmul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1]), adj_x=True))
        #neg_q_feat = tf.squeeze(tf.batch_matmul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1]), adj_x=True))
        #test_q_feat = tf.squeeze(tf.batch_matmul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1]), adj_x=True))
        #test_a_feat = tf.squeeze(tf.batch_matmul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1]), adj_x=True))
        ori_q_feat = max_pooling(
            tf.mul(ori_q, tf.reshape(delta_q, [-1, self.quest_len, 1])))
        cand_q_feat = max_pooling(
            tf.mul(cand_a, tf.reshape(delta_a, [-1, self.answer_len, 1])))
        neg_ori_q_feat = max_pooling(
            tf.mul(ori_q, tf.reshape(delta_neg_q, [-1, self.quest_len, 1])))
        neg_q_feat = max_pooling(
            tf.mul(neg_a, tf.reshape(delta_neg_a, [-1, self.answer_len, 1])))
        test_q_feat = max_pooling(
            tf.mul(test_q, tf.reshape(delta_test_q, [-1, self.quest_len, 1])))
        test_a_feat = max_pooling(
            tf.mul(test_a, tf.reshape(delta_test_a, [-1, self.answer_len, 1])))

        #-------------------------- recalculate lstm output end ---------------------
        # dropout
        #self.out_ori = tf.nn.dropout(self.out_ori, self.keep_prob)
        #self.out_cand = tf.nn.dropout(self.out_cand, self.keep_prob)
        #self.out_neg = tf.nn.dropout(self.out_neg, self.keep_prob)

        # cal cosine simulation
        self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg = feature2cos_sim(neg_ori_q_feat, neg_q_feat)
        self.test_q_a = feature2cos_sim(test_q_feat, test_a_feat)
        self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg)
Beispiel #9
0
    def __init__(self,
                 batch_size,
                 num_unroll_steps,
                 embeddings,
                 embedding_size,
                 rnn_size,
                 num_rnn_layers,
                 max_grad_norm,
                 attention_matrix_size,
                 loss_ratio,
                 l2_reg_lambda=0.0,
                 adjust_weight=False,
                 label_weight=[],
                 is_training=True,
                 m=0.1):
        """
        LSTM-BASED DEEP LEARNING MODELS FOR NON-FACTOID ANSWER SELECTION
        """
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.num_unroll_steps = num_unroll_steps
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")

        self.lr = tf.Variable(0.0, trainable=False)
        self.new_lr = tf.placeholder(tf.float32,
                                     shape=[],
                                     name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.cand_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.neg_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.test_input_q = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps])
        self.test_input_a = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps])
        self.cat_ids = tf.placeholder(tf.int32, [None, CAT_NUMBER],
                                      name='cat_ids')

        #embedding layer
        with tf.device("/cpu:0"), tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings),
                            trainable=True,
                            name="W")
            ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests)
            test_q = tf.nn.embedding_lookup(W, self.test_input_q)
            test_a = tf.nn.embedding_lookup(W, self.test_input_a)

        #build LSTM network
        U = tf.Variable(tf.truncated_normal(
            [2 * self.rnn_size, self.embedding_size], stddev=0.1),
                        name="U")
        with tf.variable_scope("LSTM_scope", reuse=None):
            ori_q = biLSTM(ori_quests, self.rnn_size)
            ori_q_feat = tf.nn.tanh(max_pooling(ori_q))
        with tf.variable_scope("LSTM_scope", reuse=True):
            cand_att_weight = tf.sigmoid(
                tf.matmul(
                    cand_quests,
                    tf.reshape(tf.expand_dims(tf.matmul(ori_q_feat, U), 1),
                               [-1, self.embedding_size, 1])))
            neg_att_weight = tf.sigmoid(
                tf.matmul(
                    neg_quests,
                    tf.reshape(tf.expand_dims(tf.matmul(ori_q_feat, U), 1),
                               [-1, self.embedding_size, 1])))
            cand_a = biLSTM(
                tf.multiply(
                    cand_quests,
                    tf.tile(cand_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            neg_a = biLSTM(
                tf.multiply(
                    neg_quests,
                    tf.tile(neg_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            cand_q_feat = tf.nn.tanh(max_pooling(cand_a))
            neg_q_feat = tf.nn.tanh(max_pooling(neg_a))
            test_q_out = biLSTM(test_q, self.rnn_size)
            test_q_out = tf.nn.tanh(max_pooling(test_q_out))
            test_att_weight = tf.sigmoid(
                tf.matmul(
                    test_a,
                    tf.reshape(tf.expand_dims(tf.matmul(test_q_out, U), 1),
                               [-1, self.embedding_size, 1])))
            test_a_out = biLSTM(
                tf.multiply(
                    test_a,
                    tf.tile(test_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            test_a_out = tf.nn.tanh(max_pooling(test_a_out))

        # multitasking
        with tf.name_scope("multitasking"):
            feature_size = int(ori_q_feat.get_shape()[1])

            fc1 = tf.layers.dense(ori_q_feat,
                                  feature_size * 2,
                                  activation=tf.nn.relu,
                                  name='fc1')
            fc2 = tf.layers.dense(fc1,
                                  feature_size,
                                  activation=tf.nn.relu,
                                  name='fc2')
            logits = tf.layers.dense(fc2, CAT_NUMBER, activation=tf.nn.sigmoid)

            # feature_size = int(ori_q_feat.get_shape()[1])

            # w = tf.get_variable(name='weights', shape=(feature_size, CAT_NUMBER, initializer=tf.random_normal_initializer())
            # b = tf.get_variable(name='bias', shape=(1, CAT_NUMBER), initializer=tf.zeros_initializer())

            # positive_qa = tf.concat([out_ori,out_cand],1,name="embedding_for_multitask")

            # logits = tf.matmul(ori_q_feat, w) + b

            entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits, labels=self.cat_ids, name='loss')
            loss_multitask = tf.reduce_mean(entropy)

        # acc
        self.ori_cand_score = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg_score = feature2cos_sim(ori_q_feat, neg_q_feat)
        loss_origin, self.acc = cal_loss_and_acc(self.ori_cand_score,
                                                 self.ori_neg_score, m)

        self.loss = loss_origin * (1 -
                                   loss_ratio) + loss_multitask * loss_ratio

        self.test_q_a = feature2cos_sim(test_q_out, test_a_out)

        # multitasking_acc
        with tf.name_scope("multi_acc"):
            self.preds = tf.nn.softmax(logits)
            self.correct_preds = tf.equal(tf.argmax(self.preds, 1),
                                          tf.argmax(self.cat_ids, 1))
            self.multi_acc = tf.reduce_sum(
                tf.cast(self.correct_preds, tf.float32))

        def assign_new_lr(self, session, lr_value):
            session.run(self._lr_update, feed_dict={self.new_lr: lr_value})
Beispiel #10
0
    def forward(self, inputs, labels=None):
        """
        :param inputs: [bsz, max_seq_leng]
        :param labels: [bsz, num_class]
        :return:
        """
        inputs = inputs.t()
        mask = (inputs > 0).float()
        inputs_len = (inputs > 0).int().sum(dim=0)

        hidden = self.encoder(inputs, mask, inputs_len)

        pool_values = []
        for pool in self.summary_type:
            if pool == 'max':
                val = max_pooling(hidden, mask)
                pool_values.append(val)
            elif pool == 'mean':
                val = mean_pooling(hidden, inputs_len, mask)
                pool_values.append(val)
            elif pool == 'first':
                seq_len, bsz, dim = hidden.size()
                val = hidden[0, :, :].view(bsz, -1).contiguous()
                pool_values.append(val)
            elif pool == 'last':
                seq_len, bsz, dim = hidden.size()
                val = hidden[-1, :, :].view(bsz, -1).contiguous()
                pool_values.append(val)
            elif pool == 'struct_att':
                val, att = self.strut_att(hidden, mask)
                bsz, head_num, dim = val.size()
                val = val.contiguous().view(bsz, -1)
                pool_values.append(val)
            elif pool == 'none':
                pool_values.append(hidden)

        if len(self.summary_type) == 1:
            hidden = pool_values[0]
        else:
            hidden = torch.cat(pool_values, dim=-1).contiguous()

        # [bsz, hid_dim]
        bsz, hid_dim = hidden.size()
        # logits = self.cls(self.dropout(hidden))
        hidden = self.normalize(hidden)
        logits = self.cls(hidden)

        if self.training:
            # Mixup
            indices = torch.randperm(bsz, device=logits.device)
            shuf_labels = torch.index_select(labels, 0, indices)
            shuf_hidden = torch.index_select(hidden, 0, indices)

            if self.mixup_type == 'mixup':
                lam = self.beta_dist.sample(sample_shape=(bsz, 1))
                lam = lam.to(inputs.device)
                lam_x, lam_y = lam, lam

            elif self.mixup_type == 'prior_mix':
                lam_x = self.beta_dist.sample(sample_shape=(bsz,))
                lam_x = lam_x.to(inputs.device)
                lam_y = self.prior_mixup(labels, shuf_labels)
                lam_y = 2. * lam_x * lam_y / (lam_x + lam_y)

            else:
                raise Exception('Unsupported mixup type %s' % self.mixup_type)

            mix_hidden = lam_x * hidden + (1 - lam_x) * shuf_hidden

            if not self.multi_label:
                onehot_label = to_onehot(labels, self.num_class)
                onehot_shuf_label = to_onehot(shuf_labels, self.num_class)
            else:
                onehot_label = labels
                onehot_shuf_label = shuf_labels

            lam_y = lam_y.unsqueeze(-1)
            mix_labels = lam_y * onehot_label + (1 - lam_y) * onehot_shuf_label

            mix_logits = self.cls(mix_hidden)

            return logits, mix_logits, mix_labels

        return logits, hidden
Beispiel #11
0
    def __init__(self,
                 batch_size,
                 num_unroll_steps,
                 embeddings,
                 embedding_size,
                 rnn_size,
                 num_rnn_layers,
                 max_grad_norm,
                 attention_matrix_size,
                 l2_reg_lambda=0.0,
                 adjust_weight=False,
                 label_weight=[],
                 is_training=True):
        """
        LSTM-BASED DEEP LEARNING MODELS FOR NON-FACTOID ANSWER SELECTION
        """
        # define input variable
        self.batch_size = batch_size
        self.embeddings = embeddings
        self.embedding_size = embedding_size
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.rnn_size = rnn_size
        self.num_rnn_layers = num_rnn_layers
        self.num_unroll_steps = num_unroll_steps
        self.max_grad_norm = max_grad_norm
        self.l2_reg_lambda = l2_reg_lambda
        self.is_training = is_training

        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")

        self.lr = tf.Variable(0.0, trainable=False)
        self.new_lr = tf.placeholder(tf.float32,
                                     shape=[],
                                     name="new_learning_rate")
        self._lr_update = tf.assign(self.lr, self.new_lr)

        self.ori_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.cand_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.neg_input_quests = tf.placeholder(
            tf.int32, shape=[None, self.num_unroll_steps])
        self.test_input_q = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps])
        self.test_input_a = tf.placeholder(tf.int32,
                                           shape=[None, self.num_unroll_steps])

        #embedding layer
        with tf.device("/cpu:0"), tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings),
                            trainable=True,
                            name="W")
            ori_quests = tf.nn.embedding_lookup(W, self.ori_input_quests)
            cand_quests = tf.nn.embedding_lookup(W, self.cand_input_quests)
            neg_quests = tf.nn.embedding_lookup(W, self.neg_input_quests)
            test_q = tf.nn.embedding_lookup(W, self.test_input_q)
            test_a = tf.nn.embedding_lookup(W, self.test_input_a)

        #build LSTM network
        U = tf.Variable(tf.truncated_normal(
            [2 * self.rnn_size, self.embedding_size], stddev=0.1),
                        name="U")
        with tf.variable_scope("LSTM_scope", reuse=None):
            ori_q = biLSTM(ori_quests, self.rnn_size)
            ori_q_feat = tf.nn.tanh(max_pooling(ori_q))
        with tf.variable_scope("LSTM_scope", reuse=True):
            cand_att_weight = tf.sigmoid(
                tf.batch_matmul(
                    cand_quests,
                    tf.reshape(
                        tf.expand_dims(tf.batch_matmul(ori_q_feat, U), 1),
                        [-1, self.embedding_size, 1])))
            neg_att_weight = tf.sigmoid(
                tf.batch_matmul(
                    neg_quests,
                    tf.reshape(
                        tf.expand_dims(tf.batch_matmul(ori_q_feat, U), 1),
                        [-1, self.embedding_size, 1])))
            cand_a = biLSTM(
                tf.mul(cand_quests,
                       tf.tile(cand_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            neg_a = biLSTM(
                tf.mul(neg_quests,
                       tf.tile(neg_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            cand_q_feat = tf.nn.tanh(max_pooling(cand_a))
            neg_q_feat = tf.nn.tanh(max_pooling(neg_a))
            test_q_out = biLSTM(test_q, self.rnn_size)
            test_q_out = tf.nn.tanh(max_pooling(test_q_out))
            test_att_weight = tf.sigmoid(
                tf.batch_matmul(
                    test_a,
                    tf.reshape(
                        tf.expand_dims(tf.batch_matmul(test_q_out, U), 1),
                        [-1, self.embedding_size, 1])))
            test_a_out = biLSTM(
                tf.mul(test_a,
                       tf.tile(test_att_weight, [1, 1, self.embedding_size])),
                self.rnn_size)
            test_a_out = tf.nn.tanh(max_pooling(test_a_out))

        self.ori_cand = feature2cos_sim(ori_q_feat, cand_q_feat)
        self.ori_neg = feature2cos_sim(ori_q_feat, neg_q_feat)
        self.loss, self.acc = cal_loss_and_acc(self.ori_cand, self.ori_neg)

        self.test_q_a = feature2cos_sim(test_q_out, test_a_out)