Example #1
0
    def get_char_by_vector(self, vector: np.array):
        """
        New method for both feature encodings and one-hot embeddings
        Parameters
        ----------
        vector
            The numpy array representing the features of the char
        Returns
            A Char object corresponding to the feature vector
        -------

        """
        cos_sims = {}
        if not self.__ortho:
            for c, v in self.__dict.items():
                cos_sims[c] = cos_sim(vector, v)
        else:
            for c, v in self.__char_embeddings.items():
                # only second dimension, the first one is lacking in the feature encodings
                # for unknown reasons
                cos_sims[c] = cos_sim(v, vector[0,])

        return max(cos_sims, key=cos_sims.get)
Example #2
0
def validate(valid_dataset, model, pool_size, pad_index):
    """
    simple validation in a code pool.
    @param: poolsize - size of the code pool, if -1, load the whole test set
    """

    model.eval()
    processd_num = 0  # record the number of processed data
    accs, mrrs, maps, ndcgs = [], [], [], []
    code_reprs, desc_reprs = [], []
    while processd_num < len(valid_dataset)-batch_size:
        # batch:code_tokens, code_tokens_len, ast_seq, ast_seq_len, desc_pos, desc_pos_len, desc_neg, desc_neg_len
        batch = get_batch(valid_dataset, processd_num, batch_size, pad_index)
        processd_num += batch_size

        code_batch = batch[:4]
        desc_batch = batch[4:6]

        with torch.no_grad():
            code_repr = model.code_encode(*code_batch).data.cpu().numpy().astype(np.float32)
            desc_repr = model.desc_encode(*desc_batch).data.cpu().numpy().astype(np.float32)

        code_reprs.append(code_repr)
        desc_reprs.append(desc_repr)

    code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)

    assert len(code_reprs) == len(desc_reprs)

    bar = tqdm(range(0, len(code_reprs), pool_size))
    bar.set_description("start valid")
    for k in bar:
        if k+pool_size >len(bar):
            break
        code_matrix = code_reprs[k:k+pool_size]
        desc_matrix = desc_reprs[k:k+pool_size]
        real = list(range(pool_size))
        sims = cos_sim(desc_matrix, code_matrix)  # use description to search code
        negsim = np.negative(sims)
        predict = np.argpartition(negsim, kth=pool_size-1)
        predict=predict[:pool_size]

        for i in range(len(real)):
            accs.append(ACC([real[i]], list(predict[i])))
            mrrs.append(MRR([real[i]], list(predict[i])))
            maps.append(MAP([real[i]], list(predict[i])))
            ndcgs.append(NDCG([real[i]], list(predict[i])))

    return np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
Example #3
0
    def get_char_by_embedding(self, vec: np.array):
        """
        Does the same as the method above, but for the one-hot-encoding
        Parameters
        ----------
        vec
            The numpy array representing the localist encoding of the car
        Returns
            A char object corresponding to the embedding
        -------
        """
        cos_sims = {}
        for c, feature_vector in self.__char_embeddings.items():
            cos_sims[c] = cos_sim(vec, feature_vector)

        return max(cos_sims, key=cos_sims.get)
Example #4
0
    def get_char_by_feature_vector(self, vec: np.array):
        """
        Finds the character whose feature vector/embedding is closest to the input vector.
        Atm we use cosine similarity to do the matching
        Parameters
        ----------
        vec
            The numpy array representing the features of the char
        Returns
            A char object corresponding to the feature vector
        -------
        """
        cos_sims = {}
        for c, feature_vector in self.__dict.items():
            cos_sims[c] = cos_sim(vec, feature_vector)

        return max(cos_sims, key=cos_sims.get)
Example #5
0
 def _init_similarity(self, user_id, another_user_id):
     """
     Description
         A function which computes and returns the similarity
         between two users.
     Arguments
         :param user_id: The first user.
         :type user_id: int
         :param another_user_id: The second user.
         :type another_user_id: int
     """
     number_rated_items_user = len(self.co_rated_between(user_id, user_id))
     number_rated_items_another_user = len(
         self.co_rated_between(another_user_id, another_user_id))
     number_of_co_rated_items = len(
         self.co_rated_between(user_id, another_user_id))
     return cos_sim(number_of_co_rated_items, number_rated_items_user,
                    number_rated_items_another_user)
Example #6
0
def time_augmented_evaluate_model(mode, model, db_gen, l_utt, save_dir, epoch,
                                  l_trial, args, device):
    if mode not in ['val', 'eval']:
        raise ValueError('mode should be either "val" or "eval"')
    model.eval()
    with torch.set_grad_enabled(False):
        #1st, extract speaker embeddings.
        l_embeddings = []
        with tqdm(total=len(db_gen), ncols=70) as pbar:
            for m_batch in db_gen:
                l_code = []
                for batch in m_batch:
                    batch = batch.to(device)
                    code = model(x=batch, is_test=True)
                    l_code.extend(code.cpu().numpy())
                l_embeddings.append(np.mean(l_code, axis=0))
                pbar.update(1)
        d_embeddings = {}
        if not len(l_utt) == len(l_embeddings):
            print(len(l_utt), len(l_embeddings))
            exit()
        for k, v in zip(l_utt, l_embeddings):
            d_embeddings[k] = v

        #2nd, calculate EER
        y_score = []  # score for each sample
        y = []  # label for each sample
        f_res = open(save_dir + 'results/{}_epoch{}.txt'.format(mode, epoch),
                     'w')

        for line in l_trial:
            trg, utt_a, utt_b = line.strip().split(' ')
            y.append(int(trg))
            y_score.append(cos_sim(d_embeddings[utt_a], d_embeddings[utt_b]))
            f_res.write('{score} {target}\n'.format(score=y_score[-1],
                                                    target=y[-1]))
        f_res.close()
        fpr, tpr, _ = roc_curve(y, y_score, pos_label=1)
        eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer
Example #7
0
    def forward(self, src_context_output, src_word_output, src_word_len, 
                KG_word_output, KG_word_len, KG_word_seq, tgt_word_input, combine_knowledge = False):
        """Compute decoder scores from context_output.
        Args:
            src_context_output (FloatTensor) : (batch_size, context_size)
            src_word_output (FloatTensor)    : (batch_size, src_max_word_len, word_size)
            src_word_len (LongTensor)        : (batch_size)
            KG_word_output (FloatTensor)     : (batch_size, KG_max_word_len, KG_word_size)
            KG_word_len (LongTensor)         : (batch_size)
            KG_word_seq (LongTensor)         : (batch_size, src_max_word_len)
            tgt_word_input (LongTensor)      : (batch_size, tgt_word_len)
        Regurns:
            logit (FloatTensor)              : (batch_size, tgt_word_len, num_vocab)
            converage (FloatTensor)          : (batch_size, tgt_word_len)
        """
        assert src_context_output.size(0) == src_word_output.size(0)
        assert src_context_output.size(0) == tgt_word_input.size(0)
        assert src_context_output.size(0) == KG_word_output.size(0)  
        batch_size = src_context_output.size(0)
        max_src_len = src_word_output.size(1)
        max_KG_len = KG_word_output.size(1)
        max_tgt_len = tgt_word_input.size(1)

        # prepare the source word and KG word outputs
        # src_word_output : [src_word_len, batch_size, src_word_size]
        src_word_output = src_word_output.permute(1, 0, 2)
        # src_word_mask : [max_src_len, batch_size]
        src_word_mask = generate_mask_by_length(src_word_len, max_src_len) 
        # KG_word_output : [KG_word_len, batch_size, KG_word_size]
        KG_word_output = KG_word_output.permute(1, 0, 2)
        # KG_word_mask : [max_KG_len, batch_size]
        KG_word_mask = generate_mask_by_length(KG_word_len, max_KG_len) 

        # obtain word embedding and initial hidden states
        # tgt_word_emb : [batch_size, tgt_word_len, emb_size]
        tgt_word_emb = self.word_embedding(tgt_word_input)
        # hidden : [batch_size, num_layer, rnn_size]
        hidden = self.context2hidden(src_context_output).view(batch_size, self.num_layers, self.rnn_size)
        # hidden : [num_layer, batch_size, rnn_size]
        hidden = hidden.permute(1, 0, 2)
         
        logit_word_list = []
        coverage_list = []
        for word_index in range(max_tgt_len):
            # recurrence
            # last_hidden : [batch_size, rnn_size]
            last_hidden = hidden[-1]
            # attn_src_word : [batch_size, src_word_size] 
            attn_src_word,_ = self.attention(last_hidden, src_word_output, src_word_output, src_word_mask)
            # attn_KG_word : [batch_size, src_KG_size] 
            # attn_KG_scores : [max_KG_len ,batch_size]
            KG_attention_query = last_hidden + attn_src_word
            attn_KG_word, attn_KG_scores = self.attention(KG_attention_query, KG_word_output, KG_word_output, KG_word_mask)
            # rnn_inputs : [barch_size, emb_size + src_word_size + KG_word_size]
            rnn_inputs = torch.cat([tgt_word_emb[:,word_index], attn_src_word, attn_KG_word], dim = 1)
            # rnn_output : [batch_size, rnn_size] ; hidden : [num_layer, batch_size, rnn_size]
            rnn_output, hidden = self.rnn_cell(rnn_inputs, hidden)
            # prob_word : [batch_size, num_vocab]
            prob_word = self.output(rnn_output)
            if combine_knowledge == True:
                # copy_dist : [batch_size, num_vocab]
                copy_prob = torch.zeros(batch_size, self.num_vocab, device = src_context_output.device)
                copy_prob = torch.scatter_add(input = copy_prob, 
                                              dim = 1, 
                                              index = KG_word_seq, src = attn_KG_scores.permute(1, 0))
                # gen_dist_trans_input : [batch_size, emb_size + KG_rnn_size + rnn_size]
                gen_dist_trans_input = torch.cat([tgt_word_emb[:, word_index], attn_KG_word, rnn_output], dim = 1)
                gen_dist = self.gen_dist_trans(gen_dist_trans_input)
                gen_dist = torch.sigmoid(gen_dist)
                # combined_prob_word: [batch_size, num_vocab]
                combined_prob_word = prob_word * gen_dist + (1 - gen_dist) * copy_prob
            else:
                combined_prob_word = prob_word
            # logit_word : [batch_size, num_vocab]
            logit_word = combined_prob_word.log()
            # coverage_score : [batch_size]
            coverage_score = cos_sim(last_hidden, hidden[-1])
            coverage_score = F.relu(coverage_score)
            
            logit_word_list.append(logit_word)
        # logit : [batch_size, max_tgt_len, num_vocab]
        logit = torch.stack(logit_word_list, dim = 1)

        return logit
Example #8
0
    def __init__(self, batch_size, seq_len, embeddings, char_embeddings, embedding_size, filter_size, num_filters, num_features, num_layers, rnn_size=100, unknown_id=7447, num_classes=2, l2_reg_lambda=4e-4, model_type= "ABCNN3", adjust_weight=False,label_weight=[],is_training=True):
        # define input variable
        self.batch_size = batch_size
        self.seq_len = seq_len 
        self.embeddings = embeddings
        self.char_embeddings = char_embeddings
        self.embedding_size = embedding_size
        self.filter_size = filter_size
        self.num_filters = num_filters
        self.num_features = num_features
        self.num_layers = num_layers 
        self.num_classes = num_classes
        self.l2_reg_lambda = l2_reg_lambda
        self.model_type = model_type
        self.adjust_weight = adjust_weight
        self.label_weight = label_weight
        self.is_training = is_training
        self.rnn_size = rnn_size

        self.ori_input_quests = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="ori_input")
        self.cand_input_quests = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="cand_input")
        self.ori_input_quests_char = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="ori_input_var")
        self.cand_input_quests_char = tf.placeholder(tf.int32, shape=[None, self.seq_len], name="cand_input_var")
        #self.ori_input_quests_var = self.ori_input_quests
        #self.cand_input_quests_var = self.cand_input_quests
        self.labels = tf.placeholder(tf.int32, shape=[None], name="labels")
        self.features = tf.placeholder(tf.float32, shape=[None, num_features], name="features")
        self.keep_prob = tf.placeholder(tf.float32, name="keep_drop")
        
        self.new_lr = tf.placeholder(tf.float32, shape=[],name="new_learning_rate")
        self.lr = tf.Variable(0.0,trainable=False)
        self._lr_update = tf.assign(self.lr, self.new_lr)

        #embedding layer
        with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
            W = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="W")
            char_W = tf.Variable(tf.to_float(self.char_embeddings), trainable=True, name="char_W")

        ori_quests =tf.nn.embedding_lookup(W, self.ori_input_quests)
        cand_quests =tf.nn.embedding_lookup(W, self.cand_input_quests)
        ori_quests_char =tf.nn.embedding_lookup(W, self.ori_input_quests_char)
        cand_quests_char =tf.nn.embedding_lookup(W, self.cand_input_quests_char)

        #
        ori_emb = tf.concat(2, [ori_quests, ori_quests_char])
        cand_emb = tf.concat(2, [cand_quests, cand_quests_char])

        #shape [batch_size, embedding_size, seq_len, 1]

        #shape [batch_size, embedding_size]
        #LO_0 = all_pool("input-left", x1_expanded, self.seq_len, self.filter_size, self.num_filters, self.embedding_size)
        #RO_0 = all_pool("input-right", x2_expanded, self.seq_len, self.filter_size, self.num_filters, self.embedding_size)

        # LI_1, RI_1 shape [batch, num_filters, seq_len, 1]
        # LO_1, RO_1 shape [batch, num_filters]
        x1_expanded = tf.expand_dims(ori_emb, -1)
        x2_expanded = tf.expand_dims(cand_emb, -1)
        #LO_1, RO_1 = CNN_layer("CNN-1", x1_expanded, x2_expanded, self.seq_len, self.embedding_size, self.num_filters, self.filter_size, self.l2_reg_lambda, self.model_type) 
        with tf.variable_scope("cnn", reuse=None) as scope:
            LO_1 = CNN(x1_expanded, self.seq_len, self.embedding_size*2, self.filter_size, self.num_filters) 
        with tf.variable_scope("cnn", reuse=True) as scope:
            RO_1 = CNN(x2_expanded, self.seq_len, self.embedding_size*2, self.filter_size, self.num_filters) 

        #with tf.variable_scope("LSTM_scope", reuse=None):
        #    ori_q = BILSTM(ori_emb, self.rnn_size)
        #    LO_1 = max_pooling(ori_q)
        #with tf.variable_scope("LSTM_scope", reuse=True):
        #    cand_a = BILSTM(cand_emb, self.rnn_size)
        #    RO_1 = max_pooling(cand_a)
        #self.sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)]
        #self.sims = [cos_sim(LO_1, RO_1), cos_sim(LO_2, RO_2)]
        self.sims = [cos_sim(LO_1, RO_1)]

        #if self.num_layers > 1:
        #    with tf.variable_scope("cnn", reuse=None) as scope:
        #        LO_2 = CNN(tf.expand_dims(ori_q, -1), self.seq_len, self.rnn_size * 2, self.filter_size, self.num_filters) 
        #    with tf.variable_scope("cnn", reuse=True) as scope:
        #        RO_2 = CNN(tf.expand_dims(cand_a, -1), self.seq_len, self.rnn_size * 2, self.filter_size, self.num_filters) 
        #    self.sims.append(cos_sim(LO_2, RO_2))

        with tf.variable_scope("output_layer") as scope:
            self.output_features = tf.concat(1, [self.features, tf.pack(self.sims, axis=1)], name="output_features")
        #self.lstm_features = tf.concat(1, [LO_1, RO_1])
        self.lstm_features = tf.concat(1, [tf.concat(1, [LO_1, RO_1]), self.features])
        #self.lstm_features = self.output_features

        with tf.variable_scope("fully_connected"):
            #feature_len = int(self.output_features.get_shape()[1])
            feature_len = int(self.lstm_features.get_shape()[1])
            softmax_w = tf.get_variable("softmax_w", initializer=tf.truncated_normal([feature_len, self.num_classes], stddev=0.1))
            softmax_b = tf.get_variable("softmax_b", initializer=tf.constant(0., shape=[self.num_classes]))
            self.estimation = tf.matmul(self.lstm_features, softmax_w) + softmax_b
            #self.estimation_sigmoid = tf.nn.sigmoid(self.estimation)
            self.output_features = self.estimation
            #self.output_features = tf.concat(1, [self.features, self.estimation_sigmoid])

        with tf.name_scope("loss"):
            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.estimation, self.labels)
            self.cost = tf.reduce_mean(self.loss)
Example #9
0
    ppl = 1
    for i in range(1, length-1):
        input_seq = torch.LongTensor(sentence2id[:i]).view(1, -1)
        target = torch.LongTensor([sentence2id[i+1]])

        with torch.no_grad():
            _, output = model(input_seq)  # 1*5262
            prob = F.softmax(output.squeeze(), dim=-1)

            prob = torch.index_select(prob, 0, target)

        ppl *= (1/prob.item())

    ppl = pow(ppl, 1/(length-1))

    return ppl


if __name__ == '__main__':
    # sentence = '我 喜欢 吃 火'
    # print(sentence_complement(sentence))
    s1 = '你 把 衣服 脱 光 了'
    s2 = '你 把 衣服 脱 了'
    begin = time.time()

    v1 = get_hidden_state(s1)
    print(time.time()-begin)
    v2 = get_hidden_state(s2)

    sim = cos_sim(v1, v2)
    print('{}\t与\t{}\t的相似度为:{}'.format(s1.replace(' ', ''), s2.replace(' ', ''), sim))
Example #10
0
    for q_id, q_text in total_q_dics.items():
        a = collect_a(str(q_id))
        if a != None:
            QA.append((q_text, a))

    '''
    Qの検索結果とAの検索結果の類似度を計算して,そのスコアをQとタプルにする
    '''
    Q_score = []
    for q_text, a_text in QA:
       #QとAのtf辞書を作成する
       q_dic = tfdic_by_bing(q_text, 50, ["Description"])
       a_dic = tfdic_by_bing(a_text, 50, ["Description"]) 

       #QとAの辞書を類似度を測る
       sim_score = cos_sim(q_dic, a_dic)

       Q_score.append((q_text, sim_score))


    '''
    Q_scoreを降順にソートする
    '''
    sorted_Q = sorted(Q_score, key=lambda x:x[1])
    for q in sorted_Q:
        print(q)