コード例 #1
0
    def find_topics(self, similarity_matrix, distribution_matrix):
        """Main function that hierarchically finds topics"""
        # Prealocate vectors for assessing grouping quality
        if self.do_silhouette:
            # simil_shape = similarity_matrix.shape
            word_to_topics_distance = np.subtract(1.0, np.copy(similarity_matrix))
            tokens_distribution = np.copy(distribution_matrix)

        number_of_topics = len(self.topics)

        max_index = np.unravel_index(np.argmax(similarity_matrix, axis=None), similarity_matrix.shape)
        max_simil = similarity_matrix[max_index]

        while (len(self.topics) > 2) and (max_simil >= self.min_association):
            self.max_simil_history.append(max_simil)

            # Update information about topics
            new_topics = self.topics[max_index[0]] + self.topics[max_index[1]]
            self.topics = delete_indices(self.topics, max_index)
            self.topics = [new_topics] + self.topics
            number_of_topics -= 1

            # Update distribution of new topic
            new_dist = np.copy(distribution_matrix[max_index, :])
            new_dist = np.sum(new_dist, axis=0, keepdims=True)

            # Delete rows or cols associated with merged topics
            distribution_matrix = np.delete(distribution_matrix, max_index, axis=0)
            similarity_matrix = np.delete(similarity_matrix, max_index, axis=0)
            similarity_matrix = np.delete(similarity_matrix, max_index, axis=1)

            # New similarity
            new_simil = cos(new_dist, distribution_matrix)

            # Update matrices
            distribution_matrix = np.concatenate([new_dist, distribution_matrix])
            new_similarity_matrix = np.zeros((number_of_topics, number_of_topics))
            new_similarity_matrix[1:, 1:] = similarity_matrix
            new_similarity_matrix[0, 1:] = new_simil
            new_similarity_matrix[1:, 0] = new_simil
            similarity_matrix = new_similarity_matrix

            # Silhouette algorithm
            if self.do_silhouette and (len(self.topics[0]) > 1):
                word_to_topics_distance = np.delete(word_to_topics_distance, max_index, axis=1)
                new_word_to_topic_distance = np.subtract(1.0, cos(tokens_distribution, new_dist))
                word_to_topics_distance = np.concatenate([new_word_to_topic_distance,
                                                          word_to_topics_distance], axis=1)
                self.silhouette(word_to_topics_distance, tokens_distribution, distribution_matrix)

            # Find new max
            max_index = np.unravel_index(np.argmax(similarity_matrix, axis=None), similarity_matrix.shape)
            max_simil = similarity_matrix[max_index]

        if self.do_silhouette:
            self.topics = self.best_topics
コード例 #2
0
def edge_cos(a, b, x, eps):
    '''REAL cosine similarity fuction with fade out. For `c=(a+b)/2` equals `cos_sim(x-c, b-c)/exp(norm(x-c))`. Positive where `ab` and `x` lay in the same semispace.'''
    c = (a + b) / 2
    dx = x - c
    cs = cos(dx.reshape(1, -1), (b - c).reshape(1, -1))[0][0]
    val = cs / np.exp(vector_norm(dx))
    return val
コード例 #3
0
def topic_attribution(tf_matrix, embeddings, embeddings_vocab, topics, sites):
    embeddings = sp.csr_matrix(np.copy(embeddings))
    tf_matrix = tf_matrix.transpose()
    corpus_embeddings = tf_matrix.dot(embeddings).toarray()
    embeddings_arr = embeddings.toarray()
    for index, topic in enumerate(topics):
        words = topic["word"]
        words_ids = [embeddings_vocab[word] for word in words]
        topic_embedding = np.sum(embeddings_arr[words_ids, :], axis=0, keepdims=True)
        df = pd.DataFrame({'corpus_simil': cos(corpus_embeddings, topic_embedding).flatten().tolist(),
                           'counts': np.array(tf_matrix.sum(axis=1)).flatten().tolist(),
                           'site': sites})
        mean_simil = df.groupby('site').apply(_weighted_mean, 'corpus_simil', 'counts')
        topics[index]['mean_simil'] = {'site': mean_simil.index.values,
                                       'mean_simil': mean_simil.values}
    # # Sites similarity
    # sites_dict = {}
    # for index, element in enumerate(sites):
    #     if element in sites_dict.keys():
    #         sites_dict[element].append(index)
    #     else:
    #         sites_dict[element] = [index]
    #
    # sites_corpus = np.zeros((len(sites_dict.keys()), corpus_embeddings.shape[1]))
    # site_names = []
    # for index, (site, indices) in enumerate(sites_dict.items()):
    #     site_names.append(site)
    #     sites_corpus[index, :] = np.sum(corpus_embeddings[indices, :], axis=0, keepdims=True)
    # sites_simil = cos(sites_corpus)

    return topics
コード例 #4
0
    def silhouette(self, tokens_to_topics_distance, tokens_distribution,
                   distribution_matrix):
        """
        Calculation of silhouette value in a particular iteration.
        Function updates inner similarity only for the new topic.

        :param tokens_to_topics_distance numpy array of distance between tokens and topics
        :param tokens_distribution numpy array of tokens embeddings
        :param distribution_matrix numpy array of topics embeddings
        """
        # Outer similarity
        for topic_index, topic in enumerate(self.topics):
            tokens_to_topics = np.copy(tokens_to_topics_distance)
            tokens_to_topics = np.delete(tokens_to_topics, topic_index, axis=1)
            for token_index in topic:
                self.outer_distance[token_index] = np.amin(
                    tokens_to_topics[token_index, :], keepdims=False)

        # Inner similarity - updated is only topic created in last iteration
        for token_index in self.topics[0]:
            token_distribution = np.copy(tokens_distribution[token_index, :])
            reference_distribution = np.subtract(
                np.copy(distribution_matrix[0, :]), token_distribution)
            # reference_distribution = np.copy(distribution_matrix[0, :])
            token_distribution = np.reshape(token_distribution,
                                            (-1, token_distribution.shape[0]))
            reference_distribution = np.reshape(
                reference_distribution, (-1, reference_distribution.shape[0]))
            self.inner_distance[token_index] = np.subtract(
                1.0, cos(reference_distribution, token_distribution))

        # Calculate silhouette values and their mean
        silhouette_values = np.zeros((len(self.outer_distance), ))
        selected_indices = np.where((self.inner_distance != 0.0)
                                    & (self.outer_distance != 0.0))
        selected_inner = self.inner_distance[selected_indices]
        selected_outer = self.outer_distance[selected_indices]
        silhouette_values[selected_indices] = np.subtract(
            selected_outer, selected_inner)
        maximum_values = np.maximum(selected_outer, selected_inner)
        silhouette_values[selected_indices] = np.divide(
            silhouette_values[selected_indices], maximum_values)

        # Calculate penalty - number of single tokens * penalty_value (0.1)
        single_tokens = self.tokens_len - (self.tokens_len - len(self.topics))
        penalty = (single_tokens * self.singularity_penalty) / self.tokens_len
        mean_silhouette = np.mean(silhouette_values) + penalty

        # Update best topic and history
        if mean_silhouette > self.max_silhouette:
            self.max_silhouette = mean_silhouette
            self.best_topics = self.topics

        self.silhouette_history.append(mean_silhouette)
コード例 #5
0
def create_embeddings(articles,
                      lambda_statistics,
                      log_lambda_statistics_df,
                      pipeline=False,
                      embedding_size=256):
    """
    This function creates embeddings for each word in data frame.
    
    :param articles data frame with pasted lematizated text
    :param lambda_statistics data frame with lambda statistics and selected words that are 
    important in a particular 'day'
    :param pipeline bool for True function outputs only files needed in next steps of pipeline
    """

    selected_words = list(lambda_statistics["word"])
    corpus = list(articles["text"])

    vectorizer = CountVectorizer(lowercase=False,
                                 tokenizer=space_tokenizer,
                                 min_df=1,
                                 encoding="UTF-8")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    tfidf_matrix = tfidf_matrix.transpose()

    svd = TruncatedSVD(n_components=embedding_size, n_iter=15, random_state=42)
    embeddings = svd.fit_transform(tfidf_matrix)

    # Scale embeddings by log lambda
    lambda_log_array = set_lambda_order(log_lambda_statistics_df,
                                        vectorizer.vocabulary_)
    embeddings = np.multiply(embeddings, lambda_log_array, out=embeddings)

    selected_words.sort()
    selected_words_indices = []
    for word in selected_words:
        try:
            selected_words_indices.append(vectorizer.vocabulary_[word])
        except KeyError:
            print(word)

    selected_embeddings = embeddings[selected_words_indices, :]
    similarity_matrix = cos(selected_embeddings)
    np.fill_diagonal(similarity_matrix, 0)

    distribution_matrix = tfidf_matrix[selected_words_indices, :].todense()

    if pipeline:
        return (similarity_matrix, distribution_matrix, selected_words,
                selected_embeddings, embeddings, vectorizer.vocabulary_)
    else:
        return (similarity_matrix, np.transpose(svd.components_),
                distribution_matrix, selected_words, selected_embeddings)
コード例 #6
0
def pairwise(segmentpool, N, threshold):
    result = []
    summs = [[] for i in range(N)]
    for seg in segmentpool:
        segment = segmentpool[seg]
        #for segment in segmentpool:
        doc = segment.docid - 1
        summs[doc].append(segment)
    summ_pairs = combinations(summs, 2)
    scores = []

    for summ_pair in summ_pairs:
        for segment in summ_pair[0]:
            if type(segment.vec) is list:
                segment.vec = np.array([segment.vec])
                segment.vec.reshape(-1, 1)
            else:
                if segment.vec.shape == (1, 100):
                    segment.vec.reshape(-1, 1)
                else:
                    pass
            #segment.vec = np.array([segment.vec])
            #segment.vec.reshape(-1,1)
            for seg in summ_pair[1]:
                if type(seg.vec) is list:
                    seg.vec = np.array([seg.vec])
                    seg.vec.reshape(-1, 1)
                else:
                    if seg.vec.shape == (1, 100):
                        seg.vec.reshape(-1, 1)
                    else:
                        pass
                sc = cos(segment.vec, seg.vec)[0][0]
                #print sc
                #sc = cos(segment.vec, seg.vec)
                scores.append(sc)
                result.append({
                    'seg1id': segment.id,
                    'seg2id': seg.id,
                    'seg1': segment.seg,
                    'seg2': seg.seg,
                    'WAS': sc * 2
                })
    Q3 = np.percentile(np.asarray(scores), threshold)
    fifty = np.percentile(np.asarray(scores), 50)
    print fifty
    #print('\tCosine Score for Quantile: %.3f' % Q3)
    rresult = []
    for res in result:
        if (res['WAS'] / 2) > Q3:
            rresult.append(res)
    return rresult
コード例 #7
0
def cos_compare(word, sentences, embeddings):

    ss = [Sentence(s.lower()) for s in sentences]  # Change to flair format
    compare = []

    for s in ss:
        embeddings.embed(s)
        for tok in s:
            if tok.text == word:
                compare.append(tok.embedding)

    compare = torch.stack(compare).cpu().clone().numpy()

    return cos(compare, compare)[2]
コード例 #8
0
def pairwise_test(segmentpool, N):
    result = []
    summs = [[] for i in range(N)]
    for seg in segmentpool:
        segment = segmentpool[seg]
        doc = segment.docid - 1
        summs[doc].append(segment)
    summ_pairs = combinations(summs, 2)

    for summ_pair in summ_pairs:
        for segment in summ_pair[0]:
            if type(segment.vec) is list:
                segment.vec = np.array([segment.vec])
                segment.vec.reshape(-1, 1)
            else:
                if segment.vec.shape == (1, 100):
                    segment.vec.reshape(-1, 1)
                else:
                    pass
            #segment.vec = np.array([segment.vec])
            #segment.vec.reshape(-1,1)
            #print "segment shape", segment.vec.shape
            for seg in summ_pair[1]:
                if type(seg.vec) is list:
                    seg.vec = np.array([seg.vec])
                    seg.vec.reshape(-1, 1)
                else:
                    if seg.vec.shape == (1, 100):
                        seg.vec.reshape(-1, 1)
                    else:
                        pass
                #print "seg shape", seg.vec.shape
                sc = cos(segment.vec, seg.vec)[0][0]
                #sc = cos(segment.vec,seg.vec)
                #if sc > 0.5:
                #result.append( {'seg1id': segment.id, 'seg2id': seg.id, 'seg1': segment.seg, 'seg2': seg.seg, 'WAS': sc*2})
                result.append(sc)
    with open("scores.txt", 'w') as f:
        for i in result:
            lines = str(i) + "\n"
            f.write(lines)
    return result
コード例 #9
0
ファイル: CGExpan.py プロジェクト: wayne9qiu/CGExpan
    def class_guided_filter(self, query_set, expanded_set, pos_cname, neg_cnames, cname2count):

        cnames = [pos_cname] + list(neg_cnames)
        cname2idx = {cname:i for i, cname in enumerate(cnames)}
        cnames_rep = np.vstack([self.get_cname_rep(cname) for cname in cnames])

        filter_out = set()
        for eid in expanded_set:
            emb = self.get_emb(self.eid2idx[eid])
            sims = cos(cnames_rep, emb)
            cnt = 0
            for i in range(len(self.ranking_templates)):
                scores = np.mean(np.partition(sims[[j*6+i for j in range(len(cnames))]], -self.k, axis=1)[:, -self.k:], axis=1)
                if np.argmax(scores) != cname2idx[pos_cname]:
                    cnt += 1
            if cnt > 2:
                filter_out.add(eid)
        temp = set([cn for cn in cname2count if cname2count[cn] >= GENERATION_SAMPLE_SIZE * len(self.generation_templates) / 6.])
        temp.update([self.inflect.plural(cn) for cn in temp])
        filter_out.update([eid for eid in expanded_set if self.eid2name[eid].lower() in temp])
        return [eid for eid in expanded_set if eid not in filter_out], filter_out
コード例 #10
0
def construct_graph(dataset, features, topk):
    fname = '../data/' + dataset + '/knn/tmp.txt'
    print(fname)
    f = open(fname, 'w')
    ##### Kernel
    # dist = -0.5 * pair(features) ** 2
    # dist = np.exp(dist)

    #### Cosine
    dist = cos(features)
    inds = []
    for i in range(dist.shape[0]):
        ind = np.argpartition(dist[i, :], -(topk + 1))[-(topk + 1):]
        inds.append(ind)

    for i, v in enumerate(inds):
        for vv in v:
            if vv == i:
                pass
            else:
                f.write('{} {}\n'.format(i, vv))
    f.close()
コード例 #11
0
 def averageSimilarity(self, segment_embedding):
     normalizer = len(self.embeddings)
     if normalizer == 0:
         print self.id 
     similarity = 0
     segment_embedding = np.array([segment_embedding])
     segment_embedding.reshape(-1,1)
     for embedding in self.embeddings:
         # By Yanjun: For testing 
         embedding = np.array([embedding])
         #print "embedding", embedding
         
         #print "embedding", segment_embedding
         embedding.reshape(-1,1)
         
         similarity += cos(embedding, segment_embedding)[0][0]
         #similarity += cos(embedding, segment_embedding)
         # For testing purpose, change simiarity threshold to 4.0  
     #if similarity / normalizer < 4.35146819363:
     if similarity / normalizer <0.5:
         return None
     else:
         return [similarity / normalizer, self.weight]
コード例 #12
0
ファイル: CGExpan.py プロジェクト: wayne9qiu/CGExpan
    def class_name_ranking(self, cname2count, query_set, expanded_set, neg_cnames, prev_cn, margin):
        current_set = query_set + expanded_set
        ids = []
        cnames = [cname for cname in cname2count if cname2count[cname] >= self.gen_thres]
        cnames += [cn for cn in prev_cn if cn not in cnames]
        cname2idx = {cname:i for i, cname in enumerate(cnames)}
        cnames_rep = np.vstack([self.get_cname_rep(cname) for cname in cnames])
        scores = np.zeros((len(current_set), len(cnames)))
        for i, eid in enumerate(current_set):
            emb = self.get_emb(self.eid2idx[eid])
            if len(emb) < self.k:
                continue
            sims = cos(cnames_rep, emb)
            for j in range(len(cnames)):
                scores[i, j] = np.mean(np.partition(np.amax(sims[j*6:(j+1)*6], axis=0), -self.k)[-self.k:])
        cname2mrr=ddict(float)
        for eid, score in zip(current_set, scores):
            r = 0.
            for i in np.argsort(-score):
                cname = cnames[i]
                if cname2count[cname] < min(GENERATION_SAMPLE_SIZE*len(self.generation_templates)*POS_CNAME_THRES, max(cname2count.values())) and cname not in prev_cn:
                    continue
                r += 1
                cname2mrr[cname] += 1 / r
        pos_cname = sorted(cname2mrr.keys(), key=lambda x: cname2mrr[x], reverse=True)[0]

        # find negative entities
        uni_cnames = [cname for cname in cnames if len(cname.split(' ')) == 1 and not pos_cname.endswith(cname)]
        this_neg_cnames = set(uni_cnames)
        for eid, score in zip(query_set, scores):
            ranked_uni_cnames = sorted([pos_cname]+uni_cnames, key=lambda x: score[cname2idx[x]], reverse=True)
            for i, cname in enumerate(ranked_uni_cnames):
                if cname == pos_cname:
                    break
            this_neg_cnames = this_neg_cnames & set(ranked_uni_cnames[i+1+margin:])
        return pos_cname, neg_cnames | this_neg_cnames
コード例 #13
0
ファイル: CGExpan.py プロジェクト: sysu17363098/EXpanTest
    def class_guided_expansion(self, pos_cname, current_set, set_text,
                               neg_set):
        global_idx_generator = self.rand_idx(len(current_set))
        local_idx_generator = self.rand_idx(len(current_set))
        global_scores = cos(
            self.means[[self.eid2idx[eid] for eid in current_set]], self.means)

        # 1.把pattern和entity名结合形成查询文�?
        ids = []
        for _ in range(EXPANSION_SAMPLE_SIZE):
            for template in self.expansion_templates:
                indices = []  # 随机�?个entity
                for n in local_idx_generator:
                    if n not in indices:
                        indices.append(n)
                        if len(indices) == 3:
                            break
                fill_in = [self.tokenizer.mask_token
                           ] + [set_text[i] for i in indices]
                fill_in = np.random.permutation(fill_in)
                # fill_in: ['MASK','entity1_name','entity2_name','entity3_name']乱序
                text = template[0] + pos_cname + template[1]
                text = text.format(*fill_in)
                ids.append(self.tokenizer.encode(text, max_length=512))

        mask_rep = self.get_mask_rep(ids)  # mask_rep就是候选entity的集�?

        eid2mrr = ddict(float)
        for local_rep in mask_rep:
            indices = []
            for n in global_idx_generator:
                if n not in indices:
                    indices.append(n)
                    if len(indices) == 3:
                        break
            this_global_score = np.mean(global_scores[indices], axis=0)
            this_global_score_ranking = np.argsort(-this_global_score)

            # keywords 是eid的集�?
            this_keywords = [
                self.keywords[i] for i in this_global_score_ranking[:500]
            ]  # eid
            this_global_score = [
                this_global_score[i] for i in this_global_score_ranking[:500]
            ]
            this_embs = [
                self.get_emb(i)
                for i in [self.eid2idx[eid] for eid in this_keywords]
            ]  # eid->index->embedding
            this_entity_pos = [0] + list(
                np.cumsum([len(emb) for emb in this_embs]))
            this_embs = np.vstack(this_embs)

            raw_local_scores = cos(local_rep[np.newaxis, :], this_embs)[0]

            local_scores = np.zeros((500, ))
            for i in range(500):
                start_pos = this_entity_pos[i]
                end_pos = this_entity_pos[i + 1]
                if end_pos - start_pos < self.k:
                    local_scores[i] = 1e-8
                else:
                    local_scores[i] = np.mean(
                        np.partition(raw_local_scores[start_pos:end_pos],
                                     -self.k)[-self.k:])

            scores = 5 * np.log(local_scores) + np.log(this_global_score)

            r = 0.
            for i in np.argsort(-scores):
                eid = this_keywords[i]
                if eid not in set(current_set) and eid not in neg_set:
                    r += 1
                    eid2mrr[eid] += 1 / r
                if r >= 20:
                    break

        eid_rank = sorted(eid2mrr, key=lambda x: eid2mrr[x], reverse=True)
        for i, eid in enumerate(eid_rank):
            if eid2mrr[eid] < EXPANSION_SAMPLE_SIZE * len(
                    self.expansion_templates) * 0.2:
                break
        return eid_rank[:max(15, i)]  # 10 candidates for one round
コード例 #14
0
def main(args):
    if len(args) < 4:
        sys.stderr.write(
            'Four required arguments: <cui vecs path> <glove vecs path> <MRCONSO file> <output file>\n'
        )
        sys.exit(-1)

    num_epochs = 500
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    example_pairs = [('C0021400', 'influenza'), ('C0006826', 'cancer'),
                     ('C0004057', 'aspirin'), ('C0027497', 'nausea'),
                     ('C0030193', 'pain')]

    print("Reading cui vectors from %s" % (args[0]))
    cui_vecs = gensim.models.KeyedVectors.load(args[0])
    print("Reading word vectors from %s" % (args[1]))
    glove_vecs = gensim.models.KeyedVectors.load(args[1])

    print("Finding one-word terms in UMLS with CUIs in our cui vectors")
    cui_word_pairs = build_cuiword_pairs(args[2], cui_vecs, glove_vecs)
    align_size = len(cui_word_pairs)
    print('  Found %d pairs of cuis and words' % (len(cui_word_pairs)))

    ## Build reduced w2v matrices for computing the linear projection
    print("Filling reduced glove and cui matrices with %d rows" % (align_size))
    cui_matrix = np.zeros([align_size, cui_vecs.vector_size], dtype='float32')
    glove_matrix = np.zeros([align_size, glove_vecs.vector_size],
                            dtype='float32')
    for row in range(align_size):
        cui_matrix[row, :] += cui_vecs[cui_word_pairs[row][0]]
        glove_matrix[row, :] += glove_vecs[cui_word_pairs[row][1]]

    ## Build full cui matrix for computing projections
    print("Building full cui matrix for computing projections")
    full_cui_matrix = torch.zeros([len(cui_vecs.vocab),
                                   cui_vecs.vector_size]).to(device)
    for row in range(len(cui_vecs.vocab)):
        cui = cui_vecs.index2word[row]
        full_cui_matrix[row, :] += torch.tensor(cui_vecs[cui]).to(device)

    cui_matrix = torch.tensor(cui_matrix).to(device)
    glove_matrix = torch.tensor(glove_matrix).to(device)

    projection = torch.zeros([cui_vecs.vector_size,
                              glove_vecs.vector_size]).to(device)
    projection.normal_()
    projection.requires_grad_()

    loss = nn.MSELoss()
    optimizer = optim.SGD([projection],
                          lr=1.0,
                          weight_decay=0.01,
                          momentum=0.9)

    for epoch in range(num_epochs + 1):
        if epoch % 100 == 0:
            full_projection = torch.matmul(full_cui_matrix,
                                           projection).detach().cpu().numpy()
            for pair in example_pairs:
                cui, word = pair
                cui_vector = full_projection[cui_vecs.vocab[cui].index]
                word_vector = glove_vecs[word]
                sim = cos(cui_vector.reshape(1, -1),
                          word_vector.reshape(1, -1))
                print("Similarity between %s and %s is %f" % (cui, word, sim))
        optimizer.zero_grad()
        similarity = torch.matmul(cui_matrix, projection) - glove_matrix
        epoch_loss = loss(similarity, torch.zeros_like(glove_matrix))
        epoch_loss.backward()
        optimizer.step()
        if epoch % 100 == 0:
            print("Loss in epoch %d is %f" % (epoch, epoch_loss))

    # Now project our GLOVE matrix with this learned projection and write as gensim 100d model
    print("Projecting full cui matrix into learned space")

    full_projection = torch.matmul(full_cui_matrix,
                                   projection).detach().cpu().numpy()

    print("Writing gensim file to %s" % (args[3]))
    tf = tempfile.NamedTemporaryFile(mode='wt')
    tf.write('%d %d\n' % (full_projection.shape[0], full_projection.shape[1]))
    for cui_ind in range(full_projection.shape[0]):
        cui = cui_vecs.index2word[cui_ind]
        vec = list(full_projection[cui_ind, :])
        str_vec = [str(x) for x in vec]
        tf.write('%s %s\n' % (cui, ' '.join(str_vec)))

    tf.seek(0)
    gs_new_vecs = gensim.models.KeyedVectors.load_word2vec_format(tf.name)
    gs_new_vecs.save(args[3])
コード例 #15
0
def STS_eval(sentset, model, data_path):
    """
    Evaluate the similarities of 
    :param sentset: string, sentence dataset
    :param model: sentence embedding model
    :return: cosine similarity, of all pairs of sentences
             pearson & spearman coefficients compared to gold standard
    """
    sent_file = open(data_path + 'sts-en-test-gs-2014/STS.input.' + sentset +
                     '.txt')
    sent_data = sent_file.readlines()
    sent_file.close()
    gs_file = open(data_path + 'sts-en-test-gs-2014/STS.gs.' + sentset +
                   '.txt')
    gs_data = np.array(gs_file.readlines(), dtype=float)
    gs_file.close()
    splited_sent = []
    n = len(sent_data)
    for i in range(n):
        splited_sent.append(re.split(r'\t+', sent_data[i]))
    splited_sent = np.array(splited_sent)
    sent_1 = splited_sent[:, 0]
    sent_2 = splited_sent[:, 1]
    x_1, x_2, y, ls = sort_length_embedding_sts(sent_1, sent_2, gs_data, model)

    s1 = x_1[:81]
    s2 = x_2[:81]
    y1 = y[:81]
    c1 = []

    s1_2 = x_1[81:162]
    s2_2 = x_2[81:162]
    y2 = y[81:162]
    c2 = []

    s1_3 = x_1[162:227]
    s2_3 = x_2[162:227]
    y3 = y[162:227]
    c3 = []

    s1_4 = x_1[227:]
    s2_4 = x_2[227:]
    y4 = y[227:]
    c4 = []

    pearsons = []
    spearmanrs = []

    for i in range(len(s1)):
        v1 = s1[i]
        v2 = s2[i]
        cos_i = cos([v1], [v2])
        c1.append(cos_i[0][0])
    pearsons.append(pearsonr(c1, y1)[0])
    spearmanrs.append(spearmanr(c1, y1)[0])

    for i in range(len(y2)):
        v1 = s1_2[i]
        v2 = s2_2[i]
        cos_i = cos([v1], [v2])
        c2.append(cos_i[0][0])
    pearsons.append(pearsonr(c2, y2)[0])
    spearmanrs.append(spearmanr(c2, y2)[0])

    for i in range(len(y3)):
        v1 = s1_3[i]
        v2 = s2_3[i]
        cos_i = cos([v1], [v2])
        c3.append(cos_i[0][0])
    pearsons.append(pearsonr(c3, y3)[0])
    spearmanrs.append(spearmanr(c3, y3)[0])

    for i in range(len(y4)):
        v1 = s1_4[i]
        v2 = s2_4[i]
        cos_i = cos([v1], [v2])
        c4.append(cos_i[0][0])
    pearsons.append(pearsonr(c4, y4)[0])
    spearmanrs.append(spearmanr(c4, y4)[0])

    return pearsons, spearmanrs
コード例 #16
0
def sklearn_experiment(training_space,
                       test_space,
                       target_contexts,
                       nn=1,
                       diag_value=None,
                       extra_info=False):
    """
    :param training_space:  a dictionary of dictionaries mapping each word to all the contexts it co-occurred with in
                            the training set, and then to the corresponding co-occurrence count
    :param test_space:      a dictionary of dictionaries mapping each word to all the contexts it co-occurred with in
                            the test set, and then to the corresponding co-occurrence count
    :param target_contexts: an iterable containing the all the contexts that were used to collect co-occurrences
    :param nn:              the number of nearest neighbours to be considered when categorizing a test word
    :param diag_value:      the value to which all the cells on the main diagonal of the matrix of cosine similarities
                            between test and training vectors are set (default is 0, meaning that cells on the main
                            diagonal don't impact the nearest neighbour computation). This option makes it possible to
                            force the model to categorize a test word while ignoring the vector from the training space
                            that correspond to the same word type, thus enforcing generalization
    :param extra_info:      if True, each word in the output dictionary is not only mapped to its correct category, its
                            predicted category, and the categorization accuracy, but also to the list of nearest
                            neighbors, to the cosine distance within which the nearest neighbors are located, and to the
                            distribution of classes of the set of nearest neighbor(s)
    :return hits:           a dictionary mapping each word in the test set to three fields and the corresponding value:
                            'predicted' is the PoS tag that the learner predicted for a test word
                            'correct' is the correct PoS tag as found in the CHILDES corpus
                            'accuracy' is a binary value indicating if 'predicted' and 'correct' match (1) or not (0)
    """

    hits = defaultdict(dict)

    # First get the set of words to be categorized (those in the test set) and the union with the words in the
    # training set. Then get numerical indices for all the words and the target contexts. Finally, store the training
    # and test input spaces in two NumPy 2-dimensional arrays and compute the cosine similarity between words in the
    # test space and words in the training space, setting the values in the diagonal to the desired value. Words from
    # the test set will be the columns, words in the training set will be the rows.
    test_words = set(test_space.keys())
    words = test_words.union(set(training_space.keys()))
    context_indices = sort_words(target_contexts)
    word_indices = sort_words(words)
    inverted_word_indices = {v: k for k, v in word_indices.items()}
    training_matrix = dict2matrix(training_space, word_indices,
                                  context_indices)
    test_matrix = dict2matrix(test_space, word_indices, context_indices)
    cosine_similarities = cos(training_matrix, test_matrix)
    if diag_value is not None:
        cosine_similarities[np.diag_indices_from(
            cosine_similarities)] = diag_value

    # Use the derived cosine similarities to find which words from the training set are closer to each word in the test
    # set to be able to categorize the latter ones. Nearest neighbors are computes using a nearest distance approach,
    # meaning that when two or more words from the training set are at the same closest distance from a test word, they
    # are all considered to assign a PoS tag to the test word (using a majority voting). In case the majority voting
    # also results in a tie, random sampling of one of the PoS tags is performed.
    for word in test_words:
        # get the column index of the test word to be categorized, and get the indices of all the rows that have a
        # cosine similarity to the word to be categorized that is at least as high as the closest distance (if k is 1,
        # otherwise get the cosine similarity value corresponding to the second closest distance (k=2), third closest
        # distance (k=3), and so on)
        c_idx = word_indices[word]
        nearest_indices, closest_distance = knn.get_nearest_indices(
            cosine_similarities, c_idx, nn=nn)

        # get all the word strings having a high enough cosine similarity value to the word to be categorized
        nearest_neighbors = knn.get_nearest_neighbors(nearest_indices[0],
                                                      inverted_word_indices)

        # store the PoS tags of the nearest neighbors. if a mapping dictionary is passed, store the PoS tags as
        # indicated in the mapping, otherwise store the PoS tags as found in the strings (it is assumed that wordforms
        # and PoS tags are separated by a tilde ('~'). Count how many times each PoS tag occurs across the nearest
        # neighbors and tally PoS tags by frequency
        tallied_tags = knn.tally_tags(nearest_neighbors)

        # count how many times every PoS tag occurring in the list of nearest neighbors occur, tally PoS tags by
        # frequency and select the PoS tag that occurs more often among the nearest neighbors.
        predicted = knn.categorize(tallied_tags, nearest_neighbors,
                                   training_matrix, word_indices)
        hits[word]['predicted'] = predicted
        hits[word]['correct'] = word.split('|')[0]
        hits[word]['accuracy'] = 1 if hits[word]['predicted'] == hits[word][
            'correct'] else 0
        if extra_info:
            hits[word]['neighbors'] = nearest_neighbors
            hits[word]['cosine'] = closest_distance
            hits[word]['tag_distribution'] = tallied_tags

    return hits, cosine_similarities, word_indices
コード例 #17
0
def sklearn_experiment(training_space,
                       training_words,
                       test_space=None,
                       test_words=None,
                       contexts=None,
                       pos_mapping=None,
                       nn=1,
                       diag_value=None,
                       plot=''):
    """
    :param training_space:  a 2d NumPy array storing word-context co-occurrence counts derived from the training corpus
    :param training_words:  a dictionary mapping words from the training space to the corresponding row indices in the
                            training space
    :param test_space:      a 2d NumPy array storing word-context co-occurrence counts derived from the test corpus
    :param test_words:      a dictionary mapping words from the test space to the corresponding row indices in the
                            test space. If a test space is passed, test_words has to be assigned a value, otherwise the
                            function will throw an error
    :param contexts:        a dictionary mapping contexts to their column indices in the training and test spaces;
                            default is None, because this mapping is only used in the train-test setting to keep the
                            alignment between training and test spaces
    :param pos_mapping:     a dictionary mapping CHILDES PoS tags to custom, coarser tags
    :param nn:              the number of nearest neighbours to be considered when categorizing a test word
    :param diag_value:      the value to which all the cells on the main diagonal of the matrix of cosine similarities
                            between test and training vectors are set (default is 0, meaning that cells on the main
                            diagonal don't impact the nearest neighbour computation). This option makes it possible to
                            force the model to categorize a test word while ignoring the vector from the training space
                            that correspond to the same word type, thus enforcing generalization
    :param plot:            a string indicating the path where the plot showing the cosine similarity matrix is saved
                            The default is the empty string, meaning that no plot is created
    :return hits:           a dictionary mapping each word in the test set to three fields and the corresponding value:
                            'predicted' is the PoS tag that the learner predicted for a test word
                            'correct' is the correct PoS tag as found in the CHILDES corpus
                            'accuracy' is a binary value indicating if 'predicted' and 'correct' match (1) or not (0)
    """

    t = 1 if test_space is not None else 0
    w = 1 if test_words is not None else 0
    c = 1 if contexts is not None else 0
    if sum([t, w, c]) not in [0, 3]:
        raise ValueError(
            'Unsure whether to use a leave-one-out or training-test approach! '
            'If you want to run a leave-one-out experiment, do not provide any argument to the parameters'
            ' test_space, test_words, and contexts. If, however, you want to perform an experiment in the'
            ' training-test setting, provide appropriate arguments to all three parameters.'
        )

    hits = defaultdict(dict)

    if test_space is not None:
        # use a training-test setting, where words from the test set are categorized by retrieving nearest neighbours in
        # the training set
        target_words = test_words
        words = set(training_words.keys()).union(set(test_words.keys()))

        # map every word occurring in either the training space, the test space, or both to a numerical index and get
        # an inverted mapping from indices to strings
        word_indices = sort_items(words)
        inverted_word_indices = {v: k for k, v in word_indices.items()}

        # create a training matrix and a test matrix that have as many rows as there are words in total, and the same
        # columns as the original matrices; then compute pairwise similarities between each pair of training-test words
        training_space = make_matrix(training_space, word_indices,
                                     training_words, contexts)
        test_space = make_matrix(test_space, word_indices, test_words,
                                 contexts)
        cosine_similarities = cos(training_space, test_space)

        # if so specified in the function call, set the diagonal values to the desired number
        # the idea is to 'silence' the diagonal by setting it to 0: this because the diagonal cells correspond to the
        # cosine similarity between equal types in the training and test set (e.g. dog in the training set and dog in
        # the test set). The cosine will not be 1 because the vectors of co-occurrence will differ (they have been
        # harvested in two different corpora); yet, we can expect same types to have more similar co-occurrence patterns
        # then different types. This could bias the retrieval of nearest neighbours: dog (from the training set) will be
        # retrieved as nearest neighbour of dog (from the test set). This is not a problem per se, but it can be in some
        # experimental settings: the diag-Value allows to get rid of this by force the diagonal values to 0, so that no
        # same word from training word will be retrieved as nearest neighbour for any test item
        if diag_value is not None:
            cosine_similarities[np.diag_indices_from(
                cosine_similarities)] = diag_value

    else:
        # use a leave-one-out setting, where words from the training set are categorized by retrieving nearest
        # neighbours from the training set, excluding the vector of the word being categorized from the pool of possible
        # neighbours
        target_words = training_words
        words = training_words
        word_indices = sort_items(words)
        inverted_word_indices = {v: k for k, v in word_indices.items()}
        cosine_similarities = cos(training_space)

        # in a leave-one-out setting, the diagonal is always set to 0 because otherwise categorization would be perfect:
        # the same vectors would be compared, resulting in a cosine similarity of 1, which will always be the maximum.
        # To avoid this, the diagonal cells are forced to 0.
        cosine_similarities[np.diag_indices_from(cosine_similarities)] = 0

    if plot:
        plot_matrix(cosine_similarities, neighbors=10, output_path=plot)

    # Use the derived cosine similarities to find which words from the training set are closer to each of the target
    # words (which words are used as targets depend on whether a test space is passed: if it is, target words are test
    # words, if it's not, target words are training words) to be able to categorize the target words. Nearest neighbors
    # are retrieved using a nearest distance approach, meaning that when two or more words from the training set are at
    # the same closest distance from a target word, they are all considered as nearest neighbors to assign a PoS tag to
    # the target word. Ties are broken by looking for the most frequent neighbour in the training set. If there is a tie
    # a word is sammpled randomly from the pool of most frequent words among the neighbours.
    for word in target_words:
        # get the column index of the test word to be categorized, and get the indices of all the rows that have a
        # cosine similarity to the word to be categorized that is at least as high as the closest distance (if k is 1,
        # otherwise get the cosine similarity value corresponding to the second closest distance (k=2), third closest
        # distance (k=3), and so on)
        c_idx = word_indices[word]
        nearest_indices = get_nearest_indices(cosine_similarities,
                                              c_idx,
                                              nn=nn)

        # get all the word strings having a high enough cosine similarity value to the word to be categorized
        nearest_neighbors = get_nearest_neighbors(nearest_indices[0],
                                                  inverted_word_indices)

        # if more than one neighbour is found at the closest distance, pick the one with the highest frequency of
        # occurrence in the training set; if more than a word has the same frequency count, pick randomly
        predicted = categorize(nearest_neighbors,
                               training_space,
                               word_indices,
                               pos_mapping=pos_mapping)
        hits[word]['predicted'] = predicted
        hits[word]['correct'] = pos_mapping[word.split(
            '~')[0]] if pos_mapping else word.split('~')[0]
        hits[word]['accuracy'] = 1 if hits[word]['predicted'] == hits[word][
            'correct'] else 0

    return hits, cosine_similarities, word_indices
コード例 #18
0
    def sentences_selection(self, topic_words):
        """
        Returns sentences ids that summarise the topic.
        Ranking is done with the use of PageRank

        :topic_words list of words in a topic
        """

        # Check which articles contains topic words
        topic_words_indices = [
            self.vectorizer_articles.vocabulary_[word] for word in topic_words
        ]
        is_topic_article = self.tf_matrix_articles[:, topic_words_indices].sum(
            axis=1) > 0
        topic_articles = np.where(is_topic_article)[0]

        # Delete articles that have too few key words
        ix_grid = np.ix_(topic_articles, topic_words_indices)
        topic_words_sums = self.tf_matrix_articles[ix_grid].sum(axis=1)
        all_words_sums = self.tf_matrix_articles[topic_articles, :].sum(axis=1)
        topic_words_freq = np.divide(topic_words_sums, all_words_sums)
        all_articles_mean = np.divide(np.sum(topic_words_sums),
                                      np.sum(all_words_sums))
        selected_articles = np.where(
            topic_words_freq > all_articles_mean * self.min_key_freq)[0]
        topic_articles = [
            article_index for index, article_index in enumerate(topic_articles)
            if index in selected_articles
        ]

        # Select topic sentences
        topic_sentences = [
            article_sentence for article_index, article_sentence in enumerate(
                self.sentences_in_articles) if article_index in topic_articles
        ]
        topic_sentences = list(chain.from_iterable(topic_sentences))
        topic_sentences_tf_matrix = self.tf_matrix_sentences[
            topic_sentences, :].copy()

        # Calculate cosine similarity between words and topic
        # Embeddings have already been weighted by lambda during embedding creation
        topic_embedding = np.sum(self.embeddings[topic_words_indices, :],
                                 axis=0,
                                 keepdims=True)

        # Multiplication of sentence TF matrix by log_lambda_statistic and cosine
        # simillarity between words and topic
        if self.use_sparse:
            topic_sentences_tf_matrix = topic_sentences_tf_matrix.dot(
                self.sparse_embeddings)
        else:
            topic_sentences_tf_matrix = np.dot(topic_sentences_tf_matrix,
                                               self.embeddings)

        # Similarity between sentences and topic
        topic_sentences_tf_matrix = topic_sentences_tf_matrix.toarray()
        sentences_topic_simil = cos(topic_sentences_tf_matrix, topic_embedding)
        ranking = sentences_topic_simil
        ranking = ranking.flatten()

        # Select x% the most similar sentences to the topic
        order = np.argsort(ranking)[::-1]
        selected_number = max(
            math.ceil(len(order) * self.freq_to_lex_rank),
            min(self.min_sent_to_lexrank, math.ceil(len(order) * 0.5)))
        order = order[:selected_number]
        ranking_simil = ranking[order]
        topic_sentences = [topic_sentences[_id] for _id in order]

        # PageRank on selected sentences
        topic_sentences_tf_matrix = topic_sentences_tf_matrix[order, :]
        simil_matrix = cos(topic_sentences_tf_matrix)
        np.fill_diagonal(simil_matrix, 0.0)
        # negative_values = simil_matrix < 0.0
        # simil_matrix[negative_values] = 0.0
        ranking = page_rank(simil_matrix)
        ranking = ranking.flatten()

        # Multiply PageRank ranking by similarity to the topic
        ranking = np.multiply(ranking, ranking_simil, out=ranking)

        # Scale ranking
        topic_sentences_tf_matrix = self.tf_matrix_sentences[
            topic_sentences, :]
        # Weighting by lambda
        if self.weighted_unique_scaling:
            unique_topic_words = topic_sentences_tf_matrix[:,
                                                           topic_words_indices] > 0
            unique_topic_words = unique_topic_words.todense()
            unique_topic_words = np.dot(
                unique_topic_words,
                self.log_lambda_statistics[topic_words_indices])
            unique_topic_words = np.divide(
                unique_topic_words,
                np.sum(self.log_lambda_statistics[topic_words_indices]))
        # Unweighted scaling need correction for number of tokens
        else:
            unique_topic_words = topic_sentences_tf_matrix[:,
                                                           topic_words_indices].getnnz(
                                                               axis=1)
            unique_topic_words = np.divide(unique_topic_words,
                                           len(topic_words))

        all_words_sums = topic_sentences_tf_matrix.sum(axis=1)
        ranking = self.scale_ranking(ranking, unique_topic_words,
                                     all_words_sums)

        # Order TF matrix by scaled ranking
        order = np.argsort(ranking)[::-1]
        topic_sentences_tf_matrix = topic_sentences_tf_matrix[order, :]

        # Select sentences with non duplicated meaning
        simil_matrix = cos(topic_sentences_tf_matrix)
        selected_sentences_ids = self.select_non_duplicated_sentences(
            simil_matrix, ranking, topic_sentences, order)

        return selected_sentences_ids