def find_topics(self, similarity_matrix, distribution_matrix): """Main function that hierarchically finds topics""" # Prealocate vectors for assessing grouping quality if self.do_silhouette: # simil_shape = similarity_matrix.shape word_to_topics_distance = np.subtract(1.0, np.copy(similarity_matrix)) tokens_distribution = np.copy(distribution_matrix) number_of_topics = len(self.topics) max_index = np.unravel_index(np.argmax(similarity_matrix, axis=None), similarity_matrix.shape) max_simil = similarity_matrix[max_index] while (len(self.topics) > 2) and (max_simil >= self.min_association): self.max_simil_history.append(max_simil) # Update information about topics new_topics = self.topics[max_index[0]] + self.topics[max_index[1]] self.topics = delete_indices(self.topics, max_index) self.topics = [new_topics] + self.topics number_of_topics -= 1 # Update distribution of new topic new_dist = np.copy(distribution_matrix[max_index, :]) new_dist = np.sum(new_dist, axis=0, keepdims=True) # Delete rows or cols associated with merged topics distribution_matrix = np.delete(distribution_matrix, max_index, axis=0) similarity_matrix = np.delete(similarity_matrix, max_index, axis=0) similarity_matrix = np.delete(similarity_matrix, max_index, axis=1) # New similarity new_simil = cos(new_dist, distribution_matrix) # Update matrices distribution_matrix = np.concatenate([new_dist, distribution_matrix]) new_similarity_matrix = np.zeros((number_of_topics, number_of_topics)) new_similarity_matrix[1:, 1:] = similarity_matrix new_similarity_matrix[0, 1:] = new_simil new_similarity_matrix[1:, 0] = new_simil similarity_matrix = new_similarity_matrix # Silhouette algorithm if self.do_silhouette and (len(self.topics[0]) > 1): word_to_topics_distance = np.delete(word_to_topics_distance, max_index, axis=1) new_word_to_topic_distance = np.subtract(1.0, cos(tokens_distribution, new_dist)) word_to_topics_distance = np.concatenate([new_word_to_topic_distance, word_to_topics_distance], axis=1) self.silhouette(word_to_topics_distance, tokens_distribution, distribution_matrix) # Find new max max_index = np.unravel_index(np.argmax(similarity_matrix, axis=None), similarity_matrix.shape) max_simil = similarity_matrix[max_index] if self.do_silhouette: self.topics = self.best_topics
def edge_cos(a, b, x, eps): '''REAL cosine similarity fuction with fade out. For `c=(a+b)/2` equals `cos_sim(x-c, b-c)/exp(norm(x-c))`. Positive where `ab` and `x` lay in the same semispace.''' c = (a + b) / 2 dx = x - c cs = cos(dx.reshape(1, -1), (b - c).reshape(1, -1))[0][0] val = cs / np.exp(vector_norm(dx)) return val
def topic_attribution(tf_matrix, embeddings, embeddings_vocab, topics, sites): embeddings = sp.csr_matrix(np.copy(embeddings)) tf_matrix = tf_matrix.transpose() corpus_embeddings = tf_matrix.dot(embeddings).toarray() embeddings_arr = embeddings.toarray() for index, topic in enumerate(topics): words = topic["word"] words_ids = [embeddings_vocab[word] for word in words] topic_embedding = np.sum(embeddings_arr[words_ids, :], axis=0, keepdims=True) df = pd.DataFrame({'corpus_simil': cos(corpus_embeddings, topic_embedding).flatten().tolist(), 'counts': np.array(tf_matrix.sum(axis=1)).flatten().tolist(), 'site': sites}) mean_simil = df.groupby('site').apply(_weighted_mean, 'corpus_simil', 'counts') topics[index]['mean_simil'] = {'site': mean_simil.index.values, 'mean_simil': mean_simil.values} # # Sites similarity # sites_dict = {} # for index, element in enumerate(sites): # if element in sites_dict.keys(): # sites_dict[element].append(index) # else: # sites_dict[element] = [index] # # sites_corpus = np.zeros((len(sites_dict.keys()), corpus_embeddings.shape[1])) # site_names = [] # for index, (site, indices) in enumerate(sites_dict.items()): # site_names.append(site) # sites_corpus[index, :] = np.sum(corpus_embeddings[indices, :], axis=0, keepdims=True) # sites_simil = cos(sites_corpus) return topics
def silhouette(self, tokens_to_topics_distance, tokens_distribution, distribution_matrix): """ Calculation of silhouette value in a particular iteration. Function updates inner similarity only for the new topic. :param tokens_to_topics_distance numpy array of distance between tokens and topics :param tokens_distribution numpy array of tokens embeddings :param distribution_matrix numpy array of topics embeddings """ # Outer similarity for topic_index, topic in enumerate(self.topics): tokens_to_topics = np.copy(tokens_to_topics_distance) tokens_to_topics = np.delete(tokens_to_topics, topic_index, axis=1) for token_index in topic: self.outer_distance[token_index] = np.amin( tokens_to_topics[token_index, :], keepdims=False) # Inner similarity - updated is only topic created in last iteration for token_index in self.topics[0]: token_distribution = np.copy(tokens_distribution[token_index, :]) reference_distribution = np.subtract( np.copy(distribution_matrix[0, :]), token_distribution) # reference_distribution = np.copy(distribution_matrix[0, :]) token_distribution = np.reshape(token_distribution, (-1, token_distribution.shape[0])) reference_distribution = np.reshape( reference_distribution, (-1, reference_distribution.shape[0])) self.inner_distance[token_index] = np.subtract( 1.0, cos(reference_distribution, token_distribution)) # Calculate silhouette values and their mean silhouette_values = np.zeros((len(self.outer_distance), )) selected_indices = np.where((self.inner_distance != 0.0) & (self.outer_distance != 0.0)) selected_inner = self.inner_distance[selected_indices] selected_outer = self.outer_distance[selected_indices] silhouette_values[selected_indices] = np.subtract( selected_outer, selected_inner) maximum_values = np.maximum(selected_outer, selected_inner) silhouette_values[selected_indices] = np.divide( silhouette_values[selected_indices], maximum_values) # Calculate penalty - number of single tokens * penalty_value (0.1) single_tokens = self.tokens_len - (self.tokens_len - len(self.topics)) penalty = (single_tokens * self.singularity_penalty) / self.tokens_len mean_silhouette = np.mean(silhouette_values) + penalty # Update best topic and history if mean_silhouette > self.max_silhouette: self.max_silhouette = mean_silhouette self.best_topics = self.topics self.silhouette_history.append(mean_silhouette)
def create_embeddings(articles, lambda_statistics, log_lambda_statistics_df, pipeline=False, embedding_size=256): """ This function creates embeddings for each word in data frame. :param articles data frame with pasted lematizated text :param lambda_statistics data frame with lambda statistics and selected words that are important in a particular 'day' :param pipeline bool for True function outputs only files needed in next steps of pipeline """ selected_words = list(lambda_statistics["word"]) corpus = list(articles["text"]) vectorizer = CountVectorizer(lowercase=False, tokenizer=space_tokenizer, min_df=1, encoding="UTF-8") tfidf_matrix = vectorizer.fit_transform(corpus) tfidf_matrix = tfidf_matrix.transpose() svd = TruncatedSVD(n_components=embedding_size, n_iter=15, random_state=42) embeddings = svd.fit_transform(tfidf_matrix) # Scale embeddings by log lambda lambda_log_array = set_lambda_order(log_lambda_statistics_df, vectorizer.vocabulary_) embeddings = np.multiply(embeddings, lambda_log_array, out=embeddings) selected_words.sort() selected_words_indices = [] for word in selected_words: try: selected_words_indices.append(vectorizer.vocabulary_[word]) except KeyError: print(word) selected_embeddings = embeddings[selected_words_indices, :] similarity_matrix = cos(selected_embeddings) np.fill_diagonal(similarity_matrix, 0) distribution_matrix = tfidf_matrix[selected_words_indices, :].todense() if pipeline: return (similarity_matrix, distribution_matrix, selected_words, selected_embeddings, embeddings, vectorizer.vocabulary_) else: return (similarity_matrix, np.transpose(svd.components_), distribution_matrix, selected_words, selected_embeddings)
def pairwise(segmentpool, N, threshold): result = [] summs = [[] for i in range(N)] for seg in segmentpool: segment = segmentpool[seg] #for segment in segmentpool: doc = segment.docid - 1 summs[doc].append(segment) summ_pairs = combinations(summs, 2) scores = [] for summ_pair in summ_pairs: for segment in summ_pair[0]: if type(segment.vec) is list: segment.vec = np.array([segment.vec]) segment.vec.reshape(-1, 1) else: if segment.vec.shape == (1, 100): segment.vec.reshape(-1, 1) else: pass #segment.vec = np.array([segment.vec]) #segment.vec.reshape(-1,1) for seg in summ_pair[1]: if type(seg.vec) is list: seg.vec = np.array([seg.vec]) seg.vec.reshape(-1, 1) else: if seg.vec.shape == (1, 100): seg.vec.reshape(-1, 1) else: pass sc = cos(segment.vec, seg.vec)[0][0] #print sc #sc = cos(segment.vec, seg.vec) scores.append(sc) result.append({ 'seg1id': segment.id, 'seg2id': seg.id, 'seg1': segment.seg, 'seg2': seg.seg, 'WAS': sc * 2 }) Q3 = np.percentile(np.asarray(scores), threshold) fifty = np.percentile(np.asarray(scores), 50) print fifty #print('\tCosine Score for Quantile: %.3f' % Q3) rresult = [] for res in result: if (res['WAS'] / 2) > Q3: rresult.append(res) return rresult
def cos_compare(word, sentences, embeddings): ss = [Sentence(s.lower()) for s in sentences] # Change to flair format compare = [] for s in ss: embeddings.embed(s) for tok in s: if tok.text == word: compare.append(tok.embedding) compare = torch.stack(compare).cpu().clone().numpy() return cos(compare, compare)[2]
def pairwise_test(segmentpool, N): result = [] summs = [[] for i in range(N)] for seg in segmentpool: segment = segmentpool[seg] doc = segment.docid - 1 summs[doc].append(segment) summ_pairs = combinations(summs, 2) for summ_pair in summ_pairs: for segment in summ_pair[0]: if type(segment.vec) is list: segment.vec = np.array([segment.vec]) segment.vec.reshape(-1, 1) else: if segment.vec.shape == (1, 100): segment.vec.reshape(-1, 1) else: pass #segment.vec = np.array([segment.vec]) #segment.vec.reshape(-1,1) #print "segment shape", segment.vec.shape for seg in summ_pair[1]: if type(seg.vec) is list: seg.vec = np.array([seg.vec]) seg.vec.reshape(-1, 1) else: if seg.vec.shape == (1, 100): seg.vec.reshape(-1, 1) else: pass #print "seg shape", seg.vec.shape sc = cos(segment.vec, seg.vec)[0][0] #sc = cos(segment.vec,seg.vec) #if sc > 0.5: #result.append( {'seg1id': segment.id, 'seg2id': seg.id, 'seg1': segment.seg, 'seg2': seg.seg, 'WAS': sc*2}) result.append(sc) with open("scores.txt", 'w') as f: for i in result: lines = str(i) + "\n" f.write(lines) return result
def class_guided_filter(self, query_set, expanded_set, pos_cname, neg_cnames, cname2count): cnames = [pos_cname] + list(neg_cnames) cname2idx = {cname:i for i, cname in enumerate(cnames)} cnames_rep = np.vstack([self.get_cname_rep(cname) for cname in cnames]) filter_out = set() for eid in expanded_set: emb = self.get_emb(self.eid2idx[eid]) sims = cos(cnames_rep, emb) cnt = 0 for i in range(len(self.ranking_templates)): scores = np.mean(np.partition(sims[[j*6+i for j in range(len(cnames))]], -self.k, axis=1)[:, -self.k:], axis=1) if np.argmax(scores) != cname2idx[pos_cname]: cnt += 1 if cnt > 2: filter_out.add(eid) temp = set([cn for cn in cname2count if cname2count[cn] >= GENERATION_SAMPLE_SIZE * len(self.generation_templates) / 6.]) temp.update([self.inflect.plural(cn) for cn in temp]) filter_out.update([eid for eid in expanded_set if self.eid2name[eid].lower() in temp]) return [eid for eid in expanded_set if eid not in filter_out], filter_out
def construct_graph(dataset, features, topk): fname = '../data/' + dataset + '/knn/tmp.txt' print(fname) f = open(fname, 'w') ##### Kernel # dist = -0.5 * pair(features) ** 2 # dist = np.exp(dist) #### Cosine dist = cos(features) inds = [] for i in range(dist.shape[0]): ind = np.argpartition(dist[i, :], -(topk + 1))[-(topk + 1):] inds.append(ind) for i, v in enumerate(inds): for vv in v: if vv == i: pass else: f.write('{} {}\n'.format(i, vv)) f.close()
def averageSimilarity(self, segment_embedding): normalizer = len(self.embeddings) if normalizer == 0: print self.id similarity = 0 segment_embedding = np.array([segment_embedding]) segment_embedding.reshape(-1,1) for embedding in self.embeddings: # By Yanjun: For testing embedding = np.array([embedding]) #print "embedding", embedding #print "embedding", segment_embedding embedding.reshape(-1,1) similarity += cos(embedding, segment_embedding)[0][0] #similarity += cos(embedding, segment_embedding) # For testing purpose, change simiarity threshold to 4.0 #if similarity / normalizer < 4.35146819363: if similarity / normalizer <0.5: return None else: return [similarity / normalizer, self.weight]
def class_name_ranking(self, cname2count, query_set, expanded_set, neg_cnames, prev_cn, margin): current_set = query_set + expanded_set ids = [] cnames = [cname for cname in cname2count if cname2count[cname] >= self.gen_thres] cnames += [cn for cn in prev_cn if cn not in cnames] cname2idx = {cname:i for i, cname in enumerate(cnames)} cnames_rep = np.vstack([self.get_cname_rep(cname) for cname in cnames]) scores = np.zeros((len(current_set), len(cnames))) for i, eid in enumerate(current_set): emb = self.get_emb(self.eid2idx[eid]) if len(emb) < self.k: continue sims = cos(cnames_rep, emb) for j in range(len(cnames)): scores[i, j] = np.mean(np.partition(np.amax(sims[j*6:(j+1)*6], axis=0), -self.k)[-self.k:]) cname2mrr=ddict(float) for eid, score in zip(current_set, scores): r = 0. for i in np.argsort(-score): cname = cnames[i] if cname2count[cname] < min(GENERATION_SAMPLE_SIZE*len(self.generation_templates)*POS_CNAME_THRES, max(cname2count.values())) and cname not in prev_cn: continue r += 1 cname2mrr[cname] += 1 / r pos_cname = sorted(cname2mrr.keys(), key=lambda x: cname2mrr[x], reverse=True)[0] # find negative entities uni_cnames = [cname for cname in cnames if len(cname.split(' ')) == 1 and not pos_cname.endswith(cname)] this_neg_cnames = set(uni_cnames) for eid, score in zip(query_set, scores): ranked_uni_cnames = sorted([pos_cname]+uni_cnames, key=lambda x: score[cname2idx[x]], reverse=True) for i, cname in enumerate(ranked_uni_cnames): if cname == pos_cname: break this_neg_cnames = this_neg_cnames & set(ranked_uni_cnames[i+1+margin:]) return pos_cname, neg_cnames | this_neg_cnames
def class_guided_expansion(self, pos_cname, current_set, set_text, neg_set): global_idx_generator = self.rand_idx(len(current_set)) local_idx_generator = self.rand_idx(len(current_set)) global_scores = cos( self.means[[self.eid2idx[eid] for eid in current_set]], self.means) # 1.把pattern和entity名结合形成查询文�? ids = [] for _ in range(EXPANSION_SAMPLE_SIZE): for template in self.expansion_templates: indices = [] # 随机�?个entity for n in local_idx_generator: if n not in indices: indices.append(n) if len(indices) == 3: break fill_in = [self.tokenizer.mask_token ] + [set_text[i] for i in indices] fill_in = np.random.permutation(fill_in) # fill_in: ['MASK','entity1_name','entity2_name','entity3_name']乱序 text = template[0] + pos_cname + template[1] text = text.format(*fill_in) ids.append(self.tokenizer.encode(text, max_length=512)) mask_rep = self.get_mask_rep(ids) # mask_rep就是候选entity的集�? eid2mrr = ddict(float) for local_rep in mask_rep: indices = [] for n in global_idx_generator: if n not in indices: indices.append(n) if len(indices) == 3: break this_global_score = np.mean(global_scores[indices], axis=0) this_global_score_ranking = np.argsort(-this_global_score) # keywords 是eid的集�? this_keywords = [ self.keywords[i] for i in this_global_score_ranking[:500] ] # eid this_global_score = [ this_global_score[i] for i in this_global_score_ranking[:500] ] this_embs = [ self.get_emb(i) for i in [self.eid2idx[eid] for eid in this_keywords] ] # eid->index->embedding this_entity_pos = [0] + list( np.cumsum([len(emb) for emb in this_embs])) this_embs = np.vstack(this_embs) raw_local_scores = cos(local_rep[np.newaxis, :], this_embs)[0] local_scores = np.zeros((500, )) for i in range(500): start_pos = this_entity_pos[i] end_pos = this_entity_pos[i + 1] if end_pos - start_pos < self.k: local_scores[i] = 1e-8 else: local_scores[i] = np.mean( np.partition(raw_local_scores[start_pos:end_pos], -self.k)[-self.k:]) scores = 5 * np.log(local_scores) + np.log(this_global_score) r = 0. for i in np.argsort(-scores): eid = this_keywords[i] if eid not in set(current_set) and eid not in neg_set: r += 1 eid2mrr[eid] += 1 / r if r >= 20: break eid_rank = sorted(eid2mrr, key=lambda x: eid2mrr[x], reverse=True) for i, eid in enumerate(eid_rank): if eid2mrr[eid] < EXPANSION_SAMPLE_SIZE * len( self.expansion_templates) * 0.2: break return eid_rank[:max(15, i)] # 10 candidates for one round
def main(args): if len(args) < 4: sys.stderr.write( 'Four required arguments: <cui vecs path> <glove vecs path> <MRCONSO file> <output file>\n' ) sys.exit(-1) num_epochs = 500 device = 'cuda' if torch.cuda.is_available() else 'cpu' example_pairs = [('C0021400', 'influenza'), ('C0006826', 'cancer'), ('C0004057', 'aspirin'), ('C0027497', 'nausea'), ('C0030193', 'pain')] print("Reading cui vectors from %s" % (args[0])) cui_vecs = gensim.models.KeyedVectors.load(args[0]) print("Reading word vectors from %s" % (args[1])) glove_vecs = gensim.models.KeyedVectors.load(args[1]) print("Finding one-word terms in UMLS with CUIs in our cui vectors") cui_word_pairs = build_cuiword_pairs(args[2], cui_vecs, glove_vecs) align_size = len(cui_word_pairs) print(' Found %d pairs of cuis and words' % (len(cui_word_pairs))) ## Build reduced w2v matrices for computing the linear projection print("Filling reduced glove and cui matrices with %d rows" % (align_size)) cui_matrix = np.zeros([align_size, cui_vecs.vector_size], dtype='float32') glove_matrix = np.zeros([align_size, glove_vecs.vector_size], dtype='float32') for row in range(align_size): cui_matrix[row, :] += cui_vecs[cui_word_pairs[row][0]] glove_matrix[row, :] += glove_vecs[cui_word_pairs[row][1]] ## Build full cui matrix for computing projections print("Building full cui matrix for computing projections") full_cui_matrix = torch.zeros([len(cui_vecs.vocab), cui_vecs.vector_size]).to(device) for row in range(len(cui_vecs.vocab)): cui = cui_vecs.index2word[row] full_cui_matrix[row, :] += torch.tensor(cui_vecs[cui]).to(device) cui_matrix = torch.tensor(cui_matrix).to(device) glove_matrix = torch.tensor(glove_matrix).to(device) projection = torch.zeros([cui_vecs.vector_size, glove_vecs.vector_size]).to(device) projection.normal_() projection.requires_grad_() loss = nn.MSELoss() optimizer = optim.SGD([projection], lr=1.0, weight_decay=0.01, momentum=0.9) for epoch in range(num_epochs + 1): if epoch % 100 == 0: full_projection = torch.matmul(full_cui_matrix, projection).detach().cpu().numpy() for pair in example_pairs: cui, word = pair cui_vector = full_projection[cui_vecs.vocab[cui].index] word_vector = glove_vecs[word] sim = cos(cui_vector.reshape(1, -1), word_vector.reshape(1, -1)) print("Similarity between %s and %s is %f" % (cui, word, sim)) optimizer.zero_grad() similarity = torch.matmul(cui_matrix, projection) - glove_matrix epoch_loss = loss(similarity, torch.zeros_like(glove_matrix)) epoch_loss.backward() optimizer.step() if epoch % 100 == 0: print("Loss in epoch %d is %f" % (epoch, epoch_loss)) # Now project our GLOVE matrix with this learned projection and write as gensim 100d model print("Projecting full cui matrix into learned space") full_projection = torch.matmul(full_cui_matrix, projection).detach().cpu().numpy() print("Writing gensim file to %s" % (args[3])) tf = tempfile.NamedTemporaryFile(mode='wt') tf.write('%d %d\n' % (full_projection.shape[0], full_projection.shape[1])) for cui_ind in range(full_projection.shape[0]): cui = cui_vecs.index2word[cui_ind] vec = list(full_projection[cui_ind, :]) str_vec = [str(x) for x in vec] tf.write('%s %s\n' % (cui, ' '.join(str_vec))) tf.seek(0) gs_new_vecs = gensim.models.KeyedVectors.load_word2vec_format(tf.name) gs_new_vecs.save(args[3])
def STS_eval(sentset, model, data_path): """ Evaluate the similarities of :param sentset: string, sentence dataset :param model: sentence embedding model :return: cosine similarity, of all pairs of sentences pearson & spearman coefficients compared to gold standard """ sent_file = open(data_path + 'sts-en-test-gs-2014/STS.input.' + sentset + '.txt') sent_data = sent_file.readlines() sent_file.close() gs_file = open(data_path + 'sts-en-test-gs-2014/STS.gs.' + sentset + '.txt') gs_data = np.array(gs_file.readlines(), dtype=float) gs_file.close() splited_sent = [] n = len(sent_data) for i in range(n): splited_sent.append(re.split(r'\t+', sent_data[i])) splited_sent = np.array(splited_sent) sent_1 = splited_sent[:, 0] sent_2 = splited_sent[:, 1] x_1, x_2, y, ls = sort_length_embedding_sts(sent_1, sent_2, gs_data, model) s1 = x_1[:81] s2 = x_2[:81] y1 = y[:81] c1 = [] s1_2 = x_1[81:162] s2_2 = x_2[81:162] y2 = y[81:162] c2 = [] s1_3 = x_1[162:227] s2_3 = x_2[162:227] y3 = y[162:227] c3 = [] s1_4 = x_1[227:] s2_4 = x_2[227:] y4 = y[227:] c4 = [] pearsons = [] spearmanrs = [] for i in range(len(s1)): v1 = s1[i] v2 = s2[i] cos_i = cos([v1], [v2]) c1.append(cos_i[0][0]) pearsons.append(pearsonr(c1, y1)[0]) spearmanrs.append(spearmanr(c1, y1)[0]) for i in range(len(y2)): v1 = s1_2[i] v2 = s2_2[i] cos_i = cos([v1], [v2]) c2.append(cos_i[0][0]) pearsons.append(pearsonr(c2, y2)[0]) spearmanrs.append(spearmanr(c2, y2)[0]) for i in range(len(y3)): v1 = s1_3[i] v2 = s2_3[i] cos_i = cos([v1], [v2]) c3.append(cos_i[0][0]) pearsons.append(pearsonr(c3, y3)[0]) spearmanrs.append(spearmanr(c3, y3)[0]) for i in range(len(y4)): v1 = s1_4[i] v2 = s2_4[i] cos_i = cos([v1], [v2]) c4.append(cos_i[0][0]) pearsons.append(pearsonr(c4, y4)[0]) spearmanrs.append(spearmanr(c4, y4)[0]) return pearsons, spearmanrs
def sklearn_experiment(training_space, test_space, target_contexts, nn=1, diag_value=None, extra_info=False): """ :param training_space: a dictionary of dictionaries mapping each word to all the contexts it co-occurred with in the training set, and then to the corresponding co-occurrence count :param test_space: a dictionary of dictionaries mapping each word to all the contexts it co-occurred with in the test set, and then to the corresponding co-occurrence count :param target_contexts: an iterable containing the all the contexts that were used to collect co-occurrences :param nn: the number of nearest neighbours to be considered when categorizing a test word :param diag_value: the value to which all the cells on the main diagonal of the matrix of cosine similarities between test and training vectors are set (default is 0, meaning that cells on the main diagonal don't impact the nearest neighbour computation). This option makes it possible to force the model to categorize a test word while ignoring the vector from the training space that correspond to the same word type, thus enforcing generalization :param extra_info: if True, each word in the output dictionary is not only mapped to its correct category, its predicted category, and the categorization accuracy, but also to the list of nearest neighbors, to the cosine distance within which the nearest neighbors are located, and to the distribution of classes of the set of nearest neighbor(s) :return hits: a dictionary mapping each word in the test set to three fields and the corresponding value: 'predicted' is the PoS tag that the learner predicted for a test word 'correct' is the correct PoS tag as found in the CHILDES corpus 'accuracy' is a binary value indicating if 'predicted' and 'correct' match (1) or not (0) """ hits = defaultdict(dict) # First get the set of words to be categorized (those in the test set) and the union with the words in the # training set. Then get numerical indices for all the words and the target contexts. Finally, store the training # and test input spaces in two NumPy 2-dimensional arrays and compute the cosine similarity between words in the # test space and words in the training space, setting the values in the diagonal to the desired value. Words from # the test set will be the columns, words in the training set will be the rows. test_words = set(test_space.keys()) words = test_words.union(set(training_space.keys())) context_indices = sort_words(target_contexts) word_indices = sort_words(words) inverted_word_indices = {v: k for k, v in word_indices.items()} training_matrix = dict2matrix(training_space, word_indices, context_indices) test_matrix = dict2matrix(test_space, word_indices, context_indices) cosine_similarities = cos(training_matrix, test_matrix) if diag_value is not None: cosine_similarities[np.diag_indices_from( cosine_similarities)] = diag_value # Use the derived cosine similarities to find which words from the training set are closer to each word in the test # set to be able to categorize the latter ones. Nearest neighbors are computes using a nearest distance approach, # meaning that when two or more words from the training set are at the same closest distance from a test word, they # are all considered to assign a PoS tag to the test word (using a majority voting). In case the majority voting # also results in a tie, random sampling of one of the PoS tags is performed. for word in test_words: # get the column index of the test word to be categorized, and get the indices of all the rows that have a # cosine similarity to the word to be categorized that is at least as high as the closest distance (if k is 1, # otherwise get the cosine similarity value corresponding to the second closest distance (k=2), third closest # distance (k=3), and so on) c_idx = word_indices[word] nearest_indices, closest_distance = knn.get_nearest_indices( cosine_similarities, c_idx, nn=nn) # get all the word strings having a high enough cosine similarity value to the word to be categorized nearest_neighbors = knn.get_nearest_neighbors(nearest_indices[0], inverted_word_indices) # store the PoS tags of the nearest neighbors. if a mapping dictionary is passed, store the PoS tags as # indicated in the mapping, otherwise store the PoS tags as found in the strings (it is assumed that wordforms # and PoS tags are separated by a tilde ('~'). Count how many times each PoS tag occurs across the nearest # neighbors and tally PoS tags by frequency tallied_tags = knn.tally_tags(nearest_neighbors) # count how many times every PoS tag occurring in the list of nearest neighbors occur, tally PoS tags by # frequency and select the PoS tag that occurs more often among the nearest neighbors. predicted = knn.categorize(tallied_tags, nearest_neighbors, training_matrix, word_indices) hits[word]['predicted'] = predicted hits[word]['correct'] = word.split('|')[0] hits[word]['accuracy'] = 1 if hits[word]['predicted'] == hits[word][ 'correct'] else 0 if extra_info: hits[word]['neighbors'] = nearest_neighbors hits[word]['cosine'] = closest_distance hits[word]['tag_distribution'] = tallied_tags return hits, cosine_similarities, word_indices
def sklearn_experiment(training_space, training_words, test_space=None, test_words=None, contexts=None, pos_mapping=None, nn=1, diag_value=None, plot=''): """ :param training_space: a 2d NumPy array storing word-context co-occurrence counts derived from the training corpus :param training_words: a dictionary mapping words from the training space to the corresponding row indices in the training space :param test_space: a 2d NumPy array storing word-context co-occurrence counts derived from the test corpus :param test_words: a dictionary mapping words from the test space to the corresponding row indices in the test space. If a test space is passed, test_words has to be assigned a value, otherwise the function will throw an error :param contexts: a dictionary mapping contexts to their column indices in the training and test spaces; default is None, because this mapping is only used in the train-test setting to keep the alignment between training and test spaces :param pos_mapping: a dictionary mapping CHILDES PoS tags to custom, coarser tags :param nn: the number of nearest neighbours to be considered when categorizing a test word :param diag_value: the value to which all the cells on the main diagonal of the matrix of cosine similarities between test and training vectors are set (default is 0, meaning that cells on the main diagonal don't impact the nearest neighbour computation). This option makes it possible to force the model to categorize a test word while ignoring the vector from the training space that correspond to the same word type, thus enforcing generalization :param plot: a string indicating the path where the plot showing the cosine similarity matrix is saved The default is the empty string, meaning that no plot is created :return hits: a dictionary mapping each word in the test set to three fields and the corresponding value: 'predicted' is the PoS tag that the learner predicted for a test word 'correct' is the correct PoS tag as found in the CHILDES corpus 'accuracy' is a binary value indicating if 'predicted' and 'correct' match (1) or not (0) """ t = 1 if test_space is not None else 0 w = 1 if test_words is not None else 0 c = 1 if contexts is not None else 0 if sum([t, w, c]) not in [0, 3]: raise ValueError( 'Unsure whether to use a leave-one-out or training-test approach! ' 'If you want to run a leave-one-out experiment, do not provide any argument to the parameters' ' test_space, test_words, and contexts. If, however, you want to perform an experiment in the' ' training-test setting, provide appropriate arguments to all three parameters.' ) hits = defaultdict(dict) if test_space is not None: # use a training-test setting, where words from the test set are categorized by retrieving nearest neighbours in # the training set target_words = test_words words = set(training_words.keys()).union(set(test_words.keys())) # map every word occurring in either the training space, the test space, or both to a numerical index and get # an inverted mapping from indices to strings word_indices = sort_items(words) inverted_word_indices = {v: k for k, v in word_indices.items()} # create a training matrix and a test matrix that have as many rows as there are words in total, and the same # columns as the original matrices; then compute pairwise similarities between each pair of training-test words training_space = make_matrix(training_space, word_indices, training_words, contexts) test_space = make_matrix(test_space, word_indices, test_words, contexts) cosine_similarities = cos(training_space, test_space) # if so specified in the function call, set the diagonal values to the desired number # the idea is to 'silence' the diagonal by setting it to 0: this because the diagonal cells correspond to the # cosine similarity between equal types in the training and test set (e.g. dog in the training set and dog in # the test set). The cosine will not be 1 because the vectors of co-occurrence will differ (they have been # harvested in two different corpora); yet, we can expect same types to have more similar co-occurrence patterns # then different types. This could bias the retrieval of nearest neighbours: dog (from the training set) will be # retrieved as nearest neighbour of dog (from the test set). This is not a problem per se, but it can be in some # experimental settings: the diag-Value allows to get rid of this by force the diagonal values to 0, so that no # same word from training word will be retrieved as nearest neighbour for any test item if diag_value is not None: cosine_similarities[np.diag_indices_from( cosine_similarities)] = diag_value else: # use a leave-one-out setting, where words from the training set are categorized by retrieving nearest # neighbours from the training set, excluding the vector of the word being categorized from the pool of possible # neighbours target_words = training_words words = training_words word_indices = sort_items(words) inverted_word_indices = {v: k for k, v in word_indices.items()} cosine_similarities = cos(training_space) # in a leave-one-out setting, the diagonal is always set to 0 because otherwise categorization would be perfect: # the same vectors would be compared, resulting in a cosine similarity of 1, which will always be the maximum. # To avoid this, the diagonal cells are forced to 0. cosine_similarities[np.diag_indices_from(cosine_similarities)] = 0 if plot: plot_matrix(cosine_similarities, neighbors=10, output_path=plot) # Use the derived cosine similarities to find which words from the training set are closer to each of the target # words (which words are used as targets depend on whether a test space is passed: if it is, target words are test # words, if it's not, target words are training words) to be able to categorize the target words. Nearest neighbors # are retrieved using a nearest distance approach, meaning that when two or more words from the training set are at # the same closest distance from a target word, they are all considered as nearest neighbors to assign a PoS tag to # the target word. Ties are broken by looking for the most frequent neighbour in the training set. If there is a tie # a word is sammpled randomly from the pool of most frequent words among the neighbours. for word in target_words: # get the column index of the test word to be categorized, and get the indices of all the rows that have a # cosine similarity to the word to be categorized that is at least as high as the closest distance (if k is 1, # otherwise get the cosine similarity value corresponding to the second closest distance (k=2), third closest # distance (k=3), and so on) c_idx = word_indices[word] nearest_indices = get_nearest_indices(cosine_similarities, c_idx, nn=nn) # get all the word strings having a high enough cosine similarity value to the word to be categorized nearest_neighbors = get_nearest_neighbors(nearest_indices[0], inverted_word_indices) # if more than one neighbour is found at the closest distance, pick the one with the highest frequency of # occurrence in the training set; if more than a word has the same frequency count, pick randomly predicted = categorize(nearest_neighbors, training_space, word_indices, pos_mapping=pos_mapping) hits[word]['predicted'] = predicted hits[word]['correct'] = pos_mapping[word.split( '~')[0]] if pos_mapping else word.split('~')[0] hits[word]['accuracy'] = 1 if hits[word]['predicted'] == hits[word][ 'correct'] else 0 return hits, cosine_similarities, word_indices
def sentences_selection(self, topic_words): """ Returns sentences ids that summarise the topic. Ranking is done with the use of PageRank :topic_words list of words in a topic """ # Check which articles contains topic words topic_words_indices = [ self.vectorizer_articles.vocabulary_[word] for word in topic_words ] is_topic_article = self.tf_matrix_articles[:, topic_words_indices].sum( axis=1) > 0 topic_articles = np.where(is_topic_article)[0] # Delete articles that have too few key words ix_grid = np.ix_(topic_articles, topic_words_indices) topic_words_sums = self.tf_matrix_articles[ix_grid].sum(axis=1) all_words_sums = self.tf_matrix_articles[topic_articles, :].sum(axis=1) topic_words_freq = np.divide(topic_words_sums, all_words_sums) all_articles_mean = np.divide(np.sum(topic_words_sums), np.sum(all_words_sums)) selected_articles = np.where( topic_words_freq > all_articles_mean * self.min_key_freq)[0] topic_articles = [ article_index for index, article_index in enumerate(topic_articles) if index in selected_articles ] # Select topic sentences topic_sentences = [ article_sentence for article_index, article_sentence in enumerate( self.sentences_in_articles) if article_index in topic_articles ] topic_sentences = list(chain.from_iterable(topic_sentences)) topic_sentences_tf_matrix = self.tf_matrix_sentences[ topic_sentences, :].copy() # Calculate cosine similarity between words and topic # Embeddings have already been weighted by lambda during embedding creation topic_embedding = np.sum(self.embeddings[topic_words_indices, :], axis=0, keepdims=True) # Multiplication of sentence TF matrix by log_lambda_statistic and cosine # simillarity between words and topic if self.use_sparse: topic_sentences_tf_matrix = topic_sentences_tf_matrix.dot( self.sparse_embeddings) else: topic_sentences_tf_matrix = np.dot(topic_sentences_tf_matrix, self.embeddings) # Similarity between sentences and topic topic_sentences_tf_matrix = topic_sentences_tf_matrix.toarray() sentences_topic_simil = cos(topic_sentences_tf_matrix, topic_embedding) ranking = sentences_topic_simil ranking = ranking.flatten() # Select x% the most similar sentences to the topic order = np.argsort(ranking)[::-1] selected_number = max( math.ceil(len(order) * self.freq_to_lex_rank), min(self.min_sent_to_lexrank, math.ceil(len(order) * 0.5))) order = order[:selected_number] ranking_simil = ranking[order] topic_sentences = [topic_sentences[_id] for _id in order] # PageRank on selected sentences topic_sentences_tf_matrix = topic_sentences_tf_matrix[order, :] simil_matrix = cos(topic_sentences_tf_matrix) np.fill_diagonal(simil_matrix, 0.0) # negative_values = simil_matrix < 0.0 # simil_matrix[negative_values] = 0.0 ranking = page_rank(simil_matrix) ranking = ranking.flatten() # Multiply PageRank ranking by similarity to the topic ranking = np.multiply(ranking, ranking_simil, out=ranking) # Scale ranking topic_sentences_tf_matrix = self.tf_matrix_sentences[ topic_sentences, :] # Weighting by lambda if self.weighted_unique_scaling: unique_topic_words = topic_sentences_tf_matrix[:, topic_words_indices] > 0 unique_topic_words = unique_topic_words.todense() unique_topic_words = np.dot( unique_topic_words, self.log_lambda_statistics[topic_words_indices]) unique_topic_words = np.divide( unique_topic_words, np.sum(self.log_lambda_statistics[topic_words_indices])) # Unweighted scaling need correction for number of tokens else: unique_topic_words = topic_sentences_tf_matrix[:, topic_words_indices].getnnz( axis=1) unique_topic_words = np.divide(unique_topic_words, len(topic_words)) all_words_sums = topic_sentences_tf_matrix.sum(axis=1) ranking = self.scale_ranking(ranking, unique_topic_words, all_words_sums) # Order TF matrix by scaled ranking order = np.argsort(ranking)[::-1] topic_sentences_tf_matrix = topic_sentences_tf_matrix[order, :] # Select sentences with non duplicated meaning simil_matrix = cos(topic_sentences_tf_matrix) selected_sentences_ids = self.select_non_duplicated_sentences( simil_matrix, ranking, topic_sentences, order) return selected_sentences_ids