import argparse from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import krovetz import string import re from math import log ks = krovetz.PyKrovetzStemmer() stop_words = set(stopwords.words('english')) # | set(string.punctuation) #word_df = 1 def preprocess(text): word_tokens = word_tokenize(re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())) processed = [ks.stem(w) for w in word_tokens if w not in stop_words] return processed def bm25(qtext, docs_id, docs_body, vocab_words_df, num_docs_collection, avg_docs_len): rel_scores = [0 for i in range(len(docs_id))] N = num_docs_collection k1 = 1.4 b = 0.75 for q in qtext: n_q = vocab_words_df.get(q, 0) idf_q = log(((N - n_q + 0.5) / (n_q + 0.5) + 1)) for i in range(len(docs_id)): # docid = docs_id[i]
def scrap_keywords(): stemmer = krovetz.PyKrovetzStemmer() # keyword_list_TERM: {key:{converted_word, count}} keyword_list_TERM = {} html_page = urlopen( "https://en.wikipedia.org/wiki/List_of_Microsoft_software") soup = BeautifulSoup(html_page, features="lxml") div = soup.find('div', attrs={'id': 'mw-content-text'}) footer = str(div.contents).rfind("Misc.") for link in div.findAll('a', attrs={'href': re.compile("/wiki/")}): #"^https://")}): link_text = link.text.replace('+', '').lower().strip() # lemmatize link_text = stemmer.stem(link_text) #if link_text=='Microsoft Edge'.lower(): # print(stemmer.stem(link_text)) if link_text in keyword_list_TERM.keys(): keyword_list_TERM[link_text]['count'] += 1 else: keyword_list_TERM[link_text] = { 'converted_word': link_text.replace(" ", "_") } keyword_list_TERM[link_text]['count'] = 1 link_text = link.text.replace('+', '').lower().replace('microsoft', 'ms').strip() # lemmatize link_text = stemmer.stem(link_text) if link_text in keyword_list_TERM.keys(): keyword_list_TERM[link_text]['count'] += 1 else: keyword_list_TERM[link_text] = { 'converted_word': link_text.replace(" ", "_") } keyword_list_TERM[link_text]['count'] = 1 link_text = link.text.replace('+', '').lower().replace('microsoft', '').strip() # lemmatize link_text = stemmer.stem(link_text) if link_text in keyword_list_TERM.keys(): keyword_list_TERM[link_text]['count'] += 1 else: keyword_list_TERM[link_text] = { 'converted_word': link_text.replace(" ", "_") } keyword_list_TERM[link_text]['count'] = 1 #print (link_text) if link_text == 'Windows To Go'.lower(): break link_text = 'window' if link_text in keyword_list_TERM.keys(): keyword_list_TERM[link_text]['count'] += 1 else: keyword_list_TERM[link_text] = {'converted_word': link_text} keyword_list_TERM[link_text]['count'] = 1 key_list = list(keyword_list_TERM.keys()) for key1 in key_list: if key1.startswith('list of '): del keyword_list_TERM[key1] # get Ubuntu Glossaries html_page = urlopen("https://help.ubuntu.com/community/Glossary") soup = BeautifulSoup(html_page, features="lxml") for link in soup.findAll('p', attrs={'class': re.compile("^line")}): #"^https://")}): strong_tag = link.find('strong') if not strong_tag == None: strong_tag_text = strong_tag.text.lower().strip() strong_tag_text = stemmer.stem(strong_tag_text) if strong_tag_text == 'Contributors:'.lower(): break if strong_tag_text.find('(or') >= 0 and strong_tag_text.endswith( ')'): #print('two_glossary={}'.format(strong_tag_text)) two_glossary = strong_tag_text.split('(or') for item in two_glossary: item = item.strip().replace(")", '') item = stemmer.stem(item) if item in keyword_list_TERM.keys(): keyword_list_TERM[item]['count'] += 1 else: keyword_list_TERM[item] = { 'converted_word': item.replace(" ", "_") } keyword_list_TERM[item]['count'] = 1 else: if strong_tag_text.find('(also') >= 0: strong_tag_text = strong_tag_text[:strong_tag_text. find('(also')] strong_tag_text = strong_tag_text if strong_tag_text.find('ubuntu') >= 0: strong_tag_text = 'ubuntu' if strong_tag_text in keyword_list_TERM.keys(): keyword_list_TERM[strong_tag_text]['count'] += 1 else: keyword_list_TERM[strong_tag_text] = { 'converted_word': strong_tag_text.replace(" ", "_") } keyword_list_TERM[strong_tag_text]['count'] = 1 #print (strong_tag_text) print(len(keyword_list_TERM.keys())) # Save Keywords import pickle with open("keyword_list.pickle", 'wb') as f: pickle.dump(keyword_list_TERM, f)
def krovetz_alternative_stemming_algorithm(tokens): ks = krovetz.PyKrovetzStemmer() # print("alternative: ", list(map(lambda word : ks.stem(word), tokens))) return list(map(lambda word : ks.stem(word), tokens))
def krovetz_stemming_algorithm(tokens): ks = krovetz.PyKrovetzStemmer() # print("original: ", [ks.stem(word) for word in tokens]) return [ks.stem(word) for word in tokens]
def main(): dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1") author_title = dblp_data dataset = author_title.to_numpy() list1 = dataset[:,2].tolist() #convert authors to lower case list2 = [] for i in list1: sublist = i.lower().split() list2.append(sublist) te = TransactionEncoder() te_ary = te.fit(list2).transform(list2) df = pd.DataFrame(te_ary, columns=te.columns_) frequent = fpgrowth(df, min_support=0.001, use_colnames=True) frequent = frequent[frequent['itemsets'].str.len()>1] freqauth_list = [] for i in frequent['itemsets']: freqauth_list.append([x for x in i]) freqauth_dict = {} for i in freqauth_list: title_idx_sublist = [] for idx, j in enumerate(list2): if set(i).issubset(j): title_idx_sublist.append(idx) freqauth_dict.update({tuple(i):title_idx_sublist}) freqauth_title_dict = {} kstem = ks.PyKrovetzStemmer() for key, value in freqauth_dict.items(): title_df = author_title.iloc[value]['title'] title_sublist = list(title_df) title_sublists = [] temp_list = [] for x in title_sublist: tempx = re.sub(r'[.]','', x) temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split() temp_list2 = [] if isinstance(temp_list, list): temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist]) title_sublists.extend(temp_list2) else: if not temp_list in stopwordlist: title_sublists.extend([kstem.stem(temp_list)]) freqauth_title_dict.update({key:title_sublists}) # Closed / Top k titles of frequent authors freqauth_title_dict_closed = {} for k, v in freqauth_title_dict.items(): ps = PrefixSpan(v) closed_Seq_pattern = ps.topk(5, closed=True) freqauth_title_dict_closed.update({k:closed_Seq_pattern}) # To get frequent author's context indicators frequentlist = freqauth_list cleanedList = list2 new_author_list = [] for i in range(0,len(frequentlist)): temp_author_list = [] authorlist = list(frequentlist[i]) found = 0 for k in range(0,len(cleanedList)): for j in range(0, len(authorlist)): if (authorlist[j] in(cleanedList[k])): found = 1 else: found = 0 break if found == 1: for jj in range(0,len(authorlist)): if (authorlist[jj] in(cleanedList[k])): cleanedList[k].remove(authorlist[jj]) temp_author_list.append(cleanedList[k]) new_author_list.append(temp_author_list) context_indicator_list = [] for i in range(0,len(new_author_list)): te = TransactionEncoder() te_ary = te.fit(new_author_list[i]).transform(new_author_list[i]) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True) supp = frequent_author_list.support.unique() # all unique support count # Dictionary storing itemset with same support count key freq_dic = {} for i in range(len(supp)): inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets']) freq_dic[supp[i]] = inset # Dictionary storing itemset with support count <= key freq_dic2 = {} for i in range(len(supp)): inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets']) freq_dic2[supp[i]] = inset2 # Find Closed frequent itemset close_freq = [] for index, row in frequent_author_list.iterrows(): isclose = True cli = row['itemsets'] cls = row['support'] checkset = freq_dic[cls] for i in checkset: if (cli != i): if (frozenset.issubset(cli, i)): isclose = False break if (isclose): close_freq.append([x for x in (row['itemsets'])]) context_indicator_list.append(close_freq) freqauth_context_ind_dict = {} for authpair, titlelist in freqauth_title_dict_closed.items(): cleantitlelist = [] for i in titlelist: if isinstance(i, tuple): if isinstance(i[1], list): listtostring = ' '.join(i[1]) cleantitlelist.append(listtostring) freqauth_context_ind_dict.update({authpair:cleantitlelist}) # Merging both titles and Context indicator author for frequent pattern authors for idx, key in enumerate(freqauth_context_ind_dict): newval = [] if len(context_indicator_list[idx])> 0: for i in context_indicator_list[idx]: if len(i) > 0: tempstr = '&'.join(i) newval = freqauth_context_ind_dict[key] newval.append(tempstr) freqauth_context_ind_dict.update({key:newval}) # Context Indicator Weighting CI_list = list(freqauth_context_ind_dict.values()) freqauth_context_in_weights = {} for key, value in freqauth_context_ind_dict.items(): freq_auth_CI_list = value length_of_CI = len(value) temp_dict = {} for i in freq_auth_CI_list: count_tmp = 0 for j in CI_list: if (i in (j)): count_tmp += 1 weight = round(1 - ((count_tmp - 1) / count_tmp), 2) if (weight > 0.1): temp_dict.update({i:weight}) sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) freqauth_context_in_weights.update({key:sorted_weights_dict}) freq_auth_transactions = {} list_of_freq_auth = list(freqauth_context_in_weights.keys()) for i in range(0, len(freqauth_title_dict)): temp_dict = {} title_list = freqauth_title_dict.get(list_of_freq_auth[i]) CI_list = freqauth_context_in_weights[list_of_freq_auth[i]] CI_list_auth = [] for n, c in enumerate(CI_list): CI_list_auth.append(c[0]) for j in range(0, len(title_list)): cos_sim = cos_similarity(CI_list_auth,title_list[j]) cos_sim = round(cos_sim, 3) t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j]) temp_dict.update({t_title:cos_sim}) sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) t_len = len(list(temp_dict.values())) max_len = t_len if (t_len > 4): max_len = 4 sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len]) freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1}) # To find the strongest SSP - Match against similarity of the context units freq_auth_SSPs = {} list_of_freq_auth = list(freqauth_context_ind_dict.keys()) list_of_freq_auth_CI = list(freqauth_context_ind_dict.values()) len_list_of_freq_auth_CI = len(list_of_freq_auth_CI) context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float) for i in range (0,len_list_of_freq_auth_CI): for j in range (0,len_list_of_freq_auth_CI): cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j]) cos_sim = round(cos_sim, 3) if (i != j): context_indicator_similarity[i][j] = cos_sim context_indicator_similarity[j][i] = cos_sim context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int) for i in range(0,len(context_indicator_similarity)): context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:] SSP_Author_List = [] for i in range(0,len(list_of_freq_auth)): temp_author_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]]) SSP_Author_List.append(temp_author_list_ssp) SSP_Title_List = [] CI_list_title = list(freqauth_title_dict_closed.values()) CI_list1 = [] for i in (CI_list_title): temp_list3 = [] for j in i: CI_str = ' '.join(j[1]) temp_list3.append(CI_str) CI_list1.append(list(set(temp_list3))) for i in range(0,len(CI_list1)): temp_title_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): ssp_str = CI_list1[context_indicator_similarity_idx[i][j]] temp_title_list_ssp.extend(ssp_str) SSP_Title_List.append(list(set(temp_title_list_ssp))) # Write the output to a CSV file # a) list_of_freq_auth # b) list_of_freq_auth_CI / freqauth_context_in_weights # c) freq_auth_transactions # d) SSP_Author_List # e) SSP_Title_List #for i in range(0, frequent_author_list): #print(len(SSP_Title_List)) #print(SSP_Title_List) titles_list_with_weight = list(freq_auth_transactions.values()) # Joining SSP authors SSP_authors_formatted = [] for i in range(0,len(SSP_Author_List)): temp_list = [] for j in range(0, len(SSP_Author_List[i])): authors = '&'.join(list(SSP_Author_List[i][j])) temp_list.append(authors) SSP_authors_formatted.append(temp_list) with open("./output.txt", 'w', encoding="utf-8") as f: f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' + 'Transaction 2' + '||' + 'Transaction 3' + '||' + 'Transaction 4' + '||' + 'SSP - Co-Author' + '||' + 'SSP - Title' + '\n') for i in range(0, len(list_of_freq_auth)): authors = ' '.join(list(list_of_freq_auth[i])) f.write(authors + '||') Context_indicators = '; '.join(list_of_freq_auth_CI[i]) f.write(Context_indicators + '||') for j in (titles_list_with_weight[i].keys()): f.write(j + '||') ssp_authors = '; '.join(SSP_authors_formatted[i]) f.write(ssp_authors + '||') ssp_titles = '; '.join(SSP_Title_List[i]) f.write(ssp_titles ) f.write('\n')
def stemmer(words): ks = krovetz.PyKrovetzStemmer() for w in range(len(words)): words[w] = ks.stem(words[w]) return words
def run(): plt.rcParams['figure.dpi'] = 300 kstemmer = krovetz.PyKrovetzStemmer() print('loading w2v model') wi_path = 'data/w2v/word_index_stemmed' we_path = 'data/w2v/word_embeddings_matrix' w2v_model, w2v_wi = load_w2v_model(wi_path, we_path) print('loading PWE') word_dict_path = 'data/word_index_json' wi = load_json(word_dict_path) embs_path = 'data/embeddings_dim_50_margin_2.0' dict_pt = torch.load(embs_path, map_location='cpu') pwe_model = dict_pt["embeddings"] """ keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('cuba'), 30, w2v_wi, w2v_model) print('closes words to cuba (w2v): %s' % ', '.join(keys)) keys, sims = compute_top_k_closest_words_to('cuba', 30, wi, w) print('closes words to cuba (pwe): %s' % ', '.join(keys)) keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('sugar'), 30, w2v_wi, w2v_model) print('closes words to sugar (w2v): %s' % ', '.join(keys)) keys, sims = compute_top_k_closest_words_to('sugar', 30, wi, w) print('closes words to sugar (pwe): %s' % ', '.join(keys)) keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('export'), 30, w2v_wi, w2v_model) print('closes words to export (w2v): %s' % ', '.join(keys)) keys, sims = compute_top_k_closest_words_to('export', 30, wi, w) print('closes words to export (pwe): %s' % ', '.join(keys)) """ # ['disease', 'osteoporosis', 'fracture', 'bone', 'diet', 'health', 'drug'] # w_lists = [['osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet']] w_list = [ 'osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet' ] # plot_ellipses([w_list], pwe_model, wi) plot_ellipses_alt(w_list, pwe_model, wi) exit() plot_pwe_sim_m([kstemmer.stem(w) for w in w_list], w_list, pwe_model, wi) for w in w_list: stemmed = kstemmer.stem(w) sim = compute_words_sim_pwe(kstemmer.stem('osteoporosis'), stemmed, pwe_model, wi) print('PWE similarity between osteoporosis and %s: %2.5f' % (w, sim)) print() print('loading fasttext model') ftext_model_path = 'data/wiki.en.bin' f = load_model(ftext_model_path) print('loading wn embs') wn2vec_model = gensim.models.KeyedVectors.load_word2vec_format( 'data/wn2vec.txt') plot_we_scatterplots(w_list, f, model_name='ft') plot_we_scatterplots(w_list, wn2vec_model, model_name='wn') plot_we_scatterplots(w_list, w2v_model, model_name='w2v', wi=w2v_wi) for w in w_list: stemmed = kstemmer.stem(w) sim = compute_word_sim_wnwe('osteoporosis', w, wn2vec_model) print('WNE similarity between osteoporosis and %s: %2.5f' % (w, sim)) print() for w in w_list: stemmed = kstemmer.stem(w) sim = compute_word_sim_ftt(kstemmer.stem('osteoporosis'), stemmed, f) print('FTE similarity between osteoporosis and %s: %2.5f' % (w, sim)) print() for w in w_list: stemmed = kstemmer.stem(w) sim = compute_word_sim_wnwe(w2v_wi[kstemmer.stem('osteoporosis')], w2v_wi[kstemmer.stem(w)], w2v_model) print('W2V similarity between osteoporosis and %s: %2.5f' % (w, sim))
import io import json import os import pickle import platform import subprocess import numpy as np import krovetz import string from tqdm import tqdm from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer kstemmer = krovetz.PyKrovetzStemmer() # choose correct variable values according to what pc I am using TREC_EVAL_PATH = '../../trec_eval.8.1/trec_eval' with open('../data/indri_stoplist_eng.txt', 'r') as slf: sw = slf.readlines() sw = [word.strip() for word in sw] def save_json(model, output_path): with open(output_path, 'w') as outfile: json.dump(model, outfile) def load_json(path): with open(path, 'r') as json_file:
def plot_ellipses(word_lists, w, wi): kstemmer = krovetz.PyKrovetzStemmer() np.random.seed(0) dim = 50 plt.grid(True) ax = plt.gca() for words in word_lists: ws = [kstemmer.stem(w.lower()) for w in words] basis = None widths = [] heights = [] angles = [] xy = [] labels = [] basis_eigenvalues = None for i in range(len(words)): label = words[i] w1 = ws[i] index = wi[w1] w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view( (-1, dim, dim))) w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim)) w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1) prec_matr = np.linalg.inv(w1_c * w1_c.T) eigs = np.linalg.eig(prec_matr) norms = [np.linalg.norm(v) for v in eigs[1]] eigenvalues = np.abs(eigs[0]) eigenvectors = np.array( [eigs[1][i] / norms[i] for i in range(len(norms))]) sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2] width = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][0])) height = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][1])) if basis is None: sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)] expl_variance = np.abs(np.sum( sorted_eigenvalues[0:2])) / np.sum( sorted_eigenvalues[2:]) * 100 print('PWE: explained variance: %2.5f' % expl_variance) basis_eigenvalues = (width, height) basis = sorted_v angle = 0 # center = (0, 0) center = (basis_eigenvalues[0] * np.dot(basis[0], w1_m) / (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)), basis_eigenvalues[1] * (np.dot(basis[1], w1_m) / (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m)))) else: proj0 = project(basis, sorted_v[0]) # proj1 = project(basis, sorted_v[1]) angle = np.arccos( np.dot(proj0, basis[0]) / (np.linalg.norm(proj0) * np.linalg.norm(basis[0]))) # mean = (np.dot(basis[0], w1_m), np.dot(basis[1], w1_m)) width *= np.linalg.norm(project(basis, eigs[1][0])) height *= np.linalg.norm(project(basis, eigs[1][1])) center = (width * np.dot(basis[0], w1_m) / (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)), height * (np.dot(basis[1], w1_m) / (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m)))) widths.append(width) heights.append(height) angles.append(angle) xy.append(center) labels.append(label) texts = [] colors = [] xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))] for i in range(len(widths)): # widths = [w / max(widths) for w in widths] # heights = [h / max(heights) for h in heights] # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))] print(labels[i]) print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' % (widths[i], heights[i], xy[i][0], xy[i][1])) ell_color = np.random.rand(3) ell = matplotlib.patches.Ellipse(xy=xy[i], width=widths[i], height=heights[i], angle=angles[i], facecolor=ell_color, edgecolor=ell_color, fill=True) ax.add_patch(ell) ell.set_zorder(-1) ell.set_alpha(np.random.rand()) ell.set_alpha(0.5) ell.set(label=labels[i], clip_box=ax.bbox) # ell.set_edgecolor(ell_color) colors.append(ell_color) ell.set_facecolor(ell_color) texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18)) plt.scatter(xy[i][0], xy[i][1], s=80, alpha=0.8, color=ell_color, edgecolor=ell_color, marker='+') adjust_text(texts, expand_text=(1.5, 2.5), expand_points=(2.5, 2.5), expand_objects=(1.9, 2.8), expand_align=(1.8, 1.7), arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8)) xmax = -100 xmin = 100 ymin = 100 ymax = -100 wmax = -100 hmax = -100 for i in range(len(xy)): coord = xy[i] if wmax < widths[i]: wmax = widths[i] if hmax < heights[i]: hmax = heights[i] if coord[0] > xmax: xmax = coord[0] if coord[0] < xmin: xmin = coord[0] if coord[1] > ymax: ymax = coord[1] if coord[1] < ymin: ymin = coord[1] ax.set_xlim(left=int(xmin - wmax / 2), right=int(xmax + wmax / 2)) ax.set_ylim(bottom=int(ymin - hmax / 2), top=int(ymax + hmax / 2)) ax.legend() plt.show() plt.close()
def plot_ellipses_alt(words, w, wi): kstemmer = krovetz.PyKrovetzStemmer() np.random.seed(0) dim = 50 plt.grid(True) plt.axhspan(0, 0, linewidth=2, color='#1f77b4') plt.axvline(0) ax = plt.gca() ws = [kstemmer.stem(w.lower()) for w in words] labels = [] mean_vectors = [] all_eigenvectors_eigenvalues = [] for i in range(len(words)): w1 = ws[i] index = wi[w1] w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view( (-1, dim, dim))) w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim)) w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1) prec_matr = np.linalg.inv(w1_c * w1_c.T) eigs = np.linalg.eig(prec_matr) norms = [np.linalg.norm(v) for v in eigs[1]] eigenvalues = np.abs(eigs[0]) eigenvectors = np.array( [eigs[1][i] / norms[i] for i in range(len(norms))]) sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)] sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2] all_eigenvectors_eigenvalues.append((sorted_v, sorted_eigenvalues)) mean_vectors.append(w1_m) reducer = PCA(2) # mean_vectors_all = np.array([np.reshape(w[index, 0:dim].view(-1, dim).detach().numpy(), -1) for index in range(len(wi.items()))]) # reducer.fit(mean_vectors_all) # centers = reducer.transform(np.array(mean_vectors)) centers = reducer.fit_transform(np.array(mean_vectors)) basis = reducer.transform(reducer.components_) widths = [] heights = [] angles = [] xy = [] max_eig0 = np.NINF max_eig1 = np.NINF for i in range(len(all_eigenvectors_eigenvalues)): eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i] if eigenvalues[0] > max_eig0: max_eig0 = eigenvalues[0] if eigenvalues[1] > max_eig1: max_eig1 = eigenvalues[1] max_overall = max_eig0 if max_eig1 > max_overall: max_overall = max_eig1 for i in range(len(all_eigenvectors_eigenvalues)): label = words[i] eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i] proj_eigs = reducer.transform(np.array(eigenvectors)) width = np.linalg.norm(proj_eigs[0]) * eigenvalues[0] / max_eig0 height = np.linalg.norm(proj_eigs[1]) * eigenvalues[1] / max_eig1 angle = np.arccos( np.dot(proj_eigs[0], basis[0]) / (np.linalg.norm(proj_eigs[0]) * np.linalg.norm(basis[0]))) widths.append(width) heights.append(height) angles.append(angle) x = centers[i][0] y = centers[i][1] xy.append((x, y)) labels.append(label) texts = [] colors = [] xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))] for i in range(len(widths)): # widths = [w / max(widths) for w in widths] # heights = [h / max(heights) for h in heights] # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))] print(labels[i]) print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' % (widths[i], heights[i], xy[i][0], xy[i][1])) ell_color = np.random.rand(3) ell = matplotlib.patches.Ellipse(xy=xy[i], width=widths[i], height=heights[i], angle=angles[i], facecolor=ell_color, edgecolor=ell_color, fill=True) ax.add_patch(ell) ell.set_zorder(-1) ell.set_alpha(np.random.rand()) ell.set_alpha(0.5) ell.set(label=labels[i], clip_box=ax.bbox) colors.append(ell_color) ell.set_facecolor(ell_color) texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18)) plt.scatter(xy[i][0], xy[i][1], s=80, alpha=0.8, color=ell_color, edgecolor=ell_color, marker='+') adjust_text(texts, expand_text=(1.5, 2.5), expand_points=(2.5, 2.5), expand_objects=(1.9, 2.8), expand_align=(1.8, 1.7), arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8)) xmax = -100 xmin = 100 ymin = 100 ymax = -100 wmax = -100 hmax = -100 for i in range(len(xy)): coord = xy[i] if wmax < widths[i]: wmax = widths[i] if hmax < heights[i]: hmax = heights[i] if coord[0] > xmax: xmax = coord[0] if coord[0] < xmin: xmin = coord[0] if coord[1] > ymax: ymax = coord[1] if coord[1] < ymin: ymin = coord[1] ax.set_xlim(left=xmin - wmax / 2 - 0.05, right=xmax + wmax / 2 + 0.05) ax.set_ylim(bottom=ymin - hmax / 2 - 0.05, top=ymax + hmax / 2 + 0.05) ax.legend(prop={'size': 13}) plt.savefig('ellipses_final.png') plt.show() plt.close()
def plot_we_scatterplots(word_list, model, model_name='ft', wi=None): labels = word_list kstemmer = krovetz.PyKrovetzStemmer() reducer = PCA(2) if model_name == 'ft': vecs = reducer.fit_transform( np.array( [model.get_word_vector(kstemmer.stem(w)) for w in word_list])) elif model_name == 'wn': word_list = [w for w in word_list if w in model.wv.vocab] labels = word_list vecs = reducer.fit_transform(model[[w for w in word_list]]) else: vecs = reducer.fit_transform( [model[wi[kstemmer.stem(w)]] for w in word_list]) fig = plt.figure() plt.grid(True) plt.axhspan(0, 0, linewidth=2, color='#1f77b4') plt.axvline(0) ax = plt.gca() ax.set_xlim(left=-10, right=10) ax.set_ylim(bottom=-10, top=10) texts = [] xy = [] for i in range(len(vecs)): reduced = vecs[i] label = labels[i] ax.scatter(reduced[0], reduced[1], s=100, alpha=1.0, color='b', marker='+') # texts.append(ax.text(reduced[0], reduced[1], label, fontsize=14)) if label == 'osteoporosis' and model_name == 'wn': ax.annotate(label, (reduced[0], reduced[1]), fontsize=18, ha='right') else: ax.annotate(label, (reduced[0], reduced[1]), fontsize=18) xy.append(reduced) # adjust_text(texts, expand_text=(0.01, 0.02), arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.0)) if model_name == 'ft': plt.title('FTE') elif model_name == 'wn': plt.title('WNE') else: plt.title('W2V') xmax = -100 xmin = 100 ymin = 100 ymax = -100 for i in range(len(xy)): coord = xy[i] if coord[0] > xmax: xmax = coord[0] if coord[0] < xmin: xmin = coord[0] if coord[1] > ymax: ymax = coord[1] if coord[1] < ymin: ymin = coord[1] ax.set_xlim(left=xmin - 1, right=xmax + 1) ax.set_ylim(bottom=ymin - 1, top=ymax + 1) plt.show() plt.close(fig)
def test_do_simple_stem(self): ks = krovetz.PyKrovetzStemmer() self.assertEqual(ks.stem("walked"), "walk") self.assertEqual(ks.stem("run"), "run")
def test_stem(benchmark, word): ks = krovetz.PyKrovetzStemmer() result = benchmark(stem_many, ks, word)