import argparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import krovetz
import string
import re
from math import log

ks = krovetz.PyKrovetzStemmer()
stop_words = set(stopwords.words('english'))  # | set(string.punctuation)

#word_df = 1


def preprocess(text):
    word_tokens = word_tokenize(re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()))
    processed = [ks.stem(w) for w in word_tokens if w not in stop_words]
    return processed


def bm25(qtext, docs_id, docs_body, vocab_words_df, num_docs_collection,
         avg_docs_len):
    rel_scores = [0 for i in range(len(docs_id))]
    N = num_docs_collection
    k1 = 1.4
    b = 0.75
    for q in qtext:
        n_q = vocab_words_df.get(q, 0)
        idf_q = log(((N - n_q + 0.5) / (n_q + 0.5) + 1))
        for i in range(len(docs_id)):
            # docid = docs_id[i]
Beispiel #2
0
def scrap_keywords():
    stemmer = krovetz.PyKrovetzStemmer()
    # keyword_list_TERM: {key:{converted_word, count}}
    keyword_list_TERM = {}
    html_page = urlopen(
        "https://en.wikipedia.org/wiki/List_of_Microsoft_software")
    soup = BeautifulSoup(html_page, features="lxml")
    div = soup.find('div', attrs={'id': 'mw-content-text'})
    footer = str(div.contents).rfind("Misc.")

    for link in div.findAll('a',
                            attrs={'href':
                                   re.compile("/wiki/")}):  #"^https://")}):
        link_text = link.text.replace('+', '').lower().strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        #if link_text=='Microsoft Edge'.lower():
        #    print(stemmer.stem(link_text))
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        link_text = link.text.replace('+',
                                      '').lower().replace('microsoft',
                                                          'ms').strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        link_text = link.text.replace('+',
                                      '').lower().replace('microsoft',
                                                          '').strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        #print (link_text)
        if link_text == 'Windows To Go'.lower():
            break

    link_text = 'window'
    if link_text in keyword_list_TERM.keys():
        keyword_list_TERM[link_text]['count'] += 1
    else:
        keyword_list_TERM[link_text] = {'converted_word': link_text}
        keyword_list_TERM[link_text]['count'] = 1
    key_list = list(keyword_list_TERM.keys())

    for key1 in key_list:
        if key1.startswith('list of '):
            del keyword_list_TERM[key1]

    # get Ubuntu Glossaries
    html_page = urlopen("https://help.ubuntu.com/community/Glossary")
    soup = BeautifulSoup(html_page, features="lxml")
    for link in soup.findAll('p',
                             attrs={'class':
                                    re.compile("^line")}):  #"^https://")}):
        strong_tag = link.find('strong')
        if not strong_tag == None:
            strong_tag_text = strong_tag.text.lower().strip()
            strong_tag_text = stemmer.stem(strong_tag_text)
            if strong_tag_text == 'Contributors:'.lower():
                break
            if strong_tag_text.find('(or') >= 0 and strong_tag_text.endswith(
                    ')'):
                #print('two_glossary={}'.format(strong_tag_text))
                two_glossary = strong_tag_text.split('(or')
                for item in two_glossary:
                    item = item.strip().replace(")", '')
                    item = stemmer.stem(item)
                    if item in keyword_list_TERM.keys():
                        keyword_list_TERM[item]['count'] += 1
                    else:
                        keyword_list_TERM[item] = {
                            'converted_word': item.replace(" ", "_")
                        }
                        keyword_list_TERM[item]['count'] = 1
            else:
                if strong_tag_text.find('(also') >= 0:
                    strong_tag_text = strong_tag_text[:strong_tag_text.
                                                      find('(also')]
                strong_tag_text = strong_tag_text
                if strong_tag_text.find('ubuntu') >= 0:
                    strong_tag_text = 'ubuntu'
                if strong_tag_text in keyword_list_TERM.keys():
                    keyword_list_TERM[strong_tag_text]['count'] += 1
                else:
                    keyword_list_TERM[strong_tag_text] = {
                        'converted_word': strong_tag_text.replace(" ", "_")
                    }
                    keyword_list_TERM[strong_tag_text]['count'] = 1
            #print (strong_tag_text)

    print(len(keyword_list_TERM.keys()))
    # Save Keywords
    import pickle
    with open("keyword_list.pickle", 'wb') as f:
        pickle.dump(keyword_list_TERM, f)
def krovetz_alternative_stemming_algorithm(tokens):
    ks = krovetz.PyKrovetzStemmer()
    # print("alternative: ", list(map(lambda word : ks.stem(word), tokens)))
    return list(map(lambda word : ks.stem(word), tokens))
def krovetz_stemming_algorithm(tokens):
    ks = krovetz.PyKrovetzStemmer()
    # print("original: ", [ks.stem(word) for word in tokens])
    return [ks.stem(word) for word in tokens]
def main():

    dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1")
    author_title = dblp_data
    dataset = author_title.to_numpy()
    list1 = dataset[:,2].tolist()

    #convert authors to lower case
    list2 = []
    for i in list1:
        sublist = i.lower().split()
        list2.append(sublist)
    
    te = TransactionEncoder()
    te_ary = te.fit(list2).transform(list2)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = frequent[frequent['itemsets'].str.len()>1]

    freqauth_list = []
    for i in frequent['itemsets']:
        freqauth_list.append([x for x in i])

    freqauth_dict = {}
    for i in freqauth_list:
        title_idx_sublist = []
        for idx, j in enumerate(list2):
            if set(i).issubset(j):
                title_idx_sublist.append(idx)
        freqauth_dict.update({tuple(i):title_idx_sublist})

    freqauth_title_dict = {}
    kstem = ks.PyKrovetzStemmer()
    for key, value in freqauth_dict.items():
        title_df = author_title.iloc[value]['title']
        title_sublist = list(title_df)
        title_sublists = []
        temp_list = []
        for x in title_sublist:
            tempx     = re.sub(r'[.]','', x)
            temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split()
            temp_list2 = []
            if isinstance(temp_list, list):
                temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist])
                title_sublists.extend(temp_list2)
            else:
                if not temp_list in stopwordlist:
                    title_sublists.extend([kstem.stem(temp_list)])
        freqauth_title_dict.update({key:title_sublists})

    # Closed / Top k titles of frequent authors
    freqauth_title_dict_closed = {}
    for k, v in freqauth_title_dict.items():
        ps = PrefixSpan(v)
        closed_Seq_pattern = ps.topk(5, closed=True)
        freqauth_title_dict_closed.update({k:closed_Seq_pattern})

    # To get frequent author's context indicators
    frequentlist = freqauth_list
    cleanedList  = list2

    new_author_list = []
    for i in range(0,len(frequentlist)):
        temp_author_list = []
        authorlist = list(frequentlist[i])
        found = 0
        for k in range(0,len(cleanedList)):
            for j in range(0, len(authorlist)):
                if (authorlist[j] in(cleanedList[k])):
                    found = 1
                else:
                    found = 0
                    break
                    
            if found == 1:
                for jj in range(0,len(authorlist)):
                    if (authorlist[jj] in(cleanedList[k])):
                        cleanedList[k].remove(authorlist[jj])
                temp_author_list.append(cleanedList[k])

        new_author_list.append(temp_author_list)

    context_indicator_list = []
    for i in range(0,len(new_author_list)):
        te = TransactionEncoder()
        te_ary = te.fit(new_author_list[i]).transform(new_author_list[i])
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True)

        supp = frequent_author_list.support.unique()  # all unique support count
        # Dictionary storing itemset with same support count key
        freq_dic = {}
        for i in range(len(supp)):
            inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets'])
            freq_dic[supp[i]] = inset
        # Dictionary storing itemset with  support count <= key
        freq_dic2 = {}
        for i in range(len(supp)):
            inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets'])
            freq_dic2[supp[i]] = inset2

        # Find Closed frequent itemset
        close_freq = []
        for index, row in frequent_author_list.iterrows():
            isclose = True
            cli = row['itemsets']
            cls = row['support']
            checkset = freq_dic[cls]
            for i in checkset:
                if (cli != i):
                    if (frozenset.issubset(cli, i)):
                        isclose = False
                        break

            if (isclose):
                close_freq.append([x for x in  (row['itemsets'])])
        context_indicator_list.append(close_freq)
    
    freqauth_context_ind_dict = {}
    for authpair, titlelist in freqauth_title_dict_closed.items():
        cleantitlelist = []
        for i in titlelist:
            if isinstance(i, tuple):
                if isinstance(i[1], list):
                    listtostring = ' '.join(i[1])
                    cleantitlelist.append(listtostring)
        freqauth_context_ind_dict.update({authpair:cleantitlelist})

    # Merging both titles and Context indicator author for frequent pattern authors 
    for idx, key in enumerate(freqauth_context_ind_dict):
        newval = []
        if len(context_indicator_list[idx])> 0:
            for i in context_indicator_list[idx]:
                if len(i) > 0:                
                    tempstr = '&'.join(i)
                    newval = freqauth_context_ind_dict[key]
                    newval.append(tempstr)
                    freqauth_context_ind_dict.update({key:newval})

# Context Indicator Weighting
    CI_list = list(freqauth_context_ind_dict.values())
    freqauth_context_in_weights = {}
    for key, value in freqauth_context_ind_dict.items():
        freq_auth_CI_list = value
        length_of_CI = len(value)
        temp_dict = {}
        for i in freq_auth_CI_list:
            count_tmp = 0
            for j in CI_list:
                if (i in (j)):
                    count_tmp += 1
            weight = round(1 - ((count_tmp - 1) /  count_tmp), 2)
            if (weight > 0.1):
                temp_dict.update({i:weight})
        sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        freqauth_context_in_weights.update({key:sorted_weights_dict})

    freq_auth_transactions = {}
    list_of_freq_auth = list(freqauth_context_in_weights.keys())
    for i in range(0, len(freqauth_title_dict)):
        temp_dict = {}
        title_list = freqauth_title_dict.get(list_of_freq_auth[i])
        CI_list = freqauth_context_in_weights[list_of_freq_auth[i]]
        CI_list_auth = []
        for n, c in enumerate(CI_list):
            CI_list_auth.append(c[0])
        for j in range(0, len(title_list)):
            cos_sim = cos_similarity(CI_list_auth,title_list[j])
            cos_sim = round(cos_sim, 3)
            t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j])
            temp_dict.update({t_title:cos_sim})

        sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        t_len = len(list(temp_dict.values()))
        max_len = t_len
        if (t_len > 4):
            max_len = 4
        sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len])
        freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1})

    # To find the strongest SSP - Match against similarity of the context units

    freq_auth_SSPs = {}
    list_of_freq_auth = list(freqauth_context_ind_dict.keys())
    list_of_freq_auth_CI =  list(freqauth_context_ind_dict.values())
    len_list_of_freq_auth_CI = len(list_of_freq_auth_CI)

    context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float)
    for i in range (0,len_list_of_freq_auth_CI):
        for j in range (0,len_list_of_freq_auth_CI):
            cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j])
            cos_sim = round(cos_sim, 3)
            if (i != j):
                context_indicator_similarity[i][j] = cos_sim
                context_indicator_similarity[j][i] = cos_sim

    context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int)
    for i in range(0,len(context_indicator_similarity)):
        context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:]

    SSP_Author_List = []
    for i in range(0,len(list_of_freq_auth)):
        temp_author_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
           temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]])
        SSP_Author_List.append(temp_author_list_ssp)

    SSP_Title_List = []

    CI_list_title = list(freqauth_title_dict_closed.values())
    CI_list1 = []
    for i in (CI_list_title):
        temp_list3 = []
        for j in i:
            CI_str = ' '.join(j[1])
            temp_list3.append(CI_str)
        CI_list1.append(list(set(temp_list3)))

    for i in range(0,len(CI_list1)):
        temp_title_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
            ssp_str = CI_list1[context_indicator_similarity_idx[i][j]]
            temp_title_list_ssp.extend(ssp_str)
        SSP_Title_List.append(list(set(temp_title_list_ssp)))

    # Write the output to a CSV file
    # a) list_of_freq_auth
    # b) list_of_freq_auth_CI / freqauth_context_in_weights
    # c) freq_auth_transactions
    # d) SSP_Author_List
    # e) SSP_Title_List
    #for i in range(0, frequent_author_list):
    #print(len(SSP_Title_List))
    #print(SSP_Title_List)
    titles_list_with_weight = list(freq_auth_transactions.values())
    # Joining SSP authors
    SSP_authors_formatted = []
    for i in range(0,len(SSP_Author_List)):
        temp_list = []
        for j in range(0, len(SSP_Author_List[i])):
            authors = '&'.join(list(SSP_Author_List[i][j]))
            temp_list.append(authors)
        SSP_authors_formatted.append(temp_list)

    with open("./output.txt", 'w', encoding="utf-8") as f:
        f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' +
                'Transaction 2' + '||'  + 'Transaction 3' + '||'  + 'Transaction 4' + '||' + 'SSP - Co-Author' +
                '||' + 'SSP - Title' + '\n')
        for i in range(0, len(list_of_freq_auth)):
            authors = ' '.join(list(list_of_freq_auth[i]))
            f.write(authors + '||')
            Context_indicators = '; '.join(list_of_freq_auth_CI[i])
            f.write(Context_indicators + '||')
            for j in (titles_list_with_weight[i].keys()):
                f.write(j + '||')
            ssp_authors = '; '.join(SSP_authors_formatted[i])
            f.write(ssp_authors + '||')
            ssp_titles = '; '.join(SSP_Title_List[i])
            f.write(ssp_titles )
            f.write('\n')
Beispiel #6
0
def stemmer(words):
    ks = krovetz.PyKrovetzStemmer()
    for w in range(len(words)):
        words[w] = ks.stem(words[w])
    return words
Beispiel #7
0
def run():

    plt.rcParams['figure.dpi'] = 300
    kstemmer = krovetz.PyKrovetzStemmer()

    print('loading w2v model')
    wi_path = 'data/w2v/word_index_stemmed'
    we_path = 'data/w2v/word_embeddings_matrix'
    w2v_model, w2v_wi = load_w2v_model(wi_path, we_path)

    print('loading PWE')
    word_dict_path = 'data/word_index_json'
    wi = load_json(word_dict_path)
    embs_path = 'data/embeddings_dim_50_margin_2.0'
    dict_pt = torch.load(embs_path, map_location='cpu')
    pwe_model = dict_pt["embeddings"]
    """
    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('cuba'), 30, w2v_wi, w2v_model)
    print('closes words to cuba (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('cuba', 30, wi, w)
    print('closes words to cuba (pwe): %s' % ', '.join(keys))

    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('sugar'), 30, w2v_wi, w2v_model)
    print('closes words to sugar (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('sugar', 30, wi, w)
    print('closes words to sugar (pwe): %s' % ', '.join(keys))

    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('export'), 30, w2v_wi, w2v_model)
    print('closes words to export (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('export', 30, wi, w)
    print('closes words to export (pwe): %s' % ', '.join(keys))
    """

    # ['disease', 'osteoporosis', 'fracture', 'bone', 'diet', 'health', 'drug']
    # w_lists = [['osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet']]
    w_list = [
        'osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet'
    ]
    # plot_ellipses([w_list], pwe_model, wi)
    plot_ellipses_alt(w_list, pwe_model, wi)

    exit()

    plot_pwe_sim_m([kstemmer.stem(w) for w in w_list], w_list, pwe_model, wi)

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_words_sim_pwe(kstemmer.stem('osteoporosis'), stemmed,
                                    pwe_model, wi)
        print('PWE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    print('loading fasttext model')
    ftext_model_path = 'data/wiki.en.bin'
    f = load_model(ftext_model_path)

    print('loading wn embs')
    wn2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/wn2vec.txt')

    plot_we_scatterplots(w_list, f, model_name='ft')
    plot_we_scatterplots(w_list, wn2vec_model, model_name='wn')
    plot_we_scatterplots(w_list, w2v_model, model_name='w2v', wi=w2v_wi)

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_wnwe('osteoporosis', w, wn2vec_model)
        print('WNE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_ftt(kstemmer.stem('osteoporosis'), stemmed, f)
        print('FTE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_wnwe(w2v_wi[kstemmer.stem('osteoporosis')],
                                    w2v_wi[kstemmer.stem(w)], w2v_model)
        print('W2V similarity between osteoporosis and %s: %2.5f' % (w, sim))
Beispiel #8
0
import io
import json
import os
import pickle
import platform
import subprocess

import numpy as np
import krovetz

import string

from tqdm import tqdm
from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer

kstemmer = krovetz.PyKrovetzStemmer()
# choose correct variable values according to what pc I am using
TREC_EVAL_PATH = '../../trec_eval.8.1/trec_eval'

with open('../data/indri_stoplist_eng.txt', 'r') as slf:
    sw = slf.readlines()
    sw = [word.strip() for word in sw]


def save_json(model, output_path):
    with open(output_path, 'w') as outfile:
        json.dump(model, outfile)


def load_json(path):
    with open(path, 'r') as json_file:
Beispiel #9
0
def plot_ellipses(word_lists, w, wi):
    kstemmer = krovetz.PyKrovetzStemmer()
    np.random.seed(0)
    dim = 50
    plt.grid(True)
    ax = plt.gca()

    for words in word_lists:
        ws = [kstemmer.stem(w.lower()) for w in words]
        basis = None
        widths = []
        heights = []
        angles = []
        xy = []
        labels = []
        basis_eigenvalues = None
        for i in range(len(words)):
            label = words[i]
            w1 = ws[i]
            index = wi[w1]
            w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view(
                (-1, dim, dim)))
            w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim))
            w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1)

            prec_matr = np.linalg.inv(w1_c * w1_c.T)

            eigs = np.linalg.eig(prec_matr)
            norms = [np.linalg.norm(v) for v in eigs[1]]
            eigenvalues = np.abs(eigs[0])
            eigenvectors = np.array(
                [eigs[1][i] / norms[i] for i in range(len(norms))])

            sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2]
            width = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][0]))
            height = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][1]))

            if basis is None:
                sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)]
                expl_variance = np.abs(np.sum(
                    sorted_eigenvalues[0:2])) / np.sum(
                        sorted_eigenvalues[2:]) * 100

                print('PWE: explained variance: %2.5f' % expl_variance)
                basis_eigenvalues = (width, height)
                basis = sorted_v
                angle = 0
                # center = (0, 0)
                center = (basis_eigenvalues[0] * np.dot(basis[0], w1_m) /
                          (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)),
                          basis_eigenvalues[1] *
                          (np.dot(basis[1], w1_m) /
                           (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m))))

            else:
                proj0 = project(basis, sorted_v[0])
                # proj1 = project(basis, sorted_v[1])
                angle = np.arccos(
                    np.dot(proj0, basis[0]) /
                    (np.linalg.norm(proj0) * np.linalg.norm(basis[0])))
                # mean = (np.dot(basis[0], w1_m), np.dot(basis[1], w1_m))

                width *= np.linalg.norm(project(basis, eigs[1][0]))
                height *= np.linalg.norm(project(basis, eigs[1][1]))

                center = (width * np.dot(basis[0], w1_m) /
                          (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)),
                          height *
                          (np.dot(basis[1], w1_m) /
                           (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m))))

            widths.append(width)
            heights.append(height)
            angles.append(angle)
            xy.append(center)
            labels.append(label)

        texts = []
        colors = []
        xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))]
        for i in range(len(widths)):
            # widths = [w / max(widths) for w in widths]
            # heights = [h / max(heights) for h in heights]
            # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))]
            print(labels[i])
            print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' %
                  (widths[i], heights[i], xy[i][0], xy[i][1]))
            ell_color = np.random.rand(3)
            ell = matplotlib.patches.Ellipse(xy=xy[i],
                                             width=widths[i],
                                             height=heights[i],
                                             angle=angles[i],
                                             facecolor=ell_color,
                                             edgecolor=ell_color,
                                             fill=True)
            ax.add_patch(ell)
            ell.set_zorder(-1)
            ell.set_alpha(np.random.rand())
            ell.set_alpha(0.5)
            ell.set(label=labels[i], clip_box=ax.bbox)
            # ell.set_edgecolor(ell_color)
            colors.append(ell_color)
            ell.set_facecolor(ell_color)

            texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18))
            plt.scatter(xy[i][0],
                        xy[i][1],
                        s=80,
                        alpha=0.8,
                        color=ell_color,
                        edgecolor=ell_color,
                        marker='+')

        adjust_text(texts,
                    expand_text=(1.5, 2.5),
                    expand_points=(2.5, 2.5),
                    expand_objects=(1.9, 2.8),
                    expand_align=(1.8, 1.7),
                    arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8))

        xmax = -100
        xmin = 100
        ymin = 100
        ymax = -100
        wmax = -100
        hmax = -100
        for i in range(len(xy)):
            coord = xy[i]
            if wmax < widths[i]:
                wmax = widths[i]
            if hmax < heights[i]:
                hmax = heights[i]

            if coord[0] > xmax:
                xmax = coord[0]
            if coord[0] < xmin:
                xmin = coord[0]

            if coord[1] > ymax:
                ymax = coord[1]
            if coord[1] < ymin:
                ymin = coord[1]

        ax.set_xlim(left=int(xmin - wmax / 2), right=int(xmax + wmax / 2))
        ax.set_ylim(bottom=int(ymin - hmax / 2), top=int(ymax + hmax / 2))
    ax.legend()
    plt.show()
    plt.close()
Beispiel #10
0
def plot_ellipses_alt(words, w, wi):
    kstemmer = krovetz.PyKrovetzStemmer()
    np.random.seed(0)
    dim = 50
    plt.grid(True)
    plt.axhspan(0, 0, linewidth=2, color='#1f77b4')
    plt.axvline(0)
    ax = plt.gca()
    ws = [kstemmer.stem(w.lower()) for w in words]
    labels = []
    mean_vectors = []
    all_eigenvectors_eigenvalues = []
    for i in range(len(words)):
        w1 = ws[i]
        index = wi[w1]
        w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view(
            (-1, dim, dim)))
        w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim))
        w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1)

        prec_matr = np.linalg.inv(w1_c * w1_c.T)
        eigs = np.linalg.eig(prec_matr)
        norms = [np.linalg.norm(v) for v in eigs[1]]
        eigenvalues = np.abs(eigs[0])
        eigenvectors = np.array(
            [eigs[1][i] / norms[i] for i in range(len(norms))])
        sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)]
        sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2]

        all_eigenvectors_eigenvalues.append((sorted_v, sorted_eigenvalues))
        mean_vectors.append(w1_m)

    reducer = PCA(2)

    # mean_vectors_all = np.array([np.reshape(w[index, 0:dim].view(-1, dim).detach().numpy(), -1) for index in range(len(wi.items()))])
    # reducer.fit(mean_vectors_all)
    # centers = reducer.transform(np.array(mean_vectors))

    centers = reducer.fit_transform(np.array(mean_vectors))
    basis = reducer.transform(reducer.components_)

    widths = []
    heights = []
    angles = []
    xy = []
    max_eig0 = np.NINF
    max_eig1 = np.NINF
    for i in range(len(all_eigenvectors_eigenvalues)):
        eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i]
        if eigenvalues[0] > max_eig0:
            max_eig0 = eigenvalues[0]

        if eigenvalues[1] > max_eig1:
            max_eig1 = eigenvalues[1]
    max_overall = max_eig0
    if max_eig1 > max_overall:
        max_overall = max_eig1

    for i in range(len(all_eigenvectors_eigenvalues)):
        label = words[i]
        eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i]
        proj_eigs = reducer.transform(np.array(eigenvectors))

        width = np.linalg.norm(proj_eigs[0]) * eigenvalues[0] / max_eig0
        height = np.linalg.norm(proj_eigs[1]) * eigenvalues[1] / max_eig1
        angle = np.arccos(
            np.dot(proj_eigs[0], basis[0]) /
            (np.linalg.norm(proj_eigs[0]) * np.linalg.norm(basis[0])))

        widths.append(width)
        heights.append(height)
        angles.append(angle)
        x = centers[i][0]
        y = centers[i][1]
        xy.append((x, y))
        labels.append(label)

    texts = []
    colors = []
    xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))]
    for i in range(len(widths)):
        # widths = [w / max(widths) for w in widths]
        # heights = [h / max(heights) for h in heights]
        # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))]
        print(labels[i])
        print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' %
              (widths[i], heights[i], xy[i][0], xy[i][1]))
        ell_color = np.random.rand(3)
        ell = matplotlib.patches.Ellipse(xy=xy[i],
                                         width=widths[i],
                                         height=heights[i],
                                         angle=angles[i],
                                         facecolor=ell_color,
                                         edgecolor=ell_color,
                                         fill=True)
        ax.add_patch(ell)
        ell.set_zorder(-1)
        ell.set_alpha(np.random.rand())
        ell.set_alpha(0.5)
        ell.set(label=labels[i], clip_box=ax.bbox)
        colors.append(ell_color)
        ell.set_facecolor(ell_color)

        texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18))
        plt.scatter(xy[i][0],
                    xy[i][1],
                    s=80,
                    alpha=0.8,
                    color=ell_color,
                    edgecolor=ell_color,
                    marker='+')

    adjust_text(texts,
                expand_text=(1.5, 2.5),
                expand_points=(2.5, 2.5),
                expand_objects=(1.9, 2.8),
                expand_align=(1.8, 1.7),
                arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8))

    xmax = -100
    xmin = 100
    ymin = 100
    ymax = -100
    wmax = -100
    hmax = -100
    for i in range(len(xy)):
        coord = xy[i]
        if wmax < widths[i]:
            wmax = widths[i]
        if hmax < heights[i]:
            hmax = heights[i]

        if coord[0] > xmax:
            xmax = coord[0]
        if coord[0] < xmin:
            xmin = coord[0]

        if coord[1] > ymax:
            ymax = coord[1]
        if coord[1] < ymin:
            ymin = coord[1]

    ax.set_xlim(left=xmin - wmax / 2 - 0.05, right=xmax + wmax / 2 + 0.05)
    ax.set_ylim(bottom=ymin - hmax / 2 - 0.05, top=ymax + hmax / 2 + 0.05)
    ax.legend(prop={'size': 13})
    plt.savefig('ellipses_final.png')
    plt.show()
    plt.close()
Beispiel #11
0
def plot_we_scatterplots(word_list, model, model_name='ft', wi=None):
    labels = word_list
    kstemmer = krovetz.PyKrovetzStemmer()
    reducer = PCA(2)
    if model_name == 'ft':
        vecs = reducer.fit_transform(
            np.array(
                [model.get_word_vector(kstemmer.stem(w)) for w in word_list]))
    elif model_name == 'wn':
        word_list = [w for w in word_list if w in model.wv.vocab]
        labels = word_list
        vecs = reducer.fit_transform(model[[w for w in word_list]])
    else:
        vecs = reducer.fit_transform(
            [model[wi[kstemmer.stem(w)]] for w in word_list])

    fig = plt.figure()
    plt.grid(True)
    plt.axhspan(0, 0, linewidth=2, color='#1f77b4')
    plt.axvline(0)
    ax = plt.gca()
    ax.set_xlim(left=-10, right=10)
    ax.set_ylim(bottom=-10, top=10)
    texts = []
    xy = []
    for i in range(len(vecs)):
        reduced = vecs[i]
        label = labels[i]
        ax.scatter(reduced[0],
                   reduced[1],
                   s=100,
                   alpha=1.0,
                   color='b',
                   marker='+')
        # texts.append(ax.text(reduced[0], reduced[1], label, fontsize=14))
        if label == 'osteoporosis' and model_name == 'wn':
            ax.annotate(label, (reduced[0], reduced[1]),
                        fontsize=18,
                        ha='right')
        else:
            ax.annotate(label, (reduced[0], reduced[1]), fontsize=18)

        xy.append(reduced)
    # adjust_text(texts, expand_text=(0.01, 0.02), arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.0))

    if model_name == 'ft':
        plt.title('FTE')
    elif model_name == 'wn':
        plt.title('WNE')
    else:
        plt.title('W2V')

    xmax = -100
    xmin = 100
    ymin = 100
    ymax = -100
    for i in range(len(xy)):
        coord = xy[i]
        if coord[0] > xmax:
            xmax = coord[0]
        if coord[0] < xmin:
            xmin = coord[0]

        if coord[1] > ymax:
            ymax = coord[1]
        if coord[1] < ymin:
            ymin = coord[1]

    ax.set_xlim(left=xmin - 1, right=xmax + 1)
    ax.set_ylim(bottom=ymin - 1, top=ymax + 1)

    plt.show()
    plt.close(fig)
Beispiel #12
0
 def test_do_simple_stem(self):
     ks = krovetz.PyKrovetzStemmer()
     self.assertEqual(ks.stem("walked"), "walk")
     self.assertEqual(ks.stem("run"), "run")
def test_stem(benchmark, word):
    ks = krovetz.PyKrovetzStemmer()
    result = benchmark(stem_many, ks, word)