コード例 #1
0
import argparse
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import krovetz
import string
import re
from math import log

ks = krovetz.PyKrovetzStemmer()
stop_words = set(stopwords.words('english'))  # | set(string.punctuation)

#word_df = 1


def preprocess(text):
    word_tokens = word_tokenize(re.sub(r'[^a-zA-Z0-9]', ' ', text.lower()))
    processed = [ks.stem(w) for w in word_tokens if w not in stop_words]
    return processed


def bm25(qtext, docs_id, docs_body, vocab_words_df, num_docs_collection,
         avg_docs_len):
    rel_scores = [0 for i in range(len(docs_id))]
    N = num_docs_collection
    k1 = 1.4
    b = 0.75
    for q in qtext:
        n_q = vocab_words_df.get(q, 0)
        idf_q = log(((N - n_q + 0.5) / (n_q + 0.5) + 1))
        for i in range(len(docs_id)):
            # docid = docs_id[i]
コード例 #2
0
def scrap_keywords():
    stemmer = krovetz.PyKrovetzStemmer()
    # keyword_list_TERM: {key:{converted_word, count}}
    keyword_list_TERM = {}
    html_page = urlopen(
        "https://en.wikipedia.org/wiki/List_of_Microsoft_software")
    soup = BeautifulSoup(html_page, features="lxml")
    div = soup.find('div', attrs={'id': 'mw-content-text'})
    footer = str(div.contents).rfind("Misc.")

    for link in div.findAll('a',
                            attrs={'href':
                                   re.compile("/wiki/")}):  #"^https://")}):
        link_text = link.text.replace('+', '').lower().strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        #if link_text=='Microsoft Edge'.lower():
        #    print(stemmer.stem(link_text))
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        link_text = link.text.replace('+',
                                      '').lower().replace('microsoft',
                                                          'ms').strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        link_text = link.text.replace('+',
                                      '').lower().replace('microsoft',
                                                          '').strip()
        # lemmatize
        link_text = stemmer.stem(link_text)
        if link_text in keyword_list_TERM.keys():
            keyword_list_TERM[link_text]['count'] += 1
        else:
            keyword_list_TERM[link_text] = {
                'converted_word': link_text.replace(" ", "_")
            }
            keyword_list_TERM[link_text]['count'] = 1

        #print (link_text)
        if link_text == 'Windows To Go'.lower():
            break

    link_text = 'window'
    if link_text in keyword_list_TERM.keys():
        keyword_list_TERM[link_text]['count'] += 1
    else:
        keyword_list_TERM[link_text] = {'converted_word': link_text}
        keyword_list_TERM[link_text]['count'] = 1
    key_list = list(keyword_list_TERM.keys())

    for key1 in key_list:
        if key1.startswith('list of '):
            del keyword_list_TERM[key1]

    # get Ubuntu Glossaries
    html_page = urlopen("https://help.ubuntu.com/community/Glossary")
    soup = BeautifulSoup(html_page, features="lxml")
    for link in soup.findAll('p',
                             attrs={'class':
                                    re.compile("^line")}):  #"^https://")}):
        strong_tag = link.find('strong')
        if not strong_tag == None:
            strong_tag_text = strong_tag.text.lower().strip()
            strong_tag_text = stemmer.stem(strong_tag_text)
            if strong_tag_text == 'Contributors:'.lower():
                break
            if strong_tag_text.find('(or') >= 0 and strong_tag_text.endswith(
                    ')'):
                #print('two_glossary={}'.format(strong_tag_text))
                two_glossary = strong_tag_text.split('(or')
                for item in two_glossary:
                    item = item.strip().replace(")", '')
                    item = stemmer.stem(item)
                    if item in keyword_list_TERM.keys():
                        keyword_list_TERM[item]['count'] += 1
                    else:
                        keyword_list_TERM[item] = {
                            'converted_word': item.replace(" ", "_")
                        }
                        keyword_list_TERM[item]['count'] = 1
            else:
                if strong_tag_text.find('(also') >= 0:
                    strong_tag_text = strong_tag_text[:strong_tag_text.
                                                      find('(also')]
                strong_tag_text = strong_tag_text
                if strong_tag_text.find('ubuntu') >= 0:
                    strong_tag_text = 'ubuntu'
                if strong_tag_text in keyword_list_TERM.keys():
                    keyword_list_TERM[strong_tag_text]['count'] += 1
                else:
                    keyword_list_TERM[strong_tag_text] = {
                        'converted_word': strong_tag_text.replace(" ", "_")
                    }
                    keyword_list_TERM[strong_tag_text]['count'] = 1
            #print (strong_tag_text)

    print(len(keyword_list_TERM.keys()))
    # Save Keywords
    import pickle
    with open("keyword_list.pickle", 'wb') as f:
        pickle.dump(keyword_list_TERM, f)
def krovetz_alternative_stemming_algorithm(tokens):
    ks = krovetz.PyKrovetzStemmer()
    # print("alternative: ", list(map(lambda word : ks.stem(word), tokens)))
    return list(map(lambda word : ks.stem(word), tokens))
def krovetz_stemming_algorithm(tokens):
    ks = krovetz.PyKrovetzStemmer()
    # print("original: ", [ks.stem(word) for word in tokens])
    return [ks.stem(word) for word in tokens]
コード例 #5
0
def main():

    dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1")
    author_title = dblp_data
    dataset = author_title.to_numpy()
    list1 = dataset[:,2].tolist()

    #convert authors to lower case
    list2 = []
    for i in list1:
        sublist = i.lower().split()
        list2.append(sublist)
    
    te = TransactionEncoder()
    te_ary = te.fit(list2).transform(list2)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent = fpgrowth(df, min_support=0.001, use_colnames=True)
    frequent = frequent[frequent['itemsets'].str.len()>1]

    freqauth_list = []
    for i in frequent['itemsets']:
        freqauth_list.append([x for x in i])

    freqauth_dict = {}
    for i in freqauth_list:
        title_idx_sublist = []
        for idx, j in enumerate(list2):
            if set(i).issubset(j):
                title_idx_sublist.append(idx)
        freqauth_dict.update({tuple(i):title_idx_sublist})

    freqauth_title_dict = {}
    kstem = ks.PyKrovetzStemmer()
    for key, value in freqauth_dict.items():
        title_df = author_title.iloc[value]['title']
        title_sublist = list(title_df)
        title_sublists = []
        temp_list = []
        for x in title_sublist:
            tempx     = re.sub(r'[.]','', x)
            temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split()
            temp_list2 = []
            if isinstance(temp_list, list):
                temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist])
                title_sublists.extend(temp_list2)
            else:
                if not temp_list in stopwordlist:
                    title_sublists.extend([kstem.stem(temp_list)])
        freqauth_title_dict.update({key:title_sublists})

    # Closed / Top k titles of frequent authors
    freqauth_title_dict_closed = {}
    for k, v in freqauth_title_dict.items():
        ps = PrefixSpan(v)
        closed_Seq_pattern = ps.topk(5, closed=True)
        freqauth_title_dict_closed.update({k:closed_Seq_pattern})

    # To get frequent author's context indicators
    frequentlist = freqauth_list
    cleanedList  = list2

    new_author_list = []
    for i in range(0,len(frequentlist)):
        temp_author_list = []
        authorlist = list(frequentlist[i])
        found = 0
        for k in range(0,len(cleanedList)):
            for j in range(0, len(authorlist)):
                if (authorlist[j] in(cleanedList[k])):
                    found = 1
                else:
                    found = 0
                    break
                    
            if found == 1:
                for jj in range(0,len(authorlist)):
                    if (authorlist[jj] in(cleanedList[k])):
                        cleanedList[k].remove(authorlist[jj])
                temp_author_list.append(cleanedList[k])

        new_author_list.append(temp_author_list)

    context_indicator_list = []
    for i in range(0,len(new_author_list)):
        te = TransactionEncoder()
        te_ary = te.fit(new_author_list[i]).transform(new_author_list[i])
        df = pd.DataFrame(te_ary, columns=te.columns_)
        frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True)

        supp = frequent_author_list.support.unique()  # all unique support count
        # Dictionary storing itemset with same support count key
        freq_dic = {}
        for i in range(len(supp)):
            inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets'])
            freq_dic[supp[i]] = inset
        # Dictionary storing itemset with  support count <= key
        freq_dic2 = {}
        for i in range(len(supp)):
            inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets'])
            freq_dic2[supp[i]] = inset2

        # Find Closed frequent itemset
        close_freq = []
        for index, row in frequent_author_list.iterrows():
            isclose = True
            cli = row['itemsets']
            cls = row['support']
            checkset = freq_dic[cls]
            for i in checkset:
                if (cli != i):
                    if (frozenset.issubset(cli, i)):
                        isclose = False
                        break

            if (isclose):
                close_freq.append([x for x in  (row['itemsets'])])
        context_indicator_list.append(close_freq)
    
    freqauth_context_ind_dict = {}
    for authpair, titlelist in freqauth_title_dict_closed.items():
        cleantitlelist = []
        for i in titlelist:
            if isinstance(i, tuple):
                if isinstance(i[1], list):
                    listtostring = ' '.join(i[1])
                    cleantitlelist.append(listtostring)
        freqauth_context_ind_dict.update({authpair:cleantitlelist})

    # Merging both titles and Context indicator author for frequent pattern authors 
    for idx, key in enumerate(freqauth_context_ind_dict):
        newval = []
        if len(context_indicator_list[idx])> 0:
            for i in context_indicator_list[idx]:
                if len(i) > 0:                
                    tempstr = '&'.join(i)
                    newval = freqauth_context_ind_dict[key]
                    newval.append(tempstr)
                    freqauth_context_ind_dict.update({key:newval})

# Context Indicator Weighting
    CI_list = list(freqauth_context_ind_dict.values())
    freqauth_context_in_weights = {}
    for key, value in freqauth_context_ind_dict.items():
        freq_auth_CI_list = value
        length_of_CI = len(value)
        temp_dict = {}
        for i in freq_auth_CI_list:
            count_tmp = 0
            for j in CI_list:
                if (i in (j)):
                    count_tmp += 1
            weight = round(1 - ((count_tmp - 1) /  count_tmp), 2)
            if (weight > 0.1):
                temp_dict.update({i:weight})
        sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        freqauth_context_in_weights.update({key:sorted_weights_dict})

    freq_auth_transactions = {}
    list_of_freq_auth = list(freqauth_context_in_weights.keys())
    for i in range(0, len(freqauth_title_dict)):
        temp_dict = {}
        title_list = freqauth_title_dict.get(list_of_freq_auth[i])
        CI_list = freqauth_context_in_weights[list_of_freq_auth[i]]
        CI_list_auth = []
        for n, c in enumerate(CI_list):
            CI_list_auth.append(c[0])
        for j in range(0, len(title_list)):
            cos_sim = cos_similarity(CI_list_auth,title_list[j])
            cos_sim = round(cos_sim, 3)
            t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j])
            temp_dict.update({t_title:cos_sim})

        sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True)
        t_len = len(list(temp_dict.values()))
        max_len = t_len
        if (t_len > 4):
            max_len = 4
        sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len])
        freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1})

    # To find the strongest SSP - Match against similarity of the context units

    freq_auth_SSPs = {}
    list_of_freq_auth = list(freqauth_context_ind_dict.keys())
    list_of_freq_auth_CI =  list(freqauth_context_ind_dict.values())
    len_list_of_freq_auth_CI = len(list_of_freq_auth_CI)

    context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float)
    for i in range (0,len_list_of_freq_auth_CI):
        for j in range (0,len_list_of_freq_auth_CI):
            cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j])
            cos_sim = round(cos_sim, 3)
            if (i != j):
                context_indicator_similarity[i][j] = cos_sim
                context_indicator_similarity[j][i] = cos_sim

    context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int)
    for i in range(0,len(context_indicator_similarity)):
        context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:]

    SSP_Author_List = []
    for i in range(0,len(list_of_freq_auth)):
        temp_author_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
           temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]])
        SSP_Author_List.append(temp_author_list_ssp)

    SSP_Title_List = []

    CI_list_title = list(freqauth_title_dict_closed.values())
    CI_list1 = []
    for i in (CI_list_title):
        temp_list3 = []
        for j in i:
            CI_str = ' '.join(j[1])
            temp_list3.append(CI_str)
        CI_list1.append(list(set(temp_list3)))

    for i in range(0,len(CI_list1)):
        temp_title_list_ssp = []
        for j in range(0,len(context_indicator_similarity_idx[i])):
            ssp_str = CI_list1[context_indicator_similarity_idx[i][j]]
            temp_title_list_ssp.extend(ssp_str)
        SSP_Title_List.append(list(set(temp_title_list_ssp)))

    # Write the output to a CSV file
    # a) list_of_freq_auth
    # b) list_of_freq_auth_CI / freqauth_context_in_weights
    # c) freq_auth_transactions
    # d) SSP_Author_List
    # e) SSP_Title_List
    #for i in range(0, frequent_author_list):
    #print(len(SSP_Title_List))
    #print(SSP_Title_List)
    titles_list_with_weight = list(freq_auth_transactions.values())
    # Joining SSP authors
    SSP_authors_formatted = []
    for i in range(0,len(SSP_Author_List)):
        temp_list = []
        for j in range(0, len(SSP_Author_List[i])):
            authors = '&'.join(list(SSP_Author_List[i][j]))
            temp_list.append(authors)
        SSP_authors_formatted.append(temp_list)

    with open("./output.txt", 'w', encoding="utf-8") as f:
        f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' +
                'Transaction 2' + '||'  + 'Transaction 3' + '||'  + 'Transaction 4' + '||' + 'SSP - Co-Author' +
                '||' + 'SSP - Title' + '\n')
        for i in range(0, len(list_of_freq_auth)):
            authors = ' '.join(list(list_of_freq_auth[i]))
            f.write(authors + '||')
            Context_indicators = '; '.join(list_of_freq_auth_CI[i])
            f.write(Context_indicators + '||')
            for j in (titles_list_with_weight[i].keys()):
                f.write(j + '||')
            ssp_authors = '; '.join(SSP_authors_formatted[i])
            f.write(ssp_authors + '||')
            ssp_titles = '; '.join(SSP_Title_List[i])
            f.write(ssp_titles )
            f.write('\n')
コード例 #6
0
ファイル: indexer.py プロジェクト: c-loh/search-engine
def stemmer(words):
    ks = krovetz.PyKrovetzStemmer()
    for w in range(len(words)):
        words[w] = ks.stem(words[w])
    return words
コード例 #7
0
def run():

    plt.rcParams['figure.dpi'] = 300
    kstemmer = krovetz.PyKrovetzStemmer()

    print('loading w2v model')
    wi_path = 'data/w2v/word_index_stemmed'
    we_path = 'data/w2v/word_embeddings_matrix'
    w2v_model, w2v_wi = load_w2v_model(wi_path, we_path)

    print('loading PWE')
    word_dict_path = 'data/word_index_json'
    wi = load_json(word_dict_path)
    embs_path = 'data/embeddings_dim_50_margin_2.0'
    dict_pt = torch.load(embs_path, map_location='cpu')
    pwe_model = dict_pt["embeddings"]
    """
    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('cuba'), 30, w2v_wi, w2v_model)
    print('closes words to cuba (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('cuba', 30, wi, w)
    print('closes words to cuba (pwe): %s' % ', '.join(keys))

    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('sugar'), 30, w2v_wi, w2v_model)
    print('closes words to sugar (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('sugar', 30, wi, w)
    print('closes words to sugar (pwe): %s' % ', '.join(keys))

    keys, sims = get_top_k_closest_words_w2v(kstemmer.stem('export'), 30, w2v_wi, w2v_model)
    print('closes words to export (w2v): %s' % ', '.join(keys))

    keys, sims = compute_top_k_closest_words_to('export', 30, wi, w)
    print('closes words to export (pwe): %s' % ', '.join(keys))
    """

    # ['disease', 'osteoporosis', 'fracture', 'bone', 'diet', 'health', 'drug']
    # w_lists = [['osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet']]
    w_list = [
        'osteoporosis', 'disease', 'fracture', 'bone', 'magnesium', 'diet'
    ]
    # plot_ellipses([w_list], pwe_model, wi)
    plot_ellipses_alt(w_list, pwe_model, wi)

    exit()

    plot_pwe_sim_m([kstemmer.stem(w) for w in w_list], w_list, pwe_model, wi)

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_words_sim_pwe(kstemmer.stem('osteoporosis'), stemmed,
                                    pwe_model, wi)
        print('PWE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    print('loading fasttext model')
    ftext_model_path = 'data/wiki.en.bin'
    f = load_model(ftext_model_path)

    print('loading wn embs')
    wn2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        'data/wn2vec.txt')

    plot_we_scatterplots(w_list, f, model_name='ft')
    plot_we_scatterplots(w_list, wn2vec_model, model_name='wn')
    plot_we_scatterplots(w_list, w2v_model, model_name='w2v', wi=w2v_wi)

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_wnwe('osteoporosis', w, wn2vec_model)
        print('WNE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_ftt(kstemmer.stem('osteoporosis'), stemmed, f)
        print('FTE similarity between osteoporosis and %s: %2.5f' % (w, sim))

    print()

    for w in w_list:
        stemmed = kstemmer.stem(w)
        sim = compute_word_sim_wnwe(w2v_wi[kstemmer.stem('osteoporosis')],
                                    w2v_wi[kstemmer.stem(w)], w2v_model)
        print('W2V similarity between osteoporosis and %s: %2.5f' % (w, sim))
コード例 #8
0
ファイル: util.py プロジェクト: albpurpura/PE4IR
import io
import json
import os
import pickle
import platform
import subprocess

import numpy as np
import krovetz

import string

from tqdm import tqdm
from whoosh.analysis import StemmingAnalyzer, StandardAnalyzer

kstemmer = krovetz.PyKrovetzStemmer()
# choose correct variable values according to what pc I am using
TREC_EVAL_PATH = '../../trec_eval.8.1/trec_eval'

with open('../data/indri_stoplist_eng.txt', 'r') as slf:
    sw = slf.readlines()
    sw = [word.strip() for word in sw]


def save_json(model, output_path):
    with open(output_path, 'w') as outfile:
        json.dump(model, outfile)


def load_json(path):
    with open(path, 'r') as json_file:
コード例 #9
0
def plot_ellipses(word_lists, w, wi):
    kstemmer = krovetz.PyKrovetzStemmer()
    np.random.seed(0)
    dim = 50
    plt.grid(True)
    ax = plt.gca()

    for words in word_lists:
        ws = [kstemmer.stem(w.lower()) for w in words]
        basis = None
        widths = []
        heights = []
        angles = []
        xy = []
        labels = []
        basis_eigenvalues = None
        for i in range(len(words)):
            label = words[i]
            w1 = ws[i]
            index = wi[w1]
            w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view(
                (-1, dim, dim)))
            w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim))
            w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1)

            prec_matr = np.linalg.inv(w1_c * w1_c.T)

            eigs = np.linalg.eig(prec_matr)
            norms = [np.linalg.norm(v) for v in eigs[1]]
            eigenvalues = np.abs(eigs[0])
            eigenvectors = np.array(
                [eigs[1][i] / norms[i] for i in range(len(norms))])

            sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2]
            width = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][0]))
            height = np.sqrt(np.abs(eigenvalues[np.argsort(-eigenvalues)][1]))

            if basis is None:
                sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)]
                expl_variance = np.abs(np.sum(
                    sorted_eigenvalues[0:2])) / np.sum(
                        sorted_eigenvalues[2:]) * 100

                print('PWE: explained variance: %2.5f' % expl_variance)
                basis_eigenvalues = (width, height)
                basis = sorted_v
                angle = 0
                # center = (0, 0)
                center = (basis_eigenvalues[0] * np.dot(basis[0], w1_m) /
                          (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)),
                          basis_eigenvalues[1] *
                          (np.dot(basis[1], w1_m) /
                           (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m))))

            else:
                proj0 = project(basis, sorted_v[0])
                # proj1 = project(basis, sorted_v[1])
                angle = np.arccos(
                    np.dot(proj0, basis[0]) /
                    (np.linalg.norm(proj0) * np.linalg.norm(basis[0])))
                # mean = (np.dot(basis[0], w1_m), np.dot(basis[1], w1_m))

                width *= np.linalg.norm(project(basis, eigs[1][0]))
                height *= np.linalg.norm(project(basis, eigs[1][1]))

                center = (width * np.dot(basis[0], w1_m) /
                          (np.linalg.norm(basis[0]) * np.linalg.norm(w1_m)),
                          height *
                          (np.dot(basis[1], w1_m) /
                           (np.linalg.norm(basis[1]) * np.linalg.norm(w1_m))))

            widths.append(width)
            heights.append(height)
            angles.append(angle)
            xy.append(center)
            labels.append(label)

        texts = []
        colors = []
        xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))]
        for i in range(len(widths)):
            # widths = [w / max(widths) for w in widths]
            # heights = [h / max(heights) for h in heights]
            # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))]
            print(labels[i])
            print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' %
                  (widths[i], heights[i], xy[i][0], xy[i][1]))
            ell_color = np.random.rand(3)
            ell = matplotlib.patches.Ellipse(xy=xy[i],
                                             width=widths[i],
                                             height=heights[i],
                                             angle=angles[i],
                                             facecolor=ell_color,
                                             edgecolor=ell_color,
                                             fill=True)
            ax.add_patch(ell)
            ell.set_zorder(-1)
            ell.set_alpha(np.random.rand())
            ell.set_alpha(0.5)
            ell.set(label=labels[i], clip_box=ax.bbox)
            # ell.set_edgecolor(ell_color)
            colors.append(ell_color)
            ell.set_facecolor(ell_color)

            texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18))
            plt.scatter(xy[i][0],
                        xy[i][1],
                        s=80,
                        alpha=0.8,
                        color=ell_color,
                        edgecolor=ell_color,
                        marker='+')

        adjust_text(texts,
                    expand_text=(1.5, 2.5),
                    expand_points=(2.5, 2.5),
                    expand_objects=(1.9, 2.8),
                    expand_align=(1.8, 1.7),
                    arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8))

        xmax = -100
        xmin = 100
        ymin = 100
        ymax = -100
        wmax = -100
        hmax = -100
        for i in range(len(xy)):
            coord = xy[i]
            if wmax < widths[i]:
                wmax = widths[i]
            if hmax < heights[i]:
                hmax = heights[i]

            if coord[0] > xmax:
                xmax = coord[0]
            if coord[0] < xmin:
                xmin = coord[0]

            if coord[1] > ymax:
                ymax = coord[1]
            if coord[1] < ymin:
                ymin = coord[1]

        ax.set_xlim(left=int(xmin - wmax / 2), right=int(xmax + wmax / 2))
        ax.set_ylim(bottom=int(ymin - hmax / 2), top=int(ymax + hmax / 2))
    ax.legend()
    plt.show()
    plt.close()
コード例 #10
0
def plot_ellipses_alt(words, w, wi):
    kstemmer = krovetz.PyKrovetzStemmer()
    np.random.seed(0)
    dim = 50
    plt.grid(True)
    plt.axhspan(0, 0, linewidth=2, color='#1f77b4')
    plt.axvline(0)
    ax = plt.gca()
    ws = [kstemmer.stem(w.lower()) for w in words]
    labels = []
    mean_vectors = []
    all_eigenvectors_eigenvalues = []
    for i in range(len(words)):
        w1 = ws[i]
        index = wi[w1]
        w1_m, w1_c = (w[index, 0:dim].view(-1, dim), w[index, dim:].view(
            (-1, dim, dim)))
        w1_c = np.reshape(w1_c.detach().numpy(), newshape=(dim, dim))
        w1_m = np.reshape(w1_m.detach().numpy(), newshape=-1)

        prec_matr = np.linalg.inv(w1_c * w1_c.T)
        eigs = np.linalg.eig(prec_matr)
        norms = [np.linalg.norm(v) for v in eigs[1]]
        eigenvalues = np.abs(eigs[0])
        eigenvectors = np.array(
            [eigs[1][i] / norms[i] for i in range(len(norms))])
        sorted_eigenvalues = eigenvalues[np.argsort(-eigenvalues)]
        sorted_v = eigenvectors[np.argsort(-eigenvalues)][0:2]

        all_eigenvectors_eigenvalues.append((sorted_v, sorted_eigenvalues))
        mean_vectors.append(w1_m)

    reducer = PCA(2)

    # mean_vectors_all = np.array([np.reshape(w[index, 0:dim].view(-1, dim).detach().numpy(), -1) for index in range(len(wi.items()))])
    # reducer.fit(mean_vectors_all)
    # centers = reducer.transform(np.array(mean_vectors))

    centers = reducer.fit_transform(np.array(mean_vectors))
    basis = reducer.transform(reducer.components_)

    widths = []
    heights = []
    angles = []
    xy = []
    max_eig0 = np.NINF
    max_eig1 = np.NINF
    for i in range(len(all_eigenvectors_eigenvalues)):
        eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i]
        if eigenvalues[0] > max_eig0:
            max_eig0 = eigenvalues[0]

        if eigenvalues[1] > max_eig1:
            max_eig1 = eigenvalues[1]
    max_overall = max_eig0
    if max_eig1 > max_overall:
        max_overall = max_eig1

    for i in range(len(all_eigenvectors_eigenvalues)):
        label = words[i]
        eigenvectors, eigenvalues = all_eigenvectors_eigenvalues[i]
        proj_eigs = reducer.transform(np.array(eigenvectors))

        width = np.linalg.norm(proj_eigs[0]) * eigenvalues[0] / max_eig0
        height = np.linalg.norm(proj_eigs[1]) * eigenvalues[1] / max_eig1
        angle = np.arccos(
            np.dot(proj_eigs[0], basis[0]) /
            (np.linalg.norm(proj_eigs[0]) * np.linalg.norm(basis[0])))

        widths.append(width)
        heights.append(height)
        angles.append(angle)
        x = centers[i][0]
        y = centers[i][1]
        xy.append((x, y))
        labels.append(label)

    texts = []
    colors = []
    xy = [(xy[i][0] * 1, xy[i][1] * 1) for i in range(len(xy))]
    for i in range(len(widths)):
        # widths = [w / max(widths) for w in widths]
        # heights = [h / max(heights) for h in heights]
        # xy = [(xy[i][0] * 2.2, xy[i][1] * 2.2) for i in range(len(xy))]
        print(labels[i])
        print('width=%2.8f, height=%2.8f, x=%2.5f, y=%2.5f' %
              (widths[i], heights[i], xy[i][0], xy[i][1]))
        ell_color = np.random.rand(3)
        ell = matplotlib.patches.Ellipse(xy=xy[i],
                                         width=widths[i],
                                         height=heights[i],
                                         angle=angles[i],
                                         facecolor=ell_color,
                                         edgecolor=ell_color,
                                         fill=True)
        ax.add_patch(ell)
        ell.set_zorder(-1)
        ell.set_alpha(np.random.rand())
        ell.set_alpha(0.5)
        ell.set(label=labels[i], clip_box=ax.bbox)
        colors.append(ell_color)
        ell.set_facecolor(ell_color)

        texts.append(ax.text(xy[i][0], xy[i][1], labels[i], fontsize=18))
        plt.scatter(xy[i][0],
                    xy[i][1],
                    s=80,
                    alpha=0.8,
                    color=ell_color,
                    edgecolor=ell_color,
                    marker='+')

    adjust_text(texts,
                expand_text=(1.5, 2.5),
                expand_points=(2.5, 2.5),
                expand_objects=(1.9, 2.8),
                expand_align=(1.8, 1.7),
                arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.8))

    xmax = -100
    xmin = 100
    ymin = 100
    ymax = -100
    wmax = -100
    hmax = -100
    for i in range(len(xy)):
        coord = xy[i]
        if wmax < widths[i]:
            wmax = widths[i]
        if hmax < heights[i]:
            hmax = heights[i]

        if coord[0] > xmax:
            xmax = coord[0]
        if coord[0] < xmin:
            xmin = coord[0]

        if coord[1] > ymax:
            ymax = coord[1]
        if coord[1] < ymin:
            ymin = coord[1]

    ax.set_xlim(left=xmin - wmax / 2 - 0.05, right=xmax + wmax / 2 + 0.05)
    ax.set_ylim(bottom=ymin - hmax / 2 - 0.05, top=ymax + hmax / 2 + 0.05)
    ax.legend(prop={'size': 13})
    plt.savefig('ellipses_final.png')
    plt.show()
    plt.close()
コード例 #11
0
def plot_we_scatterplots(word_list, model, model_name='ft', wi=None):
    labels = word_list
    kstemmer = krovetz.PyKrovetzStemmer()
    reducer = PCA(2)
    if model_name == 'ft':
        vecs = reducer.fit_transform(
            np.array(
                [model.get_word_vector(kstemmer.stem(w)) for w in word_list]))
    elif model_name == 'wn':
        word_list = [w for w in word_list if w in model.wv.vocab]
        labels = word_list
        vecs = reducer.fit_transform(model[[w for w in word_list]])
    else:
        vecs = reducer.fit_transform(
            [model[wi[kstemmer.stem(w)]] for w in word_list])

    fig = plt.figure()
    plt.grid(True)
    plt.axhspan(0, 0, linewidth=2, color='#1f77b4')
    plt.axvline(0)
    ax = plt.gca()
    ax.set_xlim(left=-10, right=10)
    ax.set_ylim(bottom=-10, top=10)
    texts = []
    xy = []
    for i in range(len(vecs)):
        reduced = vecs[i]
        label = labels[i]
        ax.scatter(reduced[0],
                   reduced[1],
                   s=100,
                   alpha=1.0,
                   color='b',
                   marker='+')
        # texts.append(ax.text(reduced[0], reduced[1], label, fontsize=14))
        if label == 'osteoporosis' and model_name == 'wn':
            ax.annotate(label, (reduced[0], reduced[1]),
                        fontsize=18,
                        ha='right')
        else:
            ax.annotate(label, (reduced[0], reduced[1]), fontsize=18)

        xy.append(reduced)
    # adjust_text(texts, expand_text=(0.01, 0.02), arrowprops=dict(arrowstyle="-|>", color='r', alpha=0.0))

    if model_name == 'ft':
        plt.title('FTE')
    elif model_name == 'wn':
        plt.title('WNE')
    else:
        plt.title('W2V')

    xmax = -100
    xmin = 100
    ymin = 100
    ymax = -100
    for i in range(len(xy)):
        coord = xy[i]
        if coord[0] > xmax:
            xmax = coord[0]
        if coord[0] < xmin:
            xmin = coord[0]

        if coord[1] > ymax:
            ymax = coord[1]
        if coord[1] < ymin:
            ymin = coord[1]

    ax.set_xlim(left=xmin - 1, right=xmax + 1)
    ax.set_ylim(bottom=ymin - 1, top=ymax + 1)

    plt.show()
    plt.close(fig)
コード例 #12
0
 def test_do_simple_stem(self):
     ks = krovetz.PyKrovetzStemmer()
     self.assertEqual(ks.stem("walked"), "walk")
     self.assertEqual(ks.stem("run"), "run")
コード例 #13
0
def test_stem(benchmark, word):
    ks = krovetz.PyKrovetzStemmer()
    result = benchmark(stem_many, ks, word)