def extract_keywords(target_word, word_clustered_data, max_df, topn):
    lemmatizer = Lemmatizer('en')
    l_sent_clust_dict = defaultdict(list)
    sent_clust_dict = defaultdict(list)
    for i, row in word_clustered_data.iterrows():
        l_sent_clust_dict[row['label']].append(row['sentence'])

    for label, sents in l_sent_clust_dict.items():
        sent_clust_dict[label] = " ".join(sents)

    stop1 = list(spacy.lang.en.stop_words.STOP_WORDS)
    stop2 = stopwords.words('english')
    stop = set(stop1 + stop2)

    labels, clusters = list(sent_clust_dict.keys()), list(
        sent_clust_dict.values())

    tfidf_transformer = TfidfVectorizer(smooth_idf=True,
                                        use_idf=True,
                                        ngram_range=(1, 2),
                                        max_df=max_df,
                                        stop_words=stop,
                                        max_features=10000)
    tfidf_transformer.fit(clusters)
    feature_names = tfidf_transformer.get_feature_names()

    keyword_clusters = {}
    for label, cluster in zip(labels, clusters):
        # generate tf-idf
        tf_idf_vector = tfidf_transformer.transform([cluster])
        # sort the tf-idf vectors by descending order of scores
        tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
        sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
        # extract only the top n
        keywords = extract_topn_from_vector(feature_names, sorted_items,
                                            topn * 5)
        keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True)
        keywords = [x[0] for x in keywords]
        #filter unigrams that appear in bigrams and remove duplicates
        all_bigrams = " ".join([kw for kw in keywords if len(kw.split()) == 2])
        already_in = set()
        filtered_keywords = []
        for kw in keywords:
            if len(kw.split()) == 1 and kw in all_bigrams:
                continue
            else:
                if len(kw.split()) == 1:
                    kw = lemmatizer.lemmatize(kw)
                if kw not in already_in and kw != target_word:
                    filtered_keywords.append(kw)
                    already_in.add(kw)

        keyword_clusters[label] = filtered_keywords[:topn]

    return keyword_clusters
def get_lemmatizer(lang):
    lemmatizer = -1
    if lang == 'hr':
        lemmatizer = Lemmatizer('hr').lemmatize
    if lang == 'ee':
        lemmatizer = Lemmatizer('et').lemmatize
    if lang == 'ru':
        lemmatizer = Lemmatizer('ru').lemmatize
    if lang == 'lv':
        lemmatizer = LatvianStemmer.stem
    assert not lemmatizer == -1
    return lemmatizer
 def test_lemmagen(self):
     normalizer = preprocess.LemmagenLemmatizer('Slovenian')
     sentence = 'Gori na gori hiša gori'
     with self.corpus.unlocked():
         self.corpus.metas[0, 0] = sentence
     self.assertEqual(
         [Lemmatizer("sl").lemmatize(t) for t in sentence.split()],
         normalizer(self.corpus).tokens[0],
     )
Beispiel #4
0
class LemmagenLemmatizer(BaseNormalizer):
    name = 'Lemmagen Lemmatizer'
    lemmagen_languages = {
        "Bulgarian": "bg",
        "Croatian": "hr",
        "Czech": "cs",
        "English": "en",
        "Estonian": "et",
        "Farsi/Persian": "fa",
        "French": "fr",
        "German": "de",
        "Hungarian": "hu",
        "Italian": "it",
        "Macedonian": "mk",
        "Polish": "pl",
        "Romanian": "ro",
        "Russian": "ru",
        "Serbian": "sr",
        "Slovak": "sk",
        "Slovenian": "sl",
        "Spanish": "es",
        "Ukrainian": "uk"
    }

    def __init__(self, language='English'):
        super().__init__()
        self.language = language
        self.lemmatizer = None

    def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
        # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
        self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
        output_corpus = super().__call__(corpus, callback)
        self.lemmatizer = None
        return output_corpus

    def normalizer(self, token):
        assert self.lemmatizer is not None
        t = self.lemmatizer.lemmatize(token)
        # sometimes Lemmagen returns an empty string, return original tokens
        # in this case
        return t if t else token
Beispiel #5
0
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer,
              adaptive, bpe, masked_lm, classification, bpe_model_path,
              datasets, lm_corpus_file, transfer_learning, pos_tags, dict_path,
              rnn, crf, lm_id, output_path):
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=batch_size)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')

    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--lr_warmup', type=float, default=0.002)
    parser.add_argument('--lr', type=float, default=learning_rate)
    parser.add_argument('--b1', type=float, default=0.9)
    parser.add_argument('--b2', type=float, default=0.999)
    parser.add_argument('--e', type=float, default=1e-8)
    parser.add_argument('--l2', type=float, default=0.01)
    parser.add_argument('--vector_l2', action='store_true')
    parser.add_argument('--max_grad_norm', type=int, default=1)

    parser.add_argument("--initializer_range", type=float, default=0.02)
    parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6)

    parser.add_argument("--n_ctx", type=int, default=n_ctx)
    parser.add_argument("--n_positions", type=int, default=n_ctx)
    parser.add_argument("--n_embd", type=int, default=n_embd)
    parser.add_argument("--n_head", type=int, default=n_head)
    parser.add_argument("--n_layer", type=int, default=n_layer)
    parser.add_argument("--max_vocab_size",
                        type=int,
                        default=0,
                        help='Zero means no limit.')

    parser.add_argument('--max_step',
                        type=int,
                        default=100000,
                        help='upper epoch limit')
    parser.add_argument('--eta_min',
                        type=float,
                        default=0.0,
                        help='min learning rate for cosine scheduler')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--kw_cut',
                        type=int,
                        default=10,
                        help='Precison and recall @')

    parser.add_argument("--num_epoch", type=int, default=10)

    parser.add_argument('--data_path', type=str, default='data')
    parser.add_argument('--result_path',
                        type=str,
                        default='results_512_sorted_big.txt')

    parser.add_argument('--adaptive',
                        action='store_true',
                        help='If true, use adaptive softmax.')
    parser.add_argument('--bpe',
                        action='store_true',
                        help='If true, use byte pair encoding.')
    parser.add_argument(
        '--masked_lm',
        action='store_true',
        help=
        'If true, use masked language model objective for pretraining instead of regular language model.'
    )
    parser.add_argument('--transfer_learning',
                        action='store_true',
                        help='If true, use a pretrained language model.')
    parser.add_argument('--POS_tags', action='store_true', help='POS tags')
    parser.add_argument('--classification',
                        action='store_true',
                        help='If true, train a classifier.')
    parser.add_argument(
        '--rnn',
        action='store_true',
        help='If true, use a RNN with attention in classification head.')
    parser.add_argument(
        '--crf',
        action='store_true',
        help=
        'If true, use CRF instead of costum loss function in classification head.'
    )

    parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path)
    parser.add_argument('--datasets', type=str, default=datasets)
    parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file)
    parser.add_argument('--trained_language_models_dir',
                        type=str,
                        default='trained_language_models')
    parser.add_argument('--trained_classification_models_dir',
                        type=str,
                        default='trained_classification_models')

    parser.add_argument('--dict_path',
                        type=str,
                        default=dict_path,
                        help='Path to dictionary')
    parser.add_argument('--lang',
                        type=str,
                        default='english',
                        help='Path to dictionary')
    parser.add_argument('--lm_id',
                        type=str,
                        default=lm_id,
                        help='Path to language model')
    parser.add_argument('--output_path',
                        type=str,
                        default=output_path,
                        help='Output designator')
    parser.add_argument('--cuda',
                        action='store_false',
                        help='If true, use gpu.')

    args = parser.parse_args()
    args.adaptive = adaptive
    args.classification = classification
    args.transfer_learning = transfer_learning
    args.POS_tags = pos_tags
    args.bpe = bpe
    args.masked_lm = masked_lm
    args.rnn = rnn
    args.crf = crf
    args.cuda = True

    if not os.path.exists(args.trained_classification_models_dir):
        os.makedirs(args.trained_classification_models_dir)

    if not os.path.exists(args.trained_language_models_dir):
        os.makedirs(args.trained_language_models_dir)

    if args.bpe:
        sp = spm.SentencePieceProcessor()
        sp.Load(args.bpe_model_path)
    else:
        sp = None

    if args.crf:
        assert not args.rnn
    if args.rnn:
        assert not args.crf

    if args.classification:
        assert args.trained_classification_models_dir != args.trained_language_models_dir
        assert not args.adaptive
        if args.transfer_learning:
            l_models = os.listdir(args.trained_language_models_dir)
            for l_model in l_models:
                if args.lm_id in l_model:
                    args.language_model_path = os.path.join(
                        args.trained_language_models_dir, l_model)
            print('Classification, using language model: ',
                  args.language_model_path)
            print()

    if not args.transfer_learning:
        assert not os.path.exists(args.dict_path)

    print(args)

    if args.lang == 'english':
        stemmer = PorterStemmer()
    elif args.lang == 'estonian':
        stemmer = Lemmatizer('et')
    elif args.lang == 'croatian':
        stemmer = Lemmatizer('hr')
    elif args.lang == 'russian':
        stemmer = Lemmatizer('ru')

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    if not args.classification:
        df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file),
                             classification=False)
        df_data = df_data.sample(frac=1, random_state=2019)
        val_idx = int(0.8 * df_data.shape[0])
        test_idx = int(0.9 * df_data.shape[0])
        df_train = df_data[:val_idx]
        df_valid = df_data[val_idx:test_idx]
        df_test = df_data[test_idx:]

        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print('Training language model on all data')
        print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape,
              "Test size: ", df_test.shape)
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print()
        train_test(df_train, df_valid, df_test, args, stemmer, sp)

    else:
        result_file = open(args.result_path, 'a', encoding='utf8')
        result_file.write("Classification results using language model " +
                          args.lm_id + " and config " + args.output_path +
                          ":\n\n")
        result_file.write("Parameters:\n")
        result_file.write(
            str(args) + '\n------------------------------------------------\n')

        for folder in args.datasets.split(';'):

            print(
                '------------------------------------------------------------------------------------------------------'
            )
            print('Training on: ', folder)
            print(
                '------------------------------------------------------------------------------------------------------'
            )

            if folder == 'duc' or folder == 'nus':
                #cross validation
                kf = model_selection.KFold(n_splits=10)
                df_data = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)
                df_data = df_data.sample(frac=1, random_state=2019)
                print()
                print('Cross validation on duc')

                fold_counter = 0

                total_pred = []
                total_true = []

                for train_index, test_index in kf.split(df_data):
                    fold_counter += 1
                    df_train, df_test = df_data.iloc[
                        train_index], df_data.iloc[test_index]
                    sep_idx = int(df_train.shape[0] / 10)
                    df_valid = df_train[:sep_idx]
                    df_train = df_train[sep_idx:]

                    print("Train fold ", fold_counter, "fold size: ",
                          df_train.shape, "Valid fold size: ", df_valid.shape,
                          "Test fold  size: ", df_test.shape)
                    print()

                    fold_pred, fold_true, num_parameters = train_test(
                        df_train, df_valid, df_test, args, stemmer, sp, folder)
                    total_pred.extend(fold_pred)
                    total_true.extend(fold_true)
                print()
                print(
                    '--------------------------------------------------------------------'
                )
                print('Final CV results:')
                print()

            else:
                df_train = file_to_df(os.path.join(args.data_path, folder,
                                                   folder + '_valid.json'),
                                      classification=True)
                df_train = df_train.sample(frac=1, random_state=2019)
                val_idx = int(0.8 * df_train.shape[0])
                df_valid = df_train[val_idx:]
                df_train = df_train[:val_idx]
                df_test = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)

                print("Train size: ", df_train.shape, "Valid size: ",
                      df_valid.shape, "Test size: ", df_test.shape)
                print()

                total_pred, total_true, num_parameters = train_test(
                    df_train, df_valid, df_test, args, stemmer, sp, folder)

            p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(
                total_pred, total_true, lang=args.lang)

            result_file.write("Dataset: " + folder + '\n')
            result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' +
                              str(r_5) + ' F1@5: ' + str(f_5) + '\n')
            result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' +
                              str(r_10) + ' F1@10: ' + str(f_10) + '\n')
            result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' +
                              str(r_k) + ' F1@k: ' + str(f_k) + '\n')
            result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' +
                              str(r_M) + ' F1@M: ' + str(f_M) + '\n')
            result_file.write('Num. trainable parameters: ' +
                              str(num_parameters) + '\n')

            outputs = []

            for pred, true in zip(total_pred, total_true):
                pred = ";".join(list(pred))
                true = ";".join(list(true))
                outputs.append((pred, true))

            df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True'])
            df_preds.to_csv('predictions/' + folder + '_' + args.output_path +
                            '.csv',
                            sep=',',
                            encoding='utf8')

        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.write(
            "\n-----------------------End of the run----------------------\n")
        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.close()
import scipy

from lemmagen3 import Lemmatizer         
stemmer = Lemmatizer('hr').lemmatize

import numpy as np
import pandas as pd

from nltk.translate.bleu_score import sentence_bleu as bleu

#stemmer = LatvianStemmer()


def stem_word_list(word_list):
    return [stemmer(w.strip()) for w in word_list]


def macro_averaged_score(precisionlist, recalllist):
    precision = np.average(precisionlist)
    recall = np.average(recalllist)
    f_score = 0
    if (precision or recall):
        f_score = round((2 * (precision * recall)) / (precision + recall), 4)
    return precision, recall, f_score


def get_match_result(true_seqs, pred_seqs, do_stem=True, type='exact'):
    '''
    If type='exact', returns a list of booleans indicating if a pred has a matching tgt
    If type='partial', returns a 2D matrix, each value v_ij is a float in range of [0,1]
        indicating the (jaccard) similarity between pred_i and tgt_j
Beispiel #7
0
from lemmagen3 import Lemmatizer
import pke


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--datasets', type=str, default='data/croatian/croatian_test.json',
                        help='paths to datasets separated with ;')
    parser.add_argument('--lang', type=str, default='croatian', help='language')
    parser.add_argument('--num_keywords', type=int, default=10, help='Number of keywords')
    args = parser.parse_args()

    if args.lang == 'english':
        stemmer = PorterStemmer()
    elif args.lang == 'estonian':
        stemmer = Lemmatizer('et')
    elif args.lang == 'croatian':
        stemmer = Lemmatizer('hr')

    language = args.lang
    numOfKeywords = args.num_keywords
    input_paths = args.datasets.split(';')

    for input_path in input_paths:
        all_preds = []
        all_true = []
        counter = 0

        num_tokens = 0
        num_kw = 0
Beispiel #8
0
    def tokenize_doc(self, df, max_length, valid=False):

        if self.lang == 'english':
            stemmer = PorterStemmer()
        elif self.lang == 'estonian':
            stemmer = Lemmatizer('et')
        elif self.lang == 'croatian':
            stemmer = Lemmatizer('hr')
        stemmed_string = ""

        docs = []
        for idx, line in df.iterrows():
            if self.pos:
                words, pos_tags = self.preprocess_line(line, self.pos)
            else:
                words = self.preprocess_line(line, self.pos)
            if self.lang == 'english':
                stems = " ".join([stemmer.stem(w.lower()) for w in words])
            elif self.lang == 'estonian' or self.lang == 'croatian':
                stems = " ".join([stemmer.lemmatize(w.lower()) for w in words])
            stemmed_string += stems + " "

            tokenized_keywords = []
            keywords = line['keyword'].lower()
            keywords = keywords.replace('-', ' ')
            keywords = keywords.replace('/', ' ')
            keywords = keywords.replace('∗', ' ')

            for kw in keywords.split(';'):
                if not self.bpe:
                    kw = kw.split()
                else:
                    kw = self.sp.tokenize(kw)
                tokenized_keywords.append(kw)

            if self.pos:
                docs.append([words, pos_tags, tokenized_keywords])
            else:
                docs.append([words, tokenized_keywords])

        docs = sorted(docs, key=lambda x: len(x[0]))

        x = torch.zeros([len(docs), max_length], dtype=torch.long)
        y = torch.zeros([len(docs), max_length], dtype=torch.long)
        if self.pos:
            x_pos = torch.zeros([len(docs), max_length], dtype=torch.long)

        all_keywords = {}
        not_in_text = defaultdict(int)
        present_kw = 0
        all_kw = 0
        copies = 0
        max_lkw = 4

        for i, doc in enumerate(docs):
            if self.pos:
                words, pos_tags, kws = doc
            else:
                words, kws = doc

            length = len(words)
            kw_in_paper = []
            stemmed_kw_in_paper = []

            for j, word in enumerate(words):
                if word in self.dictionary.word2idx:
                    idx = self.dictionary.word2idx[word]

                    for kw in kws:
                        lkw = len(kw)

                        is_keyword = False
                        if j + lkw < length:
                            for k in range(lkw):
                                w = words[j + k]

                                if self.lang == 'english':
                                    if stemmer.stem(w.lower()) != stemmer.stem(
                                            kw[k].lower()):
                                        break
                                elif self.lang == 'estonian' or self.lang == 'croatian':
                                    if stemmer.lemmatize(
                                            w.lower()) != stemmer.lemmatize(
                                                kw[k].lower()):
                                        break

                            else:
                                is_keyword = True
                        if is_keyword:

                            for k in range(lkw):
                                if j + k < max_length:
                                    y[i][j + k] = 3 if k == 0 else 2

                            kw_in_paper.append(" ".join(kw))

                            if self.lang == 'english':
                                stemmed_kw = " ".join(
                                    [stemmer.stem(w.lower()) for w in kw])
                            elif self.lang == 'estonian' or self.lang == 'croatian':
                                stemmed_kw = " ".join(
                                    [stemmer.lemmatize(w.lower()) for w in kw])

                            stemmed_kw_in_paper.append(stemmed_kw)

                else:
                    idx = self.dictionary.word2idx[unk_token]
                if j < max_length:
                    x[i][j] = idx
                    if y[i][j] == 0:
                        y[i][j] = 1

            if self.pos:
                for j, pt in enumerate(pos_tags):
                    if pt in self.dictionary.word2idx:
                        idx = self.dictionary.word2idx[pt]
                    else:
                        idx = self.dictionary.word2idx[unk_token]
                    if j < max_length:
                        x_pos[i][j] = idx

            key = x[i].numpy()
            key = "".join([str(idx) for idx in key if idx != 0])

            #remove keywords that don't appear
            num_all_kw = len(kws)
            not_kws = [
                " ".join(x) for x in kws if " ".join(x) not in kw_in_paper
            ]
            kws = [x for x in kws if " ".join(x) in kw_in_paper]

            for k in not_kws:
                not_in_text[k] += 1

            all_kw += num_all_kw
            present_kw += len(kws)

            if key not in all_keywords:
                all_keywords[key] = kws
            else:
                copies += 1
                #print('TWO identical keys!')
                #print(key)
                #print([self.dictionary.idx2word[idx] for idx in x[i].numpy()])

        print('Num all keywords: ', all_kw)
        print('Percentage of kw. present: ', present_kw / all_kw)

        print('Num identical keys: ', copies)

        l = sorted(not_in_text.items(), key=lambda x: x[1], reverse=True)
        print('Num. keywords that do not appear inside text: ', len(l))
        print('Most common out of text kw: ', l[:100])
        print('Max kw length: ', max_lkw)

        print('X Y size: ', x.size(), y.size())
        if self.pos:
            return x, x_pos, y, all_keywords, stemmed_string
        return x, y, all_keywords, stemmed_string
Beispiel #9
0
 def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus:
     # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward
     self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language])
     output_corpus = super().__call__(corpus, callback)
     self.lemmatizer = None
     return output_corpus
Beispiel #10
0
import config, sys
from install.module import *
try:
    # https://pypi.org/project/lemmagen3/
    from lemmagen3 import Lemmatizer
    moduleInstalled = True
except:
    moduleInstalled = False

# Install essential module if it is absent
if not moduleInstalled:
    installmodule("lemmagen3")
if not "lemmagen3" in sys.modules:
    try:
        from lemmagen3 import Lemmatizer
    except:
        config.mainWindow.displayMessage(
            "This plugin is not enabled.\nRun 'pip3 install {0}' to install essential module first."
            .format("lemmagen3"))

if config.pluginContext:
    lemma = Lemmatizer('en').lemmatize(config.pluginContext)
    config.mainWindow.runTextCommand("SPEAK:::en-gb:::{0}".format(lemma))
    # Search multiple thrid-party dicitonaries
    for thridDict in ("webster", ):
        config.mainWindow.runTextCommand(
            "SEARCHTHIRDDICTIONARY:::{0}:::{1}".format(thridDict, lemma))
else:
    config.contextSource.messageNoSelection()
import scipy
from nltk.stem.porter import *
import numpy as np
import pandas as pd
from lemmagen3 import Lemmatizer
import LatvianStemmer

from nltk.translate.bleu_score import sentence_bleu as bleu

stemmer_en = PorterStemmer()
stemmer_et = Lemmatizer('et')
stemmer_hr = Lemmatizer('hr')
stemmer_ru = Lemmatizer('ru')
stemmer_lv = LatvianStemmer


def stem_word_list(word_list, lang):
    if lang == 'english':
        return [stemmer_en.stem(w.strip()) for w in word_list]
    if lang == 'latvian':
        return [stemmer_lv.stem(w.strip()) for w in word_list]
    elif lang == 'estonian':
        return [stemmer_et.lemmatize(w.strip()) for w in word_list]
    elif lang == 'croatian':
        return [stemmer_hr.lemmatize(w.strip()) for w in word_list]
    elif lang == 'russian':
        return [stemmer_ru.lemmatize(w.strip()) for w in word_list]


def macro_averaged_score(precisionlist, recalllist):
    precision = np.average(precisionlist)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-e',
                        '--embeddings',
                        required=True,
                        help="Path to embeddings file.")
    parser.add_argument('-n',
                        '--n',
                        type=int,
                        default=25,
                        help="Search among top n candidates.")
    parser.add_argument('-o', '--output', required=True, help="Output prefix.")
    parser.add_argument('-i',
                        '--input',
                        default="Poklici_enobesedni.csv",
                        help="Analogies .tsv file")
    parser.add_argument('-l',
                        '--lemmatize',
                        action="store_true",
                        help="Lemmatize the embeddings.")
    parser.add_argument(
        '--avginput',
        action="store_true",
        help=
        "Take averages of male and female vectors as input instead of just words 'man' and 'woman'."
    )
    args = parser.parse_args()
    entries = []
    if args.lemmatize:
        nlp = Lemmatizer('sl')
    else:
        nlp = False

    with open(args.input, 'r') as reader:
        reader.readline()
        for line in reader:
            entries.append(Analogy(line, args.n))
    id2word, word2id, embmatrix = load_emb(args.embeddings)

    m_vector = embmatrix[word2id['moški']]
    f_vector = embmatrix[word2id['ženska']]
    if args.avginput:
        genderpairs = [('gospod', 'gospa'), ('fant', 'dekle'),
                       ('fant', 'punca'), ('deček', 'deklica'),
                       ('brat', 'sestra'), ('oče', 'mati'), ('sin', 'hči'),
                       ('dedek', 'babica'), ('mož', 'žena'), ('stric', 'teta'),
                       ('on', 'ona')]
        for p in genderpairs:
            m_vector += embmatrix[word2id[p[0]]]
            f_vector += embmatrix[word2id[p[1]]]
        m_vector /= (len(genderpairs) + 1)
        f_vector /= (len(genderpairs) + 1)

    correct_m_input = {1: 0, 5: 0, 10: 0, 20: 0}
    correct_f_input = {1: 0, 5: 0, 10: 0, 20: 0}
    correct_m_input_filtered = {1: 0, 5: 0, 10: 0, 20: 0}
    correct_f_input_filtered = {1: 0, 5: 0, 10: 0, 20: 0}
    m_input_coverage = 0
    f_input_coverage = 0
    with open(args.output + '.1.csv',
              'w') as writer1, open(args.output + '.2.csv', 'w') as writer2:
        writer1.write('PoklicM,KandidatŽ,rank_KŽ,cos_similarity\n')
        writer2.write('PoklicŽ,KandidatM,rank_KM,cos_similarity\n')
        for e in entries:
            #print(e.poklicm1, e.poklicf1, e.poklicf2, e.countf1, e.countf2)
            e.m_input(word2id, embmatrix, id2word, args.n,
                      [m_vector, f_vector], nlp)
            e.f_input(word2id, embmatrix, id2word, args.n,
                      [m_vector, f_vector], nlp)
            if e.m_in_f_candidates[0] != 'N/A':
                m_input_coverage += 1
            if e.f_in_m_candidates[0] != 'N/A':
                f_input_coverage += 1
            f_candidates_filtered = e.filter_female(
            )  # list(filter(lambda x: x not in [e.poklicm1, e.poklicm2, 'moški', 'ženska'], e.m_in_f_candidates))
            m_candidates_filtered = e.filter_male(
            )  #list(filter(lambda x: x not in [e.poklicf1, e.poklicf2, 'moški', 'ženska'], e.f_in_m_candidates))
            fcount = 0
            mcount = 0
            for c in f_candidates_filtered[:10]:
                j = e.m_in_f_candidates.index(c)
                writer1.write(e.poklicm1 + ',' + c + ',' + str(j + 1) + ',' +
                              str(e.f_candidates_dist[j]) + '\n')
            for c in m_candidates_filtered[:10]:
                j = e.f_in_m_candidates.index(c)
                writer2.write(e.poklicf1 + ',' + c + ',' + str(j + 1) + ',' +
                              str(e.m_candidates_dist[j]) + '\n')
            #for j in range(args.n):
            #    if e.m_in_f_candidates[j] in f_candidates_filtered and fcount < 10:
            #        writer1.write(e.poklicm1+','+e.m_in_f_candidates[j]+','+str(j+1)+','+str(e.f_candidates_dist[j])+'\n')
            #        #print(j)
            #        fcount += 1
            #    if e.f_in_m_candidates[j] in m_candidates_filtered and mcount < 10:
            #        writer2.write(e.poklicf1+','+e.f_in_m_candidates[j]+','+str(j+1)+','+str(e.m_candidates_dist[j])+'\n')
            #        #print(j)
            #        mcount += 1
            for i in [1, 5, 10, 20]:
                if e.poklicf1 in e.m_in_f_candidates[:
                                                     i] or e.poklicf2 in e.m_in_f_candidates[:
                                                                                             i]:
                    correct_m_input[i] += 1
                if e.poklicm1 in e.f_in_m_candidates[:
                                                     i] or e.poklicm2 in e.f_in_m_candidates[:
                                                                                             i]:
                    correct_f_input[i] += 1
                if e.poklicf1 in f_candidates_filtered[:
                                                       i] or e.poklicf2 in f_candidates_filtered[:
                                                                                                 i]:
                    correct_m_input_filtered[i] += 1
                if e.poklicm1 in m_candidates_filtered[:
                                                       i] or e.poklicm2 in m_candidates_filtered[:
                                                                                                 i]:
                    correct_f_input_filtered[i] += 1

    with open(args.output + '.inputrank.csv', 'w') as writer:
        writer.write('PoklicVhod,rankPVnaIzhodu\n')
        for e in entries:
            writer.write(','.join(e.Moutputrank) + '\n')
            writer.write(','.join(e.Foutputrank) + '\n')

    with open(args.output + '.condensed.txt', 'w') as writer:
        writer.write(
            'Coverage (kolikšen delež poklicev se pojavi v embeddingih):\n')
        writer.write('moški poklici: ' + str(m_input_coverage / len(entries)) +
                     '\n')
        writer.write('ženski poklici: ' +
                     str(f_input_coverage / len(entries)) + '\n')
        writer.write(
            '\r\nAnalogy accuracy, all = uspešnost med vsemi, če se poklic ne pojavi se šteje za nepravilno določen, covered = samo med tistimi, ki se pojavijo, če se poklic ne pojavi, se ne upošteva.\n'
        )
        writer.write('m input, f output: (all / covered)\n')
        for i in [1, 5, 10, 20]:
            printstring = 'acc@' + str(i) + ' = ' + str(
                correct_m_input[i] / len(entries)) + ' / ' + str(
                    correct_m_input[i] / m_input_coverage)
            writer.write(printstring + '\n')
        writer.write('\nf input, m output: (all / covered)\n')
        for i in [1, 5, 10, 20]:
            printstring = 'acc@' + str(i) + ' = ' + str(
                correct_f_input[i] / len(entries)) + ' / ' + str(
                    correct_f_input[i] / f_input_coverage)
            writer.write(printstring + '\n')
        writer.write('\nm input, f output, filtered: (all / covered)\n')
        for i in [1, 5, 10, 20]:
            printstring = 'acc@' + str(i) + ' = ' + str(
                correct_m_input_filtered[i] / len(entries)) + ' / ' + str(
                    correct_m_input_filtered[i] / m_input_coverage)
            writer.write(printstring + '\n')
        writer.write('\nf input, m output, filtered: (all / covered)\n')
        for i in [1, 5, 10, 20]:
            printstring = 'acc@' + str(i) + ' = ' + str(
                correct_f_input_filtered[i] / len(entries)) + ' / ' + str(
                    correct_f_input_filtered[i] / f_input_coverage)
            writer.write(printstring + '\n')
Beispiel #13
0
from lemmagen3 import Lemmatizer

print(Lemmatizer.list_supported_languages())

a = Lemmatizer('en')
word = 'cats'
print('{}->{}'.format(word, a.lemmatize(word)))

b = Lemmatizer('sl')
word = 'ljudje'
print('{}->{}'.format(word, b.lemmatize(word)))
Beispiel #14
0
from lemmagen3 import Lemmatizer

# first, list all supported languages
print(Lemmatizer.list_supported_languages())

# then, create few lemmatizer objects using ISO 639-1 language codes
# (English, Slovene and Russian)

lem_en = Lemmatizer('en')
lem_sl = Lemmatizer('sl')
lem_ru = Lemmatizer('ru')

# now lemmatize the word "cats" in all three languages
print(lem_en.lemmatize('cats'))
print(lem_sl.lemmatize('je'))
print(lem_ru.lemmatize('коты'))