def extract_keywords(target_word, word_clustered_data, max_df, topn): lemmatizer = Lemmatizer('en') l_sent_clust_dict = defaultdict(list) sent_clust_dict = defaultdict(list) for i, row in word_clustered_data.iterrows(): l_sent_clust_dict[row['label']].append(row['sentence']) for label, sents in l_sent_clust_dict.items(): sent_clust_dict[label] = " ".join(sents) stop1 = list(spacy.lang.en.stop_words.STOP_WORDS) stop2 = stopwords.words('english') stop = set(stop1 + stop2) labels, clusters = list(sent_clust_dict.keys()), list( sent_clust_dict.values()) tfidf_transformer = TfidfVectorizer(smooth_idf=True, use_idf=True, ngram_range=(1, 2), max_df=max_df, stop_words=stop, max_features=10000) tfidf_transformer.fit(clusters) feature_names = tfidf_transformer.get_feature_names() keyword_clusters = {} for label, cluster in zip(labels, clusters): # generate tf-idf tf_idf_vector = tfidf_transformer.transform([cluster]) # sort the tf-idf vectors by descending order of scores tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data) sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True) # extract only the top n keywords = extract_topn_from_vector(feature_names, sorted_items, topn * 5) keywords = sorted(keywords.items(), key=lambda x: x[1], reverse=True) keywords = [x[0] for x in keywords] #filter unigrams that appear in bigrams and remove duplicates all_bigrams = " ".join([kw for kw in keywords if len(kw.split()) == 2]) already_in = set() filtered_keywords = [] for kw in keywords: if len(kw.split()) == 1 and kw in all_bigrams: continue else: if len(kw.split()) == 1: kw = lemmatizer.lemmatize(kw) if kw not in already_in and kw != target_word: filtered_keywords.append(kw) already_in.add(kw) keyword_clusters[label] = filtered_keywords[:topn] return keyword_clusters
def get_lemmatizer(lang): lemmatizer = -1 if lang == 'hr': lemmatizer = Lemmatizer('hr').lemmatize if lang == 'ee': lemmatizer = Lemmatizer('et').lemmatize if lang == 'ru': lemmatizer = Lemmatizer('ru').lemmatize if lang == 'lv': lemmatizer = LatvianStemmer.stem assert not lemmatizer == -1 return lemmatizer
def test_lemmagen(self): normalizer = preprocess.LemmagenLemmatizer('Slovenian') sentence = 'Gori na gori hiša gori' with self.corpus.unlocked(): self.corpus.metas[0, 0] = sentence self.assertEqual( [Lemmatizer("sl").lemmatize(t) for t in sentence.split()], normalizer(self.corpus).tokens[0], )
class LemmagenLemmatizer(BaseNormalizer): name = 'Lemmagen Lemmatizer' lemmagen_languages = { "Bulgarian": "bg", "Croatian": "hr", "Czech": "cs", "English": "en", "Estonian": "et", "Farsi/Persian": "fa", "French": "fr", "German": "de", "Hungarian": "hu", "Italian": "it", "Macedonian": "mk", "Polish": "pl", "Romanian": "ro", "Russian": "ru", "Serbian": "sr", "Slovak": "sk", "Slovenian": "sl", "Spanish": "es", "Ukrainian": "uk" } def __init__(self, language='English'): super().__init__() self.language = language self.lemmatizer = None def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language]) output_corpus = super().__call__(corpus, callback) self.lemmatizer = None return output_corpus def normalizer(self, token): assert self.lemmatizer is not None t = self.lemmatizer.lemmatize(token) # sometimes Lemmagen returns an empty string, return original tokens # in this case return t if t else token
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer, adaptive, bpe, masked_lm, classification, bpe_model_path, datasets, lm_corpus_file, transfer_learning, pos_tags, dict_path, rnn, crf, lm_id, output_path): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=batch_size) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--lr_warmup', type=float, default=0.002) parser.add_argument('--lr', type=float, default=learning_rate) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) parser.add_argument('--l2', type=float, default=0.01) parser.add_argument('--vector_l2', action='store_true') parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--initializer_range", type=float, default=0.02) parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6) parser.add_argument("--n_ctx", type=int, default=n_ctx) parser.add_argument("--n_positions", type=int, default=n_ctx) parser.add_argument("--n_embd", type=int, default=n_embd) parser.add_argument("--n_head", type=int, default=n_head) parser.add_argument("--n_layer", type=int, default=n_layer) parser.add_argument("--max_vocab_size", type=int, default=0, help='Zero means no limit.') parser.add_argument('--max_step', type=int, default=100000, help='upper epoch limit') parser.add_argument('--eta_min', type=float, default=0.0, help='min learning rate for cosine scheduler') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--kw_cut', type=int, default=10, help='Precison and recall @') parser.add_argument("--num_epoch", type=int, default=10) parser.add_argument('--data_path', type=str, default='data') parser.add_argument('--result_path', type=str, default='results_512_sorted_big.txt') parser.add_argument('--adaptive', action='store_true', help='If true, use adaptive softmax.') parser.add_argument('--bpe', action='store_true', help='If true, use byte pair encoding.') parser.add_argument( '--masked_lm', action='store_true', help= 'If true, use masked language model objective for pretraining instead of regular language model.' ) parser.add_argument('--transfer_learning', action='store_true', help='If true, use a pretrained language model.') parser.add_argument('--POS_tags', action='store_true', help='POS tags') parser.add_argument('--classification', action='store_true', help='If true, train a classifier.') parser.add_argument( '--rnn', action='store_true', help='If true, use a RNN with attention in classification head.') parser.add_argument( '--crf', action='store_true', help= 'If true, use CRF instead of costum loss function in classification head.' ) parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path) parser.add_argument('--datasets', type=str, default=datasets) parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file) parser.add_argument('--trained_language_models_dir', type=str, default='trained_language_models') parser.add_argument('--trained_classification_models_dir', type=str, default='trained_classification_models') parser.add_argument('--dict_path', type=str, default=dict_path, help='Path to dictionary') parser.add_argument('--lang', type=str, default='english', help='Path to dictionary') parser.add_argument('--lm_id', type=str, default=lm_id, help='Path to language model') parser.add_argument('--output_path', type=str, default=output_path, help='Output designator') parser.add_argument('--cuda', action='store_false', help='If true, use gpu.') args = parser.parse_args() args.adaptive = adaptive args.classification = classification args.transfer_learning = transfer_learning args.POS_tags = pos_tags args.bpe = bpe args.masked_lm = masked_lm args.rnn = rnn args.crf = crf args.cuda = True if not os.path.exists(args.trained_classification_models_dir): os.makedirs(args.trained_classification_models_dir) if not os.path.exists(args.trained_language_models_dir): os.makedirs(args.trained_language_models_dir) if args.bpe: sp = spm.SentencePieceProcessor() sp.Load(args.bpe_model_path) else: sp = None if args.crf: assert not args.rnn if args.rnn: assert not args.crf if args.classification: assert args.trained_classification_models_dir != args.trained_language_models_dir assert not args.adaptive if args.transfer_learning: l_models = os.listdir(args.trained_language_models_dir) for l_model in l_models: if args.lm_id in l_model: args.language_model_path = os.path.join( args.trained_language_models_dir, l_model) print('Classification, using language model: ', args.language_model_path) print() if not args.transfer_learning: assert not os.path.exists(args.dict_path) print(args) if args.lang == 'english': stemmer = PorterStemmer() elif args.lang == 'estonian': stemmer = Lemmatizer('et') elif args.lang == 'croatian': stemmer = Lemmatizer('hr') elif args.lang == 'russian': stemmer = Lemmatizer('ru') np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if not args.classification: df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file), classification=False) df_data = df_data.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_data.shape[0]) test_idx = int(0.9 * df_data.shape[0]) df_train = df_data[:val_idx] df_valid = df_data[val_idx:test_idx] df_test = df_data[test_idx:] print( '------------------------------------------------------------------------------------------------------' ) print('Training language model on all data') print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print( '------------------------------------------------------------------------------------------------------' ) print() train_test(df_train, df_valid, df_test, args, stemmer, sp) else: result_file = open(args.result_path, 'a', encoding='utf8') result_file.write("Classification results using language model " + args.lm_id + " and config " + args.output_path + ":\n\n") result_file.write("Parameters:\n") result_file.write( str(args) + '\n------------------------------------------------\n') for folder in args.datasets.split(';'): print( '------------------------------------------------------------------------------------------------------' ) print('Training on: ', folder) print( '------------------------------------------------------------------------------------------------------' ) if folder == 'duc' or folder == 'nus': #cross validation kf = model_selection.KFold(n_splits=10) df_data = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) df_data = df_data.sample(frac=1, random_state=2019) print() print('Cross validation on duc') fold_counter = 0 total_pred = [] total_true = [] for train_index, test_index in kf.split(df_data): fold_counter += 1 df_train, df_test = df_data.iloc[ train_index], df_data.iloc[test_index] sep_idx = int(df_train.shape[0] / 10) df_valid = df_train[:sep_idx] df_train = df_train[sep_idx:] print("Train fold ", fold_counter, "fold size: ", df_train.shape, "Valid fold size: ", df_valid.shape, "Test fold size: ", df_test.shape) print() fold_pred, fold_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) total_pred.extend(fold_pred) total_true.extend(fold_true) print() print( '--------------------------------------------------------------------' ) print('Final CV results:') print() else: df_train = file_to_df(os.path.join(args.data_path, folder, folder + '_valid.json'), classification=True) df_train = df_train.sample(frac=1, random_state=2019) val_idx = int(0.8 * df_train.shape[0]) df_valid = df_train[val_idx:] df_train = df_train[:val_idx] df_test = file_to_df(os.path.join(args.data_path, folder, folder + '_test.json'), classification=True) print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape, "Test size: ", df_test.shape) print() total_pred, total_true, num_parameters = train_test( df_train, df_valid, df_test, args, stemmer, sp, folder) p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval( total_pred, total_true, lang=args.lang) result_file.write("Dataset: " + folder + '\n') result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' + str(r_5) + ' F1@5: ' + str(f_5) + '\n') result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' + str(r_10) + ' F1@10: ' + str(f_10) + '\n') result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' + str(r_k) + ' F1@k: ' + str(f_k) + '\n') result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' + str(r_M) + ' F1@M: ' + str(f_M) + '\n') result_file.write('Num. trainable parameters: ' + str(num_parameters) + '\n') outputs = [] for pred, true in zip(total_pred, total_true): pred = ";".join(list(pred)) true = ";".join(list(true)) outputs.append((pred, true)) df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True']) df_preds.to_csv('predictions/' + folder + '_' + args.output_path + '.csv', sep=',', encoding='utf8') result_file.write( "\n-----------------------------------------------------------\n") result_file.write( "\n-----------------------End of the run----------------------\n") result_file.write( "\n-----------------------------------------------------------\n") result_file.close()
import scipy from lemmagen3 import Lemmatizer stemmer = Lemmatizer('hr').lemmatize import numpy as np import pandas as pd from nltk.translate.bleu_score import sentence_bleu as bleu #stemmer = LatvianStemmer() def stem_word_list(word_list): return [stemmer(w.strip()) for w in word_list] def macro_averaged_score(precisionlist, recalllist): precision = np.average(precisionlist) recall = np.average(recalllist) f_score = 0 if (precision or recall): f_score = round((2 * (precision * recall)) / (precision + recall), 4) return precision, recall, f_score def get_match_result(true_seqs, pred_seqs, do_stem=True, type='exact'): ''' If type='exact', returns a list of booleans indicating if a pred has a matching tgt If type='partial', returns a 2D matrix, each value v_ij is a float in range of [0,1] indicating the (jaccard) similarity between pred_i and tgt_j
from lemmagen3 import Lemmatizer import pke if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--datasets', type=str, default='data/croatian/croatian_test.json', help='paths to datasets separated with ;') parser.add_argument('--lang', type=str, default='croatian', help='language') parser.add_argument('--num_keywords', type=int, default=10, help='Number of keywords') args = parser.parse_args() if args.lang == 'english': stemmer = PorterStemmer() elif args.lang == 'estonian': stemmer = Lemmatizer('et') elif args.lang == 'croatian': stemmer = Lemmatizer('hr') language = args.lang numOfKeywords = args.num_keywords input_paths = args.datasets.split(';') for input_path in input_paths: all_preds = [] all_true = [] counter = 0 num_tokens = 0 num_kw = 0
def tokenize_doc(self, df, max_length, valid=False): if self.lang == 'english': stemmer = PorterStemmer() elif self.lang == 'estonian': stemmer = Lemmatizer('et') elif self.lang == 'croatian': stemmer = Lemmatizer('hr') stemmed_string = "" docs = [] for idx, line in df.iterrows(): if self.pos: words, pos_tags = self.preprocess_line(line, self.pos) else: words = self.preprocess_line(line, self.pos) if self.lang == 'english': stems = " ".join([stemmer.stem(w.lower()) for w in words]) elif self.lang == 'estonian' or self.lang == 'croatian': stems = " ".join([stemmer.lemmatize(w.lower()) for w in words]) stemmed_string += stems + " " tokenized_keywords = [] keywords = line['keyword'].lower() keywords = keywords.replace('-', ' ') keywords = keywords.replace('/', ' ') keywords = keywords.replace('∗', ' ') for kw in keywords.split(';'): if not self.bpe: kw = kw.split() else: kw = self.sp.tokenize(kw) tokenized_keywords.append(kw) if self.pos: docs.append([words, pos_tags, tokenized_keywords]) else: docs.append([words, tokenized_keywords]) docs = sorted(docs, key=lambda x: len(x[0])) x = torch.zeros([len(docs), max_length], dtype=torch.long) y = torch.zeros([len(docs), max_length], dtype=torch.long) if self.pos: x_pos = torch.zeros([len(docs), max_length], dtype=torch.long) all_keywords = {} not_in_text = defaultdict(int) present_kw = 0 all_kw = 0 copies = 0 max_lkw = 4 for i, doc in enumerate(docs): if self.pos: words, pos_tags, kws = doc else: words, kws = doc length = len(words) kw_in_paper = [] stemmed_kw_in_paper = [] for j, word in enumerate(words): if word in self.dictionary.word2idx: idx = self.dictionary.word2idx[word] for kw in kws: lkw = len(kw) is_keyword = False if j + lkw < length: for k in range(lkw): w = words[j + k] if self.lang == 'english': if stemmer.stem(w.lower()) != stemmer.stem( kw[k].lower()): break elif self.lang == 'estonian' or self.lang == 'croatian': if stemmer.lemmatize( w.lower()) != stemmer.lemmatize( kw[k].lower()): break else: is_keyword = True if is_keyword: for k in range(lkw): if j + k < max_length: y[i][j + k] = 3 if k == 0 else 2 kw_in_paper.append(" ".join(kw)) if self.lang == 'english': stemmed_kw = " ".join( [stemmer.stem(w.lower()) for w in kw]) elif self.lang == 'estonian' or self.lang == 'croatian': stemmed_kw = " ".join( [stemmer.lemmatize(w.lower()) for w in kw]) stemmed_kw_in_paper.append(stemmed_kw) else: idx = self.dictionary.word2idx[unk_token] if j < max_length: x[i][j] = idx if y[i][j] == 0: y[i][j] = 1 if self.pos: for j, pt in enumerate(pos_tags): if pt in self.dictionary.word2idx: idx = self.dictionary.word2idx[pt] else: idx = self.dictionary.word2idx[unk_token] if j < max_length: x_pos[i][j] = idx key = x[i].numpy() key = "".join([str(idx) for idx in key if idx != 0]) #remove keywords that don't appear num_all_kw = len(kws) not_kws = [ " ".join(x) for x in kws if " ".join(x) not in kw_in_paper ] kws = [x for x in kws if " ".join(x) in kw_in_paper] for k in not_kws: not_in_text[k] += 1 all_kw += num_all_kw present_kw += len(kws) if key not in all_keywords: all_keywords[key] = kws else: copies += 1 #print('TWO identical keys!') #print(key) #print([self.dictionary.idx2word[idx] for idx in x[i].numpy()]) print('Num all keywords: ', all_kw) print('Percentage of kw. present: ', present_kw / all_kw) print('Num identical keys: ', copies) l = sorted(not_in_text.items(), key=lambda x: x[1], reverse=True) print('Num. keywords that do not appear inside text: ', len(l)) print('Most common out of text kw: ', l[:100]) print('Max kw length: ', max_lkw) print('X Y size: ', x.size(), y.size()) if self.pos: return x, x_pos, y, all_keywords, stemmed_string return x, y, all_keywords, stemmed_string
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: # lemmagen3 lemmatizer is not picklable, define it on call and discard it afterward self.lemmatizer = Lemmatizer(self.lemmagen_languages[self.language]) output_corpus = super().__call__(corpus, callback) self.lemmatizer = None return output_corpus
import config, sys from install.module import * try: # https://pypi.org/project/lemmagen3/ from lemmagen3 import Lemmatizer moduleInstalled = True except: moduleInstalled = False # Install essential module if it is absent if not moduleInstalled: installmodule("lemmagen3") if not "lemmagen3" in sys.modules: try: from lemmagen3 import Lemmatizer except: config.mainWindow.displayMessage( "This plugin is not enabled.\nRun 'pip3 install {0}' to install essential module first." .format("lemmagen3")) if config.pluginContext: lemma = Lemmatizer('en').lemmatize(config.pluginContext) config.mainWindow.runTextCommand("SPEAK:::en-gb:::{0}".format(lemma)) # Search multiple thrid-party dicitonaries for thridDict in ("webster", ): config.mainWindow.runTextCommand( "SEARCHTHIRDDICTIONARY:::{0}:::{1}".format(thridDict, lemma)) else: config.contextSource.messageNoSelection()
import scipy from nltk.stem.porter import * import numpy as np import pandas as pd from lemmagen3 import Lemmatizer import LatvianStemmer from nltk.translate.bleu_score import sentence_bleu as bleu stemmer_en = PorterStemmer() stemmer_et = Lemmatizer('et') stemmer_hr = Lemmatizer('hr') stemmer_ru = Lemmatizer('ru') stemmer_lv = LatvianStemmer def stem_word_list(word_list, lang): if lang == 'english': return [stemmer_en.stem(w.strip()) for w in word_list] if lang == 'latvian': return [stemmer_lv.stem(w.strip()) for w in word_list] elif lang == 'estonian': return [stemmer_et.lemmatize(w.strip()) for w in word_list] elif lang == 'croatian': return [stemmer_hr.lemmatize(w.strip()) for w in word_list] elif lang == 'russian': return [stemmer_ru.lemmatize(w.strip()) for w in word_list] def macro_averaged_score(precisionlist, recalllist): precision = np.average(precisionlist)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-e', '--embeddings', required=True, help="Path to embeddings file.") parser.add_argument('-n', '--n', type=int, default=25, help="Search among top n candidates.") parser.add_argument('-o', '--output', required=True, help="Output prefix.") parser.add_argument('-i', '--input', default="Poklici_enobesedni.csv", help="Analogies .tsv file") parser.add_argument('-l', '--lemmatize', action="store_true", help="Lemmatize the embeddings.") parser.add_argument( '--avginput', action="store_true", help= "Take averages of male and female vectors as input instead of just words 'man' and 'woman'." ) args = parser.parse_args() entries = [] if args.lemmatize: nlp = Lemmatizer('sl') else: nlp = False with open(args.input, 'r') as reader: reader.readline() for line in reader: entries.append(Analogy(line, args.n)) id2word, word2id, embmatrix = load_emb(args.embeddings) m_vector = embmatrix[word2id['moški']] f_vector = embmatrix[word2id['ženska']] if args.avginput: genderpairs = [('gospod', 'gospa'), ('fant', 'dekle'), ('fant', 'punca'), ('deček', 'deklica'), ('brat', 'sestra'), ('oče', 'mati'), ('sin', 'hči'), ('dedek', 'babica'), ('mož', 'žena'), ('stric', 'teta'), ('on', 'ona')] for p in genderpairs: m_vector += embmatrix[word2id[p[0]]] f_vector += embmatrix[word2id[p[1]]] m_vector /= (len(genderpairs) + 1) f_vector /= (len(genderpairs) + 1) correct_m_input = {1: 0, 5: 0, 10: 0, 20: 0} correct_f_input = {1: 0, 5: 0, 10: 0, 20: 0} correct_m_input_filtered = {1: 0, 5: 0, 10: 0, 20: 0} correct_f_input_filtered = {1: 0, 5: 0, 10: 0, 20: 0} m_input_coverage = 0 f_input_coverage = 0 with open(args.output + '.1.csv', 'w') as writer1, open(args.output + '.2.csv', 'w') as writer2: writer1.write('PoklicM,KandidatŽ,rank_KŽ,cos_similarity\n') writer2.write('PoklicŽ,KandidatM,rank_KM,cos_similarity\n') for e in entries: #print(e.poklicm1, e.poklicf1, e.poklicf2, e.countf1, e.countf2) e.m_input(word2id, embmatrix, id2word, args.n, [m_vector, f_vector], nlp) e.f_input(word2id, embmatrix, id2word, args.n, [m_vector, f_vector], nlp) if e.m_in_f_candidates[0] != 'N/A': m_input_coverage += 1 if e.f_in_m_candidates[0] != 'N/A': f_input_coverage += 1 f_candidates_filtered = e.filter_female( ) # list(filter(lambda x: x not in [e.poklicm1, e.poklicm2, 'moški', 'ženska'], e.m_in_f_candidates)) m_candidates_filtered = e.filter_male( ) #list(filter(lambda x: x not in [e.poklicf1, e.poklicf2, 'moški', 'ženska'], e.f_in_m_candidates)) fcount = 0 mcount = 0 for c in f_candidates_filtered[:10]: j = e.m_in_f_candidates.index(c) writer1.write(e.poklicm1 + ',' + c + ',' + str(j + 1) + ',' + str(e.f_candidates_dist[j]) + '\n') for c in m_candidates_filtered[:10]: j = e.f_in_m_candidates.index(c) writer2.write(e.poklicf1 + ',' + c + ',' + str(j + 1) + ',' + str(e.m_candidates_dist[j]) + '\n') #for j in range(args.n): # if e.m_in_f_candidates[j] in f_candidates_filtered and fcount < 10: # writer1.write(e.poklicm1+','+e.m_in_f_candidates[j]+','+str(j+1)+','+str(e.f_candidates_dist[j])+'\n') # #print(j) # fcount += 1 # if e.f_in_m_candidates[j] in m_candidates_filtered and mcount < 10: # writer2.write(e.poklicf1+','+e.f_in_m_candidates[j]+','+str(j+1)+','+str(e.m_candidates_dist[j])+'\n') # #print(j) # mcount += 1 for i in [1, 5, 10, 20]: if e.poklicf1 in e.m_in_f_candidates[: i] or e.poklicf2 in e.m_in_f_candidates[: i]: correct_m_input[i] += 1 if e.poklicm1 in e.f_in_m_candidates[: i] or e.poklicm2 in e.f_in_m_candidates[: i]: correct_f_input[i] += 1 if e.poklicf1 in f_candidates_filtered[: i] or e.poklicf2 in f_candidates_filtered[: i]: correct_m_input_filtered[i] += 1 if e.poklicm1 in m_candidates_filtered[: i] or e.poklicm2 in m_candidates_filtered[: i]: correct_f_input_filtered[i] += 1 with open(args.output + '.inputrank.csv', 'w') as writer: writer.write('PoklicVhod,rankPVnaIzhodu\n') for e in entries: writer.write(','.join(e.Moutputrank) + '\n') writer.write(','.join(e.Foutputrank) + '\n') with open(args.output + '.condensed.txt', 'w') as writer: writer.write( 'Coverage (kolikšen delež poklicev se pojavi v embeddingih):\n') writer.write('moški poklici: ' + str(m_input_coverage / len(entries)) + '\n') writer.write('ženski poklici: ' + str(f_input_coverage / len(entries)) + '\n') writer.write( '\r\nAnalogy accuracy, all = uspešnost med vsemi, če se poklic ne pojavi se šteje za nepravilno določen, covered = samo med tistimi, ki se pojavijo, če se poklic ne pojavi, se ne upošteva.\n' ) writer.write('m input, f output: (all / covered)\n') for i in [1, 5, 10, 20]: printstring = 'acc@' + str(i) + ' = ' + str( correct_m_input[i] / len(entries)) + ' / ' + str( correct_m_input[i] / m_input_coverage) writer.write(printstring + '\n') writer.write('\nf input, m output: (all / covered)\n') for i in [1, 5, 10, 20]: printstring = 'acc@' + str(i) + ' = ' + str( correct_f_input[i] / len(entries)) + ' / ' + str( correct_f_input[i] / f_input_coverage) writer.write(printstring + '\n') writer.write('\nm input, f output, filtered: (all / covered)\n') for i in [1, 5, 10, 20]: printstring = 'acc@' + str(i) + ' = ' + str( correct_m_input_filtered[i] / len(entries)) + ' / ' + str( correct_m_input_filtered[i] / m_input_coverage) writer.write(printstring + '\n') writer.write('\nf input, m output, filtered: (all / covered)\n') for i in [1, 5, 10, 20]: printstring = 'acc@' + str(i) + ' = ' + str( correct_f_input_filtered[i] / len(entries)) + ' / ' + str( correct_f_input_filtered[i] / f_input_coverage) writer.write(printstring + '\n')
from lemmagen3 import Lemmatizer print(Lemmatizer.list_supported_languages()) a = Lemmatizer('en') word = 'cats' print('{}->{}'.format(word, a.lemmatize(word))) b = Lemmatizer('sl') word = 'ljudje' print('{}->{}'.format(word, b.lemmatize(word)))
from lemmagen3 import Lemmatizer # first, list all supported languages print(Lemmatizer.list_supported_languages()) # then, create few lemmatizer objects using ISO 639-1 language codes # (English, Slovene and Russian) lem_en = Lemmatizer('en') lem_sl = Lemmatizer('sl') lem_ru = Lemmatizer('ru') # now lemmatize the word "cats" in all three languages print(lem_en.lemmatize('cats')) print(lem_sl.lemmatize('je')) print(lem_ru.lemmatize('коты'))