def process_frequency_list(): freq_dict = {} print("Töötlen sagedusloendit") with open("freq_list_raw.txt", "r", encoding="UTF-8-SIG") as file: count = 0 for line in file: if "?" in line: continue count += 1 print(count) parts = line.strip().split(" ") freq = int(parts[0]) lemma = parts[1] word_analysis = estnltk.Text(lemma).tag_analysis() synsets = wn.synsets(lemma) if is_tag_allowed(word_analysis) and len( synsets) > 0 and has_relations(synsets): freq_dict[lemma] = freq print("Koostan töödeldud sagedusloendit") with open("freq_list_processed.txt", "w", encoding="UTF-8") as file: for word in sorted(freq_dict, key=freq_dict.get, reverse=True): file.write(word + " " + str(freq_dict[word]) + "\n") freq_dict.clear() print("Sagedusloend töödeldud")
def get_wordnet_distractors(keyword,sent): text = Text(sent) word_lower = Text(keyword).lower().text dataframe = text.get.word_texts.lemmas.postags.postag_descriptions.as_dict #print ("dataframe ",dataframe) word_index = dataframe["word_texts"].index(word_lower) pos_tag = dataframe["postags"][word_index] pos_final = pos_mapping[pos_tag] print("pos ", pos_final, " ,Estonian tag: ", pos_tag) if pos_final is not None and pos_final != "NUM": sets = wn.synsets(word_lower, pos=pos_final) print("initial synsets ", sets) if len(sets) == 0: lemma = dataframe["lemmas"][word_index] lemma_text = Text(lemma) lemma_lower = Text(lemma_text).lower().text sets = wn.synsets(lemma_lower, pos=pos_final) print("modified synsets ", sets) else: sets = wn.synsets(word_lower) distractors=[] for syn in sets[:1]: # print (syn) # print(syn.name) # print(syn.pos) # print (syn.definition()) hypernyms = syn.hypernyms() for hypenym in hypernyms: for hyponym in hypenym.hyponyms(): if hyponym.name not in distractors: distractors.append(hyponym.name.split(".")[0]) # print ("wordnet distractors ",distractors) if word_lower in distractors: distractors.remove(word_lower) return distractors
def process_foreign_list(): basic = dictionaries.get_basic_list("basic_processed.txt") foreign_words = {} count = 0 print("Töötlen selgitustega võõrsõnade loendit") with open("foreign_meaning.txt", "r", encoding="UTF-8") as file: for line in file: print(count) count += 1 parts = line.strip().split("\t") word = parts[0] if len(parts) > 1: definition = parts[1].split("(")[0] foreign_words[word] = definition else: foreign_words[word] = None print("Töötlen märksõnadega võõrsõnade loendit") with open("foreign_keywords.txt", "r", encoding="UTF-8") as file: count = 0 for word in file: print(count) count += 1 word = word.strip() if word[0] == "-" or word[-1] == "-": continue word_analysis = estnltk.Text(word).tag_analysis() synsets = wn.synsets(word) # filtreerime välja kõik ebavajalikud sõnad if is_tag_allowed( word_analysis ) and word not in basic and word not in foreign_words and has_relations( synsets): foreign_words[word] = None print("Koostan töödeldud võõrsõnade loendit") with open("foreign_processed.txt", "w", encoding="UTF-8") as file: for word in foreign_words: if len(word.split(" ")) <= 1: if foreign_words[word] is None: file.write(word + "\n") else: file.write(word + " " + foreign_words[word] + "\n") foreign_words.clear() print("Võõrsõnade loend töödeldud")
def process_basic_list(): basic_list = [] print("Töötlen põhisõnavara loendit") count = 0 with open("basic_raw.txt", "r", encoding="UTF-8") as file: for line in file: print(count) count += 1 word = line.strip() word_analysis = estnltk.Text(word).tag_analysis() synsets = wn.synsets(word) if is_tag_allowed(word_analysis) and len( synsets) > 0 and has_relations(synsets): basic_list.append(word) print("Koostan töödeldud põhisõnavara loendit") with open("basic_processed.txt", "w", encoding="UTF-8") as file: for word in basic_list: file.write(word + "\n") basic_list.clear() print("Põhisõnavara loend töödeldud")
def hyper(text): global sent_count, hp_count WN_POS = {u'A',u'S',u'V',u'D'} words = text.words pos = text.postags lemmas_ = text.lemmas lemmas =[] for lemma in lemmas_: if "|" in lemma: lemma = lemma[:lemma.index("|")] lemmas.append(lemma) lemma_pos = zip(lemmas, pos) pos2lemmas = defaultdict(set) for lemma, pos in lemma_pos: if pos in WN_POS: pos2lemmas[pos].add(lemma) pos2pairs = dict() for pos in pos2lemmas: if len(pos2lemmas[pos]) > 1: combs = combinations(pos2lemmas[pos],2) pos2pairs[pos] = [comb for comb in combs] pos2pairs2 = {} for pos in pos2pairs: pairs_for_pos = [] for lemma1, lemma2 in pos2pairs[pos]: pairs_for_synsets = set() synsets1 = wn.synsets(lemma1) synsets2 = wn.synsets(lemma2) pairs_for_pos.append([x for x in itertools.product(synsets1, synsets2)]) #pairs_for_pos.append(pairs_for_synsets) pos2pairs2[pos] = pairs_for_pos #võtab sõnaraamatu tagastab 3 taset ülespoole mõlemal syn_hyper = hyper_level3(pos2pairs2) sent_count += 1 objects = [] for key in pos2pairs: for idx in range(len(pos2pairs2[key])): value = pos2pairs2[key][idx] try: for syn1, syn2 in value: if syn1 in syn_hyper[syn2]: obj = {} hp_count +=1 print("HP" , syn_hyper[syn2].index(syn1)+1) print(syn_hyper[syn2]) lemma1, lemma2 = pos2pairs[key][idx] print("WORDS" , words[lemmas.index(lemma1)], words[lemmas.index(lemma2)],) print("LEMMAS" , lemma1, lemma2) print(text) obj["type"] = "HP" + str(syn_hyper[syn2].index(syn1)+1) obj["lemmas"] = lemma1, lemma2 obj["start"] = words[lemmas.index(lemma1)]["start"], words[lemmas.index(lemma2)]["start"] obj["end"] = words[lemmas.index(lemma1)]["end"], words[lemmas.index(lemma2)]["end"] return obj elif syn2 in syn_hyper[syn1]: obj = {} hp_count +=1 print("HP" , syn_hyper[syn1].index(syn2)+1) print(syn_hyper[syn1]) lemma1, lemma2 = pos2pairs[key][idx] print("WORDS" , words[lemmas.index(lemma1)], words[lemmas.index(lemma2)],) print("LEMMAS" , lemma2, lemma1) print(text) obj["type"] = "HP" + str(syn_hyper[syn1].index(syn2)+1) obj["lemmas"] = lemma2, lemma1 obj["start"] = words[lemmas.index(lemma1)]["start"], words[lemmas.index(lemma2)]["start"] obj["end"] = words[lemmas.index(lemma1)]["end"], words[lemmas.index(lemma2)]["end"] return obj except IndexError: continue print(sent_count) #print("len(s_h)", len(syn_hyper)) print("HP protsent", (hp_count/sent_count)*100) return None
from estnltk.wordnet import wn from pprint import pprint print(len(wn.all_synsets())) pprint(wn.synsets("koer", pos=wn.VERB)) pprint(wn.synsets('koer')) synset = wn.synset("king.n.01") pprint(synset.name) pprint(synset.pos) pprint(synset.definition()) pprint(synset.examples()) pprint(synset.hypernyms()) pprint(synset.hyponyms()) pprint(synset.meronyms()) pprint(synset.holonyms()) pprint(synset.get_related_synsets('fuzzynym')) target_synset = wn.synset('kinnas.n.01') pprint(synset.path_similarity(target_synset)) pprint(synset.lch_similarity(target_synset)) pprint(synset.wup_similarity(target_synset)) pprint(synset.lowest_common_hypernyms(target_synset))
# Kontrollime kas sõna on võõrsõna ja kas võõrsõnale leidub omasõna if is_foreign( word[LEMMA]) and foreign_dict[word[LEMMA]] is not None: # Lisame lemma tulemusse, sageduse alampiir on 0, sest tegemist on usaldusväärse asendusega add_lemma_to_result(foreign_dict[word[LEMMA]], replacement_list, similarity=0) # Sõnaliik tag = word[POSTAG] # Kontrollime, kas sõnaliik on sobiv if tag in wn_pos: # Kontrollime, kas sõna lemma vajab lihtsustamist if needs_replacing(word[LEMMA]): # Leiame täissünonüümi hulga syn_sets = wn.synsets(word[LEMMA], pos=wn_pos[tag]) # Kui täissünonüüme ei ole, pole ei saa ka neid analüüsida if len(syn_sets) > 0: # Järjestame sünohulgad sarnasuse alusel ordered_synsets = get_best_syn_set_from_prev_and_next( prev_word, next_word, syn_sets) # Vaatame kõik sünohulgad järjestatud sünohulkade seast läbi for syn_set in ordered_synsets: # Kontrollime, kas oleme leidnud juba sobiva asenduse, kui ei ole, üritame leida if needs_further_simplification(replacement_list): # Otsime lihtsamat lemmat find_replacement(syn_set, replacement_list) # Kui oleme leidnud sobiva sõna, siis edasi enam ei analüüsi. else: break
from estnltk.wordnet import wn from pprint import pprint print (len(wn.all_synsets())) pprint(wn.synsets("koer",pos=wn.VERB)) pprint(wn.synsets('koer')) synset = wn.synset("king.n.01") pprint(synset.name) pprint(synset.pos) pprint(synset.definition()) pprint(synset.examples()) pprint(synset.hypernyms()) pprint(synset.hyponyms()) pprint(synset.meronyms()) pprint(synset.holonyms()) pprint(synset.get_related_synsets('fuzzynym')) target_synset = wn.synset('kinnas.n.01') pprint(synset.path_similarity(target_synset)) pprint(synset.lch_similarity(target_synset)) pprint(synset.wup_similarity(target_synset)) pprint(synset.lowest_common_hypernyms(target_synset))