def semantic_analysis(): wikiwordnet = WikiWordnet() text = calculated_text.get(1.0, END) text = text.replace('\n', '') if text == '': return None start = time.time() if letters_in_the_word(text): hyponyms = [] # Кольцо синонимов или синсет - это группа элементов данных, которые считаются семантически эквивалентными # для целей поиска информации synsets2 = wikiwordnet.get_synsets(text) text = '' lemmas2 = [x.lemma() for x in synsets2[0].get_words()] # Synset представляет группу лемм, имеющих одинаковый смысл, а лемма представляет собой отдельную словоформу. for lemma in lemmas2: text += lemma + ' ' synset2 = synsets2[0] for hypernym in wikiwordnet.get_hypernyms(synset2): for w in hypernym.get_words(): text += w.lemma() + ' ' for hyponym in wikiwordnet.get_hyponyms(synset2): for w in hyponym.get_words(): text += w.lemma() + ' ' word_cloud(text) end = time.time() print("Total time: {:.1f}".format(end - start)) else: messagebox.showwarning('Warning!!!', 'One word!', type='ok')
def semantic_analysis(text): wikiwordnet = WikiWordnet() tree_text = re.sub('-', ',', text) if tree_text == '': return None if letters_in_the_word(tree_text): hyponyms = [] # Кольцо синонимов или синсет - это группа элементов данных, которые считаются семантически эквивалентными # для целей поиска информации synsets2 = wikiwordnet.get_synsets(tree_text) text = '' lemmas2 = [x.lemma() for x in synsets2[0].get_words()] # Synset представляет группу лемм, имеющих одинаковый смысл, а лемма представляет собой отдельную словоформу. for lemma in lemmas2: text += lemma + ' ' synset2 = synsets2[0] for hypernym in wikiwordnet.get_hypernyms(synset2): for w in hypernym.get_words(): text += w.lemma() + ' ' for hyponym in wikiwordnet.get_hyponyms(synset2): for w in hyponym.get_words(): text += w.lemma() + ' ' return word_cloud(text)
def view_window(): text = calculated_text.get(1.0, END) text = text.replace('\n', '') if text != '': check = check_word(text) if not check: messagebox.showwarning('!', 'Необходимо ввести одно слово', type='ok') else: wiki_wordnet = WikiWordnet() syn = wiki_wordnet.get_synsets(text.lower()) text = '' for l in syn[0].get_words(): text += l.lemma() + ' ' for i in wiki_wordnet.get_hyponyms(syn[0]): for hyponym in i.get_words(): text += hyponym.lemma() + ' ' for j in wiki_wordnet.get_hypernyms(syn[0]): for hypernym in j.get_words(): text += hypernym.lemma() + ' ' wordcloud = WordCloud(relative_scaling=1.0, ).generate(text) plt.imshow(wordcloud) plt.axis("off") plt.show()
def __init__(self): # Переменная, которая инициализирует Elasticsearch self.elastic_search = Elasticsearch() # Переменная, которая инициализирует WikiWordnet (готовая библиотека с синонимами) self.wiki_wordnet = WikiWordnet() # Путь к файлу с распаршенными данными self.json_path = 'C:\\Users\\ledya\\PycharmProjects\\ir-vzsar\\vzsar\\output.json' # Название поискового индекса self.index_name = 'news'
def CreateHypernymCorpus(self, language="ru", verbose=False): if language == "ru": wikiwordnet = WikiWordnet() self.fulldata_words['hypernym'] = self.fulldata_words['synset'][ 0].apply(lambda w: wikiwordnet.get_hypernyms(w)) elif language == "en": raise NotImplementedError else: NotImplementedError return self.fulldata_words
def semantic_parse(text): wiki_wordnet = WikiWordnet() syn = wiki_wordnet.get_synsets(text.lower()) if syn: synonyms = [l.lemma() for l in syn[0].get_words()] hyponyms = [ hyponym.lemma() for i in wiki_wordnet.get_hyponyms(syn[0]) for hyponym in i.get_words() ] hypernyms = [ hypernym.lemma() for j in wiki_wordnet.get_hypernyms(syn[0]) for hypernym in j.get_words() ] else: synonyms = hyponyms = hypernyms = [] return {'synonyms': synonyms, 'hyponyms': hyponyms, 'hypernyms': hypernyms}
def get_synsets_dict(texts, return_graph=True): wikiwordnet = WikiWordnet() G = nx.Graph() for text in texts: s = set() synsets = wikiwordnet.get_synsets(text) for synset in synsets: for w in synset.get_words(): if (text != w.lemma()) & (w.lemma() in texts.values): G.add_edge(text, w.lemma()) d = {} for g in nx.connected_components(G): default_lemma = g.pop() d.update(dict.fromkeys(g, default_lemma)) d[default_lemma] = default_lemma if return_graph: return d, G else: return d
# -*- coding: utf-8 -*- import pymorphy2 import nltk import copy import collections import math from wiki_ru_wordnet import WikiWordnet wikiwordnet = WikiWordnet() morph = pymorphy2.MorphAnalyzer() ################ def synonymscheck(inlist, keylist): corrans = 0 #Correct Answers (Верные соответствия) for i in inlist: synsets = wikiwordnet.get_synsets(i) for n in wikiwordnet.get_synsets(keylist[inlist.index(i)]): if n in synsets: corrans += 1 print(corrans, '/3') else: pass ############### def getlemma(inlist): corrans = 0 #Correct Answers (Верные соответствия) for i in inlist: synsets = wikiwordnet.get_synsets(i) for synset in wikiwordnet.get_synsets(i):
def __call__(self): root = etree.parse( r'C:\Users\kiva0319\PycharmProjects\Diploma2020\processed\paraphrases.xml' ) root = root.getroot() corpus = etree.SubElement(root, "corpus") result_xml = etree.Element('raw_data') result_doc = etree.ElementTree(result_xml) corpus_info = etree.SubElement(result_xml, 'head') etree.SubElement(corpus_info, 'description').text = "—" etree.SubElement(corpus_info, 'date').text = str(date.today()) articles_list = etree.SubElement(result_xml, 'corpus') count = 0 for element in root[1]: id = element[0].text old_id = element[1].text id_1 = element[2].text id_2 = element[3].text title_1 = element[4].text title_2 = element[5].text text_1 = element[6].text text_2 = element[7].text words_title_1 = int(element[8].text) words_title_2 = int(element[9].text) words_article_1 = int(element[10].text) words_article_2 = int(element[11].text) num_of_paragraphs_1 = int(element[12].text) num_of_paragraphs_2 = int(element[13].text) element_paragraphs_1 = element[14].text element_paragraphs_2 = element[15].text jaccard = element[16].text clas = element[17].text print(count, id, flush=True) # words_max = max(words_max, words_article_1) # words_max = max(words_max, words_article_2) # chars_max = max(chars_max, len(text_1)) # chars_max = max(chars_max, len(text_2)) # continue paraphrase = etree.SubElement(articles_list, 'paraphrase') etree.SubElement(paraphrase, 'value', name="id").text = id etree.SubElement(paraphrase, 'value', name="old_id").text = old_id etree.SubElement(paraphrase, 'value', name="id_1").text = id_1 etree.SubElement(paraphrase, 'value', name="id_2").text = id_2 etree.SubElement(paraphrase, 'value', name="title_1").text = title_1 etree.SubElement(paraphrase, 'value', name="title_2").text = title_2 etree.SubElement(paraphrase, 'value', name="jaccard").text = jaccard etree.SubElement(paraphrase, 'value', name="class").text = clas # words and paragraphs diff etree.SubElement(paraphrase, 'words_title_diff').text = str( abs(words_title_1 - words_title_2)) etree.SubElement(paraphrase, 'words_article_diff').text = str( abs(words_article_1 - words_article_2)) etree.SubElement(paraphrase, 'paragraphs_diff').text = str( abs(num_of_paragraphs_1 - num_of_paragraphs_2)) # flesch_reading_ease textstat.textstat.set_lang("ru") etree.SubElement(paraphrase, 'flesch_reading_ease_title_1').text = str( textstat.flesch_reading_ease(" ".join( title_1.split(";")))) etree.SubElement(paraphrase, 'flesch_reading_ease__title_2').text = str( textstat.flesch_reading_ease(" ".join( title_2.split(";")))) etree.SubElement( paraphrase, 'flesch_reading_ease_article_1').text = str( textstat.flesch_reading_ease(" ".join(text_1.split(";"))) / num_of_paragraphs_1) etree.SubElement( paraphrase, 'flesch_reading_ease_article_2').text = str( textstat.flesch_reading_ease(" ".join(text_2.split(";"))) / num_of_paragraphs_2) # BLUE weights1 = (1, 0, 0, 0) weights2 = (0.5, 0.5, 0, 0) weights3 = (0.33, 0.33, 0.33, 0) weights4 = (0.25, 0.25, 0.25, 0.25) list_title_1 = title_1.split(";") list_title_2 = title_2.split(";") list_text_1 = text_1.split(";") list_text_2 = text_2.split(";") etree.SubElement(paraphrase, 'BLUE_w1_titles').text = str( sentence_bleu([list_title_1], list_title_2, weights=weights1)) etree.SubElement(paraphrase, 'BLUE_w2_titles').text = str( sentence_bleu([list_title_1], list_title_2, weights=weights2)) etree.SubElement(paraphrase, 'BLUE_w3_titles').text = str( sentence_bleu([list_title_1], list_title_2, weights=weights3)) etree.SubElement(paraphrase, 'BLUE_w4_titles').text = str( sentence_bleu([list_title_1], list_title_2, weights=weights4)) etree.SubElement(paraphrase, 'BLUE_w1_articles').text = str( sentence_bleu([list_text_1], list_text_2, weights=weights1)) etree.SubElement(paraphrase, 'BLUE_w2_articles').text = str( sentence_bleu([list_text_1], list_text_2, weights=weights2)) etree.SubElement(paraphrase, 'BLUE_w3_articles').text = str( sentence_bleu([list_text_1], list_text_2, weights=weights3)) etree.SubElement(paraphrase, 'BLUE_w4_articles').text = str( sentence_bleu([list_text_1], list_text_2, weights=weights4)) # NIST nist_1_titles = 0 nist_1_articles = 0 nist_2_titles = 0 nist_2_articles = 0 nist_3_titles = 0 nist_3_articles = 0 try: nist_1_titles = sentence_nist([list_title_1], list_title_2, n=1) except ZeroDivisionError: print("ZeroDivisionError id =", count) try: nist_1_articles = sentence_nist([list_text_1], list_text_2, n=1) except ZeroDivisionError: print("ZeroDivisionError id =", count) try: nist_2_titles = sentence_nist([list_title_1], list_title_2, n=2) except ZeroDivisionError: print("ZeroDivisionError id =", count) try: nist_2_articles = sentence_nist([list_text_1], list_text_2, n=2) except ZeroDivisionError: print("ZeroDivisionError id =", count) try: nist_3_titles = sentence_nist([list_title_1], list_title_2, n=3) except ZeroDivisionError: print("ZeroDivisionError id =", count) try: nist_3_articles = sentence_nist([list_text_1], list_text_2, n=3) except ZeroDivisionError: print("ZeroDivisionError id =", count) etree.SubElement(paraphrase, 'nist_1_titles').text = str(nist_1_titles) etree.SubElement(paraphrase, 'nist_1_articles').text = str(nist_1_articles) etree.SubElement(paraphrase, 'nist_2_titles').text = str(nist_2_titles) etree.SubElement(paraphrase, 'nist_2_articles').text = str(nist_2_articles) etree.SubElement(paraphrase, 'nist_3_titles').text = str(nist_3_titles) etree.SubElement(paraphrase, 'nist_3_articles').text = str(nist_3_articles) etree.SubElement(paraphrase, 'nist_1_diff').text = str(nist_1_titles - nist_1_articles) etree.SubElement(paraphrase, 'nist_2_diff').text = str(nist_2_titles - nist_2_articles) etree.SubElement(paraphrase, 'nist_3_diff').text = str(nist_3_titles - nist_3_articles) # ROUGE title_1_space = title_1.replace(";", " ") title_2_space = title_2.replace(";", " ") text_1_space = text_1.replace(";", " ") text_2_space = text_2.replace(";", " ") rouge = Rouge() title_score = rouge.get_scores(title_1_space, title_2_space)[0] article_score = rouge.get_scores(text_1_space, text_2_space)[0] etree.SubElement(paraphrase, 'rouge-1_titles').text = str( title_score['rouge-1']['f']) etree.SubElement(paraphrase, 'rouge-2_titles').text = str( title_score['rouge-2']['f']) etree.SubElement(paraphrase, 'rouge-L_titles').text = str( title_score['rouge-l']['f']) etree.SubElement(paraphrase, 'rouge-1_articles').text = str( article_score['rouge-1']['f']) etree.SubElement(paraphrase, 'rouge-2_articles').text = str( article_score['rouge-2']['f']) etree.SubElement(paraphrase, 'rouge-L_articles').text = str( article_score['rouge-l']['f']) # METEOR stemmer = SnowballStemmer("russian") wikiwordnet = WikiWordnet() etree.SubElement(paraphrase, 'meteor_title').text = str( meteor_score([title_1_space], title_2_space, stemmer=stemmer, wordnet=wikiwordnet)) etree.SubElement(paraphrase, 'meteor_article').text = str( meteor_score([text_1_space], text_2_space, stemmer=stemmer, wordnet=wikiwordnet)) count += 1 outFile = open("processed/metrics.xml", 'wb') result_doc.write(outFile, xml_declaration=True, encoding='utf-8', pretty_print=True)
CLASSIFIER Created on Thu Jan 21 17:16:32 2021 @author: Katarina """ import nltk import nltk.classify import re from pymystem3 import Mystem import random from wiki_ru_wordnet import WikiWordnet mystem = Mystem() wwnet = WikiWordnet() ALL_FEATURES = {} def load_all_data(): #outputs a tuple consisting (trainingdata, testdata) #each of the elements of that tuple is a list of tuples (tweet, label) training_data = [] test_data = [] n_train = 0 n_total = 0 with open('imperfective_masked_cleaned_masterfile.txt', encoding='utf-8') as file: lines = "" for line in file: if re.search('@[a-zA-Z0-9_]+', line) is not None:
def __init__(self, prefix_trie=None): self.wikiwordnet = WikiWordnet() self.prefix_trie = get_prefix_trie( ) # перенести потом в бота, инициализировать перед запуском, как и остальные модели
def eval_rus(): simlex = read_files.read_simlex_rus_file() wordsim = read_files.read_wordsim_rus_file() wn = WikiWordnet() """ #evaluation of wordnet matches = 0 for key,value in wordsim.items(): synset = wn.get_synsets(key) for syn in synset: for w in syn.get_words(): word = w.lemma() if value == word: matches = matches + 1 """ #evaluation of word vectors word_vectors = api.load('word2vec-ruscorpora-300') matches1 = 0 matches2 = 0 matches3 = 0 matches4 = 0 for key, value in simlex.items(): try: sim_words3 = word_vectors.most_similar(key + '_NOUN', topn=3) sim_words10 = word_vectors.most_similar(key + '_NOUN', topn=10) except: try: sim_words3 = word_vectors.most_similar(key + '_VERB', topn=3) sim_words10 = word_vectors.most_similar(key + '_VERB', topn=10) except: try: sim_words3 = word_vectors.most_similar(key + '_ADJ', topn=3) sim_words10 = word_vectors.most_similar(key + '_ADJ', topn=10) except: continue #print(sim_words10) for word in sim_words3: if word[0].split('_')[0] == value: matches1 = matches1 + 1 for word in sim_words10: if word[0].split('_')[0] == value: matches2 = matches2 + 1 for key, value in wordsim.items(): try: sim_words3 = word_vectors.most_similar(key + '_NOUN', topn=3) sim_words10 = word_vectors.most_similar(key + '_NOUN', topn=10) except: try: sim_words3 = word_vectors.most_similar(key + '_VERB', topn=3) sim_words10 = word_vectors.most_similar(key + '_VERB', topn=10) except: try: sim_words3 = word_vectors.most_similar(key + '_ADJ', topn=3) sim_words10 = word_vectors.most_similar(key + '_ADJ', topn=10) except: continue for word in sim_words3: if word[0].split('_')[0] == value: matches3 = matches3 + 1 for word in sim_words10: if word[0].split('_')[0] == value: matches4 = matches4 + 1 return [matches1, matches2, matches3, matches4]