コード例 #1
0
def semantic_analysis():
    wikiwordnet = WikiWordnet()
    text = calculated_text.get(1.0, END)
    text = text.replace('\n', '')
    if text == '':
        return None
    start = time.time()
    if letters_in_the_word(text):
        hyponyms = []
        # Кольцо синонимов или синсет - это группа элементов данных, которые считаются семантически эквивалентными
        # для целей поиска информации
        synsets2 = wikiwordnet.get_synsets(text)
        text = ''
        lemmas2 = [x.lemma() for x in synsets2[0].get_words()]

        # Synset представляет группу лемм, имеющих одинаковый смысл, а лемма представляет собой отдельную словоформу.
        for lemma in lemmas2:
            text += lemma + ' '

        synset2 = synsets2[0]
        for hypernym in wikiwordnet.get_hypernyms(synset2):
            for w in hypernym.get_words():
                text += w.lemma() + ' '

        for hyponym in wikiwordnet.get_hyponyms(synset2):
            for w in hyponym.get_words():
                text += w.lemma() + ' '

        word_cloud(text)
        end = time.time()
        print("Total time: {:.1f}".format(end - start))
    else:
        messagebox.showwarning('Warning!!!', 'One word!', type='ok')
コード例 #2
0
ファイル: worldcloud.py プロジェクト: Valery1508/NLIP-lab4
def semantic_analysis(text):
    wikiwordnet = WikiWordnet()
    tree_text = re.sub('-', ',', text)
    if tree_text == '':
        return None
    if letters_in_the_word(tree_text):
        hyponyms = []
        # Кольцо синонимов или синсет - это группа элементов данных, которые считаются семантически эквивалентными
        # для целей поиска информации
        synsets2 = wikiwordnet.get_synsets(tree_text)
        text = ''
        lemmas2 = [x.lemma() for x in synsets2[0].get_words()]
        # Synset представляет группу лемм, имеющих одинаковый смысл, а лемма представляет собой отдельную словоформу.
        for lemma in lemmas2:
            text += lemma + ' '

        synset2 = synsets2[0]
        for hypernym in wikiwordnet.get_hypernyms(synset2):
            for w in hypernym.get_words():
                text += w.lemma() + ' '

        for hyponym in wikiwordnet.get_hyponyms(synset2):
            for w in hyponym.get_words():
                text += w.lemma() + ' '
        return word_cloud(text)
コード例 #3
0
def view_window():
    text = calculated_text.get(1.0, END)
    text = text.replace('\n', '')
    if text != '':
        check = check_word(text)
        if not check:
            messagebox.showwarning('!',
                                   'Необходимо ввести одно слово',
                                   type='ok')
        else:
            wiki_wordnet = WikiWordnet()
            syn = wiki_wordnet.get_synsets(text.lower())
            text = ''
            for l in syn[0].get_words():
                text += l.lemma() + ' '
            for i in wiki_wordnet.get_hyponyms(syn[0]):
                for hyponym in i.get_words():
                    text += hyponym.lemma() + ' '
            for j in wiki_wordnet.get_hypernyms(syn[0]):
                for hypernym in j.get_words():
                    text += hypernym.lemma() + ' '
            wordcloud = WordCloud(relative_scaling=1.0, ).generate(text)
            plt.imshow(wordcloud)
            plt.axis("off")
            plt.show()
コード例 #4
0
ファイル: search.py プロジェクト: ledyaykina/Searcher_Vzsar
 def __init__(self):
     # Переменная, которая инициализирует Elasticsearch
     self.elastic_search = Elasticsearch()
     # Переменная, которая инициализирует WikiWordnet (готовая библиотека с синонимами)
     self.wiki_wordnet = WikiWordnet()
     # Путь к файлу с распаршенными данными
     self.json_path = 'C:\\Users\\ledya\\PycharmProjects\\ir-vzsar\\vzsar\\output.json'
     # Название поискового индекса
     self.index_name = 'news'
コード例 #5
0
ファイル: homonymfeatures.py プロジェクト: Biatris/Homonym
    def CreateHypernymCorpus(self, language="ru", verbose=False):
        if language == "ru":
            wikiwordnet = WikiWordnet()
            self.fulldata_words['hypernym'] = self.fulldata_words['synset'][
                0].apply(lambda w: wikiwordnet.get_hypernyms(w))

        elif language == "en":
            raise NotImplementedError
        else:
            NotImplementedError

        return self.fulldata_words
コード例 #6
0
def semantic_parse(text):
    wiki_wordnet = WikiWordnet()
    syn = wiki_wordnet.get_synsets(text.lower())
    if syn:
        synonyms = [l.lemma() for l in syn[0].get_words()]
        hyponyms = [
            hyponym.lemma() for i in wiki_wordnet.get_hyponyms(syn[0])
            for hyponym in i.get_words()
        ]
        hypernyms = [
            hypernym.lemma() for j in wiki_wordnet.get_hypernyms(syn[0])
            for hypernym in j.get_words()
        ]
    else:
        synonyms = hyponyms = hypernyms = []
    return {'synonyms': synonyms, 'hyponyms': hyponyms, 'hypernyms': hypernyms}
コード例 #7
0
ファイル: krv_nltk.py プロジェクト: motokazmin/tips-triks
def get_synsets_dict(texts, return_graph=True):
    wikiwordnet = WikiWordnet()
    G = nx.Graph()

    for text in texts:
        s = set()
        synsets = wikiwordnet.get_synsets(text)
        for synset in synsets:
            for w in synset.get_words():
                if (text != w.lemma()) & (w.lemma() in texts.values):
                    G.add_edge(text, w.lemma())

    d = {}
    for g in nx.connected_components(G):
        default_lemma = g.pop()
        d.update(dict.fromkeys(g, default_lemma))
        d[default_lemma] = default_lemma

    if return_graph:
        return d, G
    else:
        return d
コード例 #8
0
# -*- coding: utf-8 -*-
import pymorphy2
import nltk
import copy
import collections
import math
from wiki_ru_wordnet import WikiWordnet

wikiwordnet = WikiWordnet()
morph = pymorphy2.MorphAnalyzer()


################
def synonymscheck(inlist, keylist):
    corrans = 0  #Correct Answers (Верные соответствия)
    for i in inlist:
        synsets = wikiwordnet.get_synsets(i)
        for n in wikiwordnet.get_synsets(keylist[inlist.index(i)]):
            if n in synsets:
                corrans += 1
                print(corrans, '/3')
            else:
                pass


###############
def getlemma(inlist):
    corrans = 0  #Correct Answers (Верные соответствия)
    for i in inlist:
        synsets = wikiwordnet.get_synsets(i)
        for synset in wikiwordnet.get_synsets(i):
    def __call__(self):
        root = etree.parse(
            r'C:\Users\kiva0319\PycharmProjects\Diploma2020\processed\paraphrases.xml'
        )
        root = root.getroot()
        corpus = etree.SubElement(root, "corpus")

        result_xml = etree.Element('raw_data')
        result_doc = etree.ElementTree(result_xml)

        corpus_info = etree.SubElement(result_xml, 'head')
        etree.SubElement(corpus_info, 'description').text = "—"
        etree.SubElement(corpus_info, 'date').text = str(date.today())
        articles_list = etree.SubElement(result_xml, 'corpus')

        count = 0

        for element in root[1]:
            id = element[0].text
            old_id = element[1].text
            id_1 = element[2].text
            id_2 = element[3].text
            title_1 = element[4].text
            title_2 = element[5].text
            text_1 = element[6].text
            text_2 = element[7].text
            words_title_1 = int(element[8].text)
            words_title_2 = int(element[9].text)
            words_article_1 = int(element[10].text)
            words_article_2 = int(element[11].text)
            num_of_paragraphs_1 = int(element[12].text)
            num_of_paragraphs_2 = int(element[13].text)
            element_paragraphs_1 = element[14].text
            element_paragraphs_2 = element[15].text
            jaccard = element[16].text
            clas = element[17].text

            print(count, id, flush=True)

            # words_max = max(words_max, words_article_1)
            # words_max = max(words_max, words_article_2)
            # chars_max = max(chars_max, len(text_1))
            # chars_max = max(chars_max, len(text_2))
            # continue

            paraphrase = etree.SubElement(articles_list, 'paraphrase')
            etree.SubElement(paraphrase, 'value', name="id").text = id
            etree.SubElement(paraphrase, 'value', name="old_id").text = old_id
            etree.SubElement(paraphrase, 'value', name="id_1").text = id_1
            etree.SubElement(paraphrase, 'value', name="id_2").text = id_2
            etree.SubElement(paraphrase, 'value',
                             name="title_1").text = title_1
            etree.SubElement(paraphrase, 'value',
                             name="title_2").text = title_2
            etree.SubElement(paraphrase, 'value',
                             name="jaccard").text = jaccard
            etree.SubElement(paraphrase, 'value', name="class").text = clas

            # words and paragraphs diff
            etree.SubElement(paraphrase, 'words_title_diff').text = str(
                abs(words_title_1 - words_title_2))
            etree.SubElement(paraphrase, 'words_article_diff').text = str(
                abs(words_article_1 - words_article_2))
            etree.SubElement(paraphrase, 'paragraphs_diff').text = str(
                abs(num_of_paragraphs_1 - num_of_paragraphs_2))

            # flesch_reading_ease
            textstat.textstat.set_lang("ru")
            etree.SubElement(paraphrase,
                             'flesch_reading_ease_title_1').text = str(
                                 textstat.flesch_reading_ease(" ".join(
                                     title_1.split(";"))))
            etree.SubElement(paraphrase,
                             'flesch_reading_ease__title_2').text = str(
                                 textstat.flesch_reading_ease(" ".join(
                                     title_2.split(";"))))
            etree.SubElement(
                paraphrase, 'flesch_reading_ease_article_1').text = str(
                    textstat.flesch_reading_ease(" ".join(text_1.split(";"))) /
                    num_of_paragraphs_1)
            etree.SubElement(
                paraphrase, 'flesch_reading_ease_article_2').text = str(
                    textstat.flesch_reading_ease(" ".join(text_2.split(";"))) /
                    num_of_paragraphs_2)

            # BLUE
            weights1 = (1, 0, 0, 0)
            weights2 = (0.5, 0.5, 0, 0)
            weights3 = (0.33, 0.33, 0.33, 0)
            weights4 = (0.25, 0.25, 0.25, 0.25)

            list_title_1 = title_1.split(";")
            list_title_2 = title_2.split(";")
            list_text_1 = text_1.split(";")
            list_text_2 = text_2.split(";")

            etree.SubElement(paraphrase, 'BLUE_w1_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights1))
            etree.SubElement(paraphrase, 'BLUE_w2_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights2))
            etree.SubElement(paraphrase, 'BLUE_w3_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights3))
            etree.SubElement(paraphrase, 'BLUE_w4_titles').text = str(
                sentence_bleu([list_title_1], list_title_2, weights=weights4))

            etree.SubElement(paraphrase, 'BLUE_w1_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights1))
            etree.SubElement(paraphrase, 'BLUE_w2_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights2))
            etree.SubElement(paraphrase, 'BLUE_w3_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights3))
            etree.SubElement(paraphrase, 'BLUE_w4_articles').text = str(
                sentence_bleu([list_text_1], list_text_2, weights=weights4))

            # NIST
            nist_1_titles = 0
            nist_1_articles = 0

            nist_2_titles = 0
            nist_2_articles = 0

            nist_3_titles = 0
            nist_3_articles = 0

            try:
                nist_1_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=1)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_1_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=1)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_2_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=2)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_2_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=2)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_3_titles = sentence_nist([list_title_1],
                                              list_title_2,
                                              n=3)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            try:
                nist_3_articles = sentence_nist([list_text_1],
                                                list_text_2,
                                                n=3)
            except ZeroDivisionError:
                print("ZeroDivisionError id =", count)

            etree.SubElement(paraphrase,
                             'nist_1_titles').text = str(nist_1_titles)
            etree.SubElement(paraphrase,
                             'nist_1_articles').text = str(nist_1_articles)

            etree.SubElement(paraphrase,
                             'nist_2_titles').text = str(nist_2_titles)
            etree.SubElement(paraphrase,
                             'nist_2_articles').text = str(nist_2_articles)

            etree.SubElement(paraphrase,
                             'nist_3_titles').text = str(nist_3_titles)
            etree.SubElement(paraphrase,
                             'nist_3_articles').text = str(nist_3_articles)

            etree.SubElement(paraphrase,
                             'nist_1_diff').text = str(nist_1_titles -
                                                       nist_1_articles)
            etree.SubElement(paraphrase,
                             'nist_2_diff').text = str(nist_2_titles -
                                                       nist_2_articles)
            etree.SubElement(paraphrase,
                             'nist_3_diff').text = str(nist_3_titles -
                                                       nist_3_articles)

            # ROUGE
            title_1_space = title_1.replace(";", " ")
            title_2_space = title_2.replace(";", " ")
            text_1_space = text_1.replace(";", " ")
            text_2_space = text_2.replace(";", " ")

            rouge = Rouge()
            title_score = rouge.get_scores(title_1_space, title_2_space)[0]
            article_score = rouge.get_scores(text_1_space, text_2_space)[0]

            etree.SubElement(paraphrase, 'rouge-1_titles').text = str(
                title_score['rouge-1']['f'])
            etree.SubElement(paraphrase, 'rouge-2_titles').text = str(
                title_score['rouge-2']['f'])
            etree.SubElement(paraphrase, 'rouge-L_titles').text = str(
                title_score['rouge-l']['f'])

            etree.SubElement(paraphrase, 'rouge-1_articles').text = str(
                article_score['rouge-1']['f'])
            etree.SubElement(paraphrase, 'rouge-2_articles').text = str(
                article_score['rouge-2']['f'])
            etree.SubElement(paraphrase, 'rouge-L_articles').text = str(
                article_score['rouge-l']['f'])

            # METEOR
            stemmer = SnowballStemmer("russian")
            wikiwordnet = WikiWordnet()
            etree.SubElement(paraphrase, 'meteor_title').text = str(
                meteor_score([title_1_space],
                             title_2_space,
                             stemmer=stemmer,
                             wordnet=wikiwordnet))
            etree.SubElement(paraphrase, 'meteor_article').text = str(
                meteor_score([text_1_space],
                             text_2_space,
                             stemmer=stemmer,
                             wordnet=wikiwordnet))

            count += 1

        outFile = open("processed/metrics.xml", 'wb')
        result_doc.write(outFile,
                         xml_declaration=True,
                         encoding='utf-8',
                         pretty_print=True)
コード例 #10
0
ファイル: classifier.py プロジェクト: katarina-L/txmm
CLASSIFIER

Created on Thu Jan 21 17:16:32 2021

@author: Katarina
"""
import nltk
import nltk.classify
import re
from pymystem3 import Mystem
import random
from wiki_ru_wordnet import WikiWordnet

mystem = Mystem()
wwnet = WikiWordnet()
ALL_FEATURES = {}


def load_all_data():
    #outputs a tuple consisting (trainingdata, testdata)
    #each of the elements of that tuple is a list of tuples (tweet, label)
    training_data = []
    test_data = []
    n_train = 0
    n_total = 0
    with open('imperfective_masked_cleaned_masterfile.txt',
              encoding='utf-8') as file:
        lines = ""
        for line in file:
            if re.search('@[a-zA-Z0-9_]+', line) is not None:
コード例 #11
0
 def __init__(self, prefix_trie=None):
     self.wikiwordnet = WikiWordnet()
     self.prefix_trie = get_prefix_trie(
     )  # перенести потом в бота, инициализировать перед запуском, как и остальные модели
コード例 #12
0
def eval_rus():

    simlex = read_files.read_simlex_rus_file()
    wordsim = read_files.read_wordsim_rus_file()

    wn = WikiWordnet()
    """
	#evaluation of wordnet
	matches = 0
	for key,value in wordsim.items():
		synset = wn.get_synsets(key)
		for syn in synset:
			for w in syn.get_words():
				word = w.lemma()
				if value == word:
					matches = matches + 1
	"""

    #evaluation of word vectors
    word_vectors = api.load('word2vec-ruscorpora-300')

    matches1 = 0
    matches2 = 0
    matches3 = 0
    matches4 = 0

    for key, value in simlex.items():
        try:
            sim_words3 = word_vectors.most_similar(key + '_NOUN', topn=3)
            sim_words10 = word_vectors.most_similar(key + '_NOUN', topn=10)
        except:
            try:
                sim_words3 = word_vectors.most_similar(key + '_VERB', topn=3)
                sim_words10 = word_vectors.most_similar(key + '_VERB', topn=10)
            except:
                try:
                    sim_words3 = word_vectors.most_similar(key + '_ADJ',
                                                           topn=3)
                    sim_words10 = word_vectors.most_similar(key + '_ADJ',
                                                            topn=10)
                except:
                    continue

        #print(sim_words10)
        for word in sim_words3:
            if word[0].split('_')[0] == value:
                matches1 = matches1 + 1

        for word in sim_words10:
            if word[0].split('_')[0] == value:
                matches2 = matches2 + 1

    for key, value in wordsim.items():
        try:
            sim_words3 = word_vectors.most_similar(key + '_NOUN', topn=3)
            sim_words10 = word_vectors.most_similar(key + '_NOUN', topn=10)
        except:
            try:
                sim_words3 = word_vectors.most_similar(key + '_VERB', topn=3)
                sim_words10 = word_vectors.most_similar(key + '_VERB', topn=10)
            except:
                try:
                    sim_words3 = word_vectors.most_similar(key + '_ADJ',
                                                           topn=3)
                    sim_words10 = word_vectors.most_similar(key + '_ADJ',
                                                            topn=10)
                except:
                    continue

        for word in sim_words3:
            if word[0].split('_')[0] == value:
                matches3 = matches3 + 1

        for word in sim_words10:
            if word[0].split('_')[0] == value:
                matches4 = matches4 + 1

    return [matches1, matches2, matches3, matches4]