コード例 #1
0
def build_corpora(directory):
    signs = ['.', ',', '«', '»', '(', ')', '-', ':', ';', '?', '!', '@']

    out = open(out_file, mode='w', encoding='utf-8')
    out_lem = open(file=out_lemmatized, encoding='utf-8', mode='w')
    out_filenames = open(out_names, encoding='utf-8', mode='w')

    #printing corpora
    print("Printing corpora")
    for file in os.scandir("./" + directory):
        if os.DirEntry.is_file(file) and file.name != '.DS_Store':
            text = open(file, mode='r', encoding='utf-8').read()
            out.write(file.name + clean_text(text) + '\n')
            out_filenames.write(file.name + '\n')
            #printing lemmed
            print("Printing lemmatized:", file.name)
            lemmer = mystem.Mystem()
            lemmatized = lemmer.lemmatize(clean_text(text))
            out_lem.write(file.name + ' ')
            for lemma in lemmatized:
                if not lemma in signs:
                    out_lem.write(lemma)
                    out_lem.write(' ')

    #printing dictionary
    print("Printing morphological dictionary")
    subprocess.run(args_dictionary)
    #printing ngrams
    print("Printing ngrams")
    subprocess.run(args_turbotopics, stdout=None)
コード例 #2
0
ファイル: text.py プロジェクト: Temirlan/t-model
def save_bag_of_words(path: str):
    """ algorithm preprocessing text """
    stop_words = nltk.corpus.stopwords.words('russian')
    mystem = pymystem3.Mystem()

    file_object = open(BAG_OF_WORDS_PATH, 'w', encoding="utf-8")

    text = " "
    #converterUTF8(filename)
    with codecs.open(path, encoding='UTF-8') as f_manager:
        for line in f_manager:
            if len(line) != 0:
                text = text + " " + line

        word = nltk.word_tokenize(text)
        word_ws = [w.lower() for w in word if w.isalpha()]
        word_w = [w for w in word_ws if w not in stop_words]

        lem = mystem.lemmatize((" ").join(word_w))
        lema = [w for w in lem if w.isalpha() and len(w) > 1]
        freq = nltk.FreqDist(lema)

        results = []
        results = [(key + ":" + str(val)) for key, val in freq.items()
                   if val > 1]

        file_object.write("|text" + " " + (" ").join(results) + '\n')

    file_object.close()

    return freq.items()
コード例 #3
0
def prepare_group(mypath):
    files = []
    stem = pymystem3.Mystem()

    for r, d, f in os.walk(mypath):
        for file in f:
            files.append(os.path.join(r, file))

    docs = []
    full = ''
    wordset = set()

    for filepath in files:
        corpus = []
        with open(filepath) as file:
            for line in file:
                corpus.append(line)
        corpus = [el for el in corpus if el != '\n']

        corpus = [re.sub(r'[^\w\s]', '', el)[:-1] for el in corpus]

        for el in corpus:
            words = el.lower()
            proc = stem.lemmatize(words)
            proc = [w for w in proc if (w.strip() != '') and (w != '\n')]
            docs.append(proc)

            for word in proc:
                full += ' ' + word
                wordset.add(word)
        print(filepath)

    return docs, wordset, full
コード例 #4
0
 def __init__(self):
     # lemmatizer
     self.lemmatizer = pymystem3.Mystem()
     # nltk stopwords
     self.stopwords = stopwords.words('russian')
     self.stopwords.extend(['', ' ', '\n', '«', '»'])
     self.stopwords.extend([p for p in string.punctuation])
コード例 #5
0
def get_lexemas_from_text(cursor, atext=""):
    term_extractor = TermExtractor()
    mystem = pymystem3.Mystem()
    lexemas = []
    for term in term_extractor(atext):
        for lexema in str(term.normalized).split(" "):
            lexema = mystem.analyze(lexema)[0]['analysis'][0]['lex']
            id_lexema = lexema_id_by_inf(cursor, lexema)
            lexemas += [id_lexema]
    return lexemas
コード例 #6
0
def test(model):
    while True:
        in_word = input("Введите слово или q: ")
        if in_word == "q": break
        in_word = pymystem3.Mystem().lemmatize(in_word)[0]
        try:
            for word in model.most_similar(positive=[in_word], topn=10):
                print(word)
        except KeyError:
            print("word '{}' not in vocabulary".format(in_word))
コード例 #7
0
def stem_russian_batches(batch_filename):
    if not hasattr(stem_russian_batches, "stemmer"):
        stem_russian_batches.stemmer = pymystem3.Mystem()

    batch_stem_path = './stem/'

    print batch_filename + " loading..."
    batch = artm.library.Library().LoadBatch(batch_filename)
    print batch_filename + " loading done."
    batch_stem = artm.messages_pb2.Batch()
    # stem tokens
    token_list = list()
    for token in batch.token:
        token_list.append(token)
    text = ' '.join(token_list)
    text_stem = stem_russian_batches.stemmer.lemmatize(text)
    token_stem_list = ''.join(text_stem).strip().split(' ')

    token_id_to_token_stem_id = dict()
    token_stem_to_token_stem_id = dict()
    for (token_id, token_stem) in enumerate(token_stem_list):
        #print token_id, token_stem
        if not token_stem_to_token_stem_id.has_key(token_stem):
            token_stem_to_token_stem_id[token_stem] = len(batch_stem.token)
            batch_stem.token.append(token_stem)
        token_id_to_token_stem_id[token_id] = token_stem_to_token_stem_id[
            token_stem]
    print batch_filename + " " + str(len(batch.token)) + " -> " + str(
        len(batch_stem.token))
    # convert items
    for item in batch.item:
        # print item.title
        # add item
        item_stem = batch_stem.item.add()
        item_stem.id = item.id
        item_stem.title = item.title
        # add fields
        for field in item.field:
            field_stem_dict = defaultdict(int)
            for token_num in xrange(len(field.token_id)):
                token_id = field.token_id[token_num]
                token_stem_id = token_id_to_token_stem_id[token_id]
                token_count = field.token_count[token_num]
                field_stem_dict[token_stem_id] += token_count

            field_stem = item_stem.field.add()
            field_stem.name = field.name
            for token_stem_id in field_stem_dict:
                field_stem.token_id.append(token_stem_id)
                field_stem.token_count.append(field_stem_dict[token_stem_id])
    # save batch
    print batch_filename + " saving result..."
    artm.library.Library().SaveBatch(batch_stem, batch_stem_path)
    print batch_filename + " saving done."
    return 0
コード例 #8
0
ファイル: app.py プロジェクト: neverix/2020-ai-viz
def prep_words(words):
    lemmer = pymystem3.Mystem()
    stopwords = set(nltk.corpus.stopwords.words("russian") + ["весь", "это"])
    tokens = lemmer.lemmatize(' '.join(words).lower())
    tokens = [token.replace(' ', '') for token in tokens]
    tokens = [
        token for token in tokens
        if token and token not in stopwords and not all(
            char in punctuation or char.isnumeric() for char in token)
    ]
    return tokens
コード例 #9
0
def one_word_production(word, model, sets, fnames, tfidf, coef, coef2):
    primal = word
    stem = pymystem3.Mystem()
    proc = stem.lemmatize(word)[0]
    word = proc.strip()
    if word == '':
        return 0, primal
    if choose_by_tfidf(fnames, tfidf, word):
        chose, pos = select_by_cos(word, sets, model, tfidf, fnames, coef,
                                   coef2)
        return 1, chose
    else:
        return 0, primal
コード例 #10
0
    def __init__(self, config_file):
        self.config = json.load(open(config_file, 'r'))

        self.__tokenizer = TreebankWordTokenizer()
        #self.stemmer = SnowballStemmer('russian')

        self.__mystem = pymystem3.Mystem()
        self.__ft_c = pickle.load(open(self.config['word_embeddings'], 'rb'))
        self.__class_names = pickle.load(open(self.config['class_names'],
                                              'rb'))
        self.__old2new = pickle.load(open(self.config['old2new'], 'rb'))
        self.__new2old = pickle.load(open(self.config['new2old'], 'rb'))
        self.__num2title = pickle.load(open(self.config['num2title'], 'rb'))
        self.__build_net(mtype='cnn')
コード例 #11
0
def part_production(string, model, sets, fnames, tfidf, coef, coef2):
    start = string.split()
    res = []
    stem = pymystem3.Mystem()
    proc = stem.lemmatize(string)
    corpus = [re.sub(r'[^\w\s]', '', el) for el in proc]
    corpus = [el for el in corpus if el.strip() != '']

    for i, word in enumerate(corpus):
        if choose_by_tfidf(fnames, tfidf, word):
            chose, pos = select_by_cos(word, sets, model, tfidf, fnames, coef,
                                       coef2)
            res.append([word, chose, i])
    return res
コード例 #12
0
    def __init__(self, settings, task_type=None):
        """
        Arguments
        ---------
            settings : dict
            task_type : str
        """
        self.mystem = pymystem3.Mystem(entire_input=False)
        self.settings = settings

        if (task_type is not None):
            key = task_type + '_stop_words'
            self.task_specific_stop_words = self.settings[key]
        else:
            self.task_specific_stop_words = []
コード例 #13
0
def preprocess_document(document, russian_stop_words):

    document = document.lower()
    document = re.sub(u'\xa0|\n', ' ', document)
    document = re.sub('[^а-яa-z ]', '', document)

    mystem = pymystem3.Mystem()
    tokens = mystem.lemmatize(document)

    tokens = [
        token for token in tokens if ((token not in russian_stop_words) and (
            token.strip() not in string.punctuation) and (len(token) > 2))
    ]

    document = ' '.join(tokens)

    return document
コード例 #14
0
def checkExecTimeMystemOneText(texts):
    lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
    txtpart = lol(texts, 1000)
    res = []
    for txtp in txtpart:
        alltexts = ' '.join([txt + ' br ' for txt in txtp])
        m = Stem.Mystem()
        words = m.lemmatize(alltexts)
        doc = []
        for txt in words:
            if txt != '\n' and txt.strip() != '':
                if txt == 'br':
                    res.append(doc)
                    doc = []
                else:
                    doc.append(" "+txt+" ")
        return res
コード例 #15
0
ファイル: _annotating_file.py プロジェクト: strategy155/hws
def _soup_parsing(output_soup, input_soup, freq_dictionary):
    _stemmer = pymystem3.Mystem()
    _line_counter = 0
    _word_counter = -1
    _words_array = re.findall(r"[а-яА-ЯёЁ]+|\n", input_soup.text)
    _words_set = set(_words_array)
    _some_threads = []
    _word_annotation_dict = {}
    new_q = queue.Queue()
    start = time.clock()
    thread_num = 8
    for i in range(thread_num):
        t = threading.Thread(target=_get_word_annotation,
                             args=(new_q, _word_annotation_dict, _stemmer))
        t.start()
        _some_threads.append(t)
    for word in _words_set:
        new_q.put(word)
    # for word in _words_array:
    #     if word == '\n':
    #         _line_counter += 1
    #         _append_to_tag(output_soup, "body", "p")
    #     else:
    #         _word_counter += 1
    #         t = threading.Thread(target=_add_word_with_annotation,args=(_line_counter,_word_counter,
    #                                                                   word,_stemmer,output_soup,
    #                                                                   freq_dictionary))
    #         t.start()
    #         try:
    #             freq_dictionary[word.lower()] += 1
    #         except KeyError:
    #             freq_dictionary[word.lower()] = 1
    #         _some_threads.append(t)
    new_q.join()
    for i in range(thread_num):
        new_q.put(None)
    for t in _some_threads:
        t.join()
    stop = time.clock()
    print(stop - start)
    return output_soup
def utterance_to_bow(utterance):
    stop_words = nltk.corpus.stopwords.words('russian')
    stop_word = "пожалуйста здоавствуйте"
    for i in stop_words:
        stop_word = stop_word + " " + i
    stoplist = stop_word

    utterance = utterance.lower()
    utterance = utterance.replace("тк", "").replace(
        "сбербанк",
        "банк").replace("сбер",
                        "банк").replace("сбербанка",
                                        "банк").replace("банка", "банк")
    utterance = re.sub(r'[^а-яА-Я ]+', '', utterance)
    tokens = [word for word in str(utterance).split() if word not in stoplist]

    mystem = pymystem3.Mystem()
    utterance = [mystem.lemmatize(token)[0] for token in tokens]

    bow = dictionary.doc2bow(utterance)
    return bow
コード例 #17
0
 def _lemmatize_words(self, texts):
     mystem = pymystem3.Mystem()
     return [[mystem.lemmatize(token)[0] for token in text]
             for text in texts]
コード例 #18
0
 def __init__(self):
     self.mystem_gr_vocab = self.mystem_gr_tokens.split('|')
     self.mystemmer = pymystem3.Mystem()
     self.mystemmer_cache = {}
     self.mystem_gr_tokenizer = RegexpTokenizer(self.mystem_gr_tokens)
     self.mystem_gr_vectorizer = CountVectorizer(                    tokenizer=self.mystem_gr_tokenizer.tokenize,                     vocabulary=self.mystem_gr_vocab,                     binary=True)
コード例 #19
0
def print_most_common(items_list):
    print('Lemma frequency:')
    print(',\n'.join("'{}' : {}".format(str(elt[0]), str(elt[1]))
                     for elt in items_list))


def print_csv(item_list):
    with open('word_bag_stat.csv', 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=';')
        for item in item_list:
            csv_writer.writerow(item)


file = open('word_bag_text.txt', 'r', encoding='utf-8')
text = file.read()
file.close()

stemmer = pymystem3.Mystem(entire_input=False, speedup=True)
statistics = Counter()

if text == '\n':
    print('Your text is empty.')
else:
    lemmas = stemmer.lemmatize(text)
    statistics += Counter(lemmas)

print()
stat_sorted_list = statistics.most_common()
print_most_common(stat_sorted_list)
print_csv(stat_sorted_list)
コード例 #20
0
 def setUpClass(cls):
     mystem = pymystem3.Mystem(entire_input=False, disambiguation=True)
     cls.text_analyzer = Analyzer(mystem)
コード例 #21
0
Он лучше справляется с бастардами и там есть морфоанализатор.
Там нет разрешения омонимии, но ее сделали другие чуваки: проект PyPi: rnnmorph"""

import re
from collections import defaultdict, namedtuple

import pandas as pd
import numpy as np

import pymystem3

import constants

import write_lemmas_to_file

MYSTEM = pymystem3.Mystem(entire_input=False, disambiguation=True)

# Frequency distribution of lemmas
LEMMAS = defaultdict(lambda: 0)

# Lemmas and their hashes
LEMMAS_HASHES = dict()

# Filter for non-proper Nouns (S) and all Verbs (V)
PAT = re.compile('([SV]),(?!имя,|фам,|сокр=|гео)')
""" FIXME: Лемматизация происходит для двух задач: 
    - составление ключ слов
    - классификация строк описаний на основе ключевых слов
    """
# Добавит в дату просто колонку с леммами
コード例 #22
0
 def __init__(self):
     wikipedia.set_lang("en")
     self.stem = pymystem3.Mystem()
     self.sparql = SPARQLWrapper("http://dbpedia.org/sparql")
     self.sparql.setReturnFormat(JSON)
     self.ner = ner_detector.NerDetector()
コード例 #23
0
 def __init__(self, vocab_filename=None):
     self.__tokenizer = TreebankWordTokenizer()
     self.__mystem = pymystem3.Mystem()
コード例 #24
0
ファイル: processor_mystem.py プロジェクト: dvzubarev/isanlp
 def init(self):
     if self._mystem is None:
         self._mystem = pymystem3.Mystem()
         self._mystem.start()
コード例 #25
0
ファイル: topics.py プロジェクト: Serenitas/topic-modeller
        else:
            index = doc.split('|')[0]
            prob = doc.split('|')[1]
            docs[int(index)].append(topic_name + '|' + prob)

all_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens
ready_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens
ngrams_tokens = model_artm.score_tracker['top_tokens_score'].last_tokens
for topic in ngrams_tokens:
    ngrams_tokens[topic] = []
#for topic in all_tokens.keys():
#    tokens = all_tokens[topic]
#    ready_tokens[topic] = tokens[:10]

ngrams = open('adapted.txt', mode='r', encoding='utf-8').read().split('\n')
lemmer = mystem.Mystem()
topicfile = open('topics.txt', mode='w', encoding='utf-8')
tokens = []

for ngram in ngrams:
    if ngram == '':
        continue
    for topic in all_tokens.keys():
        tokens = all_tokens[topic]
        all_in = True
        for word in ngram.strip('\n').split(' '):
            if lemmer.lemmatize(word)[0] not in tokens:
                all_in = False
                break
        if all_in:
            ngrams_tokens[topic].append(ngram)
コード例 #26
0
def parsing(BATCH_SIZE, dsl_dict, short, dialect):

    m = pymystem3.Mystem()

    idx_word = []
    words = []
    infos = []
    pos_list = []
    for i in range(3, 60001):
        s = dsl_dict[i]
        if not s.startswith('\t'):
            idx_word.append(i)
            words.append(s.strip())

    for l in range(len(short) - 2):
        if short[l] == 'см.':
            del short[l]
        if short[l] == 'что-л.':
            del short[l]

    print('Кол-во слов:', len(words))
    for q in range(len(idx_word) - 1):

        if q % 100 == 0:
            print(q, datetime.now())

        s = ' '.join(
            [w.strip() for w in dsl_dict[idx_word[q] + 1:idx_word[q + 1]]])
        s = re.search('\[.*?\].*?\[.*?\](.*?)\[.*?\]', s).group(1)
        s_list = s.split(' ')
        start = 0
        end = 0
        st_ch = 0
        if '2)' and '2.' not in s_list:
            for i in range(len(s_list) - 1):
                for d in dialect:
                    if s_list[i].startswith(d):
                        start = i
                        st_ch = 1
                for l in short:
                    if s_list[i].startswith(l):
                        start = i
                        st_ch = 1
            if 'см.' in s_list:
                end = s_list.index('см.')
        if '2)' in s_list or '2.' in s_list:
            try:
                end = s_list.index('2)')
            except:
                end = s_list.index('2.')
            for i in range(end - 1):
                for d in dialect:
                    if s_list[i].startswith(d):
                        start = i
                        st_ch = 1
                for l in short:
                    if s_list[i].startswith(l):
                        start = i
                        st_ch = 1
            if 'см.' in s_list:
                if end > s_list.index('см.'):
                    end = s_list.index('см.')

        result = []

        if start == 0 and st_ch == 0:
            start = -1
        if end != 0:
            if start > end:
                start = -1
            for i in range(start + 1, end):
                result.append(s_list[i])
        else:
            for i in range(start + 1, len(s_list) - 1):
                result.append(s_list[i])

        for r in result:
            if r.endswith(')'):
                result = result[result.index(r) + 1:]

        info = ' '.join(result)
        info = info.split(';')[0]
        if info.startswith('3 '):
            info = info.split('3 ')[1]
        if info.startswith('1.'):
            info = info.split('1.')[1]
        if info.endswith('.'):
            info = info.split('.')[0]

        infos.append(info.strip())

        if 'межд.' in s_list:
            pos = 'ij'
        else:
            if words[len(infos) - 1].endswith('-мӣ'):
                pos = 'v'
            else:
                pos = detect_pos(m, info)
        pos_list.append(pos)

    final_list = []
    work_list = []
    for j in range(len(pos_list) - 1):
        if pos_list[j] == "nothing":
            work_list.append(j)
    work_list = list(chunk(work_list, BATCH_SIZE))
    for el in work_list:
        text = []
        for q in el:
            text.append(infos[q].split(' ')[0])
        pos_l = detect_pos(m, ' '.join(text))
        for p in range(len(pos_l)):
            pos_list[el[p]] = pos_l[p]

    for i in range(len(words) - 1):
        final_list.append(words[i] + '\t' + infos[i] + '\t' + pos_list[i])

    return final_list
コード例 #27
0
ファイル: test.py プロジェクト: Fatalll/KBQA
# coding=utf-8

from SPARQLWrapper import SPARQLWrapper, JSON
import pymystem3
import requests
import json
import re

m = pymystem3.Mystem()

relations = dict()

sparql = SPARQLWrapper("http://dbpedia.org/sparql")
sparql.setQuery("""
SELECT distinct ((SUBSTR(str(?property), 29)) as ?property) ((SUBSTR(str(?equals), 32)) as ?equals) WHERE {{
         ?instance a dbo:Person . 
         ?instance ?property ?obj .
         ?property owl:equivalentProperty ?equals .
         FILTER (SUBSTR(str(?equals), 1, 31) = "http://www.wikidata.org/entity/") .
}}
""")
sparql.addParameter("timeout", "30000")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

for z in results['results']['bindings']:
    relations[z['property']['value']] = z['equals']['value']

print(len(relations), relations)
counter = 0
コード例 #28
0
 def __init__(self):
     self.__analyzer = pymystem3.Mystem()
コード例 #29
0
ファイル: lemma_counter.py プロジェクト: deer95/Meanist

def stemming(text):
    lemmas = stemmer.lemmatize(text, speedup=True)
    lemma_stat = Counter(lemmas).most_common()
    for tuple in lemma_stat:
        print('{}: {}'.format(tuple[1], tuple[0]))
    print('\n#####\n')
    over_file.write('Number of tokens: {}\n'.format(len(lemmas)))
    over_file.write('Number of lemmas: {}\n\n'.format(len(lemma_stat)))


texts_file = open('SampleRU.txt', 'r', encoding='utf-8')
over_file = open('overview.txt', 'w', encoding='utf-8')

stemmer = pymystem3.Mystem(disambiguation=False, entire_input=False)
new_text = True
text_str = ''

for line in texts_file:
    # если встречается разделитель между текстами, заводим новые переменные
    if line == '####\n':
        stemming(text_str)
        new_text = True
        text_str = ''
        continue
    # записываем название текста
    elif new_text:
        over_file.write(line.upper())
        new_text = False
    # пишем текст в одну строку
コード例 #30
0
ファイル: utils.py プロジェクト: lopuhin/WSI
    'вилка',
    'винт',
    'горшок',
    # single sense in the dictionary
    'вата',
    'бык',
    'байка',
    'баян',
    'бомба',
    # really single sense
    'борщ',
    'воск',
    'бухгалтер',
]

MyStem = pymystem3.Mystem()


def load_stopwords():
    with open('stopwords.txt') as f:
        return {line.strip().split()[0] for line in f if line.strip()}


stopwords = load_stopwords()


def load_contexts(root, word, window=None):
    with open(os.path.join(root, '{}.txt'.format(word))) as f:
        contexts = []
        for line in f:
            left, _, right = line.split('\t')