コード例 #1
0
 def __init__(self):
     self.file_name = self.INPUTFILE
     self.csvlength = 0
     self.lemmatiser = Mystem()
     #self.freq_dict = {}
     self.fd = defaultdict(dict)
コード例 #2
0
from pymystem3 import Mystem

from question_game.jaccard import jaccard
from question_game.nltk_bleu_score import sentence_bleu
from question_game.nltk_bleu_score import SmoothingFunction

stemmer = Mystem()
chencherry = SmoothingFunction()


class Question:
    """Provide a class to keep a single step in the game."""
    def __init__(self, qa_pair: tuple):
        self._question, self._ref_trans = qa_pair
        self._guesses = []  # List[Tuple(guess, mask, score), ]
        self._ref_analysis = stemmer.analyze(self._ref_trans)
        self._ref_lemmas = []
        self._ref_text = []
        self._get_ref_lemmas_and_text()
        self._guess_lemmas = ''

    def add_guess(self, guess):
        self._guess_lemmas = stemmer.lemmatize(guess)  # incl. punct. etc

        mask = self._get_mask()
        score = self._calc_score(guess)

        self._guesses.append((guess, mask, score))

    def get_question(self):
        return self._question
コード例 #3
0
def preprocessing_raw_data(**kwargs):
    import re

    from airflow.models import Variable
    from elasticsearch.helpers import streaming_bulk
    from elasticsearch_dsl import Search, Q
    from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    from pymorphy2 import MorphAnalyzer
    from pymystem3 import Mystem
    from stop_words import get_stop_words

    from util.service_es import search, update_generator
    from util.util import is_latin, is_word

    start = kwargs['start']
    end = kwargs['end']

    number_of_documents = int(
        Variable.get("lemmatize_number_of_documents", default_var=None))
    if number_of_documents is None:
        raise Exception("No variable!")

    s = search(ES_CLIENT,
               ES_INDEX_DOCUMENT,
               query={},
               source=['text'],
               sort=['id'],
               get_search_obj=True)
    s = s.query(~Q('exists', field="text_lemmatized_yandex")
                | ~Q('exists', field="text_lemmatized"))
    s = s[int(start / 100 *
              number_of_documents):int(end / 100 * number_of_documents) + 1]
    documents = s.execute()

    print('!!! len docs', len(documents))
    stopwords_ru = set(get_stop_words('ru'))
    stopwords_eng = set(get_stop_words('en') + stopwords.words('english'))

    lemmatizer = WordNetLemmatizer()
    morph = MorphAnalyzer()
    m = Mystem()

    s = Search(using=ES_CLIENT, index=ES_INDEX_CUSTOM_DICTIONARY_WORD)
    r = s[:1000000].scan()
    custom_dict = dict((w.word, w.word_normal) for w in r)

    for doc in documents:
        cleaned_doc = " ".join(x.lower() for x in ' '.join(
            re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ',
                   doc.text).split()).split())
        if is_latin(cleaned_doc):
            cleaned_words_list = [
                lemmatizer.lemmatize(word) for word in cleaned_doc.split()
                if len(word) > 3 and word not in stopwords_eng
            ]
            doc['text_lemmatized_yandex'] = ""
        else:
            cleaned_words_list = [
                morph_with_dictionary(morph, word, custom_dict)
                for word in cleaned_doc.split()
                if len(word) > 2 and word not in stopwords_ru
            ]
            cwl_yandex = filter(
                lambda word: is_word(word) and len(word) > 2 and word not in
                stopwords_ru, m.lemmatize(cleaned_doc))
            cleaned_doc_yandex = " ".join(cwl_yandex)
            doc['text_lemmatized_yandex'] = cleaned_doc_yandex
        cleaned_doc = " ".join(cleaned_words_list)
        doc['text_lemmatized'] = cleaned_doc

    documents_processed = 0
    failed = 0
    for ok, result in streaming_bulk(ES_CLIENT,
                                     update_generator(ES_INDEX_DOCUMENT,
                                                      documents),
                                     index=ES_INDEX_DOCUMENT,
                                     chunk_size=5000,
                                     raise_on_error=True,
                                     max_retries=10):
        if not ok:
            failed += 1
        if failed > 5:
            raise Exception("Too many failed ES!!!")
        documents_processed += 1
    return f"{documents_processed} Processed, {known_counter} in pymorphie dict, {custom_dict_counter} in custom dict, {not_in_dict_counter} not found"
コード例 #4
0
import logging
import sys
import os
import nltk
from pymystem3 import Mystem


ROOT_LOGGER = logging.getLogger()
ROOT_LOGGER.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
ROOT_LOGGER.addHandler(handler)

ROOT_LOGGER.info("Setting up projet environment...")

PROJECT_ROOT = "."
DB_NAME = "question_pairs.db"
logging.info("Make sure that you have internet connection to download data for vectorization.")
logging.info("Otherwise this program is to freeze!")
nltk.download('punkt')
MORPH = Mystem()
CORPUS_SIZE = 100000

FASTTEXT_MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v", "model.model")
FASTTEXT_CACHE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v", "fast_text_matrix.npy")
ELMO_MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), "elmo", "elmo")
コード例 #5
0
from pymystem3 import Mystem
from gram_filter import gram_filter
from ngram import train
ma = Mystem()
from random import uniform


def nachform(word):
    try:
        token = ma.analyze(word)[0]['analysis'][0]['lex']
    except (KeyError, IndexError):
        token = ''
    return token


def unirand(seq, kutoken):
    freq_, res = 0, None
    rnd = uniform(0, 1)
    for token, freq in seq:
        if kutoken == nachform(token):
            freq_ += freq * 2
        else:
            freq_ += freq / 2
        #print(1, rnd, freq_, token, kutoken)
        if rnd < freq_:
            res = (token, nachform(token))
    if res == None:
        freq_ = 0
        for token, freq in seq:
            freq_ += freq
            #print(2, rnd, freq_, token)
コード例 #6
0
 def __init__(self):
     self.mystem = Mystem()
コード例 #7
0
ファイル: Parser.py プロジェクト: SoftIlnyr/info_search
def get_stem_text(text):
    m = Mystem()
    text_raw = re.sub(u'[^a-zA-Zа-яА-ЯйЙёЁ _]+', '', text)
    text_stem = m.lemmatize(text_raw)
    text_res = ''.join(text_stem).strip()
    return text_res
コード例 #8
0
from re import sub
from pymystem3 import Mystem
from json import loads, dumps
from nltk.corpus import stopwords
from os.path import dirname, abspath

d = dirname(dirname(abspath(__file__)))

rus_lemmatizer = Mystem()
rus_stopwords = stopwords.words('russian')

FILENAME = 'train7.json'
FILEPATH = f'{d}/resources/{FILENAME}'
OUTPATH = f'{d}/regression/{FILENAME}'
FIELDS = ['author', 'summary', 'title']


def preprocess_text(text):
    return [
        token for token in rus_lemmatizer.lemmatize(text.lower())
        if token not in rus_stopwords and token.strip() != ''
    ]


def make_ranking(query, results):
    ranks = [0 for _ in range(len(results))]
    for token in query:
        for it, result in enumerate(results):
            for field in FIELDS:
                if token in result[field]:
                    ranks[it] += 1
コード例 #9
0
from nltk.corpus import stopwords as nltk_stopwords
from stop_words import get_stop_words as py_stopwords
from itertools import islice
import re
from number_utils import _num2words, _ordinal_to_cardinal
import pandas as pd
import utils
from multiprocessing import Pool
import math
import numba
import dask.dataframe as dd
from dask.multiprocessing import get


csv.field_size_limit(sys.maxsize)
stemmer = Mystem()  # already has caching
stopwords = None
pattern = re.compile('[^\w ]')
NUM_ROWS_TO_PROCESS = 582167
ROWS_BUFFER_SIZE = 10
CORE_NUM = 8
BLOCK_SIZE = 2e9


def get_stopwords():
    global stopwords
    if stopwords is None:
        stopwords = nltk_stopwords.words('russian')
        with open('./stopwords.txt', 'r') as f:
            stopwords.extend(map(lambda x: x.replace('\n', ''), f.readlines()))
        stopwords = set(stopwords)
コード例 #10
0
import pybase64

from bs4 import BeautifulSoup, Comment
from pymystem3 import Mystem

mystem = Mystem(disambiguation=False)


class Document:
    def __init__(self, content, doc_id, url):
        self.doc_id = doc_id
        self.url = url
        self.content = content
        self.parse_html()
        stemmed_words = mystem.lemmatize(self.text)
        self.words = [word.lower() for word in stemmed_words if word.isalnum()]

    def calc_doc_stats(self):
        text_bytes_size = string_size_in_bytes(self.text)
        html_bytes_size = string_size_in_bytes(self.content)
        ratio = text_bytes_size / html_bytes_size
        return DocumentStat(self.doc_id, self.url, self.content_urls, len(self.words), text_bytes_size, ratio)

    def parse_html(self):
        def decompose_comments(soup):
            comments = soup.findAll(text=lambda text: isinstance(text, Comment))
            for comment in comments:
                comment.extract()
            return soup

        def decompose_js(soup):
コード例 #11
0
def MakeItLex(plaintext):
    m = Mystem()
    lemmas = m.lemmatize(plaintext)
    return ''.join(lemmas)
コード例 #12
0
def freq_jarg_stats(scrap_file, freq_file, ngram_file, jargon_file,
                    stats_file):
    freq_dict = load_dict()

    df = pd.read_csv(scrap_file,
                     sep=',',
                     header=0,
                     index_col=None,
                     usecols=['author'])

    # Get the number of unique authors
    authors_dict = {}
    for author in df['author']:
        if author not in authors_dict:
            authors_dict[author] = 1
    author_count = len(authors_dict)

    df = pd.read_csv(scrap_file,
                     sep=',',
                     header=0,
                     index_col=None,
                     usecols=['timestamp', 'text'])
    with open(jargon_file, 'w') as f_jarg:
        writer = csv.writer(f_jarg)
        writer.writerow(['timestamp', 'date', 'text', 'lemma', 'context'])
        lemmas_dict = {}
        ngrams_dict = [{} for i in ngram_file]
        mystem = Mystem()
        message_count = len(df['text'])
        lemma_count = 0
        ngrams_count = [0 for i in ngrams_dict]
        for timestamp, text in zip(df['timestamp'], df['text']):
            # Разбиваем текст сообщения на предложения
            context_ind = 0
            str_text = str(text)
            #sentences = tsplit(str_text, ['.', '?', '!'])
            sentences = [str_text]
            for sentence in sentences:
                cur_ngrams = [[] for i in ngrams_count]
                sent_analysis = mystem.analyze(sentence)
                for a in sent_analysis:
                    # Если токен не подвергается морфологическому разбору, пропускаем его
                    if ('analysis' not in a) \
                        and (a['text'][0] not in ['0','1','2','3','4','5','6','7','8','9']):
                        continue
                    # Подсчитываем количество лемм
                    cur_ngrams[1].append(a['text'].lower())
                    if (a['text'][0] in digits):
                        continue
                    lemma_count += 1
                    lemma = a['text']
                    if (len(a['analysis']) > 0) and (len(
                            a['analysis'][0]) > 0) and ('lex'
                                                        in a['analysis'][0]):
                        lemma = a['analysis'][0]['lex']
                    if lemma in lemmas_dict:
                        lemmas_dict[lemma] += 1
                    else:
                        lemmas_dict[lemma] = 1
                    # Если токена нет в частотном словаре, включаем его в список потеницальных жаргонизмов:
                    if (len(a['analysis']) == 0) or (a['analysis'][0]['lex']
                                                     in freq_dict):
                        continue
                    # Ищем контекст, в котором находится анализируемая словоформа
                    context_ind = text.find(a['text'], context_ind + 1)
                    context = str_text[max(0, context_ind -
                                           85):min(len(text), context_ind +
                                                   95)]
                    # Записываем в строчку все данные о необычной словоформе
                    writer.writerow([
                        str(timestamp),
                        datetime.datetime.fromtimestamp(timestamp).strftime(
                            '%Y-%m-%d'), a['text'], a['analysis'][0]['lex'],
                        context
                    ])

                # Формируем 2-, 3- и т.д. -граммы на основе 1-грамм
                for n in range(1, len(cur_ngrams)):
                    for v, w in zip(cur_ngrams[n - 1][:-1],
                                    cur_ngrams[1][n - 1:]):
                        cur_ngrams[n].append(v + ' ' + w)
                # Подсчитываем количество n-грам
                for n in range(1, len(ngrams_dict)):
                    for ngram in cur_ngrams[n]:
                        ngrams_count[n] += 1
                        if ngram in ngrams_dict[n]:
                            ngrams_dict[n][ngram] += 1
                        else:
                            ngrams_dict[n][ngram] = 1

    words_list = []
    for lemma, count in lemmas_dict.items():
        words_list.append((count, lemma))
    words_list.sort(reverse=True)

    with open(freq_file, 'w') as f:
        writer = csv.writer(f)
        writer.writerow([
            'lemma', 'pos', 'pos_dict', 'count', 'ipm', 'ipm_from_dict',
            'ipm_ratio'
        ])
        for w in words_list:
            ipm = w[0] * 1000000. / lemma_count
            ipm_from_dict = 0.
            ipm_ratio = 0.
            pos_dict = 'NA'
            if w[1] in freq_dict:
                ipm_from_dict = freq_dict[w[1]][0]
                pos_dict = freq_dict[w[1]][1]
            pos = mystem.analyze(w[1])
            if (len(pos)>0) and ('analysis' in pos[0]) and (len(pos[0]['analysis'])>0)\
            and ('gr' in pos[0]['analysis'][0]):
                pos = pos[0]['analysis'][0]['gr']
            else:
                pos = 'NA'
            if ipm_from_dict > 0:
                ipm_ratio = ipm / ipm_from_dict
            writer.writerow([
                w[1], pos, pos_dict,
                str(w[0]),
                str(round(ipm, 3)),
                str(round(ipm_from_dict, 3)),
                str(round(ipm_ratio, 3))
            ])

    # Min and max timestamp
    df = pd.read_csv(scrap_file,
                     sep=',',
                     header=0,
                     index_col=None,
                     usecols=['timestamp'])
    df = list(df['timestamp'])
    first = datetime.datetime.fromtimestamp(min(df)).strftime('%Y-%m-%d')
    last = datetime.datetime.fromtimestamp(max(df)).strftime('%Y-%m-%d')

    with open(stats_file, 'w') as f:
        f.write('authors,messages,lemmas')
        for n in range(1, len(ngram_file)):
            f.write(',' + str(n) + 'grams')
        f.write(',first,last\n')
        f.write(
            str(author_count) + ',' + str(message_count) + ',' +
            str(lemma_count))
        for n in range(1, len(ngram_file)):
            f.write(',' + str(ngrams_count[n]))
        f.write(',' + first + ',' + last + '\n')

    for n in range(1, len(ngram_file)):
        ngrams_list = []
        for ngram, count in ngrams_dict[n].items():
            ngrams_list.append((count, ngram))
        ngrams_list.sort(reverse=True)

        with open(ngram_file[n], 'w') as f:
            writer = csv.writer(f)
            writer.writerow([str(n)+'gram', 'count', 'ipm']\
               + (['ipm_from_dict'] if n==1 else []))
            for w in ngrams_list:
                ipm = w[0] * 1000000. / ngrams_count[n]
                if n == 1:
                    an = mystem.lemmatize(w[1])
                    ipm_from_dict = 0.
                    if an[0] in freq_dict:
                        ipm_from_dict = freq_dict[an[0]][0]
                if (n == 1) and (an[0][0] in digits):
                    continue
                writer.writerow([w[1], str(w[0]), str(round(ipm, 3))]\
                    + ([str(ipm_from_dict)] if n==1 else []))
コード例 #13
0
ファイル: views.py プロジェクト: TizJourney/2025_backend
 def __init__(self):
     self.stemmer = Mystem()
     self.stop_words = set(nltk.corpus.stopwords.words('russian'))
コード例 #14
0
def get_mystem():
    global m
    m = Mystem()
コード例 #15
0
# coding=utf-8
""" Stemming for russian texts """
import argparse
import datetime
import logging
import os
import time

from pymystem3 import Mystem

# Creating stemmer instance take lot of time and resources, so we create it once
# if you use multithreading create instance in every thread and pass to  stemming function

main_stemmer = Mystem()

# if you use multithreading create instance in every thread and pass to  stemming function


def parse_args(test_args=None):
    """ Argument parser. ['-v', 'DEBUG'] for console testing
    Args:
        test_args(Optional[list]): Use for testing purposes. Defaults to None.
            Used instead of command line arguments
    Returns:
        argparse.Namespace: Command line args if test_args = None, parsed test_args otherwise
    """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '-L',
        choices=['FATAL', 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'],
        dest='logging_level',
コード例 #16
0
def __mystemWrapper(text):
    """
    обёртка для разбора текста через mystem
    :param text: текст для разбора
    :type text: str
    :return:
        - textAnalytics - данные по анализу всего текста (слова плюс другие символы)
        - wordsAnalytics - данные по анализу слов текста
        - lemmas - леммы слов текста
    :rtype: tuple
    """
    # символы, разбивающие блоки текста внутри предложения
    blockBreakers = [",", ":", ";", "(", ")", "\"", "-"]
    # символы, разбивающие предложения
    sentenceBreakers = [".", "!", "?"]
    # символы, разбивающие слова внутри блока
    wordsBreakers = [" ", "-"]
    m = Mystem()
    rawAnalytics = m.analyze(text)

    # аналитика по всему тексту - включая нераспозанные элементы
    textAnalytics = list()
    # аналитика только по словам
    wordsAnalytics = list()
    # леммы слов
    lemmas = list()

    # перебираем выход mystem
    for rawAnalytic in rawAnalytics:
        # если по аналитике есть данные, копаемся в них
        if 'analysis' in rawAnalytic and len(rawAnalytic['analysis']) > 0:
            # первым делом определяем часть речи
            POS = rawAnalytic['analysis'][0]['gr'].split(',')[0]
            if '=' in POS:
                POS = POS.split('=')[0]

            # затем признак обсценности
            isObscene = "обсц" in rawAnalytic['analysis'][0]['gr']
            isPersonal = "имя" in rawAnalytic['analysis'][0]['gr']

            # затем лемму
            lemma = rawAnalytic['analysis'][0]['lex']

            # cтроим результирующий словарь
            analytic = {
                'POS': POS,
                'text': lemma,
                'rawText': rawAnalytic['text'],
                'isObscene': isObscene,
                'isPersonal': isPersonal
            }

            # для глаголов расширяем его другими признаками
            if POS == 'V':
                isImperative = "пов" in rawAnalytic['analysis'][0]['gr']
                isIndicative = "изъяв" in rawAnalytic['analysis'][0]['gr']
                isGerund = "деепр" in rawAnalytic['analysis'][0]['gr']
                isParticiple = "прич" in rawAnalytic['analysis'][0]['gr']
                isInfinitive = "инф" in rawAnalytic['analysis'][0]['gr']

                analytic['verbsCharacteristics'] = {
                    'isImperative': isImperative,
                    'isIndicative': isIndicative,
                    'isGerund': isGerund,
                    'isParticiple': isParticiple,
                    'isInfinitive': isInfinitive
                }

            wordsAnalytics.append(analytic)
            lemmas.append(lemma)
            analytic['type'] = 'word'
            textAnalytics.append(analytic)
        else:
            # для текста, не разобранного mystem как слово, определяем тип
            char = rawAnalytic['text']
            charTrimmed = char.strip()
            charType = "unrecognized"
            if char in wordsBreakers:
                charType = "space"
                charTrimmed = char
            elif charTrimmed in blockBreakers:
                charType = "blockBreaker"
            elif charTrimmed in sentenceBreakers:
                charType = "sentenceBreaker"
            elif re.match(r"^[a-zA-Z]+$", char):
                charType = "enText"
                # cлова на английском mystem не обрабатывает, так что вручную записываем их в леммы
                lemmas.append(char)
            elif re.match(r"^[а-яА-ЯёЁ]+$", char):
                charType = "ruText"
                # нераспознанные cлова на русском
                lemmas.append(char)
            elif re.match(r"^[0-9]+$", char):
                charType = "number"
                # число
                lemmas.append(char)

            analytic = {'type': charType, 'rawText': char, 'text': charTrimmed}
            textAnalytics.append(analytic)

    return textAnalytics, wordsAnalytics, lemmas
コード例 #17
0
def Lemmatize (query):
	m = Mystem()
	lemmas = m.lemmatize(query)
	return (''.join(lemmas))
コード例 #18
0
import re
import json
import math
import numpy as np
from pymystem3 import Mystem

from search_course.hw5 import do_ranking, write_to_file

LA = np.linalg
ARTICLES = json.load(open('result2.json'))
DOC_COLLECTION_SIZE = 10
MYSTEM_INSTANCE = Mystem()


def _mystem(text):
    return re.sub('[^\w ]+', '', ''.join(MYSTEM_INSTANCE.lemmatize(text)).strip())


def _put_to_dict(words_dict, word, counter):
    if word not in words_dict:
        words_dict[word] = {}

    counter_dict = words_dict[word]
    counter_dict[counter] = counter_dict.get(counter, 0) + 1


def _idf(q):
    idx = json.load(open('index_mystem.json'))

    term_idfs = {}
コード例 #19
0
 def __init__(self, mapping):
     self.m = Mystem()
     self.mapping = mapping
コード例 #20
0
def set_mystem():
    global mystem
    mystem = Mystem()
コード例 #21
0
ファイル: morphology.py プロジェクト: birch-group/elan2folia
BiRCh Morphology Module
[email protected]

Input: list of tokens
Output: morphological analysis of input tokens

References:
...
"""
from pickle import load
from tokenization import *
from pymystem3 import Mystem
import re

# exclude non-word tokens (e.g.{'text':' '} or {'text':'\n'}) from mystem's result list
m = Mystem(entire_input=False)

# info of dict_of_dims
# key: first two letters of a dims word
# value: set of dims words
# with open('dict_of_dims.pkl','rb') as f:
with open('dict_of_dims_lower.pkl', 'rb') as f:  # handle upper/lower cases
    dod = load(f)
for k in dod:
    dod[k] = set(re.sub(r'ё', r'е', w) for w in dod[k])

# sep_mystem = re.compile(r'[,=|]')
# def analyze_mystem_gr(gr_value): # type(gr_value): str
#     """ -> tuple of (pos, features) """
#     temp = sep_mystem.search(gr_value)
#     if temp:
コード例 #22
0
                           'рассказать про Ваш опыт и обстоятельства',
                           'дать совет. Что нужно/необходимо/важно/обязательно сделать в данном случае?',
                           'если это уместно, рассказать про долг и правила. Что правильно сделать в данном случае?']

HREF_WORDS = ['href', 'nofollow', 'rel', '_blank', 'target', 'https', 'www', 'http', 'com', 'youtube', 'youtu', 'be']
IMG_WORDS = ['img', 'src', 'imgsmail', 'data', 'gif', 'big', 'jpg', 'download', 'otvet']
VIDEO_WORDS = ['amp', 'video']
ALL_STOPWORDS = stopwords.words('russian') + [
    'это', 'наш', 'тыс', 'млн', 'млрд', 'также', 'т', 'д',
    'который', 'прошлый', 'сей', 'свой', 'наш', 'мочь', 'такой', 'очень'
]

NUM_ADVICES_FOR_NORM = 5
RU_WORDS = re.compile("[А-Яа-я]+")
WORDS = re.compile("(\w+)")
M = Mystem()

MODEL = utils.get_pickle_file(config.path_to_model)
TFIDF = utils.get_pickle_file(config.path_to_tfidf)


def only_words(text):
    """
    Удаление лишних символов, в тексте остаются только слова
    """
    return " ".join(WORDS.findall(text))


def lemmatize(text):
    """
    Лемматизация текста
コード例 #23
0
import pandas as pd
from argparse import ArgumentParser
from pymystem3 import Mystem
from tqdm import tqdm
from gensim.models import FastText

morph = Mystem()

if __name__ == '__main__':
    parser = ArgumentParser(description='Enrich model with UD contexts')
    parser.add_argument('model', help='FastText model directory')
    parser.add_argument('vectors', help='where to save vectors')
    args = parser.parse_args()

    comp = pd.read_csv('compounds_select_1000.csv')

    chast1 = list(comp['Часть 1'].values)
    chast2 = list(comp['Часть 2'].values)

    print('loading model')
    model = FastText.load(args.model)
    print('loaded')
    print(len(model.wv.vocab))

    part1_lem = []
    part2_lem = []

    print('Starting lemmatize')

    for w1, w2 in zip(chast1, chast2):
        lem_w1 = morph.lemmatize(w1)[0]
コード例 #24
0
import xml.etree.ElementTree as Elem

from pymystem3 import Mystem

from utils import get_words

tree = Elem.parse('issue.xml')
root = tree.getroot()
articles = root.find('articles').findall('article')
result_docs = set(range(len(articles)))
substr_list = set()
porter_word_map = get_words('index_mystem.xml')
query = input("add query:")
query = ''.join(Mystem().lemmatize(query)).strip()
for word in query.split():
    if word[0] == '-':
        substr_list.update(set(porter_word_map.get(word[1:], list())))
    elif porter_word_map.get(word, None) is not None:
        result_docs.intersection_update(set(porter_word_map.get(word)))
    else:
        result_docs = set()
        break
    result_docs.difference_update(substr_list)
print(result_docs)
コード例 #25
0
ファイル: main.py プロジェクト: kgabbasova/doc_recognition
 def __init__(self):
     self._mystem = Mystem(entire_input=True)
     self._stopwords = self._read_stopwords()
     self._vect_word = None
     self._vect_char = None
     self._model = None
コード例 #26
0
# coding: utf-8
from bs4 import BeautifulSoup
import os
import csv
import regex as re
from nltk import word_tokenize
from pymystem3 import Mystem
import pickle

MULTINUCLEAR_RELATIONS = [
    'comparison', 'contrast', 'joint', 'restatement', 'same-unit', 'sequence'
]
analyzer = Mystem()
pos_map = {
    'A': 'ADJ',
    'ADV': 'ADV',
    'ADVPRO': 'PRON',
    'ANUM': 'ADJ',
    'APRO': 'PRON',
    'COM': 'NOUN',
    'CONJ': 'CCONJ',
    'INTJ': 'PART',
    'NUM': 'NUM',
    'PART': 'PART',
    'PR': 'ADP',
    'S': 'NOUN',
    'SPRO': 'PRON',
    'V': 'VERB'
}

コード例 #27
0
    words = re.split(r'\W+', text)
    words = [w.lower() for w in words if w.isalpha()]

    stop_words = set(stopwords.words('russian'))
    words = [w for w in words if not w in stop_words]

    stemmer_eng = SnowballStemmer('english')
    stemmer_rus = SnowballStemmer('russian')
    stemmed = [stemmer_rus.stem(stemmer_eng.stem(word)) for word in words]

    for w in stemmed:
        words_porter.append((str(uuid.uuid4()), w, item[0]))

    # саму программу я добавил в gitignore из-за большого веса, обертка может сама скачивать MyStem, если не указывать путь к бинарнику
    stemmer_ya = Mystem(mystem_bin='./mystem')
    lemmas = stemmer_ya.lemmatize(' '.join(words))
    lemmas = [l for l in lemmas if len(l.strip()) > 0]

    for w in lemmas:
        words_mystem.append((str(uuid.uuid4()), w, item[0]))

    conn.commit()

for w in words_porter:
    cur.execute("INSERT INTO words_porter VALUES(%s, %s, %s)", w)

for w in words_mystem:
    cur.execute("INSERT INTO words_mystem VALUES(%s, %s, %s)", w)

conn.commit()
コード例 #28
0
import codecs
import collections

fd = codecs.open("heptral.txt", 'r', 'utf-16')
sqlFile = str(fd.read())

# print (sqlFile)

from pymystem3 import Mystem
import re

RU = Mystem()
lemmas = RU.lemmatize(sqlFile)
analysis = RU.analyze(sqlFile)
type_pattern = re.compile("^([A-Z]+)*")
# print (lemmas)
# for i in range(lemmas):

# ROS_dict = {i.get('analysis')[0].get('lex'): \
#                 type_pattern.findall(i.get('analysis')[0].get('gr'))[0] \
#             for i in analysis if 'analysis' in i.keys()}

wordcount = {}
for word in lemmas:
    if len(word) > 5:
        if word not in wordcount:
            wordcount[word] = 1
        else:
            wordcount[word] += 1
n_print = int(input("How many most common words to print: "))
print("\nOK. The {} most common words are as follows\n".format(n_print))
コード例 #29
0
import json
from urllib.request import urlopen
from urllib.parse import quote, urlencode
from flatten_dict import flatten
from pymystem3 import Mystem
import re

m = Mystem()
url_base = 'https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles='
#url = 'https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles=Рассказы_братьев_Стругацких'
url_title = 'Наполеон I'
# jsonurl = urlopen(url)
# file = json.loads(jsonurl.read())
# article = str(file)
# # for k, v in flatten(file).items():
# #     print('key:', k, '\n', v, type(v))
# print(article.replace(r'\n', '\n'))

print (quote(url_title.encode('utf8')))
# print(flatten(file))
# for k, v in
# article.replace('\\xa0',' ')
# with open('https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles=%D0%A0%D0%B0%D1%81%D1%81%D0%BA%D0%B0%D0%B7%D1%8B_%D0%B1%D1%80%D0%B0%D1%82%D1%8C%D0%B5%D0%B2_%D0%A1%D1%82%D1%80%D1%83%D0%B3%D0%B0%D1%86%D0%BA%D0%B8%D1%85',encoding=('unicode-escape')) as f:
#        article = f.read()
#        #
#        #print(article.decode('latin1'))
# print(article)
# title_regex = re.compile(r'\"title\".*?\"(?P<title>.*?)\"')
# match = title_regex.search(article)
# title = match.group('title')
#
コード例 #30
0
import json
from pymystem3 import Mystem
from pyasn1.compat.octets import null
from xml.etree import ElementTree

# Опции от Сергея Гладилина:   mystem_ruscorpora --format=json -i -c -g --eng-gr
# Также предлагается -w, если мы удовлетворительно работаем с незнакомыми словами
# Для полного набора опций надо брать pymystem из git: pip install git+https://github.com/nlpub/pymystem3 (см. https://github.com/nlpub/pymystem3).
m = Mystem(mystem_bin='./mystem_ruscorpora.linux',
           grammar_info=True,
           disambiguation=False,
           entire_input=True,
           glue_grammar_info=True,
           use_english_names=True)
# Вариант mystem с леммой несовершенного вида у глаголов совершенного вида
#masp = Mystem(grammar_info=True, disambiguation=False, entire_input=True, glue_grammar_info=True, use_english_names=True)


class MystemFixLists(object):
    def __init__(self):
        self.add_fix = self.load_add_cfg()
        self.del_fix = self.load_del_cfg()

    def load_add_cfg(self):
        result = {}
        with open('add.cfg', 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line[0] == '+':