def __init__(self): self.file_name = self.INPUTFILE self.csvlength = 0 self.lemmatiser = Mystem() #self.freq_dict = {} self.fd = defaultdict(dict)
from pymystem3 import Mystem from question_game.jaccard import jaccard from question_game.nltk_bleu_score import sentence_bleu from question_game.nltk_bleu_score import SmoothingFunction stemmer = Mystem() chencherry = SmoothingFunction() class Question: """Provide a class to keep a single step in the game.""" def __init__(self, qa_pair: tuple): self._question, self._ref_trans = qa_pair self._guesses = [] # List[Tuple(guess, mask, score), ] self._ref_analysis = stemmer.analyze(self._ref_trans) self._ref_lemmas = [] self._ref_text = [] self._get_ref_lemmas_and_text() self._guess_lemmas = '' def add_guess(self, guess): self._guess_lemmas = stemmer.lemmatize(guess) # incl. punct. etc mask = self._get_mask() score = self._calc_score(guess) self._guesses.append((guess, mask, score)) def get_question(self): return self._question
def preprocessing_raw_data(**kwargs): import re from airflow.models import Variable from elasticsearch.helpers import streaming_bulk from elasticsearch_dsl import Search, Q from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT, ES_INDEX_CUSTOM_DICTIONARY_WORD from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from pymorphy2 import MorphAnalyzer from pymystem3 import Mystem from stop_words import get_stop_words from util.service_es import search, update_generator from util.util import is_latin, is_word start = kwargs['start'] end = kwargs['end'] number_of_documents = int( Variable.get("lemmatize_number_of_documents", default_var=None)) if number_of_documents is None: raise Exception("No variable!") s = search(ES_CLIENT, ES_INDEX_DOCUMENT, query={}, source=['text'], sort=['id'], get_search_obj=True) s = s.query(~Q('exists', field="text_lemmatized_yandex") | ~Q('exists', field="text_lemmatized")) s = s[int(start / 100 * number_of_documents):int(end / 100 * number_of_documents) + 1] documents = s.execute() print('!!! len docs', len(documents)) stopwords_ru = set(get_stop_words('ru')) stopwords_eng = set(get_stop_words('en') + stopwords.words('english')) lemmatizer = WordNetLemmatizer() morph = MorphAnalyzer() m = Mystem() s = Search(using=ES_CLIENT, index=ES_INDEX_CUSTOM_DICTIONARY_WORD) r = s[:1000000].scan() custom_dict = dict((w.word, w.word_normal) for w in r) for doc in documents: cleaned_doc = " ".join(x.lower() for x in ' '.join( re.sub('([^А-Яа-яa-zA-ZӘәҒғҚқҢңӨөҰұҮүІі-]|[^ ]*[*][^ ]*)', ' ', doc.text).split()).split()) if is_latin(cleaned_doc): cleaned_words_list = [ lemmatizer.lemmatize(word) for word in cleaned_doc.split() if len(word) > 3 and word not in stopwords_eng ] doc['text_lemmatized_yandex'] = "" else: cleaned_words_list = [ morph_with_dictionary(morph, word, custom_dict) for word in cleaned_doc.split() if len(word) > 2 and word not in stopwords_ru ] cwl_yandex = filter( lambda word: is_word(word) and len(word) > 2 and word not in stopwords_ru, m.lemmatize(cleaned_doc)) cleaned_doc_yandex = " ".join(cwl_yandex) doc['text_lemmatized_yandex'] = cleaned_doc_yandex cleaned_doc = " ".join(cleaned_words_list) doc['text_lemmatized'] = cleaned_doc documents_processed = 0 failed = 0 for ok, result in streaming_bulk(ES_CLIENT, update_generator(ES_INDEX_DOCUMENT, documents), index=ES_INDEX_DOCUMENT, chunk_size=5000, raise_on_error=True, max_retries=10): if not ok: failed += 1 if failed > 5: raise Exception("Too many failed ES!!!") documents_processed += 1 return f"{documents_processed} Processed, {known_counter} in pymorphie dict, {custom_dict_counter} in custom dict, {not_in_dict_counter} not found"
import logging import sys import os import nltk from pymystem3 import Mystem ROOT_LOGGER = logging.getLogger() ROOT_LOGGER.setLevel(logging.DEBUG) handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) ROOT_LOGGER.addHandler(handler) ROOT_LOGGER.info("Setting up projet environment...") PROJECT_ROOT = "." DB_NAME = "question_pairs.db" logging.info("Make sure that you have internet connection to download data for vectorization.") logging.info("Otherwise this program is to freeze!") nltk.download('punkt') MORPH = Mystem() CORPUS_SIZE = 100000 FASTTEXT_MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v", "model.model") FASTTEXT_CACHE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "w2v", "fast_text_matrix.npy") ELMO_MODEL = os.path.join(os.path.dirname(os.path.abspath(__file__)), "elmo", "elmo")
from pymystem3 import Mystem from gram_filter import gram_filter from ngram import train ma = Mystem() from random import uniform def nachform(word): try: token = ma.analyze(word)[0]['analysis'][0]['lex'] except (KeyError, IndexError): token = '' return token def unirand(seq, kutoken): freq_, res = 0, None rnd = uniform(0, 1) for token, freq in seq: if kutoken == nachform(token): freq_ += freq * 2 else: freq_ += freq / 2 #print(1, rnd, freq_, token, kutoken) if rnd < freq_: res = (token, nachform(token)) if res == None: freq_ = 0 for token, freq in seq: freq_ += freq #print(2, rnd, freq_, token)
def __init__(self): self.mystem = Mystem()
def get_stem_text(text): m = Mystem() text_raw = re.sub(u'[^a-zA-Zа-яА-ЯйЙёЁ _]+', '', text) text_stem = m.lemmatize(text_raw) text_res = ''.join(text_stem).strip() return text_res
from re import sub from pymystem3 import Mystem from json import loads, dumps from nltk.corpus import stopwords from os.path import dirname, abspath d = dirname(dirname(abspath(__file__))) rus_lemmatizer = Mystem() rus_stopwords = stopwords.words('russian') FILENAME = 'train7.json' FILEPATH = f'{d}/resources/{FILENAME}' OUTPATH = f'{d}/regression/{FILENAME}' FIELDS = ['author', 'summary', 'title'] def preprocess_text(text): return [ token for token in rus_lemmatizer.lemmatize(text.lower()) if token not in rus_stopwords and token.strip() != '' ] def make_ranking(query, results): ranks = [0 for _ in range(len(results))] for token in query: for it, result in enumerate(results): for field in FIELDS: if token in result[field]: ranks[it] += 1
from nltk.corpus import stopwords as nltk_stopwords from stop_words import get_stop_words as py_stopwords from itertools import islice import re from number_utils import _num2words, _ordinal_to_cardinal import pandas as pd import utils from multiprocessing import Pool import math import numba import dask.dataframe as dd from dask.multiprocessing import get csv.field_size_limit(sys.maxsize) stemmer = Mystem() # already has caching stopwords = None pattern = re.compile('[^\w ]') NUM_ROWS_TO_PROCESS = 582167 ROWS_BUFFER_SIZE = 10 CORE_NUM = 8 BLOCK_SIZE = 2e9 def get_stopwords(): global stopwords if stopwords is None: stopwords = nltk_stopwords.words('russian') with open('./stopwords.txt', 'r') as f: stopwords.extend(map(lambda x: x.replace('\n', ''), f.readlines())) stopwords = set(stopwords)
import pybase64 from bs4 import BeautifulSoup, Comment from pymystem3 import Mystem mystem = Mystem(disambiguation=False) class Document: def __init__(self, content, doc_id, url): self.doc_id = doc_id self.url = url self.content = content self.parse_html() stemmed_words = mystem.lemmatize(self.text) self.words = [word.lower() for word in stemmed_words if word.isalnum()] def calc_doc_stats(self): text_bytes_size = string_size_in_bytes(self.text) html_bytes_size = string_size_in_bytes(self.content) ratio = text_bytes_size / html_bytes_size return DocumentStat(self.doc_id, self.url, self.content_urls, len(self.words), text_bytes_size, ratio) def parse_html(self): def decompose_comments(soup): comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() return soup def decompose_js(soup):
def MakeItLex(plaintext): m = Mystem() lemmas = m.lemmatize(plaintext) return ''.join(lemmas)
def freq_jarg_stats(scrap_file, freq_file, ngram_file, jargon_file, stats_file): freq_dict = load_dict() df = pd.read_csv(scrap_file, sep=',', header=0, index_col=None, usecols=['author']) # Get the number of unique authors authors_dict = {} for author in df['author']: if author not in authors_dict: authors_dict[author] = 1 author_count = len(authors_dict) df = pd.read_csv(scrap_file, sep=',', header=0, index_col=None, usecols=['timestamp', 'text']) with open(jargon_file, 'w') as f_jarg: writer = csv.writer(f_jarg) writer.writerow(['timestamp', 'date', 'text', 'lemma', 'context']) lemmas_dict = {} ngrams_dict = [{} for i in ngram_file] mystem = Mystem() message_count = len(df['text']) lemma_count = 0 ngrams_count = [0 for i in ngrams_dict] for timestamp, text in zip(df['timestamp'], df['text']): # Разбиваем текст сообщения на предложения context_ind = 0 str_text = str(text) #sentences = tsplit(str_text, ['.', '?', '!']) sentences = [str_text] for sentence in sentences: cur_ngrams = [[] for i in ngrams_count] sent_analysis = mystem.analyze(sentence) for a in sent_analysis: # Если токен не подвергается морфологическому разбору, пропускаем его if ('analysis' not in a) \ and (a['text'][0] not in ['0','1','2','3','4','5','6','7','8','9']): continue # Подсчитываем количество лемм cur_ngrams[1].append(a['text'].lower()) if (a['text'][0] in digits): continue lemma_count += 1 lemma = a['text'] if (len(a['analysis']) > 0) and (len( a['analysis'][0]) > 0) and ('lex' in a['analysis'][0]): lemma = a['analysis'][0]['lex'] if lemma in lemmas_dict: lemmas_dict[lemma] += 1 else: lemmas_dict[lemma] = 1 # Если токена нет в частотном словаре, включаем его в список потеницальных жаргонизмов: if (len(a['analysis']) == 0) or (a['analysis'][0]['lex'] in freq_dict): continue # Ищем контекст, в котором находится анализируемая словоформа context_ind = text.find(a['text'], context_ind + 1) context = str_text[max(0, context_ind - 85):min(len(text), context_ind + 95)] # Записываем в строчку все данные о необычной словоформе writer.writerow([ str(timestamp), datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d'), a['text'], a['analysis'][0]['lex'], context ]) # Формируем 2-, 3- и т.д. -граммы на основе 1-грамм for n in range(1, len(cur_ngrams)): for v, w in zip(cur_ngrams[n - 1][:-1], cur_ngrams[1][n - 1:]): cur_ngrams[n].append(v + ' ' + w) # Подсчитываем количество n-грам for n in range(1, len(ngrams_dict)): for ngram in cur_ngrams[n]: ngrams_count[n] += 1 if ngram in ngrams_dict[n]: ngrams_dict[n][ngram] += 1 else: ngrams_dict[n][ngram] = 1 words_list = [] for lemma, count in lemmas_dict.items(): words_list.append((count, lemma)) words_list.sort(reverse=True) with open(freq_file, 'w') as f: writer = csv.writer(f) writer.writerow([ 'lemma', 'pos', 'pos_dict', 'count', 'ipm', 'ipm_from_dict', 'ipm_ratio' ]) for w in words_list: ipm = w[0] * 1000000. / lemma_count ipm_from_dict = 0. ipm_ratio = 0. pos_dict = 'NA' if w[1] in freq_dict: ipm_from_dict = freq_dict[w[1]][0] pos_dict = freq_dict[w[1]][1] pos = mystem.analyze(w[1]) if (len(pos)>0) and ('analysis' in pos[0]) and (len(pos[0]['analysis'])>0)\ and ('gr' in pos[0]['analysis'][0]): pos = pos[0]['analysis'][0]['gr'] else: pos = 'NA' if ipm_from_dict > 0: ipm_ratio = ipm / ipm_from_dict writer.writerow([ w[1], pos, pos_dict, str(w[0]), str(round(ipm, 3)), str(round(ipm_from_dict, 3)), str(round(ipm_ratio, 3)) ]) # Min and max timestamp df = pd.read_csv(scrap_file, sep=',', header=0, index_col=None, usecols=['timestamp']) df = list(df['timestamp']) first = datetime.datetime.fromtimestamp(min(df)).strftime('%Y-%m-%d') last = datetime.datetime.fromtimestamp(max(df)).strftime('%Y-%m-%d') with open(stats_file, 'w') as f: f.write('authors,messages,lemmas') for n in range(1, len(ngram_file)): f.write(',' + str(n) + 'grams') f.write(',first,last\n') f.write( str(author_count) + ',' + str(message_count) + ',' + str(lemma_count)) for n in range(1, len(ngram_file)): f.write(',' + str(ngrams_count[n])) f.write(',' + first + ',' + last + '\n') for n in range(1, len(ngram_file)): ngrams_list = [] for ngram, count in ngrams_dict[n].items(): ngrams_list.append((count, ngram)) ngrams_list.sort(reverse=True) with open(ngram_file[n], 'w') as f: writer = csv.writer(f) writer.writerow([str(n)+'gram', 'count', 'ipm']\ + (['ipm_from_dict'] if n==1 else [])) for w in ngrams_list: ipm = w[0] * 1000000. / ngrams_count[n] if n == 1: an = mystem.lemmatize(w[1]) ipm_from_dict = 0. if an[0] in freq_dict: ipm_from_dict = freq_dict[an[0]][0] if (n == 1) and (an[0][0] in digits): continue writer.writerow([w[1], str(w[0]), str(round(ipm, 3))]\ + ([str(ipm_from_dict)] if n==1 else []))
def __init__(self): self.stemmer = Mystem() self.stop_words = set(nltk.corpus.stopwords.words('russian'))
def get_mystem(): global m m = Mystem()
# coding=utf-8 """ Stemming for russian texts """ import argparse import datetime import logging import os import time from pymystem3 import Mystem # Creating stemmer instance take lot of time and resources, so we create it once # if you use multithreading create instance in every thread and pass to stemming function main_stemmer = Mystem() # if you use multithreading create instance in every thread and pass to stemming function def parse_args(test_args=None): """ Argument parser. ['-v', 'DEBUG'] for console testing Args: test_args(Optional[list]): Use for testing purposes. Defaults to None. Used instead of command line arguments Returns: argparse.Namespace: Command line args if test_args = None, parsed test_args otherwise """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '-L', choices=['FATAL', 'CRITICAL', 'ERROR', 'WARNING', 'INFO', 'DEBUG'], dest='logging_level',
def __mystemWrapper(text): """ обёртка для разбора текста через mystem :param text: текст для разбора :type text: str :return: - textAnalytics - данные по анализу всего текста (слова плюс другие символы) - wordsAnalytics - данные по анализу слов текста - lemmas - леммы слов текста :rtype: tuple """ # символы, разбивающие блоки текста внутри предложения blockBreakers = [",", ":", ";", "(", ")", "\"", "-"] # символы, разбивающие предложения sentenceBreakers = [".", "!", "?"] # символы, разбивающие слова внутри блока wordsBreakers = [" ", "-"] m = Mystem() rawAnalytics = m.analyze(text) # аналитика по всему тексту - включая нераспозанные элементы textAnalytics = list() # аналитика только по словам wordsAnalytics = list() # леммы слов lemmas = list() # перебираем выход mystem for rawAnalytic in rawAnalytics: # если по аналитике есть данные, копаемся в них if 'analysis' in rawAnalytic and len(rawAnalytic['analysis']) > 0: # первым делом определяем часть речи POS = rawAnalytic['analysis'][0]['gr'].split(',')[0] if '=' in POS: POS = POS.split('=')[0] # затем признак обсценности isObscene = "обсц" in rawAnalytic['analysis'][0]['gr'] isPersonal = "имя" in rawAnalytic['analysis'][0]['gr'] # затем лемму lemma = rawAnalytic['analysis'][0]['lex'] # cтроим результирующий словарь analytic = { 'POS': POS, 'text': lemma, 'rawText': rawAnalytic['text'], 'isObscene': isObscene, 'isPersonal': isPersonal } # для глаголов расширяем его другими признаками if POS == 'V': isImperative = "пов" in rawAnalytic['analysis'][0]['gr'] isIndicative = "изъяв" in rawAnalytic['analysis'][0]['gr'] isGerund = "деепр" in rawAnalytic['analysis'][0]['gr'] isParticiple = "прич" in rawAnalytic['analysis'][0]['gr'] isInfinitive = "инф" in rawAnalytic['analysis'][0]['gr'] analytic['verbsCharacteristics'] = { 'isImperative': isImperative, 'isIndicative': isIndicative, 'isGerund': isGerund, 'isParticiple': isParticiple, 'isInfinitive': isInfinitive } wordsAnalytics.append(analytic) lemmas.append(lemma) analytic['type'] = 'word' textAnalytics.append(analytic) else: # для текста, не разобранного mystem как слово, определяем тип char = rawAnalytic['text'] charTrimmed = char.strip() charType = "unrecognized" if char in wordsBreakers: charType = "space" charTrimmed = char elif charTrimmed in blockBreakers: charType = "blockBreaker" elif charTrimmed in sentenceBreakers: charType = "sentenceBreaker" elif re.match(r"^[a-zA-Z]+$", char): charType = "enText" # cлова на английском mystem не обрабатывает, так что вручную записываем их в леммы lemmas.append(char) elif re.match(r"^[а-яА-ЯёЁ]+$", char): charType = "ruText" # нераспознанные cлова на русском lemmas.append(char) elif re.match(r"^[0-9]+$", char): charType = "number" # число lemmas.append(char) analytic = {'type': charType, 'rawText': char, 'text': charTrimmed} textAnalytics.append(analytic) return textAnalytics, wordsAnalytics, lemmas
def Lemmatize (query): m = Mystem() lemmas = m.lemmatize(query) return (''.join(lemmas))
import re import json import math import numpy as np from pymystem3 import Mystem from search_course.hw5 import do_ranking, write_to_file LA = np.linalg ARTICLES = json.load(open('result2.json')) DOC_COLLECTION_SIZE = 10 MYSTEM_INSTANCE = Mystem() def _mystem(text): return re.sub('[^\w ]+', '', ''.join(MYSTEM_INSTANCE.lemmatize(text)).strip()) def _put_to_dict(words_dict, word, counter): if word not in words_dict: words_dict[word] = {} counter_dict = words_dict[word] counter_dict[counter] = counter_dict.get(counter, 0) + 1 def _idf(q): idx = json.load(open('index_mystem.json')) term_idfs = {}
def __init__(self, mapping): self.m = Mystem() self.mapping = mapping
def set_mystem(): global mystem mystem = Mystem()
BiRCh Morphology Module [email protected] Input: list of tokens Output: morphological analysis of input tokens References: ... """ from pickle import load from tokenization import * from pymystem3 import Mystem import re # exclude non-word tokens (e.g.{'text':' '} or {'text':'\n'}) from mystem's result list m = Mystem(entire_input=False) # info of dict_of_dims # key: first two letters of a dims word # value: set of dims words # with open('dict_of_dims.pkl','rb') as f: with open('dict_of_dims_lower.pkl', 'rb') as f: # handle upper/lower cases dod = load(f) for k in dod: dod[k] = set(re.sub(r'ё', r'е', w) for w in dod[k]) # sep_mystem = re.compile(r'[,=|]') # def analyze_mystem_gr(gr_value): # type(gr_value): str # """ -> tuple of (pos, features) """ # temp = sep_mystem.search(gr_value) # if temp:
'рассказать про Ваш опыт и обстоятельства', 'дать совет. Что нужно/необходимо/важно/обязательно сделать в данном случае?', 'если это уместно, рассказать про долг и правила. Что правильно сделать в данном случае?'] HREF_WORDS = ['href', 'nofollow', 'rel', '_blank', 'target', 'https', 'www', 'http', 'com', 'youtube', 'youtu', 'be'] IMG_WORDS = ['img', 'src', 'imgsmail', 'data', 'gif', 'big', 'jpg', 'download', 'otvet'] VIDEO_WORDS = ['amp', 'video'] ALL_STOPWORDS = stopwords.words('russian') + [ 'это', 'наш', 'тыс', 'млн', 'млрд', 'также', 'т', 'д', 'который', 'прошлый', 'сей', 'свой', 'наш', 'мочь', 'такой', 'очень' ] NUM_ADVICES_FOR_NORM = 5 RU_WORDS = re.compile("[А-Яа-я]+") WORDS = re.compile("(\w+)") M = Mystem() MODEL = utils.get_pickle_file(config.path_to_model) TFIDF = utils.get_pickle_file(config.path_to_tfidf) def only_words(text): """ Удаление лишних символов, в тексте остаются только слова """ return " ".join(WORDS.findall(text)) def lemmatize(text): """ Лемматизация текста
import pandas as pd from argparse import ArgumentParser from pymystem3 import Mystem from tqdm import tqdm from gensim.models import FastText morph = Mystem() if __name__ == '__main__': parser = ArgumentParser(description='Enrich model with UD contexts') parser.add_argument('model', help='FastText model directory') parser.add_argument('vectors', help='where to save vectors') args = parser.parse_args() comp = pd.read_csv('compounds_select_1000.csv') chast1 = list(comp['Часть 1'].values) chast2 = list(comp['Часть 2'].values) print('loading model') model = FastText.load(args.model) print('loaded') print(len(model.wv.vocab)) part1_lem = [] part2_lem = [] print('Starting lemmatize') for w1, w2 in zip(chast1, chast2): lem_w1 = morph.lemmatize(w1)[0]
import xml.etree.ElementTree as Elem from pymystem3 import Mystem from utils import get_words tree = Elem.parse('issue.xml') root = tree.getroot() articles = root.find('articles').findall('article') result_docs = set(range(len(articles))) substr_list = set() porter_word_map = get_words('index_mystem.xml') query = input("add query:") query = ''.join(Mystem().lemmatize(query)).strip() for word in query.split(): if word[0] == '-': substr_list.update(set(porter_word_map.get(word[1:], list()))) elif porter_word_map.get(word, None) is not None: result_docs.intersection_update(set(porter_word_map.get(word))) else: result_docs = set() break result_docs.difference_update(substr_list) print(result_docs)
def __init__(self): self._mystem = Mystem(entire_input=True) self._stopwords = self._read_stopwords() self._vect_word = None self._vect_char = None self._model = None
# coding: utf-8 from bs4 import BeautifulSoup import os import csv import regex as re from nltk import word_tokenize from pymystem3 import Mystem import pickle MULTINUCLEAR_RELATIONS = [ 'comparison', 'contrast', 'joint', 'restatement', 'same-unit', 'sequence' ] analyzer = Mystem() pos_map = { 'A': 'ADJ', 'ADV': 'ADV', 'ADVPRO': 'PRON', 'ANUM': 'ADJ', 'APRO': 'PRON', 'COM': 'NOUN', 'CONJ': 'CCONJ', 'INTJ': 'PART', 'NUM': 'NUM', 'PART': 'PART', 'PR': 'ADP', 'S': 'NOUN', 'SPRO': 'PRON', 'V': 'VERB' }
words = re.split(r'\W+', text) words = [w.lower() for w in words if w.isalpha()] stop_words = set(stopwords.words('russian')) words = [w for w in words if not w in stop_words] stemmer_eng = SnowballStemmer('english') stemmer_rus = SnowballStemmer('russian') stemmed = [stemmer_rus.stem(stemmer_eng.stem(word)) for word in words] for w in stemmed: words_porter.append((str(uuid.uuid4()), w, item[0])) # саму программу я добавил в gitignore из-за большого веса, обертка может сама скачивать MyStem, если не указывать путь к бинарнику stemmer_ya = Mystem(mystem_bin='./mystem') lemmas = stemmer_ya.lemmatize(' '.join(words)) lemmas = [l for l in lemmas if len(l.strip()) > 0] for w in lemmas: words_mystem.append((str(uuid.uuid4()), w, item[0])) conn.commit() for w in words_porter: cur.execute("INSERT INTO words_porter VALUES(%s, %s, %s)", w) for w in words_mystem: cur.execute("INSERT INTO words_mystem VALUES(%s, %s, %s)", w) conn.commit()
import codecs import collections fd = codecs.open("heptral.txt", 'r', 'utf-16') sqlFile = str(fd.read()) # print (sqlFile) from pymystem3 import Mystem import re RU = Mystem() lemmas = RU.lemmatize(sqlFile) analysis = RU.analyze(sqlFile) type_pattern = re.compile("^([A-Z]+)*") # print (lemmas) # for i in range(lemmas): # ROS_dict = {i.get('analysis')[0].get('lex'): \ # type_pattern.findall(i.get('analysis')[0].get('gr'))[0] \ # for i in analysis if 'analysis' in i.keys()} wordcount = {} for word in lemmas: if len(word) > 5: if word not in wordcount: wordcount[word] = 1 else: wordcount[word] += 1 n_print = int(input("How many most common words to print: ")) print("\nOK. The {} most common words are as follows\n".format(n_print))
import json from urllib.request import urlopen from urllib.parse import quote, urlencode from flatten_dict import flatten from pymystem3 import Mystem import re m = Mystem() url_base = 'https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles=' #url = 'https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles=Рассказы_братьев_Стругацких' url_title = 'Наполеон I' # jsonurl = urlopen(url) # file = json.loads(jsonurl.read()) # article = str(file) # # for k, v in flatten(file).items(): # # print('key:', k, '\n', v, type(v)) # print(article.replace(r'\n', '\n')) print (quote(url_title.encode('utf8'))) # print(flatten(file)) # for k, v in # article.replace('\\xa0',' ') # with open('https://ru.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&&titles=%D0%A0%D0%B0%D1%81%D1%81%D0%BA%D0%B0%D0%B7%D1%8B_%D0%B1%D1%80%D0%B0%D1%82%D1%8C%D0%B5%D0%B2_%D0%A1%D1%82%D1%80%D1%83%D0%B3%D0%B0%D1%86%D0%BA%D0%B8%D1%85',encoding=('unicode-escape')) as f: # article = f.read() # # # #print(article.decode('latin1')) # print(article) # title_regex = re.compile(r'\"title\".*?\"(?P<title>.*?)\"') # match = title_regex.search(article) # title = match.group('title') #
import json from pymystem3 import Mystem from pyasn1.compat.octets import null from xml.etree import ElementTree # Опции от Сергея Гладилина: mystem_ruscorpora --format=json -i -c -g --eng-gr # Также предлагается -w, если мы удовлетворительно работаем с незнакомыми словами # Для полного набора опций надо брать pymystem из git: pip install git+https://github.com/nlpub/pymystem3 (см. https://github.com/nlpub/pymystem3). m = Mystem(mystem_bin='./mystem_ruscorpora.linux', grammar_info=True, disambiguation=False, entire_input=True, glue_grammar_info=True, use_english_names=True) # Вариант mystem с леммой несовершенного вида у глаголов совершенного вида #masp = Mystem(grammar_info=True, disambiguation=False, entire_input=True, glue_grammar_info=True, use_english_names=True) class MystemFixLists(object): def __init__(self): self.add_fix = self.load_add_cfg() self.del_fix = self.load_del_cfg() def load_add_cfg(self): result = {} with open('add.cfg', 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue if line[0] == '+':