def post(self, request): morph = get_morph(join(settings.PROJECT_DIR, 'morph')) last_name = self.DATA['last_name'] first_name = self.DATA['first_name'] patronymic = self.DATA['patronymic'] # для склонения фамилии надо определить пол try: sex = morph.get_graminfo(first_name.upper())[0]['info'].split(',', 1)[0] except IndexError: # get_graminfo() вернул [] # print 'get_graminfo failed on ', first_name sex = u'жр' # фамилия last_name_inflated = firstcaps(lastnames_ru.inflect(morph, last_name.upper(), sex + u',дт')) # имя first_name_inflated = firstcaps(morph.inflect_ru(first_name.upper(), u'дт')) # отчество patronymic_inflated = firstcaps(morph.inflect_ru(patronymic.upper(), sex + u',дт')) return { 'last_name': last_name_inflated, 'first_name': first_name_inflated, 'patronymic': patronymic_inflated, 'user': self.get_user(), }
def formalize(filename, morph_dict=DICTS_DIR, stop_dict=stop_dict_path): morph = get_morph(morph_dict) stop_words = get_stop_words() dict = {} words = 0.0 try: f = open(filename) except IOError: raise for line in f.readlines(): for word in line.split(): word = word.decode("utf-8", 'ignore') word = word.strip(u'[,.:;\"\')$«»(?<>!-_—//=]\n\t') word = word.replace('.', '_') word = morph.normalize(word.upper()) if isinstance(word, set): word = word.pop() else: continue word = word.lower() words += 1 if word in stop_words or not word: continue if not word in dict: dict[word] = 1.0 else: dict[word] += 1.0 for key in dict: dict[key] /= words return dict
def __init__(self, corpus=None): """Initialize your data structures in the constructor.""" self.unigramCounts = collections.defaultdict(lambda: 0) self.bigramCounts = collections.defaultdict(lambda: 0) self.V = 0 self.morph = get_morph(DICTS_DIR) if corpus: self.train(corpus)
def __init__(self, page, working_list, mutex, dbname = 'crawler_db'): Db_manager.__init__(self, dbname) threading.Thread.__init__(self) self.working_list = working_list self.page = page self.mutex = mutex self.morph = get_morph('dicts') with self.mutex: working_list.append(page)
def download_morph(): # скачиваем и используем словарь для получения грамматической информации о слове (часть речи) path_to_dictionary = os.path.realpath(os.path.curdir) morph_path = join(path_to_dictionary, 'morph_dicts') if not os.path.exists(morph_path): subprocess.call(['wget', 'https://bitbucket.org/kmike/pymorphy/downloads/ru.sqlite-json.zip']) subprocess.call(['unzip', 'ru.sqlite-json.zip', '-d', 'morph_dicts']) morph = get_morph(morph_path) return morph
def get(self, request, *args, **kwargs): params = request.GET COUNT_ELEMENTS = 5 errors = [] limit = COUNT_ELEMENTS offset = 0 form = forms.SearchForm(params) if form.is_valid(): #pointsreq = MainModels.Person.objects; name = form.cleaned_data.get("s") users_list = [] morph = get_morph('/home/tenoclock/yasenput/dicts') if name: #pointsreq = MainModels.Person.search.query(params.get("s")) #search = SphinxSearch() search = SphinxQuerySet(index="auth_user") name_morph = morph.normalize(name.upper()) file1 = open('file1.txt','w') file1.write(str(list(name_morph))) file1.close() phrase_list = name.split(' ') for phrase in phrase_list: if phrase != '': name_morph = morph.normalize(phrase.upper()) for name_m in name_morph: search_query = search.query(name_m) for splited_item in search_query: if not MainModels.Person.objects.get(id = splited_item['id']) in users_list: users_list.append(MainModels.Person.objects.get(id = splited_item['id'])) content = form.cleaned_data.get("content") if content == 'new': pointsreq = pointsreq.order_by('-id') elif content == "popular": pointsreq = pointsreq.annotate(usfiliwers=Count('followers__id')).order_by('-usfiliwers', '-id') else: pointsreq = users_list points = users_list[offset:limit] YpJson = YpSerialiser() return HttpResponse(YpJson.serialize(points, fields=("username", "first_name", "last_name")), mimetype="application/json") else: e = form.errors for er in e: errors.append(er +':'+e[er][0]) return JsonHTTPResponse({"status": 0, "txt": ", ".join(errors)});
def __init__(self): super(NumWordRU,self).__init__() # initializing morphology module for inflecting from pymorphy import get_morph import ConfigParser config = ConfigParser.RawConfigParser() config.read('/home/soshial/text-normalization/normalization.cfg') dicts_folder = config.get('lms','dicts') import os if not os.path.exists(dicts_folder): quit('Please put existing dictionaries into "'+dicts_folder+'" folder!') self.morph = get_morph(dicts_folder) self.inflection_case = u"им" # todo add gender for the ending of numeral ('жр')
def __init__(self): super(NumWordRU,self).__init__() # initializing morphology module for inflecting from pymorphy import get_morph import ConfigParser config = ConfigParser.RawConfigParser() config.read('normalization.cfg') dicts_folder = config.get('lms','dicts') import os if not os.path.exists(dicts_folder): quit('Please put existing dictionaries into "'+dicts_folder+'" folder!') self.morph = get_morph(dicts_folder) self.inflection_case = u"им" # todo add gender for the ending of numeral ('жр')
def misc_utilites(): morpher = get_morph('static/res/pymorphy/') def pluralize(number, word): return morpher.pluralize_inflected_ru(word.upper(), number).lower() def is_logged_in(): try: int(session['logged']) return True except (ValueError, KeyError): return False return {'pluralize': pluralize, 'is_logged_in': is_logged_in }
def main(): (options, args) = parser.parse_args() if not options.word or not options.dict: print 'inflect -h for help.' return morph = get_morph(options.dict) word = options.word.decode(chardet.detect(options.word)['encoding']).upper() word = unicode(word) a = morph.inflect_ru(word, u'пр', u'С') print a.encode('utf8')
def main(): (options, args) = parser.parse_args() if not options.word or not options.dict: print 'inflect -h for help.' return morph = get_morph(options.dict) word = options.word.decode(chardet.detect( options.word)['encoding']).upper() word = unicode(word) a = morph.inflect_ru(word, u'пр', u'С') print a.encode('utf8')
def process(self, interval): jk_re = re.compile(r"%s" % u'(.*?)[\w_\-]+[\s]{1,3}\( (.*?) \)(.*?)Ссылка$') cit_re = re.compile(r"%s" % u">>-----Цитата---->>(.*?)<<-----Цитата----<<", re.DOTALL) rus = u"йцукенгшщзхъёфывапролджэячсмитьбю." morph = get_morph('dicts') finished = True for m in MessageStore.objects.filter(is_processed=False).order_by("-id")[:interval]: finished = False user = m.ms.user #print user data = strip_tags(m.text).strip() data = re.sub(cit_re, "", data) jk = jk_re.findall(data) try: line = jk[0][0] except IndexError: print "ERROR: [%s]" % data line = data.replace(u">>> Подробности", "")[:-33] tokens = nltk.word_tokenize(line.lower()) #text = nltk.word_tokenize(line.lower()) #print nltk.pos_tag(text) m.is_processed = True m.save(update_fields=['is_processed']) for t in tokens: if len(t) > 35 or len(t) < 4: continue if t in string.punctuation or t in string.letters or t in rus: print "%s skipped" % t else: tok = morph.normalize(t.upper()) if isinstance(tok, unicode): word = tok.lower() else: word = list(tok)[0].lower() #print word w, c = Word.objects.get_or_create(name=word) if not c: w.count += 1 w.save(update_fields=["count"]) wu, c = UserWord.objects.get_or_create(word=w, user=user) if not c: wu.count += 1 wu.save(update_fields=["count"]) return finished
def handle(self, *args, **options): morph = get_morph(join(settings.PROJECT_DIR, 'morph')) self.dialog = Dialog() listeners = Listener.objects.filter(first_name__exact=u'') total = listeners.count() index = 0 self.dialog.gauge_start() for listener in listeners: listener.normalize_name(morph) text = u'Склонение: %s %s %s' % (listener.last_name, listener.first_name, listener.patronymic) self.dialog.gauge_update(int(float(index)/total*100), text=text.encode('utf-8'), update_text=True) index += 1 self.dialog.gauge_stop()
def initializeResources(self): """Pre-initialization""" self.animationTimer = () self.progressTimer = () self.grid_layout =() """Initialize Options""" self.options = Options() """Initialize Statistics""" self.stats = Stats() """Config Here""" self.initializeComposition() self.initializeComponents() self.setMenus() self.trayIcon.show() #self.startTrayLoading() """"Initialize Dictionaries (will take a some time!)""" time_start = datetime.now() self.dict = EdictParser() self.dict.loadDict() self.morphy = get_morph(PATH_TO_RES + DICT_EN) self.trayIcon.showMessage('Loading...', 'Initializing dictionaries', QSystemTrayIcon.MessageIcon.Information, 20000 ) #TODO: change into loading dialog... or not """Initializing srs system""" self.trayIcon.showMessage('Loading...', 'Initializing databases', QSystemTrayIcon.MessageIcon.Information, 20000 ) self.srs = srsScheduler() self.srs.initializeAll() self.srs.initializeCurrentSession(self.options.getSessionSize()) """Global hotkeys hook""" #TODO: add multiple hotkeys and fix stop() #self.hooker = GlobalHotkeyManager(toggleQDictFlag, 'Q') # self.hooker = GlobalHotkeyManager(toggleWidgetFlag(self.qdict), 'Q') # self.hooker.setDaemon(True) #temporarily, should work using stop() # self.hooker.start() time_end = datetime.now() self.loadingTime = time_end - time_start
def initializeResources(self): """Pre-initialization""" self.animationTimer = () self.progressTimer = () self.grid_layout = () """Initialize Options""" self.options = Options() """Initialize Statistics""" self.stats = Stats() """Config Here""" self.initializeComposition() self.initializeComponents() self.setMenus() self.trayIcon.show() #self.startTrayLoading() """"Initialize Dictionaries (will take a some time!)""" time_start = datetime.now() self.dict = EdictParser() self.dict.loadDict() self.morphy = get_morph(PATH_TO_RES + DICT_EN) self.trayIcon.showMessage( 'Loading...', 'Initializing dictionaries', QSystemTrayIcon.MessageIcon.Information, 20000) #TODO: change into loading dialog... or not """Initializing srs system""" self.trayIcon.showMessage('Loading...', 'Initializing databases', QSystemTrayIcon.MessageIcon.Information, 20000) self.srs = srsScheduler() self.srs.initializeAll() self.srs.initializeCurrentSession(self.options.getSessionSize()) """Global hotkeys hook""" #TODO: add multiple hotkeys and fix stop() #self.hooker = GlobalHotkeyManager(toggleQDictFlag, 'Q') # self.hooker = GlobalHotkeyManager(toggleWidgetFlag(self.qdict), 'Q') # self.hooker.setDaemon(True) #temporarily, should work using stop() # self.hooker.start() time_end = datetime.now() self.loadingTime = time_end - time_start
def create_triads(path_item, path_rel, path_attr): dicts = "c:\\Python27\\Lib\\site-packages\\pymorphy\\ru.sqlite-json\\" morph = get_morph(dicts) # read items with open(path_item) as f: items = f.readlines() # read relations with open(path_rel) as f: relations = f.readlines() # read attributes with open(path_attr) as f: attributes = f.readlines() # split attributes according to different parts of speech attrsN, attrsV, attrsAdj, attrsIs = [[],[],[],[]] for at in attributes: if 'N' in at: attrsN.append(re.split(',', at)[0].decode('cp1251').lower()) if 'V' in at: attrsV.append(re.split(',', at)[0].decode('cp1251').lower()) if 'Adj' in at: attrsAdj.append(re.split(',', at)[0].decode('cp1251').lower()) if 'Is' in at: attrsIs.append(re.split(',', at)[0].decode('cp1251').lower()) # assemble triads triads = [] for it in items: it = it.replace('\n', '').decode('cp1251') for rel in relations: rel = rel.replace('\n', '').decode('cp1251') if rel == u'может': for attr in attrsV: triads.append([it, rel, attr]) if rel == u'имеет': for attr in attrsN: triads.append([it, rel, attr]) if rel == u'является': for attr in attrsIs: triads.append([it, rel, attr]) if u'как' in rel: for attr in attrsAdj: triads.append([it, '', attr]) # test for triad in triads: print triad[0] + ', ' + triad[1] + ', ' + triad[2] return triads
def get_words(file_name, index): morph = get_morph('') print "Getting words from " + file_name + "..." words = [] pattern = re.compile("(([\w]+[-'])*[\w']+'?)", re.U) # try: f = open(file_name, 'r') file_text = f.read() f.close() file_text = unicode(file_text, 'utf8').upper() file_text = file_text.replace('--', ' -- ') tokens = file_text.split() previous_percentage = -1 for idx, token in enumerate(tokens): m = pattern.match(token) if m: word = m.group() info = morph.get_graminfo(word) if len(info) < 2: continue if not info[0]['class'] in [u"П", u"С", u"Г"]: continue norm = info[0]['norm'] words.append(norm) if norm in index: index[norm] += 1 else: index[norm] = 1 percentage = 100 * idx / len(tokens) if percentage != previous_percentage and percentage % 5 == 0: print "Getting words: " + str(percentage) + "% done" previous_percentage = percentage # except: # print "error occured" return words
# coding: utf-8 import os import tempfile import pymorphy from unittest import TestCase from textgen.words import Noun, Adjective, Verb, NounGroup, Fake, Participle, ShortParticiple, Pronoun from textgen.templates import Args, Template, Dictionary, Vocabulary from textgen.conf import APP_DIR, textgen_settings from textgen.logic import import_texts from textgen.exceptions import NormalFormNeeded morph = pymorphy.get_morph(textgen_settings.PYMORPHY_DICTS_DIRECTORY) class NounTest(TestCase): def test_create_from_baseword(self): noun = Noun.create_from_baseword(morph, u'обезьянка') self.assertEqual(noun.normalized, u'обезьянка') self.assertEqual(noun.properties, (u'жр', )) self.assertEqual( noun.forms, (u'обезьянка', u'обезьянки', u'обезьянке', u'обезьянку', u'обезьянкой', u'обезьянке', u'обезьянки', u'обезьянок', u'обезьянкам', u'обезьянок', u'обезьянками', u'обезьянках')) def test_pluralize(self): noun = Noun.create_from_baseword(morph, u'монета') self.assertEqual(noun.normalized, u'монета')
nargs="?", type=argparse.FileType('w'), default='past.sents', help="output sentences") parser.add_argument('-c', dest='c', nargs="?", type=argparse.FileType('w'), default='past.info', help="output information") args = parser.parse_args() sent_id = 0 morph = get_morph( '/people/burlot/prog/wmt17/analysis/pydict/kmike-pymorphy-3d1a3f962d0e/dicts/converted/en' ) en_dict = pickle.load( open("/vol/work1/burlot/wmt17/analysis/news/words_en.pkl", 'r')) for sent, tags in zip(args.i, args.t): sent = sent.split() tags = tags.split() sent_init = list(sent) # no interrogative if sent[-1] == '?': continue in_prog = False for i in range(len(sent)): # skip complicated cases
# coding=utf-8 from pymorphy import get_morph morph = get_morph('/home/ilya/github/ru.sqlite-json') #dict path ins = open("adjective_opinion_words.txt", "r") array = [] for line in ins: ind = line.index(' ') if ind != -1: line = line[0:ind] array.append(line) ins.close() file = open("pyDict", "w") for i in range(len(array)): word = array[i] word = word.decode("utf-8").upper() info1 = morph.inflect_ru(unicode(word), u'мр') info2 = morph.inflect_ru(unicode(word), u'жр') info3 = morph.inflect_ru(unicode(word), u'ср') res = word.lower().encode("utf-8") + " " + info1.lower().encode( "utf-8") + " " + info2.lower().encode( "utf-8") + " " + info3.lower().encode("utf-8") # print res
# -*- coding: utf-8 -*- import random, re import logging from pymorphy import get_morph morph = get_morph('dict') def rev(x): revmap = { u"Я": u"ТЫ", u"МЕНЯ": u"ТЕБЯ", u"МНЕ": u"ТЕБЕ", u"МНОЙ": u"ТОБОЙ", u"МЫ": u"ВЫ", u"НАС": u"ВАС", u"НАМ": u"ВАМ", u"НАС": u"ВАС", u"НАМИ": u"ВАМИ" } for k, v in revmap.items(): revmap[v] = k xstr = x.replace(',', '').replace('.', '') if xstr in revmap: return x.replace(xstr, revmap[xstr]).lower() global morph info = morph.get_graminfo(x) if len(info):
# -*- coding: cp1251 -*- from pymorphy import get_morph from pymorphy.contrib import tokenizers morph = get_morph('C:/Python27/ru.sqlite-json/') text = (raw_input()).decode('cp1251') d = {} a = set() part_d = {} part_a = set() for word in tokenizers.extract_tokens(text): if word.isalpha() == True: info = morph.get_graminfo(word.upper()) part = info[0]['class'] gram_info = info[0]['info'] nf = info[0]['norm'] print str('{0}({1})').format(word.encode('cp1251'), part.encode('cp1251')), if part == u'Ñ' or part == u'Ã' or part == u'Ï' or part == u'ÈÍÔÈÍÈÒÈÂ': if u'èìÿ' in info[0]['info']: name = 1 else: len_ = len(a) a.add(nf) if len_ == len(a): l = d[nf] l.append(word) d[nf] = l
# coding=utf-8 from pymorphy import get_morph morph = get_morph('/home/ilya/github/ru.sqlite-json') #dict path ins = open("adjective_opinion_words.txt", "r") array = [] for line in ins: ind = line.index(' ') if ind!=-1: line = line[0:ind] array.append(line) ins.close() file = open("pyDict", "w") for i in range(len(array)): word = array[i] word = word.decode("utf-8").upper() info1= morph.inflect_ru(unicode(word), u'мр') info2 = morph.inflect_ru(unicode(word), u'жр') info3 = morph.inflect_ru(unicode(word), u'ср') res = word.lower().encode("utf-8")+" "+info1.lower().encode("utf-8")+" "+info2.lower().encode("utf-8")+" "+info3.lower().encode("utf-8") # print res file.write(res+"\n")
def get_data_and_statistic(file_path, morph, rd): for adj_noun_list in data_gathering_iterator(file_path, morph): for adj_noun in adj_noun_list: adj_noun_str = u" ".join(adj_noun) rd.incr(adj_noun_str, 1) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-f", "--file", dest="file_path", type=str, required=True, help="Файл из которого будет браться текст") parser.add_argument("-m", "--morph_dir", dest="morph_dir", type=str, required=True, help="Директория в которой лежат словари pymorphy") parser.add_argument("-s", "--host", dest="host", default="localhost", type=str, help="Хост на котором находится Redis. По умолчанию 'localhost'") parser.add_argument("-p", "--port", dest="port", default=6379, type=int, help="Порт на котором находится Redis. По умолчанию 6379") parser.add_argument("-d", "--db", dest="db", default=0, type=int, help="БД в редисе. По умолчанию - 0") args = parser.parse_args() morph = get_morph(args.morph_dir) rd = Redis(host=args.host, port=args.port, db=args.db) rd.flushdb() get_data_and_statistic(args.file_path, morph, rd) rd.save()
def __init__(self, pathPY, pathOPC): self.morph = pm.get_morph(pathPY) self.opcorpdict = oci.OpCorpDict(pathOPC)
# -*- coding: utf-8 -*- from pytagcloud import create_tag_image, create_html_data, make_tags, \ LAYOUT_HORIZONTAL, LAYOUTS, LAYOUT_MIX, LAYOUT_MOST_HORIZONTAL from pytagcloud.colors import COLOR_SCHEMES from pytagcloud.lang.counter import get_tag_counts import os, time, sys from pymorphy import get_morph morph = get_morph('dicts/ru', 'sqlite') COLOR_MAP = ((232, 43, 30), (200, 188, 107), (85, 122, 102), (69, 54, 37), (160, 182, 136)) def update_text(t): info = morph.get_graminfo(t.upper()) if len(info) > 0: return info[0]['norm'].lower() return t def process_tags(taglist): allt = {} for t, v in taglist: w = update_text(t) if allt.has_key(w): allt[w] += v else: allt[w] = v d = sorted(allt.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) return d
# -*- coding: utf-8 -*- import nltk, re, pymorphy, csv from nltk.tokenize import RegexpTokenizer #подключаем морфологию from pymorphy import get_morph morf_storage = 'morf_storage/ru.shelve' morph = get_morph(morf_storage, backend='shelve') def declause(sentence): #смотрим является, ли предложение content-clause или нет (есть ли союз "что") claused_sent = {'type': 'nonclaused', 'clauses': sentence} if re.search(u',и?\s*что\s+', sentence): regexp_pattern = u',и?\s*что\s+' #создаем токенайзер с заданой регуляркой clauses = RegexpTokenizer(regexp_pattern, gaps=True).tokenize(sentence) claused_sent = {'type': 'subord', 'clauses': clauses} elif re.search(u",\s*(и)|(но)|(а)|(или)\s+", sentence): # разделяем предложение на noncontent-клаузы по границам сочинительных союзов regexp_pattern = u',\s*((и)|(но)|(а)|(или)|(потому что))\s+' #для для каждого не content-<coord> clauses = RegexpTokenizer(regexp_pattern, gaps=True).tokenize(sentence) claused_sent = {'type': 'coord', 'clauses': clauses}
#!/usr/bin/env python # -*- coding: utf-8 -*- #Desc: Telegram bot. Python 2.7 #Author: Dimitry Lukin, [email protected] #Version: 2019041900 from telegram.ext import Updater, MessageHandler, Filters import re import pycurl import StringIO from pymorphy import get_morph morph = get_morph('/var/lib/pymorphy') SIGNIFICANTWORDS = 4 SREF = 'https://sampowiki.club/doku.php?do=search&id=start&sf=1&q=' STOPLIST = [ u'сампо', u'какой', u'сосед', u"привет", u"знать", u"подсказать", u"здравствовать", u"делать", u"я", u"он", u"есть", u"здесь", u'она', u'куда', u'ехать', u'сейчас' ] def xmlmethod(mtype, mstring): if (mtype == 'search'): request = '<?xml version="1.0"?><methodCall><methodName>dokuwiki.search</methodName><params><param><value><string>' + mstring + '</string></value></param></params></methodCall>' elif (mtype == 'pageList'): request = '<?xml version="1.0"?><methodCall><methodName>dokuwiki.getPagelist</methodName><params><param><value><string></string></value></param></params></methodCall>'
# coding=utf-8 __author__ = 'artemii' from pymorphy import get_morph from pymorphy.contrib import tokenizers f = open('negative_words.txt', 'r') resultFile = open('negative_words_normalized.txt', 'a') morph = get_morph('.') #normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8")) #print normalized.pop().lower().encode("utf-8") for line in f: # word = raw_input() words = tokenizers.extract_words(line.decode("utf-8")) word = words.next() normalized = morph.normalize(word.upper()) resultFile.write(normalized.pop().lower().encode("utf-8") + '\n') # print normalized.pop().lower() # for word pairs #for line in f : ## word = raw_input() # words = tokenizers.extract_words(line.decode("utf-8")) # normalized_fst = morph.normalize(words.next().upper()) # normalized_snd = morph.normalize(words.next().upper()) # resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')
# -*- coding: utf-8 -*- from pymorphy import get_morph morph = get_morph('dicts/ru') print morph.normalize(u"ЛЮДЕЙ")
# -*- coding: utf-8 -*- import os from transl import translit, check_lang from itertools import permutations from pymorphy import get_morph morph = get_morph( os.path.join(os.path.abspath(os.path.dirname(__file__)), 'morph')) def permutations_strings(strings, remove='no'): per = set([]) for str in strings: per.update(map(lambda x: ' '.join(x), permutations(str.split()))) if remove == 'yes': per.remove(str) return list(per) lst = ['abc asd', 'def dfg', 'try'] print permutations_strings(lst, 'yes')
# -*- coding: utf-8 -*- import random, re import logging from pymorphy import get_morph morph = get_morph("dict") def rev(x): revmap = { u"Я": u"ТЫ", u"МЕНЯ": u"ТЕБЯ", u"МНЕ": u"ТЕБЕ", u"МНОЙ": u"ТОБОЙ", u"МЫ": u"ВЫ", u"НАС": u"ВАС", u"НАМ": u"ВАМ", u"НАС": u"ВАС", u"НАМИ": u"ВАМИ", } for k, v in revmap.items(): revmap[v] = k xstr = x.replace(",", "").replace(".", "") if xstr in revmap: return x.replace(xstr, revmap[xstr]).lower() global morph info = morph.get_graminfo(x) if len(info):
__author__ = 'Astakhov D. A.' # -*- coding: utf-8 -*- import pymorphy import subprocess from pymorphy import get_morph from pymorphy.contrib import tokenizers morph = get_morph('/home/bliq/PycharmProjects/Dictionary') # Подключаем словари # Расшифровка частей речи partOfSpeech = '+---------------------------------------------------+\n\ | Части речи Расшифровка |\n\ |---------------------------------------------------|\n\ | C существительное |\n\ | П прилагательное |\n\ | МС местоимение-существительное |\n\ | Г глагол в личной форме |\n\ | ПРИЧАСТИЕ причастие |\n\ | ДЕЕПРИЧАСТИЕ деепричастие |\n\ | ИНФИНИТИВ инфинитив |\n\ | МС-ПРЕДК местоимение-предикатив |\n\ | МС-П местоименное прилагательное |\n\ | ЧИСЛ числительное (количественное) |\n\ | ЧИСЛ-П порядковое числительное |\n\ | Н наречие |\n\ | ПРЕДК предикатив |\n\ | ПРЕДЛ предлог |\n\ | СОЮЗ союз |\n\ | МЕЖД междометие |\n\ | ЧАСТ частица |\n\ | ВВОДН вводное слово |\n\
(u'SOCIAL', u'ACTIVITY'), (u'CURRENT', u'FRESHMAN'), (u'CURRENT', u'SOPHOMORE'), (u'FOUR-YEAR', u'UNIVERSITY'), (u'ACADEMIC', u'RECORD'), (u'DEMONSTRATE', u'PASSION'), (u'HIGH', u'STUDENT'), (u'POTENTIAL', u'STUDENT'), (u'EXCITING', u'PROGRAM'), (u'FAST-PACED', u'PROGRAM'), (u'INTERACTIVE', u'COURCE'), (u'FORMER', u'CAMPER'), (u'MANY', u'INFORMATION') ] morph = get_morph("../dicts/en") def get_grammars_precision_and_recall(grammar, dir_path): retrieved = 0.0 relevant = 0.0 for root, dirs, files in os.walk(dir_path): for file_name in files: path = os.path.join(root, file_name) for result in data_gathering_iterator(path, morph, grammar): for subresult in result: if subresult in normilized_right_result_list: relevant += 1.0 retrieved += 1.0 return relevant / retrieved, relevant / len(normilized_right_result_list)
# -*- coding: utf-8 -*- from django.contrib import admin from django.utils.translation import ugettext_lazy as _ from django.utils.text import capfirst from django.db.models.base import ModelBase from django.conf import settings from pymorphy import get_morph import re morph = get_morph(settings.PYMORPHY_DICTS['ru']['dir']) class I18nLabel(): def __init__(self, function): self.target = function self.app_label = u'' def rename(self, f, name=u''): def wrapper(*args, **kwargs): extra_context = kwargs.get('extra_context', {}) if 'delete_view' != f.__name__: extra_context['title'] = self.get_title_by_name( f.__name__, args[1], name) else: extra_context['object_name'] = morph.inflect_ru(name, u'вн').lower() kwargs['extra_context'] = extra_context return f(*args, **kwargs) return wrapper
# -*- coding: utf-8 -*- import nltk import string import pymorphy #from nltk.corpus import stopwords #import os import pymysql morph = pymorphy.get_morph('K:\databases\BAR\pymorfy') file = open('pushkin/pushkin_aleksandr_polnoe_sobranie_stihotvoreniy.txt', 'r') text = file.read() file.close() text = text.decode('utf-8') #tokenization tokens = nltk.word_tokenize(text) tokens = [i for i in tokens if ( i not in string.punctuation )] #making dictionary pushkin_dict = {} for i in tokens: key = i.replace('.', '') key = key.replace(u"«", '') key = key.replace(u'»', '') key = key.replace(u'…', '') key = key.lower() pushkin_dict[key] = 0 #put all in mysql conn = pymysql.connect(host='127.0.0.1', port=3306, user='******', passwd='', db='pushkin', charset='utf8')
#print "Skip sentence (bracket mismatch)" #print string return False # check that there is at most one 'special' punctuation sign if tags.count(':') > 1 or tags.count('SYM') > 1: #print "Skip sentence (too much punctuation)" #print string return False if sentence[-1][0] == "." and sentence[-2][0] == "Sept": return False return True morph = pymorphy.get_morph('/wrk/yvessche/wmt18/testsuite/pymorphy_en') # Extended PTB set: https://corpling.uis.georgetown.edu/ptb_tags.html # Pymorphy tagset documentation: https://github.com/kmike/pymorphy/blob/master/dicts/src/Dicts/Morph/egramtab.tab def complex_np(sentence): global nplist if nplist == []: return None out_sentence = [x[0] for x in sentence] for i, (word, tag) in enumerate(sentence): if (tag == 'PP') and word.lower() in ['him', 'her']: if [x[0] for x in sentence].count('him') + [x[0] for x in sentence].count('her') > 1: return None if word[0].isupper():
# coding: utf-8 import os import pymorphy from textgen.conf import APP_DIR, textgen_settings from textgen.logic import import_texts morph = pymorphy.get_morph(textgen_settings.PYMORPHY_DICTS_DIRECTORY) import_texts( morph, source_dir=os.path.join(APP_DIR, "fixtures", "texts_src"), tech_vocabulary_path=os.path.join(APP_DIR, "fixtures", "vocabulary.json"), voc_storage="/tmp/voc.json", dict_storage="/tmp/dict.json", print_undefined_words=True, )
def __init__(self): super(Morph, self).__init__() self.morph = pymorphy.get_morph(self.dicts_dir, self.dicts_type)
# -*- coding: utf-8 -*- from south.utils import datetime_utils as datetime from south.db import db from south.v2 import DataMigration from django.db import models from lyrics.models import Song, IndexElement # -----------------------------------------------------------------------------pymorphy from pymorphy.contrib.tokenizers import extract_words from pymorphy import get_morph # Морф анализатор https://pythonhosted.org/pymorphy/intro.html from ..pymorphy_dicts_dir import ret morph = get_morph(ret()) # Директория со словарями для pymorphy class Migration(DataMigration): def forwards(self, orm): "Write your forwards methods here." for song in Song.objects.all(): to_write = list() try: for i, word in enumerate(extract_words(song.lyrics)): for term in morph.normalize(word.upper()): to_write.append('1 ' + str(i) + " " + term) except TypeError: pass try: for i, word in enumerate(extract_words(song.artist)): for term in morph.normalize(word.upper()): to_write.append('2 ' + str(i) + " " + term) except TypeError:
import re import pymorphy import pymorphy.utils text = u''' Сяпала Калуша по напушке и увазила бутявку. И волит: — Калушата, калушаточки! Бутявка! Калушата присяпали и бутявку стрямкали. И подудонились. А Калуша волит: — Оее, оее! Бутявка-то некузявая! Калушата бутявку вычучили. Бутявка вздребезнулась, сопритюкнулась и усяпала с напушки. А Калуша волит: — Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся. А бутявка волит за напушкой: — Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые! ''' r = re.compile('[\W+-]',re.U) words = r.split(text.upper()) # тут нужно прописать путь до папки со словарями morph = pymorphy.get_morph('dicts/converted/ru') for word in words: if word: print word info = morph.get_graminfo(word) for form in info: pymorphy.utils.pprint(form)
def __init__(self, dbname = 'crawler_db'): self.conn = MySQLdb.connect(user = '******', db = dbname, passwd = 'crawler', unix_socket = '/var/run/mysqld/mysqld.sock') self.cursor = self.conn.cursor() self.morph = get_morph('dicts')
import nltk import string import pymorphy #from pymorphy.contrib import tokenizers from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer import os from collections import Counter import numpy as np import cv2 corpus_dir = 'corpus_1836' #corpus_dir = 'poems' CLUSTERS_COUNT = 2 morph = pymorphy.get_morph('C:/DB/ru.sqlite-json') stop_words = stopwords.words('russian') stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как']) corpus = {} res_txt = [] def tokenizeMe(sentence): all_words = [] #sentence = "меня! это,. что? похоже; на - пирамиду" #sentence = sentence.encode('utf-8').upper() #sentence = sentence.upper() #print(sentence) # #words = tokenizers.extract_words(sentence) #print(words)
# -*- coding: utf-8 -*- import nltk import string import pymorphy #from nltk.corpus import stopwords #import os import pymysql morph = pymorphy.get_morph('K:\databases\BAR\pymorfy') file = open('pushkin/pushkin_aleksandr_polnoe_sobranie_stihotvoreniy.txt', 'r') text = file.read() file.close() text = text.decode('utf-8') #tokenization tokens = nltk.word_tokenize(text) tokens = [i for i in tokens if (i not in string.punctuation)] #making dictionary pushkin_dict = {} for i in tokens: key = i.replace('.', '') key = key.replace(u"«", '') key = key.replace(u'»', '') key = key.replace(u'…', '') key = key.lower() pushkin_dict[key] = 0 #put all in mysql conn = pymysql.connect(host='127.0.0.1',
import time from datetime import datetime from pymorphy import get_morph from dater import Dater from tagger import Tagger filename = os.path.join(os.path.dirname(sys.argv[0]), "corpora/somecorpus.txt") traincorpus = os.path.join(os.path.dirname(sys.argv[0]), "dicts/ruscorpora.txt.lemma") trainsuffcorpus = traincorpus + ".suffs" print "STARTED:", str(datetime.now()) start = time.time() morph = get_morph( os.path.join(os.path.dirname(sys.argv[0]), "pydicts").decode("UTF8")) # Подгружаем русский словарь morph_simple = get_morph(os.path.join(os.path.dirname(sys.argv[0]), "pydicts").decode("UTF8"), check_prefixes=False) # Подгружаем русский словарь # Подгружаем обработчик дат dater = Dater() # Подгружаем тэггер tagger = Tagger(morph, morph_simple, dater) # Подгружаем суффиксную статистику для тэггера tagger.load_statistics(traincorpus, trainsuffcorpus) # Лемматизируем частями tagger.parse_chunks(filename, sent_marks=True) print "FINISHED:", str(datetime.now()) print "Time elapsed: ", time.time() - start
# -*- coding: cp1251 -*- from pymorphy import get_morph # портируем необходимые модули from pymorphy.contrib import tokenizers morph = get_morph('C:/Python27/ru.sqlite-json/') # создаем объект класса pymorphy.Morph text = (raw_input("Входные данные: ")).decode('cp1251') d = {} # словарь для хранения списка слов с одной нормальной формой a = set() # множество, не допускающее повторений нормальных форм part_d = {} # словарь для грамматической информации слова part_a = set() # множество, не допускающее повторений слов for word in tokenizers.extract_tokens(text): # выделяем токены, узнаем часть речи if word.isalpha() == True: # отсеиваем пунктуацию info = morph.get_graminfo(word.upper()) # находим грамм. информацию слова part = info[0]['class'] # часть речи слова gram_info = info[0]['info'] # грамм. информация слова nf = info[0]['norm'] # нормальная форма слова print str('{0}({1})').format(word.encode('cp1251'),part.encode('cp1251')), # учитываем только сущ, глаголы и прил if part == u'С' or part == u'Г' or part == u'П' or part == u'ИНФИНИТИВ': # для имени не подбираем синоним if u'имя' in info[0]['info']:name = 1 else: len_ = len(a) a.add(nf) if len_ == len(a): # есть ли уже такая нормальная форма? l = d[nf] # если есть, то добавляем слово в конец списка l.append(word) d[nf] = l
#-*- coding: UTF-8 import littlebrother.ident.config as ident_config import os.path import pymorphy db_encoding = ident_config.geo_db.get('encoding', 'UTF-16') cities_db = ident_config.geo_db.get('cities', 'cities_ru/db/cities.shelve') regions_db = ident_config.geo_db.get('regions', 'cities_ru/db/regions.shelve') world_db = ident_config.geo_db.get('world', 'cities_ru/db/db/regions.shelve') cities_db = os.path.join('..', cities_db) regions_db = os.path.join('..', regions_db) world_db = os.path.join('..', world_db) morph = pymorphy.get_morph( os.path.join('..', ident_config.dicts.get('path', 'dicts/ru/shelve45')), ident_config.dicts.get('backend', 'shelve'))
from django import template from django.conf import settings from pymorphy import get_morph register = template.Library() morph = get_morph(settings.PYMORPHY_DICTS['ru']['dir'], 'cdb') @register.filter def plural_from_object(source, obj): l = len(obj[0]) if 1 == l: return source return morph.pluralize_inflected_ru(source.upper(), l).lower()
# -*- coding: utf-8 -*- from south.utils import datetime_utils as datetime from south.db import db from south.v2 import DataMigration from django.db import models import os import re from lyrics.models import Song, IndexElement # -----------------------------------------------------------------------------pymorphy from pymorphy.contrib import tokenizers from pymorphy import get_morph # Морф анализатор https://pythonhosted.org/pymorphy/intro.html morph = get_morph('/home/antre/pymorphy_dicts/') # Директория со словарями для pymorphy DIR_OF_COLLECT_DATA = os.path.dirname(os.path.abspath(__file__))[:-17] + "collect_data/" FILE_WITH_SYNONYMS = DIR_OF_COLLECT_DATA + "synonims.txt" class Migration(DataMigration): def forwards(self, orm): fi = open(FILE_WITH_SYNONYMS, "rb") for line in fi: try: term, synonyms = line.decode("cp1251")[:-2].upper().split('|', 2) print term except ValueError: continue synonyms = synonyms.split('.')[0] # Отсекаются всякие антонимы synonyms = re.sub('\([^)]*\)', '()', synonyms) # Отсекаются всякие слова в скобках