Esempio n. 1
0
    def post(self, request):
        morph = get_morph(join(settings.PROJECT_DIR, 'morph'))
        last_name = self.DATA['last_name']
        first_name = self.DATA['first_name']
        patronymic = self.DATA['patronymic']

        # для склонения фамилии надо определить пол
        try:
            sex = morph.get_graminfo(first_name.upper())[0]['info'].split(',', 1)[0]
        except IndexError:
            # get_graminfo() вернул []
            # print 'get_graminfo failed on ', first_name
            sex = u'жр'
        # фамилия
        last_name_inflated = firstcaps(lastnames_ru.inflect(morph,
            last_name.upper(), sex + u',дт'))
        # имя
        first_name_inflated = firstcaps(morph.inflect_ru(first_name.upper(), u'дт'))
        # отчество
        patronymic_inflated = firstcaps(morph.inflect_ru(patronymic.upper(), sex + u',дт'))
        return {
            'last_name': last_name_inflated,
            'first_name': first_name_inflated,
            'patronymic': patronymic_inflated,
            'user': self.get_user(),
        }
Esempio n. 2
0
def formalize(filename, morph_dict=DICTS_DIR, stop_dict=stop_dict_path):
    morph = get_morph(morph_dict)
    stop_words = get_stop_words()
    dict = {}
    words = 0.0
    try:
        f = open(filename)
    except IOError:
        raise
    for line in f.readlines():
        for word in line.split():
            word = word.decode("utf-8", 'ignore')
            word = word.strip(u'[,.:;\"\')$«»(?<>!-_—//=]\n\t')
            word = word.replace('.', '_')
            word = morph.normalize(word.upper())
            if isinstance(word, set):
                word = word.pop()
            else:
                continue
            word = word.lower()
            words += 1
            if word in stop_words or not word:
                continue
            if not word in dict:
                dict[word] = 1.0
            else:
                dict[word] += 1.0
    for key in dict:
        dict[key] /= words
    return dict
 def __init__(self, corpus=None):
     """Initialize your data structures in the constructor."""
     self.unigramCounts = collections.defaultdict(lambda: 0)
     self.bigramCounts = collections.defaultdict(lambda: 0)
     self.V = 0
     self.morph = get_morph(DICTS_DIR)
     if corpus:
         self.train(corpus)
Esempio n. 4
0
 def __init__(self, page, working_list, mutex, dbname = 'crawler_db'):
     Db_manager.__init__(self, dbname)
     threading.Thread.__init__(self)
     self.working_list = working_list
     self.page = page
     self.mutex = mutex
     self.morph = get_morph('dicts')
     with self.mutex:
         working_list.append(page)
Esempio n. 5
0
def download_morph():
    # скачиваем и используем словарь для получения грамматической информации о слове (часть речи)
    path_to_dictionary = os.path.realpath(os.path.curdir)
    morph_path = join(path_to_dictionary, 'morph_dicts')
    if not os.path.exists(morph_path):
        subprocess.call(['wget', 'https://bitbucket.org/kmike/pymorphy/downloads/ru.sqlite-json.zip'])
        subprocess.call(['unzip', 'ru.sqlite-json.zip', '-d', 'morph_dicts'])
    morph = get_morph(morph_path)
    return morph
Esempio n. 6
0
    def get(self, request, *args, **kwargs):
        params = request.GET
        COUNT_ELEMENTS = 5
        errors = []

        limit = COUNT_ELEMENTS
        offset = 0

        form = forms.SearchForm(params)
        if form.is_valid():
            #pointsreq = MainModels.Person.objects;

            name = form.cleaned_data.get("s")
            users_list = []
            morph = get_morph('/home/tenoclock/yasenput/dicts')
            if name:
                #pointsreq = MainModels.Person.search.query(params.get("s"))
                #search = SphinxSearch()
                search = SphinxQuerySet(index="auth_user")
                name_morph = morph.normalize(name.upper())
                file1 = open('file1.txt','w')
                file1.write(str(list(name_morph)))
                file1.close()
                phrase_list = name.split(' ')
                for phrase in phrase_list:
                    if phrase != '':
                        name_morph = morph.normalize(phrase.upper())
                        for name_m in name_morph:
                            search_query = search.query(name_m)
                            for splited_item in search_query:
                                if not MainModels.Person.objects.get(id = splited_item['id']) in users_list:
                                   users_list.append(MainModels.Person.objects.get(id = splited_item['id']))





            content = form.cleaned_data.get("content")
            if content == 'new':
                pointsreq  = pointsreq.order_by('-id')
            elif content == "popular":
                pointsreq  = pointsreq.annotate(usfiliwers=Count('followers__id')).order_by('-usfiliwers', '-id')
            else:
                pointsreq  = users_list


            points = users_list[offset:limit]

            YpJson = YpSerialiser()
            return HttpResponse(YpJson.serialize(points, fields=("username", "first_name", "last_name")),
                                mimetype="application/json")
        else:
            e = form.errors
            for er in e:
                errors.append(er +':'+e[er][0])
            return JsonHTTPResponse({"status": 0, "txt": ", ".join(errors)});
 def __init__(self):
     super(NumWordRU,self).__init__()
     # initializing morphology module for inflecting
     from pymorphy import get_morph
     import ConfigParser
     config = ConfigParser.RawConfigParser()
     config.read('/home/soshial/text-normalization/normalization.cfg')
     dicts_folder = config.get('lms','dicts')
     import os
     if not os.path.exists(dicts_folder): quit('Please put existing dictionaries into "'+dicts_folder+'" folder!')
     self.morph = get_morph(dicts_folder)
     self.inflection_case = u"им" # todo add gender for the ending of numeral ('жр')
 def __init__(self):
     super(NumWordRU,self).__init__()
     # initializing morphology module for inflecting
     from pymorphy import get_morph
     import ConfigParser
     config = ConfigParser.RawConfigParser()
     config.read('normalization.cfg')
     dicts_folder = config.get('lms','dicts')
     import os
     if not os.path.exists(dicts_folder): quit('Please put existing dictionaries into "'+dicts_folder+'" folder!')
     self.morph = get_morph(dicts_folder)
     self.inflection_case = u"им" # todo add gender for the ending of numeral ('жр')
Esempio n. 9
0
def misc_utilites():
	morpher = get_morph('static/res/pymorphy/')

	def pluralize(number, word):
		return morpher.pluralize_inflected_ru(word.upper(), number).lower()

	def is_logged_in():
		try:
			int(session['logged'])
			return True
		except (ValueError, KeyError):
			return False

	return {'pluralize': pluralize, 'is_logged_in': is_logged_in }
Esempio n. 10
0
def main():
	(options, args) = parser.parse_args()

	if not options.word or not options.dict:
		print 'inflect -h for help.'
		return
		
	morph = get_morph(options.dict)

	word 	= options.word.decode(chardet.detect(options.word)['encoding']).upper()
	word 	= unicode(word)

	a 		= morph.inflect_ru(word, u'пр', u'С')
	print a.encode('utf8')
Esempio n. 11
0
def main():
    (options, args) = parser.parse_args()

    if not options.word or not options.dict:
        print 'inflect -h for help.'
        return

    morph = get_morph(options.dict)

    word = options.word.decode(chardet.detect(
        options.word)['encoding']).upper()
    word = unicode(word)

    a = morph.inflect_ru(word, u'пр', u'С')
    print a.encode('utf8')
Esempio n. 12
0
 def process(self, interval):
     jk_re = re.compile(r"%s" % u'(.*?)[\w_\-]+[\s]{1,3}\( (.*?) \)(.*?)Ссылка$')
     cit_re = re.compile(r"%s" % u">>-----Цитата---->>(.*?)<<-----Цитата----<<", re.DOTALL)
     rus = u"йцукенгшщзхъёфывапролджэячсмитьбю."
     morph = get_morph('dicts')
     finished = True
     for m in MessageStore.objects.filter(is_processed=False).order_by("-id")[:interval]:
         finished = False
         user = m.ms.user
         #print user
         
         data = strip_tags(m.text).strip()
         data = re.sub(cit_re, "", data)
         jk = jk_re.findall(data)
         try:
             line = jk[0][0]
         except IndexError:
             print "ERROR: [%s]" % data
             line = data.replace(u">>> Подробности", "")[:-33]
             
         tokens = nltk.word_tokenize(line.lower())
         #text = nltk.word_tokenize(line.lower())
         #print nltk.pos_tag(text)
         m.is_processed = True
         m.save(update_fields=['is_processed'])
         
         for t in tokens:
             if len(t) > 35 or len(t) < 4:
                 continue
             if t in string.punctuation or t in string.letters or t in rus:
                 print "%s skipped" % t
             else:
                 tok = morph.normalize(t.upper())
                 if isinstance(tok, unicode):
                     word = tok.lower()
                 else:
                     word = list(tok)[0].lower()
                 #print word
                 w, c = Word.objects.get_or_create(name=word)
                 if not c:
                     w.count += 1
                     w.save(update_fields=["count"])
                 wu, c = UserWord.objects.get_or_create(word=w, user=user)
                 if not c:
                     wu.count += 1
                     wu.save(update_fields=["count"])
     return finished
Esempio n. 13
0
    def handle(self, *args, **options):
        morph = get_morph(join(settings.PROJECT_DIR, 'morph'))
        self.dialog = Dialog()
        listeners = Listener.objects.filter(first_name__exact=u'')

        total = listeners.count()
        index = 0
        self.dialog.gauge_start()

        for listener in listeners:
            listener.normalize_name(morph)
            text = u'Склонение: %s %s %s' % (listener.last_name, listener.first_name, listener.patronymic)
            self.dialog.gauge_update(int(float(index)/total*100),
                text=text.encode('utf-8'),
                update_text=True)
            index += 1

        self.dialog.gauge_stop()
Esempio n. 14
0
    def initializeResources(self):
        
        """Pre-initialization"""
        self.animationTimer = ()
        self.progressTimer = ()
        self.grid_layout =()
                       
        """Initialize Options"""
        self.options = Options()
        
        """Initialize Statistics"""
        self.stats = Stats()
        
        """Config Here"""
        self.initializeComposition()
        self.initializeComponents()
        self.setMenus()
        self.trayIcon.show()
        #self.startTrayLoading()
        
        """"Initialize Dictionaries    (will take a some time!)"""
        time_start = datetime.now()
        self.dict = EdictParser()
        self.dict.loadDict()
        
        self.morphy = get_morph(PATH_TO_RES + DICT_EN)
        
        self.trayIcon.showMessage('Loading...', 'Initializing dictionaries', QSystemTrayIcon.MessageIcon.Information, 20000 )     #TODO: change into loading dialog... or not
        
        """Initializing srs system"""
        self.trayIcon.showMessage('Loading...', 'Initializing databases', QSystemTrayIcon.MessageIcon.Information, 20000 )
        self.srs = srsScheduler()
        self.srs.initializeAll()
        self.srs.initializeCurrentSession(self.options.getSessionSize())
        
        """Global hotkeys hook"""
        #TODO: add multiple hotkeys and fix stop()
        #self.hooker = GlobalHotkeyManager(toggleQDictFlag, 'Q')
#        self.hooker = GlobalHotkeyManager(toggleWidgetFlag(self.qdict), 'Q')
#        self.hooker.setDaemon(True) #temporarily, should work using stop()
#        self.hooker.start()
        
        time_end = datetime.now()
        self.loadingTime =  time_end - time_start
Esempio n. 15
0
    def initializeResources(self):
        """Pre-initialization"""
        self.animationTimer = ()
        self.progressTimer = ()
        self.grid_layout = ()
        """Initialize Options"""
        self.options = Options()
        """Initialize Statistics"""
        self.stats = Stats()
        """Config Here"""
        self.initializeComposition()
        self.initializeComponents()
        self.setMenus()
        self.trayIcon.show()
        #self.startTrayLoading()
        """"Initialize Dictionaries    (will take a some time!)"""
        time_start = datetime.now()
        self.dict = EdictParser()
        self.dict.loadDict()

        self.morphy = get_morph(PATH_TO_RES + DICT_EN)

        self.trayIcon.showMessage(
            'Loading...', 'Initializing dictionaries',
            QSystemTrayIcon.MessageIcon.Information,
            20000)  #TODO: change into loading dialog... or not
        """Initializing srs system"""
        self.trayIcon.showMessage('Loading...', 'Initializing databases',
                                  QSystemTrayIcon.MessageIcon.Information,
                                  20000)
        self.srs = srsScheduler()
        self.srs.initializeAll()
        self.srs.initializeCurrentSession(self.options.getSessionSize())
        """Global hotkeys hook"""
        #TODO: add multiple hotkeys and fix stop()
        #self.hooker = GlobalHotkeyManager(toggleQDictFlag, 'Q')
        #        self.hooker = GlobalHotkeyManager(toggleWidgetFlag(self.qdict), 'Q')
        #        self.hooker.setDaemon(True) #temporarily, should work using stop()
        #        self.hooker.start()

        time_end = datetime.now()
        self.loadingTime = time_end - time_start
Esempio n. 16
0
def create_triads(path_item, path_rel, path_attr):
    dicts = "c:\\Python27\\Lib\\site-packages\\pymorphy\\ru.sqlite-json\\"
    morph = get_morph(dicts)

    # read items
    with open(path_item) as f:
        items = f.readlines()
    # read relations
    with open(path_rel) as f:
        relations = f.readlines()
    # read attributes
    with open(path_attr) as f:
        attributes = f.readlines()

    # split attributes according to different parts of speech
    attrsN, attrsV, attrsAdj, attrsIs = [[],[],[],[]]
    for at in attributes:
        if 'N' in at: attrsN.append(re.split(',', at)[0].decode('cp1251').lower())
        if 'V' in at: attrsV.append(re.split(',', at)[0].decode('cp1251').lower())
        if 'Adj' in at: attrsAdj.append(re.split(',', at)[0].decode('cp1251').lower())
        if 'Is' in at: attrsIs.append(re.split(',', at)[0].decode('cp1251').lower())

    # assemble triads
    triads = []
    for it in items:
        it = it.replace('\n', '').decode('cp1251')
        for rel in relations:
            rel = rel.replace('\n', '').decode('cp1251')
            if rel == u'может':
                for attr in attrsV: triads.append([it, rel, attr])
            if rel == u'имеет':
                for attr in attrsN: triads.append([it, rel, attr])
            if rel == u'является':
                for attr in attrsIs: triads.append([it, rel, attr])
            if u'как' in rel:
                for attr in attrsAdj: triads.append([it, '', attr])

    # test
    for triad in triads:
        print triad[0] + ', ' + triad[1] + ', ' + triad[2]

    return triads
Esempio n. 17
0
def get_words(file_name, index):
    
    morph = get_morph('')
    print "Getting words from " + file_name + "..."

    words = []
    pattern = re.compile("(([\w]+[-'])*[\w']+'?)", re.U)

    # try:
    f = open(file_name, 'r')
    file_text = f.read()
    f.close()
    file_text = unicode(file_text, 'utf8').upper()
    file_text = file_text.replace('--', ' -- ')
    tokens = file_text.split()
    previous_percentage = -1
    for idx, token in enumerate(tokens):
        m = pattern.match(token)
        if m:
            word = m.group()
	    info = morph.get_graminfo(word)
	    if len(info) < 2:
		continue
	    if not info[0]['class'] in [u"П", u"С", u"Г"]:
		continue
	    norm = info[0]['norm']
            words.append(norm)
            if norm in index:
                index[norm] += 1
            else:
                index[norm] = 1
        percentage = 100 * idx / len(tokens)
        if percentage != previous_percentage and percentage % 5 == 0:
            print "Getting words: " + str(percentage) + "% done"
            previous_percentage = percentage
    # except:
    #     print "error occured"

    return words
Esempio n. 18
0
def get_words(file_name, index):

    morph = get_morph('')
    print "Getting words from " + file_name + "..."

    words = []
    pattern = re.compile("(([\w]+[-'])*[\w']+'?)", re.U)

    # try:
    f = open(file_name, 'r')
    file_text = f.read()
    f.close()
    file_text = unicode(file_text, 'utf8').upper()
    file_text = file_text.replace('--', ' -- ')
    tokens = file_text.split()
    previous_percentage = -1
    for idx, token in enumerate(tokens):
        m = pattern.match(token)
        if m:
            word = m.group()
            info = morph.get_graminfo(word)
            if len(info) < 2:
                continue
            if not info[0]['class'] in [u"П", u"С", u"Г"]:
                continue
            norm = info[0]['norm']
            words.append(norm)
            if norm in index:
                index[norm] += 1
            else:
                index[norm] = 1
        percentage = 100 * idx / len(tokens)
        if percentage != previous_percentage and percentage % 5 == 0:
            print "Getting words: " + str(percentage) + "% done"
            previous_percentage = percentage
    # except:
    #     print "error occured"

    return words
Esempio n. 19
0
# coding: utf-8
import os
import tempfile

import pymorphy

from unittest import TestCase

from textgen.words import Noun, Adjective, Verb, NounGroup, Fake, Participle, ShortParticiple, Pronoun
from textgen.templates import Args, Template, Dictionary, Vocabulary
from textgen.conf import APP_DIR, textgen_settings
from textgen.logic import import_texts
from textgen.exceptions import NormalFormNeeded

morph = pymorphy.get_morph(textgen_settings.PYMORPHY_DICTS_DIRECTORY)


class NounTest(TestCase):
    def test_create_from_baseword(self):
        noun = Noun.create_from_baseword(morph, u'обезьянка')
        self.assertEqual(noun.normalized, u'обезьянка')
        self.assertEqual(noun.properties, (u'жр', ))
        self.assertEqual(
            noun.forms,
            (u'обезьянка', u'обезьянки', u'обезьянке', u'обезьянку',
             u'обезьянкой', u'обезьянке', u'обезьянки', u'обезьянок',
             u'обезьянкам', u'обезьянок', u'обезьянками', u'обезьянках'))

    def test_pluralize(self):
        noun = Noun.create_from_baseword(morph, u'монета')
        self.assertEqual(noun.normalized, u'монета')
Esempio n. 20
0
                    nargs="?",
                    type=argparse.FileType('w'),
                    default='past.sents',
                    help="output sentences")
parser.add_argument('-c',
                    dest='c',
                    nargs="?",
                    type=argparse.FileType('w'),
                    default='past.info',
                    help="output information")
args = parser.parse_args()

sent_id = 0

morph = get_morph(
    '/people/burlot/prog/wmt17/analysis/pydict/kmike-pymorphy-3d1a3f962d0e/dicts/converted/en'
)
en_dict = pickle.load(
    open("/vol/work1/burlot/wmt17/analysis/news/words_en.pkl", 'r'))

for sent, tags in zip(args.i, args.t):
    sent = sent.split()
    tags = tags.split()
    sent_init = list(sent)
    # no interrogative
    if sent[-1] == '?':
        continue
    in_prog = False
    for i in range(len(sent)):

        # skip complicated cases
Esempio n. 21
0
# coding=utf-8

from pymorphy import get_morph

morph = get_morph('/home/ilya/github/ru.sqlite-json')  #dict path

ins = open("adjective_opinion_words.txt", "r")

array = []
for line in ins:
    ind = line.index(' ')
    if ind != -1:
        line = line[0:ind]
        array.append(line)

ins.close()

file = open("pyDict", "w")

for i in range(len(array)):
    word = array[i]
    word = word.decode("utf-8").upper()

    info1 = morph.inflect_ru(unicode(word), u'мр')
    info2 = morph.inflect_ru(unicode(word), u'жр')
    info3 = morph.inflect_ru(unicode(word), u'ср')

    res = word.lower().encode("utf-8") + " " + info1.lower().encode(
        "utf-8") + " " + info2.lower().encode(
            "utf-8") + " " + info3.lower().encode("utf-8")
    #    print res
Esempio n. 22
0
# -*- coding: utf-8 -*-

import random, re
import logging
from pymorphy import get_morph

morph = get_morph('dict')


def rev(x):
    revmap = {
        u"Я": u"ТЫ",
        u"МЕНЯ": u"ТЕБЯ",
        u"МНЕ": u"ТЕБЕ",
        u"МНОЙ": u"ТОБОЙ",
        u"МЫ": u"ВЫ",
        u"НАС": u"ВАС",
        u"НАМ": u"ВАМ",
        u"НАС": u"ВАС",
        u"НАМИ": u"ВАМИ"
    }
    for k, v in revmap.items():
        revmap[v] = k

    xstr = x.replace(',', '').replace('.', '')
    if xstr in revmap:
        return x.replace(xstr, revmap[xstr]).lower()

    global morph
    info = morph.get_graminfo(x)
    if len(info):
Esempio n. 23
0
# -*- coding: cp1251 -*-
from pymorphy import get_morph
from pymorphy.contrib import tokenizers

morph = get_morph('C:/Python27/ru.sqlite-json/')

text = (raw_input()).decode('cp1251')

d = {}
a = set()
part_d = {}
part_a = set()

for word in tokenizers.extract_tokens(text):
    if word.isalpha() == True:
        info = morph.get_graminfo(word.upper())
        part = info[0]['class']
        gram_info = info[0]['info']
        nf = info[0]['norm']
        print str('{0}({1})').format(word.encode('cp1251'),
                                     part.encode('cp1251')),

        if part == u'Ñ' or part == u'Ã' or part == u'Ï' or part == u'ÈÍÔÈÍÈÒÈÂ':
            if u'èìÿ' in info[0]['info']: name = 1
            else:
                len_ = len(a)
                a.add(nf)
                if len_ == len(a):
                    l = d[nf]
                    l.append(word)
                    d[nf] = l
Esempio n. 24
0
# coding=utf-8

from pymorphy import get_morph

morph = get_morph('/home/ilya/github/ru.sqlite-json') #dict path

ins = open("adjective_opinion_words.txt", "r")

array = []
for line in ins:
    ind = line.index(' ')
    if ind!=-1:
        line = line[0:ind]
        array.append(line)

ins.close()

file = open("pyDict", "w")

for i in range(len(array)):
    word = array[i]
    word = word.decode("utf-8").upper()

    info1= morph.inflect_ru(unicode(word), u'мр')
    info2 = morph.inflect_ru(unicode(word), u'жр')
    info3 = morph.inflect_ru(unicode(word), u'ср')

    res = word.lower().encode("utf-8")+" "+info1.lower().encode("utf-8")+" "+info2.lower().encode("utf-8")+" "+info3.lower().encode("utf-8")
#    print res
    file.write(res+"\n")
Esempio n. 25
0
def get_data_and_statistic(file_path, morph, rd):
    for adj_noun_list in data_gathering_iterator(file_path, morph):
        for adj_noun in adj_noun_list:
            adj_noun_str = u" ".join(adj_noun)
            rd.incr(adj_noun_str, 1)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("-f", "--file", dest="file_path", type=str, required=True,
                        help="Файл из которого будет браться текст")
    parser.add_argument("-m", "--morph_dir", dest="morph_dir", type=str, required=True,
                        help="Директория в которой лежат словари pymorphy")
    parser.add_argument("-s", "--host", dest="host", default="localhost", type=str,
                        help="Хост на котором находится Redis. По умолчанию 'localhost'")
    parser.add_argument("-p", "--port", dest="port", default=6379, type=int,
                        help="Порт на котором находится Redis. По умолчанию 6379")
    parser.add_argument("-d", "--db", dest="db", default=0, type=int,
                        help="БД в редисе. По умолчанию - 0")

    args = parser.parse_args()
    morph = get_morph(args.morph_dir)
    rd = Redis(host=args.host, port=args.port, db=args.db)
    rd.flushdb()

    get_data_and_statistic(args.file_path, morph, rd)

    rd.save()
Esempio n. 26
0
 def __init__(self, pathPY, pathOPC):
     self.morph = pm.get_morph(pathPY)
     self.opcorpdict = oci.OpCorpDict(pathOPC)
Esempio n. 27
0
# -*- coding: utf-8 -*-
from pytagcloud import create_tag_image, create_html_data, make_tags, \
    LAYOUT_HORIZONTAL, LAYOUTS, LAYOUT_MIX, LAYOUT_MOST_HORIZONTAL
from pytagcloud.colors import COLOR_SCHEMES
from pytagcloud.lang.counter import get_tag_counts
import os, time, sys
from pymorphy import get_morph
morph = get_morph('dicts/ru', 'sqlite')

COLOR_MAP = ((232, 43, 30), (200, 188, 107), (85, 122, 102), (69, 54, 37),
             (160, 182, 136))


def update_text(t):
    info = morph.get_graminfo(t.upper())
    if len(info) > 0:
        return info[0]['norm'].lower()
    return t


def process_tags(taglist):
    allt = {}
    for t, v in taglist:
        w = update_text(t)
        if allt.has_key(w):
            allt[w] += v
        else:
            allt[w] = v
    d = sorted(allt.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    return d
Esempio n. 28
0
# -*- coding: utf-8 -*-
import nltk, re, pymorphy, csv
from nltk.tokenize import RegexpTokenizer

#подключаем морфологию
from pymorphy import get_morph
morf_storage = 'morf_storage/ru.shelve'
morph = get_morph(morf_storage, backend='shelve')


def declause(sentence):
    #смотрим является, ли предложение content-clause или нет (есть ли союз "что")

    claused_sent = {'type': 'nonclaused', 'clauses': sentence}

    if re.search(u',и?\s*что\s+', sentence):
        regexp_pattern = u',и?\s*что\s+'

        #создаем токенайзер с заданой регуляркой
        clauses = RegexpTokenizer(regexp_pattern, gaps=True).tokenize(sentence)

        claused_sent = {'type': 'subord', 'clauses': clauses}

    elif re.search(u",\s*(и)|(но)|(а)|(или)\s+", sentence):

        # разделяем предложение на noncontent-клаузы по границам сочинительных союзов
        regexp_pattern = u',\s*((и)|(но)|(а)|(или)|(потому что))\s+'

        #для для каждого не content-<coord>
        clauses = RegexpTokenizer(regexp_pattern, gaps=True).tokenize(sentence)
        claused_sent = {'type': 'coord', 'clauses': clauses}
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Desc: Telegram bot. Python 2.7
#Author: Dimitry Lukin, [email protected]
#Version: 2019041900

from telegram.ext import Updater, MessageHandler, Filters
import re

import pycurl
import StringIO
from pymorphy import get_morph
morph = get_morph('/var/lib/pymorphy')

SIGNIFICANTWORDS = 4

SREF = 'https://sampowiki.club/doku.php?do=search&id=start&sf=1&q='

STOPLIST = [
    u'сампо', u'какой', u'сосед', u"привет", u"знать", u"подсказать",
    u"здравствовать", u"делать", u"я", u"он", u"есть", u"здесь", u'она',
    u'куда', u'ехать', u'сейчас'
]


def xmlmethod(mtype, mstring):
    if (mtype == 'search'):
        request = '<?xml version="1.0"?><methodCall><methodName>dokuwiki.search</methodName><params><param><value><string>' + mstring + '</string></value></param></params></methodCall>'
    elif (mtype == 'pageList'):
        request = '<?xml version="1.0"?><methodCall><methodName>dokuwiki.getPagelist</methodName><params><param><value><string></string></value></param></params></methodCall>'
Esempio n. 30
0
# coding=utf-8

__author__ = 'artemii'
from pymorphy import get_morph
from pymorphy.contrib import tokenizers

f = open('negative_words.txt', 'r')
resultFile = open('negative_words_normalized.txt', 'a')
morph = get_morph('.')

#normalized = morph.normalize('тнрнюооюпюрю'.decode("utf-8"))
#print normalized.pop().lower().encode("utf-8")

for line in f:
    #    word = raw_input()
    words = tokenizers.extract_words(line.decode("utf-8"))
    word = words.next()
    normalized = morph.normalize(word.upper())
    resultFile.write(normalized.pop().lower().encode("utf-8") + '\n')
#    print normalized.pop().lower()

# for word pairs
#for line in f :
##    word = raw_input()
#    words = tokenizers.extract_words(line.decode("utf-8"))
#    normalized_fst = morph.normalize(words.next().upper())
#    normalized_snd = morph.normalize(words.next().upper())
#    resultFile.write(normalized_fst.pop().lower().encode("utf-8") + ' ' + normalized_snd.pop().lower().encode("utf-8") + '\n')
Esempio n. 31
0
# -*- coding: utf-8 -*-
from pymorphy import get_morph
morph = get_morph('dicts/ru')
print morph.normalize(u"ЛЮДЕЙ")
Esempio n. 32
0
# -*- coding: utf-8 -*-
import os
from transl import translit, check_lang
from itertools import permutations
from pymorphy import get_morph

morph = get_morph(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), 'morph'))


def permutations_strings(strings, remove='no'):
    per = set([])
    for str in strings:
        per.update(map(lambda x: ' '.join(x), permutations(str.split())))
        if remove == 'yes': per.remove(str)
    return list(per)


lst = ['abc asd', 'def dfg', 'try']

print permutations_strings(lst, 'yes')
Esempio n. 33
0
# -*- coding: utf-8 -*-

import random, re
import logging
from pymorphy import get_morph

morph = get_morph("dict")


def rev(x):
    revmap = {
        u"Я": u"ТЫ",
        u"МЕНЯ": u"ТЕБЯ",
        u"МНЕ": u"ТЕБЕ",
        u"МНОЙ": u"ТОБОЙ",
        u"МЫ": u"ВЫ",
        u"НАС": u"ВАС",
        u"НАМ": u"ВАМ",
        u"НАС": u"ВАС",
        u"НАМИ": u"ВАМИ",
    }
    for k, v in revmap.items():
        revmap[v] = k

    xstr = x.replace(",", "").replace(".", "")
    if xstr in revmap:
        return x.replace(xstr, revmap[xstr]).lower()

    global morph
    info = morph.get_graminfo(x)
    if len(info):
Esempio n. 34
0
File: morth.py Progetto: dmg61/Morth
__author__ = 'Astakhov D. A.'
# -*- coding: utf-8 -*-

import pymorphy
import subprocess
from pymorphy import get_morph
from pymorphy.contrib import tokenizers
morph = get_morph('/home/bliq/PycharmProjects/Dictionary') # Подключаем словари

# Расшифровка частей речи
partOfSpeech = '+---------------------------------------------------+\n\
|   Части речи 	    Расшифровка                     |\n\
|---------------------------------------------------|\n\
|   C 	            существительное                 |\n\
|   П 	            прилагательное                  |\n\
|   МС 	            местоимение-существительное     |\n\
|   Г 	            глагол в личной форме           |\n\
|   ПРИЧАСТИЕ 	    причастие                       |\n\
|   ДЕЕПРИЧАСТИЕ    деепричастие                    |\n\
|   ИНФИНИТИВ 	    инфинитив                       |\n\
|   МС-ПРЕДК 	    местоимение-предикатив          |\n\
|   МС-П 	        местоименное прилагательное     |\n\
|   ЧИСЛ 	        числительное (количественное)   |\n\
|   ЧИСЛ-П 	        порядковое числительное         |\n\
|   Н 	            наречие                         |\n\
|   ПРЕДК 	        предикатив                      |\n\
|   ПРЕДЛ 	        предлог                         |\n\
|   СОЮЗ 	        союз                            |\n\
|   МЕЖД 	        междометие                      |\n\
|   ЧАСТ 	        частица                         |\n\
|   ВВОДН 	        вводное слово                   |\n\
Esempio n. 35
0
    (u'SOCIAL', u'ACTIVITY'),
    (u'CURRENT', u'FRESHMAN'),
    (u'CURRENT', u'SOPHOMORE'),
    (u'FOUR-YEAR', u'UNIVERSITY'),
    (u'ACADEMIC', u'RECORD'),
    (u'DEMONSTRATE', u'PASSION'),
    (u'HIGH', u'STUDENT'),
    (u'POTENTIAL', u'STUDENT'),
    (u'EXCITING', u'PROGRAM'),
    (u'FAST-PACED', u'PROGRAM'),
    (u'INTERACTIVE', u'COURCE'),
    (u'FORMER', u'CAMPER'),
    (u'MANY', u'INFORMATION')
]

morph = get_morph("../dicts/en")


def get_grammars_precision_and_recall(grammar, dir_path):
    retrieved = 0.0
    relevant = 0.0
    for root, dirs, files in os.walk(dir_path):
        for file_name in files:
            path = os.path.join(root, file_name)
            for result in data_gathering_iterator(path, morph, grammar):
                for subresult in result:
                    if subresult in normilized_right_result_list:
                        relevant += 1.0
                    retrieved += 1.0

    return relevant / retrieved, relevant / len(normilized_right_result_list)
Esempio n. 36
0
# -*- coding: utf-8 -*-

from django.contrib import admin
from django.utils.translation import ugettext_lazy as _
from django.utils.text import capfirst
from django.db.models.base import ModelBase
from django.conf import settings
from pymorphy import get_morph
import re

morph = get_morph(settings.PYMORPHY_DICTS['ru']['dir'])


class I18nLabel():
    def __init__(self, function):
        self.target = function
        self.app_label = u''

    def rename(self, f, name=u''):
        def wrapper(*args, **kwargs):
            extra_context = kwargs.get('extra_context', {})
            if 'delete_view' != f.__name__:
                extra_context['title'] = self.get_title_by_name(
                    f.__name__, args[1], name)
            else:
                extra_context['object_name'] = morph.inflect_ru(name,
                                                                u'вн').lower()
            kwargs['extra_context'] = extra_context
            return f(*args, **kwargs)

        return wrapper
Esempio n. 37
0
# -*- coding: utf-8 -*-
import nltk
import string
import pymorphy
#from nltk.corpus import stopwords
#import os
import pymysql

morph = pymorphy.get_morph('K:\databases\BAR\pymorfy')

file = open('pushkin/pushkin_aleksandr_polnoe_sobranie_stihotvoreniy.txt', 'r')
text = file.read()
file.close()
text = text.decode('utf-8')

#tokenization
tokens = nltk.word_tokenize(text)
tokens = [i for i in tokens if ( i not in string.punctuation )]

#making dictionary
pushkin_dict = {}
for i in tokens:
    key = i.replace('.', '')
    key = key.replace(u"«", '')
    key = key.replace(u'»', '')
    key = key.replace(u'…', '')
    key = key.lower()
    pushkin_dict[key] = 0

#put all in mysql
conn = pymysql.connect(host='127.0.0.1', port=3306, user='******', passwd='', db='pushkin', charset='utf8')
Esempio n. 38
0
		#print "Skip sentence (bracket mismatch)"
		#print string
		return False
	# check that there is at most one 'special' punctuation sign
	if tags.count(':') > 1 or tags.count('SYM') > 1:
		#print "Skip sentence (too much punctuation)"
		#print string
		return False

	if sentence[-1][0] == "." and sentence[-2][0] == "Sept":
		return False
	
	return True


morph = pymorphy.get_morph('/wrk/yvessche/wmt18/testsuite/pymorphy_en')

# Extended PTB set: https://corpling.uis.georgetown.edu/ptb_tags.html
# Pymorphy tagset documentation: https://github.com/kmike/pymorphy/blob/master/dicts/src/Dicts/Morph/egramtab.tab


def complex_np(sentence):
	global nplist
	if nplist == []:
		return None
	out_sentence = [x[0] for x in sentence]
	for i, (word, tag) in enumerate(sentence):
		if (tag == 'PP') and word.lower() in ['him', 'her']:
			if [x[0] for x in sentence].count('him') + [x[0] for x in sentence].count('her') > 1:
				return None
			if word[0].isupper():
Esempio n. 39
0
# coding: utf-8
import os
import pymorphy

from textgen.conf import APP_DIR, textgen_settings
from textgen.logic import import_texts

morph = pymorphy.get_morph(textgen_settings.PYMORPHY_DICTS_DIRECTORY)

import_texts(
    morph,
    source_dir=os.path.join(APP_DIR, "fixtures", "texts_src"),
    tech_vocabulary_path=os.path.join(APP_DIR, "fixtures", "vocabulary.json"),
    voc_storage="/tmp/voc.json",
    dict_storage="/tmp/dict.json",
    print_undefined_words=True,
)
Esempio n. 40
0
	def __init__(self):
		super(Morph, self).__init__()

		self.morph = pymorphy.get_morph(self.dicts_dir, self.dicts_type)
# -*- coding: utf-8 -*-
from south.utils import datetime_utils as datetime
from south.db import db
from south.v2 import DataMigration
from django.db import models
from lyrics.models import Song, IndexElement

# -----------------------------------------------------------------------------pymorphy
from pymorphy.contrib.tokenizers import extract_words
from pymorphy import get_morph                      # Морф анализатор https://pythonhosted.org/pymorphy/intro.html
from ..pymorphy_dicts_dir import ret
morph = get_morph(ret())    # Директория со словарями для pymorphy


class Migration(DataMigration):

    def forwards(self, orm):
        "Write your forwards methods here."
        for song in Song.objects.all():
            to_write = list()
            try:
                for i, word in enumerate(extract_words(song.lyrics)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('1 ' + str(i) + " " + term)
            except TypeError:
                pass
            try:
                for i, word in enumerate(extract_words(song.artist)):
                    for term in morph.normalize(word.upper()):
                        to_write.append('2 ' + str(i) + " " + term)
            except TypeError:
Esempio n. 42
0
import re
import pymorphy
import pymorphy.utils

text = u'''
Сяпала Калуша по напушке и увазила бутявку. И волит:
— Калушата, калушаточки! Бутявка!
Калушата присяпали и бутявку стрямкали. И подудонились.
А Калуша волит:
— Оее, оее! Бутявка-то некузявая!
Калушата бутявку вычучили.
Бутявка вздребезнулась, сопритюкнулась и усяпала с напушки.
А Калуша волит:
— Бутявок не трямкают. Бутявки дюбые и зюмо-зюмо некузявые. От бутявок дудонятся.
А бутявка волит за напушкой:
— Калушата подудонились! Калушата подудонились! Зюмо некузявые! Пуськи бятые!
'''

r = re.compile('[\W+-]',re.U)
words = r.split(text.upper())

# тут нужно прописать путь до папки со словарями
morph = pymorphy.get_morph('dicts/converted/ru')

for word in words:
    if word:
        print word
        info = morph.get_graminfo(word)
        for form in info:
            pymorphy.utils.pprint(form)
Esempio n. 43
0
 def __init__(self, dbname = 'crawler_db'):
     self.conn = MySQLdb.connect(user = '******', db = dbname, passwd = 'crawler', unix_socket = '/var/run/mysqld/mysqld.sock')
     self.cursor = self.conn.cursor()
     self.morph = get_morph('dicts')
Esempio n. 44
0
import nltk
import string
import pymorphy
#from pymorphy.contrib import tokenizers
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from collections import Counter

import numpy as np
import cv2

corpus_dir = 'corpus_1836'
#corpus_dir = 'poems'
CLUSTERS_COUNT = 2
morph = pymorphy.get_morph('C:/DB/ru.sqlite-json')
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как'])
corpus = {}
res_txt = []

def tokenizeMe(sentence):
    all_words = []
    #sentence = "меня! это,. что? похоже; на - пирамиду"
    #sentence = sentence.encode('utf-8').upper()
    #sentence = sentence.upper()
    #print(sentence)
    #
    #words = tokenizers.extract_words(sentence)
    #print(words)
Esempio n. 45
0
# -*- coding: utf-8 -*-
import nltk
import string
import pymorphy
#from nltk.corpus import stopwords
#import os
import pymysql

morph = pymorphy.get_morph('K:\databases\BAR\pymorfy')

file = open('pushkin/pushkin_aleksandr_polnoe_sobranie_stihotvoreniy.txt', 'r')
text = file.read()
file.close()
text = text.decode('utf-8')

#tokenization
tokens = nltk.word_tokenize(text)
tokens = [i for i in tokens if (i not in string.punctuation)]

#making dictionary
pushkin_dict = {}
for i in tokens:
    key = i.replace('.', '')
    key = key.replace(u"«", '')
    key = key.replace(u'»', '')
    key = key.replace(u'…', '')
    key = key.lower()
    pushkin_dict[key] = 0

#put all in mysql
conn = pymysql.connect(host='127.0.0.1',
Esempio n. 46
0
import time
from datetime import datetime
from pymorphy import get_morph
from dater import Dater
from tagger import Tagger

filename = os.path.join(os.path.dirname(sys.argv[0]), "corpora/somecorpus.txt")
traincorpus = os.path.join(os.path.dirname(sys.argv[0]),
                           "dicts/ruscorpora.txt.lemma")
trainsuffcorpus = traincorpus + ".suffs"

print "STARTED:", str(datetime.now())
start = time.time()

morph = get_morph(
    os.path.join(os.path.dirname(sys.argv[0]),
                 "pydicts").decode("UTF8"))  # Подгружаем русский словарь
morph_simple = get_morph(os.path.join(os.path.dirname(sys.argv[0]),
                                      "pydicts").decode("UTF8"),
                         check_prefixes=False)  # Подгружаем русский словарь
# Подгружаем обработчик дат
dater = Dater()
# Подгружаем тэггер
tagger = Tagger(morph, morph_simple, dater)
# Подгружаем суффиксную статистику для тэггера
tagger.load_statistics(traincorpus, trainsuffcorpus)
# Лемматизируем частями
tagger.parse_chunks(filename, sent_marks=True)

print "FINISHED:", str(datetime.now())
print "Time elapsed: ", time.time() - start
Esempio n. 47
0
# -*- coding: cp1251 -*- 
from pymorphy import get_morph # портируем необходимые модули
from pymorphy.contrib import tokenizers

morph = get_morph('C:/Python27/ru.sqlite-json/') # создаем объект класса pymorphy.Morph

text = (raw_input("Входные данные:  ")).decode('cp1251')

d = {} # словарь для хранения списка слов с одной нормальной формой
a = set() # множество, не допускающее повторений нормальных форм
part_d = {} # словарь для грамматической информации слова
part_a  = set() # множество, не допускающее повторений слов

for word in tokenizers.extract_tokens(text): # выделяем токены, узнаем часть речи
    if word.isalpha() == True: # отсеиваем пунктуацию
        info = morph.get_graminfo(word.upper()) # находим грамм. информацию слова
        part = info[0]['class'] # часть речи слова
        gram_info = info[0]['info'] # грамм. информация слова
        nf = info[0]['norm'] # нормальная форма слова
        print str('{0}({1})').format(word.encode('cp1251'),part.encode('cp1251')),
        # учитываем только сущ, глаголы и прил
        if part == u'С' or part == u'Г' or part == u'П' or part == u'ИНФИНИТИВ':
            # для имени не подбираем синоним
            if u'имя' in info[0]['info']:name = 1
            else:
                len_ = len(a)
                a.add(nf)
                if len_ == len(a): # есть ли уже такая нормальная форма?
                    l = d[nf]      # если есть, то добавляем слово в конец списка
                    l.append(word)
                    d[nf] = l
Esempio n. 48
0
#-*- coding: UTF-8

import littlebrother.ident.config as ident_config
import os.path
import pymorphy

db_encoding = ident_config.geo_db.get('encoding', 'UTF-16')
cities_db = ident_config.geo_db.get('cities', 'cities_ru/db/cities.shelve')
regions_db = ident_config.geo_db.get('regions', 'cities_ru/db/regions.shelve')
world_db = ident_config.geo_db.get('world', 'cities_ru/db/db/regions.shelve')

cities_db = os.path.join('..', cities_db)
regions_db = os.path.join('..', regions_db)
world_db = os.path.join('..', world_db)

morph = pymorphy.get_morph(
	os.path.join('..', ident_config.dicts.get('path', 'dicts/ru/shelve45')), 
	ident_config.dicts.get('backend', 'shelve'))

Esempio n. 49
0
from django import template
from django.conf import settings
from pymorphy import get_morph

register = template.Library()
morph = get_morph(settings.PYMORPHY_DICTS['ru']['dir'], 'cdb')

@register.filter
def plural_from_object(source, obj):
    l = len(obj[0])
    if 1 == l:
        return source
    return morph.pluralize_inflected_ru(source.upper(), l).lower()
Esempio n. 50
0
# -*- coding: utf-8 -*-
from south.utils import datetime_utils as datetime
from south.db import db
from south.v2 import DataMigration
from django.db import models
import os
import re
from lyrics.models import Song, IndexElement

# -----------------------------------------------------------------------------pymorphy
from pymorphy.contrib import tokenizers
from pymorphy import get_morph                      # Морф анализатор https://pythonhosted.org/pymorphy/intro.html
morph = get_morph('/home/antre/pymorphy_dicts/')    # Директория со словарями для pymorphy

DIR_OF_COLLECT_DATA = os.path.dirname(os.path.abspath(__file__))[:-17] + "collect_data/"
FILE_WITH_SYNONYMS = DIR_OF_COLLECT_DATA + "synonims.txt"


class Migration(DataMigration):

    def forwards(self, orm):

        fi = open(FILE_WITH_SYNONYMS, "rb")
        for line in fi:
            try:
                term, synonyms = line.decode("cp1251")[:-2].upper().split('|', 2)
                print term
            except ValueError:
                continue
            synonyms = synonyms.split('.')[0]                   # Отсекаются всякие антонимы
            synonyms = re.sub('\([^)]*\)', '()', synonyms)      # Отсекаются всякие слова в скобках