def normalisation(new_data):
    m = Mystem()
    m.start()
    normalize_data = []
    data_size = len(new_data)
    count = 0
    for i in tqdm(new_data, desc="normalisation"):
        lemmas = m.lemmatize(i)
        normalize_data.append((''.join(lemmas)).replace("\n", ""))
        count = count + 1


#         print(count*100/data_size)
    return normalize_data
Beispiel #2
0
import torch
from pymystem3 import Mystem

from itertools import count

import onmt.io
import onmt.translate
import onmt
import onmt.ModelConstructor
import onmt.modules
import sys

from onmt.io.IO import build_dataset_request

m = Mystem()
m.start()

opt = Namespace(
    alpha=0.0,
    attn_debug=False,
    batch_size=1,
    beam_size=10,
    beta=-0.0,
    data_type='text',
    dump_beam='',
    dynamic_dict=False,
    gpu=-1,
    max_length=100,
    max_sent_length=None,
    min_length=0,
    model=os.path.dirname(os.path.abspath(__file__)) +
import re, os
import gensim
from gensim import corpora, models
import nltk
from nltk import FreqDist
from nltk.collocations import *
from pymystem3 import Mystem
from stop_words import get_stop_words

ru_stop = get_stop_words('ru')

mystem_object = Mystem()
mystem_object.start()

puncts = "[«–»—!\$%&'()*+,./:;<=>?@^_`{|}~']*-–—...]"
extra_words = [
    "понимать", "знать", "хотеть", "глаз", "рука", "голова", "увидеть",
    "что-то", "смотреть", "нога", "свой", 'видеть', 'становиться', 'остаться',
    'давать', 'стоять', 'оставаться', 'оказываться', 'думать'
]


#Fantasy
def processFileFantasy(file):
    doc = []
    with open(file, 'r', encoding='utf-8') as f:
        #print(file)
        text = f.read()
        #print(len(text))
        words = text.split()
        for word in words:
Beispiel #4
0
 def make_lemmantisation(text):
     m = Mystem()
     m.start()
     lemmas = m.lemmatize(str(text))
     return lemmas
Beispiel #5
0
parser = argparse.ArgumentParser(description='RE to CONLL')
parser.add_argument('--re', type=str, help='REs to apply')
parser.add_argument('--data_dir', type=str, help='Folder with docs')
parser.add_argument('--file', type=str, help='Source file')
parser.add_argument('--lines', action='store_true', help='Lines as docs')
parser.add_argument('--bioes', action='store_true', help='Output BEOES encoding')
args = parser.parse_args()

def build_re():
	patterns = filter(lambda line : line and not line.startswith('#') and not line.isspace(), open(args.re).readlines())
	return map(lambda line: re.compile(line.strip().decode('utf8'), flags=re.U+re.M+re.S), patterns)

from pymystem3 import Mystem
mystem = Mystem(grammar_info=False, disambiguation=False)
mystem.start()

def parse_doc(mystem, text):
	morph_parse = mystem.analyze(text)
	current_pos = 0
	offsets = []
	lemmas = []
	words = []
	all_words = []
	for word_parse in morph_parse:
		word = word_parse['text']
		all_words.append(word)
		sword = word.strip(' ').replace('\n', u'\u2028')
		if re.search("\w", sword, flags=re.U):
			words.append(sword)
			analysis = word_parse.get('analysis')