Beispiel #1
0
def wordforms(word):
    arr = []
    morph = MorphAnalyzer()
    lex = morph.parse(word)[0].lexeme 
    for l in lex:
        arr.append(l.word)
    return set(arr)
Beispiel #2
0
def read_test_corpus(fn):
    m = MorphAnalyzer()
    for line in fn:
        line = line.rstrip('\n')
# считаем, что текст у нас уже токенизованный
#        line = word_tokenize(line)
        line = line.decode('utf-8').split()
# разбираем слова по словарю, возьмем только первый разбор от pymorphy
        parses = [m.parse(token) for token in line]
        if line:
            yield [(p[0].word, p[0].tag) for p in parses]
def agree(w1, w2, t1, t2):
    if t1 == "comma" or t2 == "comma":
        return w1, w2

    morph = MorphAnalyzer()
    raw_cur_tags = morph.tag(w1)[-1]
    raw_next_tags = morph.tag(w1)[-1]

    cur_tags = re.findall(r"\w+", str(raw_cur_tags))
    next_tags = re.findall(r"\w+", str(raw_next_tags))

    if t1[:-2] == "person":
        if t2[:-2] == "verb_right":
            if morph.normal_forms(w2)[0] in dative_verbs:
                w1 = morph.parse(w1)[0].inflect({"datv"}).word

    if t1[:-2] == "verb_right":
        if t2[:-2] == "property":
            pass
        if t2[:-2] == "person":
            if cur_tags[3] == "tran":
                w2 = morph.parse(w2)[0].inflect({"accs"}).word
            else:
                w2 = morph.parse(w2)[0].inflect({"nomn"}).word
                #gender with nomn only
                gender = next_tags[2]
                if gender == "inan":
                    gender = next_tags[3]
                w1 = morph.parse(w1)[0].inflect({gender}).word

    if t1[:-2] == "adjective":
        if t2[:-2] == "property":
            #gender
            gender = next_tags[2]
            if gender == "inan":
                gender = next_tags[3]
            try:
                w1 = morph.parse(w1)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)

    if t1[:-2] == "property":
        if t2[:-2] == "person":
            pass
        if t2[:-2] == "adjective":
            gender = cur_tags[2]
            if gender == "inan":
                gender = cur_tags[3]
            try:
                w2 = morph.parse(w2)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)


    #w1 = morph.parse(w1)[0].inflect({}).word
    return w1, w2
Beispiel #4
0
class MorphTest(unittest.TestCase):
    def __init__(self, document_vector):
        self.document = None
        self.documents = document_vector
        self.morph = MorphAnalyzer()

    # def setUp(self):
    #     self.document = documents[randint(0, len(documents))]

    def testMorph(self):
        self.document = self.document if not None else self.documents[0]
        morph_array = [self.morph.parse(word)[0].normal_form for word in self.document]
        print morph_array
        self.assertTrue(True, msg=None)
Beispiel #5
0
class MorphAnalyzer(object):
    def __init__(self):
        self.raw = PymorphyAnalyzer()

    def check_gram(self, gram):
        if not self.raw.TagClass.grammeme_is_known(gram):
            raise ValueError(gram)

    def __call__(self, word):
        records = self.raw.parse(word)
        return [prepare_form(_) for _ in records]

    def normalized(self, word):
        return {_.normalized for _ in self(word)}
Beispiel #6
0
class Analyzer:
    """
    Анализирует входящий текст, разбирает каждое слово на лексемы, убирает пунктуацию и все, кроме существительных,
    глаголов или прилагательных, а так же слова из списка запрещенных слов. Выдает 10 самых популярных из текущих слов.
    """

    def __init__(self, text_array):
        self.text_array = text_array
        self.morph = MorphAnalyzer()  # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную
        self.trash_list = \
            {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все",
             "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный",
             "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь",
             "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш",
             "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься",
             "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый",
             "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой",
             "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий",
             "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание",
             "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие",
             "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий",
             "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение",
             "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь",
             "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть",
             "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг",
             "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент",
             "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа",
             "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео",
             "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество",
             "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка",
             "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал",
             "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество",
             "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право",
             "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение",
             "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент",
             "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"}

    def start(self):
        res = list(filter(
            lambda x: len(x) > 2 and self.pymorphy_analyze(x) and re.match("[а-яА-Я]", x) and x not in self.trash_list,
            self.text_array))
        return [x[0] for x in Counter(res).most_common(10)]

    def pymorphy_analyze(self, word):
        lexem = self.morph.parse(word)
        x = lexem[0].tag.POS
        if x == ("NOUN" or "ADJF" or "INFN"):
            return True
        return False
Beispiel #7
0
def read_tab_corpus(inc):
    m = MorphAnalyzer()
    sent = []
    for t in inc:
        t = t.rstrip().decode('utf-8')
        if not t:
            continue
        if t == u'sent':
            sent = []
            continue
        if t == u'/sent' or t == u'SENT':
            sent = [x[0] for x in sent]
            parses = [m.parse(token) for token in sent]
            if sent:
                yield [(p[0].word, p[0].tag) for p in parses]
            continue
        t = t.split('\t')
        try:
            token = (t[1], ' '.join(t[2].split(' ')[2:]))
            sent.append(token)
        except IndexError:
            continue
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 18 13:38:13 2019

@author: dan
"""


def is_formula(token):
    tags = {''}
    for var in token:
        tags |= var.tag.grammemes
    if {'LATN', 'UNKN'} & tags:
        chars = set(token[0].word)
        return chars
    else:
        return 0


if __name__ == '__main__':
    from pymorphy2 import MorphAnalyzer
    mo = MorphAnalyzer()
    for x in [
            '1/r²', '2*2=4', '\frac{2,2}', 'help', 'what is love',
            'лингви́стика'
    ]:
        print(x, is_formula(mo.parse(x)))
Beispiel #9
0
class Event():
	"""
	Event object - class for working with event candidates.
	Collects all data on event candidate, stores it between clustering slices; merges slices, if required.
	TBD: constructs and saves description, scores texts and media, scores and descripts event itself 
	(probability, that candidate is real, event buzz, event category).

	Attributes:
		self.created (datetime): creation timestamp
		self.updated (datetime): last update timestamp
		self.start (datetime): timestamp of the first message in the self.messages dict
		self.end (datetime): timestamp of the last message in the self.messages dict
		self.messages (Dict[dict]): raw tweets from database, enriched with weight, is_core params (on init), tokens (after add_stem_texts)
		self.media (Dict[dict]): raw media objects from database
		self.cores (Dict[list]): tokens, that form the most common vocabulary for the event; computed in create_core() method
		self.entropy (float): entropy for authorship: 0 for mono-authored cluster; computed in event_summary_stats() method
		self.ppa (float): average number of posts per one author; computed in event_summary_stats() method
		self.authors (int): number of unique authors for event
		self.most_active_author (float): share of messages, written by one (most active author)
		self.authors_share (float): number of authors divided by number of messages
		self.relevant_messages_share (float): share of messages with token_score above zero
		self.duration (int): total seconds from self.start to self.end
		self.classifier (Object): classifier for deciding, whether event is real
		self.validity (bool): Classifier verdict, whether event is real or not
		self.verification (bool): Handmade verification of event quality

	Methods:
		self.event_update: commands to calculate all data on event, based on messages and media
		self.is_successor: examines, if current event have common messages with specified event slice
		self.is_valid: method for classifier to determine, if event is actually event, and not a random messages contilation
		self.classifier_row: unififed method for creating classifier data-row
		self.merge: merge current event with another event, update stat Attributes
		self.add_slice: add messages and media to the event, recompute statistics
		self.load / self.dump: serialize/deserialize event and put/get it to Redis
		self.backup / self.restore: dump/restore event to/from MySQL long-term storage
		self.get_messages_data: get MySQL data for messages ids
		self.get_media_data: get MySQL data for media using existing messages ids
		self.event_summary_stats: calculate statistics and start/end time for event
		self.add_stem_texts: add tokens lists to self.messages
		self.create_core: create vocabulary of most important words for the event
		self.score_messages_by_text: method calculates token_score for messages. TF/IDF likelihood with core is used

	Message keys:
		cluster (int): legacy from DBSCAN - number of cluster (event ancestor)
		id (str): DB message id; unique
		is_core (bool): True, if tweet belongs to the core of ancestor cluster
		iscopy (int): 1, if message is shared from another network
		lat (float): latitude
		lng (float): longitude
		network (int): 2 for Instagram, 1 for Twitter, 3 for VKontakte
		text (str): raw text of the message
		tokens (Set[str]): collection of stemmed tokens from raw text; created in add_stem_texts()
		tstamp (datetime): 'created at' timestamp
		user (int): user id, absolutely unique for one network, but matches between networks are possible
		token_score (float): agreement estimation with average cluster text
		weight (float): standart deviations below average
	"""

	def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []):
		"""
		Initialization.

		Args:
			mysql_con (PySQLPoolConnection): MySQL connection Object
			redis_con (StrictRedis): RedisDB connection Object
			tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words
			morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. 
			classifier (Object): scikit trained classifier to detect real and fake events
			points (list[dict]): raw messages from event detector
		"""
		self.mysql = mysql_con
		self.redis = redis_con

		if morph:
			self.morph = morph
		else:
			self.morph = MorphAnalyzer()
		if tokenizer:
			self.tokenizer = tokenizer
		else:
			self.tokenizer = TreebankWordTokenizer()
		self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE)
		self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

		self.validity = None
		self.verification = None
		self.cores = {}
		self.classifier = classifier

		if points:
			self.id = str(uuid4())
			self.created = datetime.now()
			self.updated = datetime.now()

			self.messages = { x['id']:x for x in points }
			self.get_messages_data()
			self.media = {}
			self.get_media_data()
			self.event_update()

	def __str__(self):
		txt = "<Event {}: {} msgs [{} -- {}]>".format(self.id, len(self.messages), self.start.strftime("%Y-%m-%d %H:%M"), self.end.strftime("%H:%M"))
		return txt

	def __unicode__(self):
		return unicode(self.__str__())

	def __repr__(self):
		return self.__str__()

	def event_update(self):
		"""
		Commands to calculate all data on event, based on messages and media.
		"""
		self.add_stem_texts()
		self.create_core(deviation_threshold = 1)
		self.create_core(deviation_threshold = 2)
		self.create_core(deviation_threshold = 3)
		self.score_messages_by_text()
		self.event_summary_stats()
		self.is_valid()

	def is_successor(self, slice_ids, sim_index = 0.3, only_relevant = True):
		"""
		Method examines, if current event have common messages with specified event slice.

		Args:
			slice_ids (Set): set if message id's to compare with
			sim_index (float): minimal share of messages that should match in slice to be detected as a successor
			only_relevant (bool): use only messages with non-zero token_score (to exclude spam)
		"""
		if only_relevant:
			event_ids = set([k for k, v in self.messages.items() if v['token_score'] > 0])
			if not event_ids:
				event_ids = set(self.messages.keys())
		else:
			event_ids = set(self.messages.keys())
		#if float(len(event_ids.intersection(slice_ids)))/len(event_ids.union(slice_ids)) >= jaccard:
		if float(len(event_ids.intersection(slice_ids)))/min((len(event_ids), len(slice_ids))) >= sim_index:
			return True
		return False

	def is_valid(self):
		"""
		Method for Classifier to determine, if event is actually event, and not a random messages contilation.
		"""
		if self.validity:
			return True
		if self.classifier:
			self.validity = bool(self.classifier.predict([self.classifier_row()])[0])
		return self.validity

	def classifier_row(self):
		"""
		Unififed method for creating classifier data-row. Every var, used in prediction, is listed here, and only here.
		"""
		row = [
			len(self.messages.values()), 
			len(self.media.values()), 
			self.authors, 
			self.most_active_author, 
			self.authors_share, 
			self.entropy, 
			self.ppa, 
			self.relevant_messages_share, 
			self.duration
		]
		return row

	def merge(self, other_event):
		"""
		Method merges current event with another event, update stat Attributes.

		Args:
			other_event (Event): another event object - to merge with
		"""
		self.messages.update(other_event.messages)
		self.media.update(other_event.media)
		self.event_update()
		self.updated = datetime.now()
		self.created = min((self.created, other_event.created))

	def add_slice(self, new_slice):
		"""
		Method adds messages and media to the event, recompute statistics.

		Args:
			new_slice (List[dict]): initial list with messages to be added
		"""
		self.messages.update({ x['id']:x for x in new_slice })
		self.get_messages_data([x['id'] for x in new_slice])
		self.get_media_data([x['id'] for x in new_slice])
		self.event_update()
		self.updated = datetime.now()

	def backup(self):
		"""
		Method dumps event to MySQL long-term storage, used for non-evaluating events.
		"""
		if self.verification is None:
			ver = 'NULL'
		else:
			ver = int(self.verification)
		if self.validity is None:
			val = 'NULL'
		else:
			val = int(self.validity)
		msg_string = self.pack()
		q = b'''INSERT INTO events(id, start, end, msgs, description, dumps, verification, validity) VALUES ("{}", "{}", "{}", {}, "{}", "{}", {}, {}) ON DUPLICATE KEY UPDATE `start`=VALUES(`start`), `end`=VALUES(`end`), `msgs`=VALUES(`msgs`), `description`=VALUES(`description`), `dumps`=VALUES(`dumps`), `verification`=VALUES(`verification`), `validity`=VALUES(`validity`);'''.format(self.id, self.start, self.end, len(self.messages.keys()), escape_string(', '.join([x.encode('utf-8') for x in self.cores[2]])), escape_string(msg_string), ver, val)
		exec_mysql(q, self.mysql)
		self.redis.delete("event:{}".format(self.id))

	def restore(self, event_id):
		"""
		Method restores event from MySQL table using event_id parameter.

		Args:
			event_id (str): unique event identifier
		"""
		q = '''SELECT dumps FROM events WHERE id="{}"'''.format(event_id)
		event_data = exec_mysql(q, self.mysql)[0][0]['dumps']
		self.unpack(event_data)

	def load(self, event_id, redis_prefix='event'):
		"""
		Method for deserializing and loading event from Redis database.

		Args:
			event_id (str): unique event isentifier
			redis_prefix (str): prefix used in Redis database
		"""
		try:
			event_data = self.redis.hget('{}:{}'.format(redis_prefix, event_id), 'dumps')
		except ResponseError:
			event_data = self.redis.get('{}:{}'.format(redis_prefix, event_id))
		self.unpack(event_data)

	def dump(self, redis_prefix='event'):
		"""
		Method for serializing and dumping event to Redis database.

		Args:
			redis_prefix (str): prefix to use, when storing new key in Redis database
		"""
		if self.verification is None:
			ver = 'NULL'
		else:
			ver = int(self.verification)
		if self.validity is None:
			val = 'NULL'
		else:
			val = int(self.validity)
		msg_string = self.pack()
		event = {'start':self.start.strftime("%Y-%m-%d %H:%M:%S"), 'end':self.end.strftime("%Y-%m-%d %H:%M:%S"), 'msgs':len(self.messages.keys()), 'description':', '.join([x.encode('utf-8') for x in self.cores[2]]), 'dumps':msg_string, 'verification':ver, 'validity':val}
		self.redis.hmset("{}:{}".format(redis_prefix, self.id), event)

	def pack(self, complete=False):
		"""
		Method for serializing event to string.

		Args:
			complete (bool): whether to pack all available data for the event (full texted messages, media links, and cores).
		"""
		todump = {
			'id':self.id,
			'created':int(mktime(self.created.timetuple())),
			'updated':int(mktime(self.updated.timetuple())),
			'verification':self.verification,
			'messages':[{'id':x['id'], 'is_core':x.get('is_core'), 'token_score':x.get('token_score'), 'weight':x.get('weight')} for x in self.messages.values()]
		}

		if complete:
			todump['media'] = self.media
			todump['validity'] = self.validity
			for i in range(len(todump['messages'])):
				msg = self.messages[todump['messages'][i]['id']]
				todump['messages'][i].update({'iscopy':msg['iscopy'], 'lat':msg['lat'], 'lng':msg['lng'], 'network':msg['network'], 'text':msg['text'], 'tstamp':int(mktime(msg['tstamp'].timetuple())), 'user':msg['user']})
		return packb(todump)

	def unpack(self, data, complete=False):
		"""
		Method for deserializing event from string. msgpack lib is used (considered to be faster than pickle).

		Args:
			data (str): pickle dump of event-required parameters.
			complete (bool): whether to unpack all available data for the event (full texted messages, media links, and cores), or compute these parameters on the fly.
		"""
		data = unpackb(data)
		self.id = data['id']
		self.created = datetime.fromtimestamp(data['created'])
		self.updated = datetime.fromtimestamp(data['updated'])
		self.verification = data['verification']
		self.messages = {x['id']:x for x in data['messages']}

		if complete:
			self.validity = data['validity']
			self.media = data['media']
			for k in self.messages.keys():
				self.messages[k]['tstamp'] = datetime.fromtimestamp(self.messages[k]['tstamp'])

		else:
			self.get_messages_data()
			self.media = {}
			self.get_media_data()

		self.event_update()

	def get_messages_data(self, ids=None):
		"""
		Method loads MySQL data for messages ids and adds it to the self.messagea argument.

		Args:
			ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used 
		"""
		if not ids:
			ids = [x['id'] for x in self.messages.values()]
		q = '''SELECT * FROM tweets WHERE id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.messages[item['id']].update(item)

	def get_media_data(self, ids=None):
		"""
		Method loads MySQL data for media using existing messages ids and adds it to the self.media argument.

		Args:
			ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used 
		"""
		if not ids:
			ids = [x['id'] for x in self.messages.values()]
		q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.media[item['id']] = item

	def event_summary_stats(self):
		"""
		Method calculates several statistics, updates self.start and self.end timestamps.
		"""
		authorsip_stats = [len(tuple(i[1])) for i in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z: z['user'])]
		self.authors = len(authorsip_stats)
		self.most_active_author = max(authorsip_stats)/float(len(self.messages.values()))
		self.authors_share = float(self.authors)/len(self.messages.values())
		self.entropy = entropy(authorsip_stats)
		self.ppa = mean(authorsip_stats)
		self.relevant_messages_share = float(len([x for x in self.messages.values() if x['token_score'] > 0]))/len(self.messages.values())
		self.start = min([x['tstamp'] for x in self.messages.values()])
		self.end = max([x['tstamp'] for x in self.messages.values()])
		self.duration = int((self.end - self.start).total_seconds())

	def add_stem_texts(self):
		"""
		Method adds tokens lists to self.messages.
		"""
		for i in self.messages.keys():
			if 'tokens' not in self.messages[i].keys():
				txt = self.messages[i].get('text', '')
				txt = sub(self.url_re, '', txt)
				self.messages[i]['tokens'] = {self.morph.parse(token.decode('utf-8'))[0].normal_form for token in self.tokenizer.tokenize(txt) if match(self.word, token.decode('utf-8'))}

	def create_core(self, deviation_threshold=2, min_token=3):
		"""
		Method creates core of imprtant words for event.

		Args:
			deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens
			min_token (int): minimal length of token, to exclude prepositions/conjunctions
		"""
		texts_by_authors = [set().union(*[msg['tokens'] for msg in list(y[1])]) for y in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z:z['user'])]
		top_words = {}
		for doc in texts_by_authors:
			for token in doc:
				if len(token) >= min_token:
					try:
						top_words[token] += 1
					except KeyError:
						top_words[token] = 1
		th_vals = [x[1] for x in top_words.items()]
		threshold = mean(th_vals) + deviation_threshold * std(th_vals)
		self.cores[deviation_threshold] = [k for k,v in top_words.items() if v > threshold]

	def score_messages_by_text(self, deviation_threshold=2):
		"""
		Method calculates token_score parameter for self.messages.

		Args:
			deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens
		"""
		texts = [x['tokens'] for x in self.messages.values()]
		if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1:
			for k in self.messages.keys():
				self.messages[k]['token_score'] = 0
			return
		dictionary = Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		tfidf = TfidfModel(corpus, id2word=dictionary)
		index = MatrixSimilarity(tfidf[corpus])
		try:
			scores = index[dictionary.doc2bow(self.cores[deviation_threshold])]
		except IndexError:
			error('Index error in token scoring for event {}'.format(self.id))
			scores = [0]*len(self.messages.values())
		for i in range(len(scores)):
			self.messages.values()[i]['token_score'] = float(scores[i])
class PymorphyVectorizer(WordIndexVectorizer):
    """
    Transforms russian words into 0-1 vector of its possible Universal Dependencies tags.
    Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io)
    and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets).
    All UD2.0 tags that are compatible with produced tags are memorized.
    The list of possible Universal Dependencies tags is read from a file,
    which contains all the labels that occur in UD2.0 SynTagRus dataset.

    Args:
        save_path: path to save the tags list,
        load_path: path to load the list of tags,
        max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used.
    """

    USELESS_KEYS = ["Abbr"]
    VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"}

    def __init__(self,
                 save_path: str,
                 load_path: str,
                 max_pymorphy_variants: int = -1,
                 **kwargs) -> None:
        super().__init__(save_path, load_path, **kwargs)
        self.max_pymorphy_variants = max_pymorphy_variants
        self.load()
        self.memorized_word_indexes = dict()
        self.memorized_tag_indexes = dict()
        self.analyzer = MorphAnalyzer()
        self.converter = converters.converter('opencorpora-int', 'ud20')

    @property
    def dim(self):
        return len(self._t2i)

    def save(self) -> None:
        """Saves the dictionary to self.save_path"""
        with self.save_path.open("w", encoding="utf8") as fout:
            fout.write("\n".join(self._i2t))

    def load(self) -> None:
        """Loads the dictionary from self.load_path"""
        self._i2t = []
        with self.load_path.open("r", encoding="utf8") as fin:
            for line in fin:
                line = line.strip()
                if line == "":
                    continue
                self._i2t.append(line)
        self._t2i = {tag: i for i, tag in enumerate(self._i2t)}
        self._make_tag_trie()

    def _make_tag_trie(self):
        self._nodes = [defaultdict(dict)]
        self._start_nodes_for_pos = dict()
        self._data = [None]
        for tag, code in self._t2i.items():
            if "," in tag:
                pos, tag = tag.split(",", maxsplit=1)
                tag = sorted(
                    [tuple(elem.split("=")) for elem in tag.split("|")])
            else:
                pos, tag = tag, []
            start = self._start_nodes_for_pos.get(pos)
            if start is None:
                start = self._start_nodes_for_pos[pos] = len(self._nodes)
                self._nodes.append(defaultdict(dict))
                self._data.append(None)
            for key, value in tag:
                values_dict = self._nodes[start][key]
                child = values_dict.get(value)
                if child is None:
                    child = values_dict[value] = len(self._nodes)
                    self._nodes.append(defaultdict(dict))
                    self._data.append(None)
                start = child
            self._data[start] = code
        return self

    def find_compatible(self, tag: str) -> List[int]:
        """
        Transforms a Pymorphy tag to a list of indexes of compatible UD tags.

        Args:
            tag: input Pymorphy tag

        Returns:
            indexes of compatible UD tags
        """
        if " " in tag and "_" not in tag:
            pos, tag = tag.split(" ", maxsplit=1)
            tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")])
        else:
            pos, tag = tag.split()[0], []
        if pos not in self._start_nodes_for_pos:
            return []
        tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag
               if key not in self.USELESS_KEYS]
        if len(tag) > 0:
            curr_nodes = [(0, self._start_nodes_for_pos[pos])]
            final_nodes = []
        else:
            final_nodes = [self._start_nodes_for_pos[pos]]
            curr_nodes = []
        while len(curr_nodes) > 0:
            i, node_index = curr_nodes.pop()
            # key, value = tag[i]
            node = self._nodes[node_index]
            if len(node) == 0:
                final_nodes.append(node_index)
            for curr_key, curr_values_dict in node.items():
                curr_i, curr_node_index = i, node_index
                while curr_i < len(tag) and tag[curr_i][0] < curr_key:
                    curr_i += 1
                if curr_i == len(tag):
                    final_nodes.extend(curr_values_dict.values())
                    continue
                key, value = tag[curr_i]
                if curr_key < key:
                    for child in curr_values_dict.values():
                        curr_nodes.append((curr_i, child))
                else:
                    child = curr_values_dict.get(value)
                    if child is not None:
                        if curr_i < len(tag) - 1:
                            curr_nodes.append((curr_i + 1, child))
                        else:
                            final_nodes.append(child)
        answer = []
        while len(final_nodes) > 0:
            index = final_nodes.pop()
            if self._data[index] is not None:
                answer.append(self._data[index])
            for elem in self._nodes[index].values():
                final_nodes.extend(elem.values())
        return answer

    def _get_word_indexes(self, word):
        answer = self.memorized_word_indexes.get(word)
        if answer is None:
            parse = self.analyzer.parse(word)
            if self.max_pymorphy_variants > 0:
                parse = parse[:self.max_pymorphy_variants]
            tag_indexes = set()
            for elem in parse:
                tag_indexes.update(set(self._get_tag_indexes(elem.tag)))
            answer = self.memorized_word_indexes[word] = list(tag_indexes)
        return answer

    def _get_tag_indexes(self, pymorphy_tag):
        answer = self.memorized_tag_indexes.get(pymorphy_tag)
        if answer is None:
            tag = self.converter(str(pymorphy_tag))
            answer = self.memorized_tag_indexes[
                pymorphy_tag] = self.find_compatible(tag)
        return answer
Beispiel #11
0
import json
import os
import re

from pymorphy2 import MorphAnalyzer


m = MorphAnalyzer()
lemma = lambda word: m.parse(word)[0].normal_form

def to_json(cont):
	return json.dumps(cont, ensure_ascii=False, indent='\t')


text_all = ''
k = 0

for i in os.listdir('data/history/'):
	if i[-4:] == 'json':
		try:
			k += 1
			with open('data/history/{}'.format(i), 'r') as file:
				for j in file:
					text = json.loads(j)['body'].strip()
					if text:
						text_all += text + '\n'
		except:
			pass

print(k)
Beispiel #12
0
class RNNMorphPredictor(Predictor):
    """
    POS-теггер на освное RNN.
    """
    def __init__(self,
                 language="ru",
                 eval_model_config_path: str=None,
                 eval_model_weights_path: str=None,
                 gram_dict_input: str=None,
                 gram_dict_output: str=None,
                 word_vocabulary: str=None,
                 char_set_path: str=None,
                 build_config: str=None):
        if eval_model_config_path is None:
            eval_model_config_path = MODELS_PATHS[language]["eval_model_config"]
        if eval_model_weights_path is None:
            eval_model_weights_path = MODELS_PATHS[language]["eval_model_weights"]
        if gram_dict_input is None:
            gram_dict_input = MODELS_PATHS[language]["gram_input"]
        if gram_dict_output is None:
            gram_dict_output = MODELS_PATHS[language]["gram_output"]
        if word_vocabulary is None:
            word_vocabulary = MODELS_PATHS[language]["word_vocabulary"]
        if char_set_path is None:
            char_set_path = MODELS_PATHS[language]["char_set"]
        if build_config is None:
            build_config = MODELS_PATHS[language]["build_config"]

        self.language = language
        self.converter = converters.converter('opencorpora-int', 'ud14') if language == "ru" else None
        self.morph = MorphAnalyzer() if language == "ru" else None
        if self.language == "en":
            nltk.download("wordnet")
            nltk.download('averaged_perceptron_tagger')
            nltk.download('universal_tagset')

        self.build_config = BuildModelConfig()
        self.build_config.load(build_config)

        self.model = LSTMMorphoAnalysis(language=language)
        self.model.prepare(gram_dict_input, gram_dict_output, word_vocabulary, char_set_path)
        self.model.load_eval(self.build_config, eval_model_config_path, eval_model_weights_path)

    def predict(self, words: List[str], include_all_forms: bool=False) -> List[WordFormOut]:
        words_probabilities = self.model.predict_probabilities([words], 1, self.build_config)[0]
        return self.__get_sentence_forms(words, words_probabilities, include_all_forms)

    def predict_sentences(self, sentences: List[List[str]], batch_size: int=64,
                          include_all_forms: bool=False) -> List[List[WordFormOut]]:
        sentences_probabilities = self.model.predict_probabilities(sentences, batch_size, self.build_config)
        answers = []
        for words, words_probabilities in zip(sentences, sentences_probabilities):
            answers.append(self.__get_sentence_forms(words, words_probabilities, include_all_forms))
        return answers

    def __get_sentence_forms(self, words: List[str], words_probabilities: List[List[float]],
                             include_all_forms: bool) -> List[WordFormOut]:
        """
        Получить теги и формы.
        
        :param words: слова.
        :param words_probabilities: вероятности тегов слов.
        :param include_all_forms: флаг, включающий все варианты разбора.
        :return: вероятности и формы для всех вариантов слов.
        """
        result = []
        for word, word_prob in zip(words, words_probabilities[-len(words):]):
            result.append(self.__compose_out_form(word, word_prob[1:], include_all_forms))
        return result

    def __compose_out_form(self, word: str, probabilities: List[float],
                           include_all_forms: bool) -> WordFormOut:
        """
        Собрать форму по номеру теги в векторизаторе и слову.

        :param word: слово.
        :param probabilities: вероятности разных форм.
        :param include_all_forms: флаг, включающий все варианты разбора.
        :return: форма.
        """
        word_forms = None
        if self.language == "ru":
            word_forms = self.morph.parse(word)

        vectorizer = self.model.grammeme_vectorizer_output
        tag_num = int(np.argmax(probabilities))
        score = probabilities[tag_num]
        full_tag = vectorizer.get_name_by_index(tag_num)
        pos, tag = full_tag.split("#")[0], full_tag.split("#")[1]
        lemma = self.__get_lemma(word, pos, tag, word_forms)
        vector = np.array(vectorizer.get_vector(full_tag))
        result_form = WordFormOut(word=word, normal_form=lemma, pos=pos, tag=tag, vector=vector, score=score)

        if include_all_forms:
            weighted_vector = np.zeros_like(vector, dtype='float64')
            for tag_num, prob in enumerate(probabilities):
                full_tag = vectorizer.get_name_by_index(tag_num)
                pos, tag = full_tag.split("#")[0], full_tag.split("#")[1]
                lemma = self.__get_lemma(word, pos, tag, word_forms)
                vector = np.array(vectorizer.get_vector(full_tag), dtype='float64')
                weighted_vector += vector * prob

                form = WordFormOut(word=word, normal_form=lemma, pos=pos, tag=tag, vector=vector, score=prob)
                result_form.possible_forms.append(form)

            result_form.weighted_vector = weighted_vector
        return result_form

    def __get_lemma(self, word: str, pos_tag: str, gram: str, word_forms=None,
                    enable_normalization: bool=True):
        """
        Получить лемму.
        
        :param word: слово.
        :param pos_tag: часть речи.
        :param gram: граммаическое значение.
        :param enable_normalization: использовать ли нормализацию как в корпусе ГИКРЯ.
        :return: лемма.
        """
        if '_' in word:
            return word
        if self.language == "ru":
            if word_forms is None:
                word_forms = self.morph.parse(word)
            guess = ""
            max_common_tags = 0
            for word_form in word_forms:
                word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(self.converter, word_form.tag, word)
                word_form_gram = process_gram_tag(word_form_gram)
                common_tags_len = len(set(word_form_gram.split("|")).intersection(set(gram.split("|"))))
                if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
                    max_common_tags = common_tags_len
                    guess = word_form
            if guess == "":
                guess = word_forms[0]
            if enable_normalization:
                lemma = self.__normalize_for_gikrya(guess)
            else:
                lemma = guess.normal_form
            return lemma
        elif self.language == "en":
            lemmatizer = nltk.stem.WordNetLemmatizer()
            pos_map = defaultdict(lambda: 'n')
            pos_map.update({
                'ADJ': 'a',
                'ADV': 'r',
                'NOUN': 'n',
                'VERB': 'v'
            })
            return lemmatizer.lemmatize(word, pos=pos_map[pos_tag])
        else:
            assert False

    @staticmethod
    def __normalize_for_gikrya(form):
        """
        Поучение леммы по правилам, максимально близким к тем, которые в корпусе ГИКРЯ.
        
        :param form: форма из pymorphy2.
        :return: леммма.
        """
        if form.tag.POS == 'NPRO':
            if form.normal_form == 'она':
                return 'он'
            if form.normal_form == 'они':
                return 'он'
            if form.normal_form == 'оно':
                return 'он'

        if form.word == 'об':
            return 'об'
        if form.word == 'тот':
            return 'то'
        if form.word == 'со':
            return 'со'

        if form.tag.POS in {'PRTS', 'PRTF'}:
            return form.inflect({'PRTF', 'sing', 'masc', 'nomn'}).word

        return form.normal_form
Beispiel #13
0
class Parser:
    def __init__(self):
        self.url = 'https://ru.wiktionary.org/wiki/'
        self.headers = {
            'accept':
            '*/*',
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
        }
        self.session = requests.Session()
        self.morph = MorphAnalyzer()
        text = open('olds.json', 'r').read()
        self.olds = json.loads(text)

    def find(self, content):

        variants = self.morph.parse(content)

        word = variants[0].normal_form

        request = self.session.get(self.url + word.lower())
        soup = bs(request.text, 'html.parser')

        response = soup.find('ol').find('li')

        if response != None:

            response_dict = {'text': response.text, 'title': content}

            response_dict['text'] = response_dict['text'].replace(
                '\n', 'dp-trans')
            return json.dumps(response_dict)
        else:
            response_dict = {
                'text': "Информация не найденна(",
                'title': content
            }
            return json.dumps(response_dict)

    def parse_current_page(self, url):
        word_dict = {}
        request = self.session.get(url, headers=self.headers)
        soup = bs(request.text, 'html.parser')
        poems = soup.find_all(attrs={"class": 'dpast__content'})

        dialects = {'words': []}

        def superrost(words):
            return ''.join(
                filter(
                    lambda x: ord(x) in range(ord('а'),
                                              ord('я') + 1) or ord(x) in range(
                                                  ord('А'),
                                                  ord('Я') + 1) or x == ' ',
                    list(words.replace('\\n', ' '))))

        tokenizer = TweetTokenizer()
        analyzer = MorphAnalyzer()

        def preprocess(text):
            w = text.lower().split()

            filtered_words = [
                word for word in w if word not in stopwords.words('russian')
            ]

            words = tokenizer.tokenize(' '.join(filtered_words))

            for i in range(len(words)):
                k = analyzer.parse(words[i])[0].normal_form
                word_dict[k] = words[i]
                words[i] = k
            return ' '.join(words)

        for poem in poems:
            poem = poem.text
            poem = preprocess(superrost(poem)).split()

            for w in poem:
                if w in self.olds:
                    dialects['words'].append(word_dict[w])
        return json.dumps(dialects)

    def parse_current_page_chrome(self, url):
        request = self.session.get(url, headers=self.headers)
        soup = bs(request.text, 'html.parser')
        [s.extract() for s in soup('script')]
        text = soup.text

        word_dict = {}

        dialects = {'words': []}

        def superrost(words):
            return ''.join(
                filter(
                    lambda x: ord(x) in range(ord('а'),
                                              ord('я') + 1) or ord(x) in range(
                                                  ord('А'),
                                                  ord('Я') + 1) or x == ' ',
                    list(words.replace('\\n', ' '))))

        tokenizer = TweetTokenizer()
        analyzer = MorphAnalyzer()

        def preprocess(text):
            w = text.lower().split()

            filtered_words = [
                word for word in w if word not in stopwords.words('russian')
            ]

            words = tokenizer.tokenize(' '.join(filtered_words))

            for i in range(len(words)):
                k = analyzer.parse(words[i])[0].normal_form
                word_dict[k] = words[i]
                words[i] = k
            return ' '.join(words)

        poem = text
        poem = preprocess(superrost(poem)).split()

        for w in poem:
            if w in self.olds:

                dialects['words'].append(word_dict[w])
        dialects['words'] = list(set(dialects['words']))
        return json.dumps(dialects)
Beispiel #14
0
articles = os.listdir('./articles')
for article in articles:
    if article.endswith('.txt'):
        with open('./articles/' + article, 'r', encoding='utf-8-sig') as f:
            all_text = f.read()
        link = extract_link.search(all_text).group(1)
        title = extract_title.search(all_text).group(1)
        text = extract_text.search(all_text)
        if text is not None:
            text = text.group(1)
            words = [
                x.lower().strip(string.punctuation + '»«–…')
                for x in word_tokenize(text)
            ]
            lemmas = [
                m.parse(x)[0].normal_form for x in words
                if x and x not in set(stopwords.words('russian'))
            ]
            collection[article] = lemmas
            article_info[article] = (link, title, len(lemmas))
            avdl += len(lemmas)

inverted_index = inv_index(collection)
avdl = avdl / len(collection)

with open('inverted_index.json', 'w', encoding='utf-8-sig') as f:
    s = json.dumps(inverted_index, ensure_ascii=False)
    f.write(s)

with open('article_info.json', 'w', encoding='utf-8-sig') as f:
    s = json.dumps(article_info, ensure_ascii=False, indent=2)
Beispiel #15
0
def process(inpt_dir, otpt_dir, gold):
    # Создаём директорию с выходными данными на случай, если её нет
    os.makedirs(otpt_dir, exist_ok=True)
    # Если директории со входными данными нет, тут возбуждается исключение
    os.chdir(inpt_dir)
    # Если директория есть, всё в порядке - программа начинает работу
    print('Please wait. Python is processing your data...')

    morph = MorphAnalyzer()
    files = glob.glob('*.txt')

    gold_file = open(gold, mode='r', encoding='utf-8', newline='')

    # Файлы с текстами обрабатываем поштучно
    for file in files:
        f = open(file, mode='r', encoding='windows-1251')
        lines = f.readlines()
        root = etree.Element('text')

        # Словарь для статистики
        stat = {
            'breaks on start': 0,
            'breaks on end': 0,
            'regular breaks': 0,
            'fallbacks': 0,
            'terminal <pc>\'s': 0
        }
        # Массив для фолбэков
        log_list = []

        for i, line in enumerate(lines):
            # Массив токенов
            line_tokens = nltk.word_tokenize(line)
            # Массив упорядоченных словарей вида {разбор: лемма}
            line_parses = format_parse_list(
                [morph.parse(token) for token in line_tokens])

            p = etree.SubElement(root, 'p')
            p.set('n', str(i + 1))
            prev_ana = ''

            for j, ana in enumerate(line_parses):
                gold_file.seek(0)
                gold_reader = csv.reader(gold_file, delimiter=';')
                # parses = все варианты разбора одного слова
                parses = list(ana.keys())
                check = False

                if parses[0].startswith('PM'):
                    elem = etree.SubElement(p, 'pc')
                else:
                    elem = etree.SubElement(p, 'w')
                elem.text = line_tokens[j]

                for row in gold_reader:

                    # Отсекаем триграммы с частотой < 4
                    if row[3] == '3':
                        break

                    # Если текущий элемент - однозначно терминальный ЗП, то искать с ним триграмму бессмысленно
                    if parses[0] == 'PM,Tr,_':
                        elem.set('ana', 'PM,Tr,_')
                        elem.set('lemma', ana['PM,Tr,_'])
                        prev_ana = 'PM,Tr,_'

                        stat['terminal <pc>\'s'] += 1
                        check = True

                    else:
                        # Если находимся в абсолютном начале предложения/чанка, рассматриваем левые биграммы
                        if j == 0 or prev_ana == 'PM,Tr,_':

                            # Фолбэк к pymorphy2, если текущий элемент последний в предложении
                            if j + 1 == len(line_parses):
                                break
                            else:
                                if row[0] in parses and row[1] in line_parses[
                                        j + 1]:
                                    elem.set('ana', row[0])
                                    elem.set('lemma', ana[row[0]])
                                    prev_ana = row[0]

                                    stat['breaks on start'] += 1
                                    check = True

                        # Если текущий элемент последний в предложении, рассматриваем правые биграммы
                        elif j + 1 == len(line_parses):
                            if prev_ana == row[1] and row[2] in parses:
                                elem.set('ana', row[2])
                                elem.set('lemma', ana[row[2]])
                                prev_ana = row[2]

                                stat['breaks on end'] += 1
                                check = True

                        # В других случаях рассматриваем полноценные триграммы
                        else:
                            if row[0] == prev_ana and row[1] in parses and row[
                                    2] in line_parses[j + 1]:
                                ##                                print('row[0]', row[0])
                                ##                                print('row[1]',row[1])
                                elem.set('ana', row[1])
                                elem.set('lemma', ana[row[1]])
                                prev_ana = row[1]

                                stat['regular breaks'] += 1
                                check = True

                    if check:
                        break

                # Фолбэк, если подходящей триграммы в золотом стандарте не нашлось
                if not check:
                    elem.set('ana', parses[0])
                    elem.set('lemma', ana[parses[0]])
                    prev_ana = parses[0]

                    # Фиксируем триграммы, на которых случился фолбэк
                    if j == 0 and len(line_tokens) == 1:
                        log_data = '''\
{
    %s: %s,
};
''' % (str(line_tokens[j]), str(parses))
                    elif j == 0:
                        log_data = '''\
{
    %s: %s,
    %s: %s,
};
''' % (str(line_tokens[j]), str(parses), str(line_tokens[j + 1]),
                        str(list(line_parses[j + 1].keys())))
                    elif j + 1 == len(line_parses):
                        log_data = '''\
{
    %s: %s,
    %s: %s,
};
''' % (str(line_tokens[j - 1]), str(prev_ana), str(
                            line_tokens[j]), str(parses))
                    else:
                        log_data = '''\
{
    %s: %s,
    %s: %s,
    %s: %s,
};
''' % (str(line_tokens[j - 1]), str(prev_ana), str(
                            line_tokens[j]), str(parses),
                        str(line_tokens[j + 1]), str(list(line_parses[j + 1].keys())))

                    log_list.append(log_data)
                    stat['fallbacks'] += 1

        # Шагаем в выходную директорию
        os.chdir(otpt_dir)

        # Записываем в XML
        with open(file[:-3] + 'xml', mode='w', encoding='utf-8') as out:
            xml = etree.tostring(root, method='xml', encoding='utf-8')
            pretty = parseString(xml).toprettyxml(indent='  ',
                                                  encoding='utf-8')
            out.write(pretty.decode())

        # Записываем фолбэки в лог-файл
        with open(file[:-4] + '_log_trg.txt', mode='w',
                  encoding='utf-8') as log:
            for line in log_list:
                log.write(str(line) + '\n')

        # Выдаём статистику по файлу
        print(file)
        for key in stat:
            print('    %d %s' % (stat[key], key))

        # Возвращаемся во входную директорию - к файлам на очереди
        os.chdir(inpt_dir)
        f.close()

    gold_file.close()
Beispiel #16
0
class RNNMorphPredictor(Predictor):
    """
    POS-теггер на освное RNN.
    """
    def __init__(self,
                 model_config_path: str = RU_MORPH_DEFAULT_MODEL_CONFIG,
                 model_weights_path: str = RU_MORPH_DEFAULT_MODEL_WEIGHTS,
                 gramm_dict_input: str = RU_MORPH_GRAMMEMES_DICT,
                 gramm_dict_output: str = RU_MORPH_GRAMMEMES_DICT_OUTPUT):
        self.model = LSTMMorphoAnalysis()
        self.model.prepare(gramm_dict_input, gramm_dict_output)
        self.model.load(model_config_path, model_weights_path)
        self.morph = MorphAnalyzer()

    def predict_sentence_tags(self, words: List[str]) -> List[WordFormOut]:
        tags = self.model.predict([words], batch_size=1)[0]
        return [
            self.__compose_out_form(tag_num, word)
            for tag_num, word in zip(tags, words)
        ]

    def predict_sentences_tags(
            self,
            sentences: List[List[str]],
            batch_size: int = 64) -> List[List[WordFormOut]]:
        sentences_tags = self.model.predict(sentences, batch_size)
        answers = []
        for tags, words in zip(sentences_tags, sentences):
            answers.append([
                self.__compose_out_form(tag_num, word)
                for tag_num, word in zip(tags, words)
            ])
        return answers

    def predict_sentence_tags_proba(
            self, words: List[str]) -> List[List[Tuple[float, WordFormOut]]]:
        words_probabilities = self.model.predict_proba([words],
                                                       batch_size=1)[0]
        return self.__get_sentence_forms_probs(words, words_probabilities)

    def predict_sentences_tags_proba(
            self,
            sentences: List[List[str]],
            batch_size: int = 64
    ) -> List[List[List[Tuple[float, WordFormOut]]]]:
        result = []
        sentences_probabilities = self.model.predict_proba(
            sentences, batch_size)
        for sentence, words_probabilities in zip(sentences,
                                                 sentences_probabilities):
            result.append(
                self.__get_sentence_forms_probs(sentence, words_probabilities))
        return result

    def __get_sentence_forms_probs(self, words: List[str], words_probabilities: List[List[float]]) -> \
            List[List[Tuple[float, WordFormOut]]]:
        """
        Получить теги и формы.
        
        :param words: слова.
        :param words_probabilities: вероятности тегов слов.
        :return: вероятности и формы для всех вариантов слов.
        """
        result = []
        for word, word_prob in zip(words, words_probabilities[-len(words):]):
            word_prob = word_prob[1:]
            word_forms = [(grammeme_prob,
                           self.__compose_out_form(tag_num, word))
                          for tag_num, grammeme_prob in enumerate(word_prob)]
            result.append(word_forms)
        return result

    def __compose_out_form(self, tag_num: int, word: str) -> WordFormOut:
        """
        Собрать форму по номеру теги в векторизаторе и слову.
        
        :param tag_num: номер тега.
        :param word: слово.
        :return: форма.
        """
        vectorizer = self.model.grammeme_vectorizer_output
        tag = vectorizer.get_name_by_index(tag_num)
        pos_tag = tag.split("#")[0]
        gram = tag.split("#")[1]
        lemma = self.__get_lemma(word, pos_tag, gram)
        return WordForm(lemma=lemma, gram_vector_index=tag_num,
                        text=word).get_out_form(vectorizer)

    def __get_lemma(self,
                    word: str,
                    pos_tag: str,
                    gram: str,
                    enable_gikrya_normalization: bool = True):
        """
        Получить лемму.
        
        :param word: слово.
        :param pos_tag: часть речи.
        :param gram: граммаическое значение.
        :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ.
        :return: лемма.
        """
        if '_' in word:
            return word
        to_ud = converters.converter('opencorpora-int', 'ud14')
        guess = ""
        max_common_tags = 0
        for word_form in self.morph.parse(word):
            word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(
                to_ud, word_form.tag, word)
            word_form_gram = process_gram_tag(word_form_gram)
            common_tags_len = len(
                set(word_form_gram.split("|")).intersection(
                    set(gram.split("|"))))
            if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
                max_common_tags = common_tags_len
                guess = word_form
        if guess == "":
            guess = self.morph.parse(word)[0]
        if enable_gikrya_normalization:
            lemma = self.__normalize_for_gikrya(guess)
        else:
            lemma = guess.normal_form
        return lemma

    @staticmethod
    def __normalize_for_gikrya(form):
        """
        Поучение леммы по правилам, максимально близким к тем, которые в корпусе ГИКРЯ.
        
        :param form: форма из pymorphy2.
        :return: леммма.
        """
        if form.tag.POS == 'NPRO':
            if form.normal_form == 'она':
                return 'он'
            if form.normal_form == 'они':
                return 'он'
            if form.normal_form == 'оно':
                return 'он'

        if form.word == 'об':
            return 'об'
        if form.word == 'тот':
            return 'то'
        if form.word == 'со':
            return 'со'

        if form.tag.POS in {'PRTS', 'PRTF'}:
            return form.inflect({'PRTF', 'sing', 'masc', 'nomn'}).word

        return form.normal_form
            nonnum += 1
            alpha_postings += cnt
            lo = word.lower()
            if lo in low_reg:
                low_reg[lo] += cnt
            else:
                low_reg[lo] = cnt
    just_ru = {k: v for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k)}
    ru_postings = sum(just_ru.values())
    morph = MorphAnalyzer()
    c = 0
    for k, v in just_ru.items():
        if c % 100000 == 0:
            print(c)
        c += 1
        lem = morph.parse(k)[0].normal_form
        if lem in lemmatized:
            lemmatized[lem] += int(v)
        else:
            lemmatized[lem] = int(v)
    with open("stopwords", "r") as st:
        stops = set(st.read().split('\n'))
        for k, v in just_ru.items():
            if not k in stops:
                no_stops_postings += v
print("Raw dictionary size = {0}\n"
      "Without numbers = {1}\n"
      "Lowered = {2}\n"
      "Just russian = {3}\n".format(all, nonnum, len(low_reg), len(just_ru)))
print("Lemmatized = {0}\n\n".format(len(lemmatized)))
print("All postings = {0}\n"
Beispiel #18
0
        'C:\\Users\\iburmistrov\\Documents\\Texts_Analyze\\url_list.txt')
]
for url in urls:
    try:
        response = requests.get(url, timeout=None)
    except:
        continue
    soup = BeautifulSoup(response.content, "html.parser")

    for pp in soup.select("p"):
        print(pp.text)

#приводим все слова к их исходной форме, настраиваем CountVectorizer на подсчет N-грамм с количеством слов от 2 до 4 и записываем результат в файл.
cvn = CountVectorizer(ngram_range=(2, 4), stop_words=stop_words)
words_nf = [
    ' '.join([m.parse(word)[0].normal_form for word in x.split()])
    for x in texts
]
ngrams = cvn.fit_transform(words_nf)
vb = cvn.vocabulary_
count_values = ngrams.toarray().sum(axis=0)

for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vb.items()],
                                reverse=True):
    print(ng_text, ng_count, sep='\t')

#Все, что нужно дальше, – записать все результаты в один Excel-файл:
info_data = pd.read_csv('C:\\Users\\evgen\\Documents\\result.txt',
                        encoding='utf-8',
                        sep="\t",
                        header=None)
Beispiel #19
0
                    new_file.write(elem_clear)

with open('not_spam_file.txt', encoding='utf8') as a_src:
    text = a_src.read()
morph = MorphAnalyzer()

not_spam_m = 0  #количество слов в обучающей выборке
not_spam_nk = 0  #количество слов в обучающей выборке без стоп-слов
stops = stopwords.words("russian")
articles_texts = []
articles_texts.append(text)
articles_preprocessed = []
for a_text in articles_texts:
    a_tokens = wordpunct_tokenize(a_text)
    a_lemmatized = ' '.join(
        [morph.parse(item)[0].normal_form for item in a_tokens])
    articles_preprocessed.append(a_lemmatized)
    for token in a_tokens:
        p = morph.parse(token)[0]
        if p.tag.POS:
            not_spam_m += 1
        if p.tag.POS and token not in stops:
            not_spam_nk += 1

tfidf = TfidfVectorizer(analyzer="word", stop_words=stops)

articles_tfidf = tfidf.fit_transform(articles_preprocessed)

feature_names = np.array(tfidf.get_feature_names())

not_spam_triggers = {}
Beispiel #20
0
            if lo in low_reg:
                low_reg[lo] += cnt
            else:
                low_reg[lo] = cnt
    just_ru = {
        k: v
        for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k)
    }
    ru_postings = sum(just_ru.values())
    morph = MorphAnalyzer()
    c = 0
    for k, v in just_ru.items():
        if c % 100000 == 0:
            print(c)
        c += 1
        lem = morph.parse(k)[0].normal_form
        if lem in lemmatized:
            lemmatized[lem] += int(v)
        else:
            lemmatized[lem] = int(v)
    with open("stopwords", "r") as st:
        stops = set(st.read().split('\n'))
        for k, v in just_ru.items():
            if not k in stops:
                no_stops_postings += v
print("Raw dictionary size = {0}\n"
      "Without numbers = {1}\n"
      "Lowered = {2}\n"
      "Just russian = {3}\n".format(all, nonnum, len(low_reg), len(just_ru)))
print("Lemmatized = {0}\n\n".format(len(lemmatized)))
print("All postings = {0}\n"
class Extractor:
    def __init__(self):
        self.word2vec = None
        self.morph = MorphAnalyzer()

    @lru_cache(20000)
    def _morph_parse(self, word):
        return self.morph.parse(word)

    def _tokenize(self, text):
        tokens = word_tokenize(text.lower())
        result = []
        for token in tokens:
            morph = self._morph_parse(token)
            if len(morph) > 0:
                if morph[0].tag.POS is not None:
                    result.append(morph[0])
        return [
            "{0}_{1}".format(morph.word, morph.tag.POS) for morph in result
        ]

    def fit(self, texts, word2vec_params):
        converted_texts = [self._tokenize(text) for text in texts]
        self.word2vec = Word2Vec(converted_texts, **word2vec_params)

    def _tfidf_order_features(self, tfidf, matrix):
        mean_features = np.asarray(matrix.mean(axis=0))[0]
        ordered_features = mean_features.argsort()[::-1]
        feature_names = tfidf.get_feature_names()
        result = []
        for feature in ordered_features:
            result.append(feature_names[feature])
        return np.array(result)

    def _tfidf_feature_filter(self, features):
        ignorance_filter = lambda text: bool(re.match(".*prep", text)) or \
                                        bool(re.match(".*infn", text)) or \
                                        bool(re.match(".*verb", text))
        feature_filter = lambda text: bool(re.match(".*adjf .*noun", text)
                                           ) and not ignorance_filter(text)
        return [feature for feature in features if feature_filter(feature)]

    def _document_distance(self, doc1, doc2):
        doc1vec = np.array([np.zeros([self.word2vec.vector_size])] + [
            self.word2vec[token]
            for token in doc1.split(" ") if token in self.word2vec
        ]).sum(axis=0)
        doc2vec = np.array([np.zeros([self.word2vec.vector_size])] + [
            self.word2vec[token]
            for token in doc2.split(" ") if token in self.word2vec
        ]).sum(axis=0)
        return cosine(doc1vec, doc2vec)

    def _top_features(self, converted_texts, ngram_min, ngram_max,
                      top_tfidf_features):
        features = []
        for size in range(ngram_min, ngram_max + 1):
            tfidf = TfidfVectorizer(ngram_range=(ngram_min, ngram_max))
            tfidf_transformed_texts = tfidf.fit_transform(converted_texts)
            tfidf_features = self._tfidf_order_features(
                tfidf, tfidf_transformed_texts)
            top_features = self._tfidf_feature_filter(
                tfidf_features)[:top_tfidf_features]
            features += top_features
        features = list(set(features))
        features.sort()
        return features

    def _feature_distances(self, features):
        distances = np.zeros([len(features), len(features)])
        for i, feature1 in enumerate(features):
            for j, feature2 in enumerate(features):
                features_distance = self._document_distance(feature1, feature2)
                distances[i, j] = features_distance
                distances[j, i] = features_distance
        return distances

    def _cluster_features(self, features, distances):
        dbscan = DBSCAN(0.2, min_samples=1, metric="precomputed")
        clusters = dbscan.fit_predict(distances)
        items = {}
        for cluster, feature in zip(clusters, features):
            items[cluster] = items.get(cluster, []) + [feature]
        return items

    def _choose_features(self, features, distances):
        choosen_ngrams = []
        for key, values in self._cluster_features(features, distances).items():
            values_indices = np.array([features.index(val) for val in values])
            if len(values) < 2:
                continue
            values_distances = distances[values_indices, :][:, values_indices]
            index_mean_distances = np.zeros([len(values_indices)])
            for i in range(0, len(values_indices)):
                index_mean_distances[i] = np.delete(values_distances[i],
                                                    i,
                                                    axis=0).mean()
            choosen_ngram = values[index_mean_distances.argmin()]
            choosen_ngrams.append(choosen_ngram)
        return choosen_ngrams

    def _apply_rules(self, rules, text):
        if isinstance(text, list):
            return [self._apply_rules(rules, item) for item in text]
        for rule in rules:
            text = rule(text)
        return text

    def _norm(self, text):
        tokens = wordpunct_tokenize(text)
        result = []
        for token in tokens:
            parse = self.morph.parse(token)
            if len(parse) > 0 and parse[0].tag.POS:
                inflect = parse[0].inflect({"nomn"})
                if inflect:
                    result.append(inflect.word)
                else:
                    result.append(token)
            else:
                result.append(token)
        return " ".join(result).replace(" ,", ",")

    def transform(self, texts, ngram_min, ngram_max, top_tfidf_features):
        _texts = []
        for text in texts:
            add_texts = re.split("[,.]|([ ]+и[ ]+)", text)
            _texts += [item for item in add_texts if item is not None]
        texts = _texts
        converted_texts = [" ".join(self._tokenize(text)) for text in texts]
        features = self._top_features(converted_texts, ngram_min, ngram_max,
                                      top_tfidf_features)
        if len(features) == 0:
            return []
        distances = self._feature_distances(features)
        choosen_features = self._choose_features(features, distances)

        rules = [
            lambda text: re.sub("_adjf+ (\w+)_intj", ", \g<1>", text),
            lambda text: re.sub("_noun+ (\w+)_adjf", ", \g<1>", text),
            lambda text: re.sub("^\w+_conj", "", text),
            lambda text: re.sub("\w+_conj$", "", text),
            lambda text: re.sub("^\w+_pred", "", text),
            lambda text: re.sub("\w+_pred$", "", text),
            lambda text: re.sub("^\w+_precl", "", text),
            lambda text: re.sub("\w+_precl$", "", text),
            lambda text: re.sub("_[a-z]+", "", text),
            lambda text: text.strip(), lambda text: self._norm(text)
        ]
        return self._apply_rules(rules, choosen_features)
Beispiel #22
0
# pprint(messages)

ngrams = []
n, m = 0, 0
t = int(time())
l = len(messages)

for message in messages:
    if message == "<|BEGIN|>":
        ngram = []
    elif message == "<|END|>":
        phrases = []
        for phrase in ngram:
            terms = set(te(phrase, strings=1, nested=1))
            words = list(
                set([ma.parse(w)[0].normal_form
                     for w in wpt.tokenize(phrase)]))
            idx = []
            for word in words:
                w = 1 if word in terms else .5
                idx += [(w, word)]
            phrases += [(idx, phrase)]
        ngrams += [phrases]
    else:
        ngram += [message]
    n += 1
    if time() - t > 1:
        print("%s of %s, %s / sec" % (m, l, n))
        m += n
        n = 0
        t = int(time())
Beispiel #23
0
class SearchEngine:
    """
    Search engine
    """
    def __init__(self):
        self.morph = MorphAnalyzer()
        self.inverted_index_dict = {
            'phrases_index': {},
            'counter_dict': {},
            'categories': []
        }

        if os.path.isfile('inverted_index.pickle'):
            with open('inverted_index.pickle', 'rb') as handle:
                self.inverted_index_dict = pickle.load(handle)
        else:
            with open('docs.json', encoding='utf8') as f:
                self.docs = json.load(f)
            self.build_inverted_index(self.docs)
            with open('inverted_index.pickle', 'wb') as handle:
                pickle.dump(self.inverted_index_dict,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)
            del self.docs

    async def convert_sentence(self, sentence):
        """
        Each word converts to normal form and returned as a list.
        """
        sentence = re.sub(r'\W|\d', ' ', sentence)
        tokens = sentence.lower().split()
        result = [self.morph.parse(word)[0].normal_form for word in tokens]
        return result

    def build_inverted_index(self, docs):
        """
        Initializes creation of inverted index and category list.
        """
        for document_idx, doc in enumerate(docs):
            self.build_dictionary(document_idx, doc, 'phrases',
                                  self.inverted_index_dict['phrases_index'])

        self.inverted_index_dict['categories'] = [
            i['category'] for i in self.docs
        ]

    def build_dictionary(self, doc_idx, doc, section, dictionary):
        """
        Builds an inverted index dictionary.
        """
        if section in doc:
            for sentence_ind, sentence in enumerate(doc[section]):
                splitted_sentence = self.tokenize(sentence)

                if doc_idx in self.inverted_index_dict['counter_dict']:
                    self.inverted_index_dict['counter_dict'][doc_idx] += (
                        len(splitted_sentence), )
                else:
                    self.inverted_index_dict['counter_dict'][doc_idx] = (
                        len(splitted_sentence), )
                for word in splitted_sentence:
                    if word not in dictionary:
                        dictionary[word] = {}
                    if doc_idx in dictionary[word]:
                        dictionary[word][doc_idx] += (sentence_ind, )
                    else:
                        dictionary[word][doc_idx] = (sentence_ind, )

    async def get_categories(self, sentence):
        """
        Сoroutine get the sentence and returns json with the list of categories.
        """
        result = {'categories': ()}
        _categories = ()
        _links = []

        # calling coroutine, bringing each word to normal form.
        converted_words = await self.convert_sentence(sentence)
        for word in converted_words:
            if word in self.inverted_index_dict['phrases_index']:
                # tuple with category_ids in index for word
                _categories += tuple(
                    self.inverted_index_dict['phrases_index'][word].keys())
                # dicts with categories_id and phrases_id in index  for word
                _links.append(self.inverted_index_dict['phrases_index'][word])

        for category in set(_categories):
            # count the number of words for each of the categories
            _word_ids = sum([x[category] for x in _links if x.get(category)],
                            ())
            for i in set(_word_ids):
                real_count = _word_ids.count(i)
                # If count of words in phrase matches the actual number === the phrase matches the search query.
                if self.inverted_index_dict['counter_dict'][category][
                        i] == real_count:
                    result['categories'] += (
                        self.inverted_index_dict['categories'][category], )
        return result

    def tokenize(self, sentence):
        sentence = re.sub(r'\W|\d', ' ', sentence)
        tokens = sentence.lower().split()
        result = [self.morph.parse(word)[0].normal_form for word in tokens]
        return result
Beispiel #24
0
from pymorphy2 import MorphAnalyzer

# Создаем объект анализатор
morph = MorphAnalyzer()

# Слово для примера
word = 'струбинов'
lst_case = ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']

# # Анализируем слово



word_parsed =morph.parse(word)
for par in word_parsed:
    if {'masc','Surn'} in par.tag:
        print(par.inflect({'gent'}))


# print(len(word_parsed))
# for par in word_parsed:
#     print(par)
#     print()
#     print(par.lexeme)
# # # print(word_parsed.tag.gender)
# # # print(word_parsed.lexeme)
# # # for case in lst_case:
# # #     print(word_parsed)
# # #     print(word_parsed.inflect({case}).word)
# # print(len(word_parsed.lexeme))
# # for par in word_parsed.lexeme:
Beispiel #25
0
class Substs_loader:
    def __init__(self,
                 data_name,
                 lemmatizing_method,
                 max_examples=None,
                 delete_word_parts=False,
                 drop_duplicates=True,
                 count_lemmas_weights=False,
                 limit=None):
        self.data_name = data_name
        self.lemmatizing_method = lemmatizing_method
        self.max_examples = max_examples
        self.delete_word_parts = delete_word_parts
        self.drop_duplicates = drop_duplicates
        self.count_lemmas_weights = count_lemmas_weights
        self.translation = str.maketrans('', '', string.punctuation)

        self.dfs = dict()
        self.nf_cnts = dict()
        self.cache = dict()

        if lemmatizing_method is not None and lemmatizing_method != 'none':
            if 'ru' in data_name:
                self.analyzer = MorphAnalyzer()
            elif 'german' in data_name:
                self.analyzer = spacy.load("de_core_news_sm",
                                           disable=['ner', 'parser'])
            elif 'english' in data_name:
                self.analyzer = spacy.load("en_core_web_sm",
                                           disable=['ner', 'parser'])
            else:
                assert "unknown data name %s" % data_name

    def get_nf_cnt(self, substs_probs):
        nf_cnt = Counter(nf for l in substs_probs for p, s in l
                         for nf in self.analyze_russian_word(s))
        return nf_cnt

    def analyze_russian_word(self, word, nf_cnt=None):
        word = word.strip()
        if word not in self.cache:
            self.cache[word] = {
                i.normal_form
                for i in self.analyzer.parse(word)
            }

        if nf_cnt is not None and len(
                self.cache[word]) > 1:  # select most common normal form
            h_weights = [nf_cnt[h] for h in self.cache[word]]
            max_weight = max(h_weights)
            res = {
                h
                for i, h in enumerate(self.cache[word])
                if h_weights[i] == max_weight
            }
        else:
            res = self.cache[word]

        return sorted(list(res))

    def analyze(self, word):
        if not word:
            return ['']

        if not word in self.cache:
            spacyed = self.analyzer(word)
            lemma = spacyed[0].lemma_ if spacyed[
                0].lemma_ != '-PRON-' else spacyed[0].lower_
            self.cache[word] = [lemma]
        return self.cache[word]

    def get_lemmas(self, word, nf_cnt=None):
        if 'ru' in self.data_name:
            return self.analyze_russian_word(word, nf_cnt)
        else:
            return self.analyze(word)

    def get_single_lemma(self, word, nf_cnt):
        return self.get_lemmas(word, nf_cnt)[0]

    def preprocess_substitutes(self,
                               substs_probs,
                               target_word,
                               nf_cnt,
                               topk,
                               exclude_lemmas=set(),
                               delete_word_parts=False):
        """
        1) leaves only topk substitutes without spaces inside
        2) applies lemmatization
        3) excludes unwanted lemmas (if any)
        4) returns string of space separated substitutes
        """
        exclude = exclude_lemmas.union({target_word})

        if delete_word_parts:
            res = [
                word.strip() for prob, word in substs_probs[:topk]
                if word.strip() and ' ' not in word.strip() and word[0] == ' '
            ]
        else:
            res = [
                word.strip() for prob, word in substs_probs[:topk]
                if word.strip() and ' ' not in word.strip()
            ]

        # TODO: optimise!
        if exclude:
            if self.lemmatizing_method != 'none':
                res = [
                    s for s in res
                    if not set(self.get_lemmas(s)).intersection(exclude)
                ]
            else:
                res = [s for s in res if not s in exclude]

        if self.lemmatizing_method == 'single':
            res = [self.get_single_lemma(word.strip(), nf_cnt) for word in res]
        elif self.lemmatizing_method == 'all':
            res = [
                ' '.join(self.get_lemmas(word.strip(), nf_cnt)) for word in res
            ]
        else:
            assert self.lemmatizing_method == 'none', "unrecognized lemmatization method %s" % self.lemmatizing_method

        return ' '.join(res)

    def get_substitutes(self, path, topk, data_name=None):

        if data_name is None:
            data_name = self.data_name

        if data_name in self.dfs:
            assert data_name in self.nf_cnts
            subst = self.dfs[data_name]
            nf_cnt = self.nf_cnts[data_name]

        else:
            subst = load_substs(path,
                                data_name=data_name,
                                drop_duplicates=self.drop_duplicates,
                                limit=self.max_examples)

            if self.lemmatizing_method != 'none' and self.count_lemmas_weights and 'ru' in self.data_name:
                nf_cnt = self.get_nf_cnt(subst['substs_probs'])
            else:
                nf_cnt = None

            self.dfs[data_name] = subst
            self.nf_cnts[data_name] = nf_cnt

        subst['substs'] = subst.apply(lambda x: self.preprocess_substitutes(
            x.substs_probs,
            x.word,
            nf_cnt,
            topk,
            delete_word_parts=self.delete_word_parts),
                                      axis=1)
        subst['word'] = subst['word'].apply(lambda x: x.replace('ё', 'е'))

        return subst

    def get_substs_pair(self, path1, path2, topk):
        """
        loads subs from path1, path2 and applies preprocessing
        """
        return self.get_substitutes(path1, topk=topk, data_name=self.data_name + '_1'), \
               self.get_substitutes(path2, topk=topk, data_name=self.data_name + '_2' )
Beispiel #26
0
bot = telebot.TeleBot(conf.TOKEN, threaded=False)

bot.remove_webhook()
bot.set_webhook(url=WEBHOOK_URL_BASE+WEBHOOK_URL_PATH)

app = flask.Flask(__name__)

@bot.message_handler(commands=['start', 'help'])
def send_welcome(message):
	bot.send_message(message.chat.id, "Здравствуйте! Это бот, с которым можно разговаривать.")

@bot.message_handler(func=lambda m: True):
    def send(message):
 reply = ''
    for word in message.split(' '):
        ana = morph.parse(word.strip('.,:;?!()""'''))[0]
        if ana.tag.POS in pos_files_dict and ana.tag.POS not in ['NOUN', 'NPRO']:
            file = pos_files_dict[ana.tag.POS]
            words = (open(file, 'r').read()).split(' ')
            word_replace = random.choice(words)
            grammemes = set()
            grammemes.add(ana.tag.case)
            grammemes.add(ana.tag.gender)
            grammemes.add(ana.tag.mood)
            grammemes.add(ana.tag.number)
            grammemes.add(ana.tag.person)
            grammemes.add(ana.tag.tense)
            grammemes.add(ana.tag.voice)
            grammemes.remove(None)
            word_replace = ((morph.parse(word_replace)[0]).inflect(grammemes)).word
        elif ana.tag.POS in ['NOUN', 'NPRO'] and ana.tag.gender != None:
Beispiel #27
0
inverted_index = {}
page_occurrences = {}

pages = os.listdir(path=PAGES_PATH)

for index, page in enumerate(pages):
    file = open(PAGES_PATH + page, 'r', encoding="utf-8")
    text = file.read()

    tokens = tokenizers.simple_word_tokenize(text)

    page_occurrences[index] = len(tokens)

    for token in tokens:
        lemma = morph.parse(token)[0].normal_form.lower()
        value = inverted_index.get(lemma)

        if value is None:
            inverted_index[lemma] = {index: 1}
        elif inverted_index[lemma].get(index) is None:
            inverted_index[lemma][index] = 1
        else:
            inverted_index[lemma][index] += 1

inverted_index_file = open("inverted_index.pkl", "wb")
pickle.dump(inverted_index, inverted_index_file)
inverted_index_file.close()

page_occurrences_file = open("page_occurrences.pkl", "wb")
pickle.dump(page_occurrences, page_occurrences_file)
Beispiel #28
0
class PymorphyVectorizer(WordIndexVectorizer):
    """
    Transforms russian words into 0-1 vector of its possible Universal Dependencies tags.
    Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io)
    and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets).
    All UD2.0 tags that are compatible with produced tags are memorized.
    The list of possible Universal Dependencies tags is read from a file,
    which contains all the labels that occur in UD2.0 SynTagRus dataset.

    Args:
        save_path: path to save the tags list,
        load_path: path to load the list of tags,
        max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used.
    """

    USELESS_KEYS = ["Abbr"]
    VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"}

    def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
        super().__init__(save_path, load_path, **kwargs)
        self.max_pymorphy_variants = max_pymorphy_variants
        self.load()
        self.memorized_word_indexes = dict()
        self.memorized_tag_indexes = dict()
        self.analyzer = MorphAnalyzer()
        self.converter = converters.converter('opencorpora-int', 'ud20')

    @property
    def dim(self):
        return len(self._t2i)

    def save(self) -> None:
        """Saves the dictionary to self.save_path"""
        with self.save_path.open("w", encoding="utf8") as fout:
            fout.write("\n".join(self._i2t))

    def load(self) -> None:
        """Loads the dictionary from self.load_path"""
        self._i2t = []
        with self.load_path.open("r", encoding="utf8") as fin:
            for line in fin:
                line = line.strip()
                if line == "":
                    continue
                self._i2t.append(line)
        self._t2i = {tag: i for i, tag in enumerate(self._i2t)}
        self._make_tag_trie()

    def _make_tag_trie(self):
        self._nodes = [defaultdict(dict)]
        self._start_nodes_for_pos = dict()
        self._data = [None]
        for tag, code in self._t2i.items():
            if "," in tag:
                pos, tag = tag.split(",", maxsplit=1)
                tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")])
            else:
                pos, tag = tag, []
            start = self._start_nodes_for_pos.get(pos)
            if start is None:
                start = self._start_nodes_for_pos[pos] = len(self._nodes)
                self._nodes.append(defaultdict(dict))
                self._data.append(None)
            for key, value in tag:
                values_dict = self._nodes[start][key]
                child = values_dict.get(value)
                if child is None:
                    child = values_dict[value] = len(self._nodes)
                    self._nodes.append(defaultdict(dict))
                    self._data.append(None)
                start = child
            self._data[start] = code
        return self

    def find_compatible(self, tag: str) -> List[int]:
        """
        Transforms a Pymorphy tag to a list of indexes of compatible UD tags.

        Args:
            tag: input Pymorphy tag

        Returns:
            indexes of compatible UD tags
        """
        if " " in tag and "_" not in tag:
            pos, tag = tag.split(" ", maxsplit=1)
            tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")])
        else:
            pos, tag = tag.split()[0], []
        if pos not in self._start_nodes_for_pos:
            return []
        tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag
               if key not in self.USELESS_KEYS]
        if len(tag) > 0:
            curr_nodes = [(0, self._start_nodes_for_pos[pos])]
            final_nodes = []
        else:
            final_nodes = [self._start_nodes_for_pos[pos]]
            curr_nodes = []
        while len(curr_nodes) > 0:
            i, node_index = curr_nodes.pop()
            # key, value = tag[i]
            node = self._nodes[node_index]
            if len(node) == 0:
                final_nodes.append(node_index)
            for curr_key, curr_values_dict in node.items():
                curr_i, curr_node_index = i, node_index
                while curr_i < len(tag) and tag[curr_i][0] < curr_key:
                    curr_i += 1
                if curr_i == len(tag):
                    final_nodes.extend(curr_values_dict.values())
                    continue
                key, value = tag[curr_i]
                if curr_key < key:
                    for child in curr_values_dict.values():
                        curr_nodes.append((curr_i, child))
                else:
                    child = curr_values_dict.get(value)
                    if child is not None:
                        if curr_i < len(tag) - 1:
                            curr_nodes.append((curr_i + 1, child))
                        else:
                            final_nodes.append(child)
        answer = []
        while len(final_nodes) > 0:
            index = final_nodes.pop()
            if self._data[index] is not None:
                answer.append(self._data[index])
            for elem in self._nodes[index].values():
                final_nodes.extend(elem.values())
        return answer

    def _get_word_indexes(self, word):
        answer = self.memorized_word_indexes.get(word)
        if answer is None:
            parse = self.analyzer.parse(word)
            if self.max_pymorphy_variants > 0:
                parse = parse[:self.max_pymorphy_variants]
            tag_indexes = set()
            for elem in parse:
                tag_indexes.update(set(self._get_tag_indexes(elem.tag)))
            answer = self.memorized_word_indexes[word] = list(tag_indexes)
        return answer

    def _get_tag_indexes(self, pymorphy_tag):
        answer = self.memorized_tag_indexes.get(pymorphy_tag)
        if answer is None:
            tag = self.converter(str(pymorphy_tag))
            answer = self.memorized_tag_indexes[pymorphy_tag] = self.find_compatible(tag)
        return answer
Beispiel #29
0
class RussianLemmatizer(Lemmatizer):
    def __init__(
        self,
        vocab: Vocab,
        model: Optional[Model],
        name: str = "lemmatizer",
        *,
        mode: str = "pymorphy2",
        overwrite: bool = False,
        scorer: Optional[Callable] = lemmatizer_score,
    ) -> None:
        if mode == "pymorphy2":
            try:
                from pymorphy2 import MorphAnalyzer
            except ImportError:
                raise ImportError(
                    "The Russian lemmatizer mode 'pymorphy2' requires the "
                    "pymorphy2 library. Install it with: pip install pymorphy2"
                ) from None
            if getattr(self, "_morph", None) is None:
                self._morph = MorphAnalyzer()
        super().__init__(vocab,
                         model,
                         name,
                         mode=mode,
                         overwrite=overwrite,
                         scorer=scorer)

    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        univ_pos = token.pos_
        morphology = token.morph.to_dict()
        if univ_pos == "PUNCT":
            return [PUNCT_RULES.get(string, string)]
        if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN",
                            "VERB"):
            # Skip unchangeable pos
            return [string.lower()]
        analyses = self._morph.parse(string)
        filtered_analyses = []
        for analysis in analyses:
            if not analysis.is_known:
                # Skip suggested parse variant for unknown word for pymorphy
                continue
            analysis_pos, _ = oc2ud(str(analysis.tag))
            if analysis_pos == univ_pos or (analysis_pos in ("NOUN", "PROPN")
                                            and univ_pos in ("NOUN", "PROPN")):
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        if morphology is None or (len(morphology) == 1 and POS in morphology):
            return list(
                dict.fromkeys(
                    [analysis.normal_form for analysis in filtered_analyses]))
        if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"):
            features_to_compare = ["Case", "Number", "Gender"]
        elif univ_pos == "NUM":
            features_to_compare = ["Case", "Gender"]
        elif univ_pos == "PRON":
            features_to_compare = ["Case", "Number", "Gender", "Person"]
        else:  # VERB
            features_to_compare = [
                "Aspect",
                "Gender",
                "Mood",
                "Number",
                "Tense",
                "VerbForm",
                "Voice",
            ]
        analyses, filtered_analyses = filtered_analyses, []
        for analysis in analyses:
            _, analysis_morph = oc2ud(str(analysis.tag))
            for feature in features_to_compare:
                if (feature in morphology and feature in analysis_morph
                        and morphology[feature].lower() !=
                        analysis_morph[feature].lower()):
                    break
            else:
                filtered_analyses.append(analysis)
        if not len(filtered_analyses):
            return [string.lower()]
        return list(
            dict.fromkeys(
                [analysis.normal_form for analysis in filtered_analyses]))

    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
        string = token.text
        analyses = self._morph.parse(string)
        if len(analyses) == 1:
            return [analyses[0].normal_form]
        return [string]
Beispiel #30
0
class MagicWorker():
    def __init__(self, path_stopwords, path_clusters):
        #self.classifier = ft.load(path_to_fasttext_model)
        self.__stopwords = self.__get_stopwords__(path_stopwords)
        self.__analyzer = MorphAnalyzer()
        self.__classes, self.__answers = self.__get_classes__(path_clusters)

    def __get_classes__(self, path_to_clusters):
        classes, answers = [], []
        with open(path_to_clusters, 'r') as f:
            for line in f:
                tmp = list(line.split(', '))
                tmp[-1] = tmp[-1].replace('\n', '')
                classes.append(tmp[:-1])
                answers.append(tmp[-1])
        return classes, answers

    def __get_stopwords__(self, path_to_stopwords):
        with open(path_to_stopwords, 'r') as f:
            stopwords = list(f.read().split('\n'))
        return stopwords

    def __process_request__(self, request: str):
        request = request.lower()
        letters = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя '
        spec_syms = ',./<>?;":[]{}!@#$%^&*()-=_+|'
        for sym in spec_syms:
            request = request.replace(sym, ' ')
        request = re.sub(r'\s+', ' ', request)
        request = request.replace('ё', 'е')
        result = ''
        for letter in request:
            if letter in letters:
                result += letter
        temp = []
        for word in result.split():
            temp.append(self.__analyzer.parse(word)[0].normal_form)
        result = ' '.join(temp)

        tmp_ = []
        for t in result.split(' '):
            if not (t in self.__stopwords):
                tmp_.append(t)
        result = ' '.join(tmp_)
        return result

    def __analize__request__(self, request: str):
        processed_request = self.__process_request__(request)
        count_of_entries = [0 for _ in range(len(self.__classes))]
        for word in processed_request.split(' '):
            for i in range(len(self.__classes)):
                if word in self.__classes[i]:
                    count_of_entries[i] += 1
        persents_of_entries = [
            int(count_of_entries[i] / len(self.__classes[i]) * 100)
            for i in range(len(count_of_entries))
        ]
        return persents_of_entries

    def predict(self, request: str):
        persents = self.__analize__request__(request)
        ans = 'Попробуйте переформулировать вопрос'
        max_persents_index = 0
        for i in range(1, len(persents)):
            if persents[i] > persents[max_persents_index]:
                max_persents_index = i
        if (persents[max_persents_index] > 10
            ):  # and (persents.count(persents[max_persents_index]) == 1):
            ans = self.__answers[max_persents_index]
        return ans
Beispiel #31
0
    def get_first_data(self):

        morp = MorphAnalyzer()
        udpipe_data = pd.DataFrame(
            columns=['id', 'form', 'lemma', 'UPosTag', 'XPosTag', 'Feats', 'Head', 'DepRel', 'Deps', 'Misc'])


        for text in self.df['first_or_propn'].values.tolist():

            if len(text.split(' ')) == 1:
                processed = pipeline.process(text)
            elif len(text.split(' ')) > 1:
                final_word = None
                for word in text.split(' '):
                    p = morp.parse(word)[0]
                    if str(p.tag.POS) == 'NOUN' or str(p.tag.POS) == 'PRON':
                        final_word = word
                        #processed = pipeline.process(word)
                        break
                if final_word is None:
                    processed = pipeline.process(text.split(' ')[0])
                else:
                    processed = pipeline.process(final_word)
            else:
                processed = pipeline.process(text)

            processed = processed.split('\n')
            for line in processed:
                if '#' not in line and line != '':
                    udpipe_data = udpipe_data.append({
                        'id': line.split('\t')[0],
                        'form': line.split('\t')[1],
                        'lemma': line.split('\t')[2],
                        'UPosTag': line.split('\t')[3],
                        'XPosTag': line.split('\t')[4],
                        'Feats': line.split('\t')[5],
                        'Head': line.split('\t')[6],
                        'DepRel': line.split('\t')[7],
                        'Deps': line.split('\t')[8],
                        'Misc': line.split('\t')[9]
                    }, ignore_index=True)

        # за первый проход находим все разные возможные колонки
        cols = list()
        for i in range(udpipe_data.shape[0]):
            if '|' in udpipe_data.Feats[i]:
                tmp_data = udpipe_data.Feats[i].split('|')
                for j in tmp_data:
                    cols.append(j.split('=')[0])

        cols = list(set(cols))
        for i in range(len(cols)):
            udpipe_data[cols[i]] = pd.np.nan

        for i in range(udpipe_data.shape[0]):
            if '|' in udpipe_data.Feats[i]:
                tmp_data = udpipe_data.Feats[i].split('|')
                for j in tmp_data:
                    udpipe_data[j.split('=')[0]][i] = j.split('=')[1]

        udpipe_data.insert(1, 'number_of_sent', pd.np.nan)

        n = 1
        for i in range(udpipe_data.shape[0] - 1):
            if udpipe_data.iloc[i, 0] < udpipe_data.iloc[i + 1, 0]:
                udpipe_data.iloc[i, 1] = n
            else:
                udpipe_data.iloc[i, 1] = n
                n += 1
        udpipe_data.iloc[udpipe_data.shape[0] - 1, 1] = n

        udpipe_data = udpipe_data.drop(['id', 'number_of_sent', 'form', 'lemma', 'XPosTag', 'Feats', 'Head', 'DepRel', 'Deps', 'Misc'], axis=1)

        need_cols = ['UPosTag', 'Animacy', 'Number', 'Case', 'Gender']
        for col in need_cols:
            if col not in udpipe_data:
                udpipe_data[col] = pd.np.nan

        udpipe_data = udpipe_data[['UPosTag', 'Animacy', 'Number', 'Case', 'Gender']]

        return udpipe_data
Beispiel #32
0
 def convert_words(self, word, number):
     a = MorphAnalyzer()
     conv_word = a.parse(word)[0]
     return conv_word.make_agree_with_number(number).word
Beispiel #33
0
from nltk.corpus import stopwords
import pandas as pd

# text reading, cleaning and splitting into words
if len(sys.argv) > 1:
    if path.exists(sys.argv[1]):
        with open(sys.argv[1]) as f:
            text = re.split(r'\b[\W\d\s]+\b', f.read())
    else:
        raise FileNotFoundError("File {} not found".format(sys.argv[1]))
else:
    raise Exception("Missing commad line parameter")

# word normalization
morph = MorphAnalyzer()
normal_words = [morph.parse(word)[0].normal_form for word in text]

# stopwords removing
try:
    set_stopwords = set(stopwords.words('russian'))
except Exception:
    import nltk
    nltk.download('stopwords')
    set_stopwords = set(stopwords.words('russian'))

set_words = set(normal_words) - set_stopwords

# Words counting and sorting
counted_words = pd.Series(Counter(normal_words))
sorted_words = counted_words[set_words].sort_values(ascending=False)
Beispiel #34
0
    model = gensim.models.KeyedVectors.load_word2vec_format(m, binary=False)
elif m.endswith('.bin.gz'):
    model = gensim.models.KeyedVectors.load_word2vec_format(m, binary=True)
else:
    model = gensim.models.KeyedVectors.load(m)

model.init_sims(replace=True)

#первые 4 ближайшие по косинусовой схожести слов, берем в словарь
dop_words = []
for word in words_:
    if word in model:
        for i in model.most_similar(positive=[word], topn=4):
            # слово + коэффициент косинусной близости
            wrd = re.sub('[^а-я|\s|А-Я]', '', i[0])
            wrd = morph.parse(wrd)[0].normal_form
            dop_words.append(wrd)

words_ = ["режиссёр", "кино", "премьера", "показ", "блокбастер"]
words_ = dop_words + words_


def for_bar_dict(fr_dict):  #словарь для постоение столбчатой диаграммы
    graph_dict = {}
    for word in words_:
        try:
            graph_dict[word] = fr_dict[word]
        except:
            graph_dict[word] = 0
    return graph_dict
Beispiel #35
0
class UDPymorphyLemmatizer(BasicLemmatizer):
    """
    A class that returns a normal form of a Russian word given its morphological tag in UD format.
    Lemma is selected from one of PyMorphy parses,
    the parse whose tag resembles the most a known UD tag is chosen.
    """

    RARE_FEATURES = ["Fixd", "Litr"]
    SPECIAL_FEATURES = ["Patr", "Surn"]

    def __init__(self,
                 save_path: Optional[str] = None,
                 load_path: Optional[str] = None,
                 rare_grammeme_penalty: float = 1.0,
                 long_lemma_penalty: float = 1.0,
                 **kwargs) -> None:
        self.rare_grammeme_penalty = rare_grammeme_penalty
        self.long_lemma_penalty = long_lemma_penalty
        self._reset()
        self.analyzer = MorphAnalyzer()
        self.converter = converters.converter("opencorpora-int", "ud20")
        super().__init__(save_path, load_path, **kwargs)

    def save(self, *args, **kwargs):
        pass

    def load(self, *args, **kwargs):
        pass

    def _reset(self):
        self.memo = dict()

    def _extract_lemma(self, parse: Parse) -> str:
        special_feats = [x for x in self.SPECIAL_FEATURES if x in parse.tag]
        if len(special_feats) == 0:
            return parse.normal_form
        # here we process surnames and patronyms since PyMorphy lemmatizes them incorrectly
        for other in parse.lexeme:
            tag = other.tag
            if any(x not in tag for x in special_feats):
                continue
            if tag.case == "nomn" and tag.gender == parse.tag.gender and tag.number == "sing":
                return other.word
        return parse.normal_form

    def _lemmatize(self, word: str, tag: Optional[str] = None) -> str:
        lemma = self.memo.get((word, tag))
        if lemma is not None:
            return lemma
        parses = self.analyzer.parse(word)
        best_lemma, best_distance = word, np.inf
        for i, parse in enumerate(parses):
            curr_tag = self.converter(str(parse.tag))
            distance = get_tag_distance(tag, curr_tag)
            for feat in self.RARE_FEATURES:
                if feat in parse.tag:
                    distance += self.rare_grammeme_penalty
                    break
            if len(word) == 1 and len(parse.normal_form) > 1:
                distance += self.long_lemma_penalty
            if distance < best_distance:
                best_lemma, best_distance = self._extract_lemma(
                    parse), distance
                if distance == 0:
                    break
        self.memo[(word, tag)] = best_lemma
        return best_lemma
Beispiel #36
0
def read_text_lemmas(fileobj):
    m = MorphAnalyzer()
    for line in fileobj:
        yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
Beispiel #37
0
 def lemmatize(self, tokens):
     """
     :param tokens: a list of tokens to lemmatize
     """
     analyzer = MorphAnalyzer()
     return Counter([analyzer.parse(token)[0].normal_form for token in tokens if len(token) > 1])
Beispiel #38
0
class LamaBot(object):
    def __init__(self, app_id, mail_manager,
                 chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs):
        """
        Initializes Lama Bot.

        Expects login/password or access_token as named parameters

        :param mail_manager: A manager for retrieving mails
        :type mail_manager: AbstractMailManager

        :param chat_id: Chat identifier
        :type chat_id: int

        :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented
        :type chat_id_for_mails: int

        :raise ValueError: When neither login/password nor access_token was provided
        """
        self.exit_event = Event()
        self.morph = MorphAnalyzer()
        self.version = '0.1.1'
        self.app_id = app_id
        self.access_token = None
        self.password = None
        self.login = None
        self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs)
        self.commands = {}
        self._plugins = []
        self.mail_manager = mail_manager
        self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest
        self.chat_id = chat_id
        self.chat_id_for_mails = chat_id_for_mails or self.chat_id
        self.admins = admins or []

        self.initialize_commands()

    def initialize_commands(self):
        self.commands = {
            'post_to_dialog': lambda args, m: self.safe_post_message_and_log_if_failed(args),
            'ping': self.pong_to_admins
        }

    def safe_notify_about_unread_mails(self):
        for m in self.safe_unread_mails:
            if self.safe_post_mail_and_log_if_failed(m):
                self.mail_manager.safe_mark_mail_as_read_and_log_if_failed(m)

    def safe_process_directed_dialog_message(self, message):
        logging.debug(u'Processing message with body {}'.format(message.body))
        words = self.split_to_words(message.body)
        logging.debug(u'Words in the body: {}'.format(words))
        self.safe_process_plugins(message, words)
        self.safe_mark_message_as_read_and_log_if_failed(message)

    def safe_process_private_message(self, message):
        if self.safe_execute_and_log_if_failed(message):
            self.safe_mark_message_as_read_and_log_if_failed(message)

    @safe_call_and_log_if_failed
    def safe_process_plugins(self, message, words):
        normalized_words = self.normalize_words(words)
        for p in self.plugins:
            p.process_input(message.body, words, normalized_words, message)

    def long_pool_loop(self, exit_event):
        server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response()

        while not exit_event.is_set():
            response = self.send_long_poll_request(server, key, ts)
            if 'failed' in response:
                server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response()
            else:
                self.process_long_poll_response(response)
                ts = self.get_timestamp(response, ts)

    def extract_server_key_and_timestamp_from_get_long_poll_server_response(self):
        response = self.vkapi.messages_get_long_poll_server()
        while not all(x in response for x in ('server', 'key', 'ts')):
            logging.error('Could not retrieve credentials for connecting to long poll server', response)
            response = self.vkapi.messages_get_long_poll_server()
        return response['server'], response['key'], response['ts']

    @safe_call_and_log_if_failed(default={'failed': True})
    def send_long_poll_request(self, server, key, ts, act='a_check', wait=25, mode=2):
        params = {
            'act': act,
            'key': key,
            'ts': ts,
            'wait': wait,
            'mode': mode
        }
        return requests.get('http://{server}'.format(server=server), params=params).json()

    def process_long_poll_response(self, response):
        if response:
            for update in response.get('updates', []):
                self.process_long_poll_update(update)

    def process_long_poll_update(self, update):
        functions = {
            4: self.process_long_poll_new_message
        }
        function = functions.get(update[0])
        if function:
            function(update)

    def process_long_poll_new_message(self, update):
        chat_id = self.get_chat_id_from_long_poll_new_message_update(update)
        fwd_messages = self.get_fwd_messages_from_long_poll_new_message_update(update)
        self.process_new_message(VkMessage({'id': update[1],
                                            'user_id': None,
                                            'read_state': (update[2] + 1) % 2,
                                            'chat_id': chat_id,
                                            'title': update[5],
                                            'body': update[6],
                                            'fwd_messages': fwd_messages,
                                            'out': (update[2] & 2) >> 1}))

    def process_new_message(self, message):
        if message.is_unread and message.is_inbox:
            if message.chat_id == self.chat_id and self.message_is_directed(message):
                self.safe_process_directed_dialog_message(message)
            elif message.is_private:
                self.safe_process_private_message(message)

    def get_fwd_messages_from_long_poll_new_message_update(self, update):
        return map(self.convert_fwd_from_long_poll_new_message_update_to_fwd_message,
                   ifilter(None,
                           self.get_attachments_from_long_poll_new_message_update(update).get('fwd', '').split(',')))

    @staticmethod
    def convert_fwd_from_long_poll_new_message_update_to_fwd_message(fwd):
        regex = re.compile('(?P<user_id>\d+)_(?P<msg_id>\d+)')
        m = regex.match(fwd)
        return {
            'id': m.group('msg_id'),
            'user_id': m.group('user_id')
        }

    @staticmethod
    def get_chat_id_from_long_poll_new_message_update(update):
        """
        The message was sent from chat if user_id is greater than 2000000000
        :param update:
        :return:
        """
        return update[3] - 2000000000 if update[3] > 2000000000 else None

    def get_user_id_from_long_poll_new_message_update(self, update):
        """
        Retrieves user_id from update according to documentation
        https://vk.com/pages?oid=-17680044&p=Connecting_to_the_LongPoll_Server
        :param update:
        :return:
        """
        return self.get_attachments_from_long_poll_new_message_update(update).get('from')

    @staticmethod
    def get_attachments_from_long_poll_new_message_update(update):
        return update[7] if len(update) > 7 else {}

    @staticmethod
    def get_timestamp(response, default):
        return response.get('ts', default) if response else default

    @property
    def unread_mails(self):
        return self.mail_manager.unread_mails

    @property
    def safe_unread_mails(self):
        """
        Just delegates the work to the mail manager
        :return:
        """
        return self.mail_manager.safe_unread_mails

    @property
    def vkapi_messages_get(self):
        return self.vkapi.messages_get()

    @property
    def plugins(self):
        """

        :rtype : a list of LamaPlugin
        """
        return self._plugins

    def vkapi_messages_set_activity_in_chat(self):
        return self.vkapi.messages_set_activity(chat_id=self.chat_id, type='typing')

    def post_mail(self, mail):
        """
        Posts mail to VK. Loads and attaches documents, if any.
        :param mail:
        :return:
        """
        documents = None
        if mail.attachments:
            documents = filter(None, imap(self.safe_upload_attachment, mail.attachments))
        self.post_message_to_mail_dialog(self.wrap_mail(mail), attachments=documents)

    @safe_call_and_log_if_failed(default=False)
    def safe_post_mail_and_log_if_failed(self, mail):
        """
        :param mail:
        :return: True if no error, False otherwise
        """
        self.post_mail(mail)
        return True

    @safe_call_and_log_if_failed()
    def safe_post_message_and_log_if_failed(self, message):
        self.post_message_to_dialog(message)

    @safe_call_and_log_if_failed
    def pong_to_admins(self, _, message):
        self.post_message_to_admins('Pong', forward_messages=[message])

    @safe_call_and_log_if_failed
    def safe_post_message_with_forward_messages(self, message, forward_messages):
        self.post_message_to_dialog(message, forward_messages=forward_messages)

    def execute(self, message):
        s = message.body
        command, args = self.split_to_command_and_argument(s)
        if command in self.commands:
            self.commands[command](args, message)
        else:
            self.command_not_found(command)

    @safe_call_and_log_if_failed(default=False)
    def safe_execute_and_log_if_failed(self, message):
        self.execute(message)
        return True

    @staticmethod
    def split_to_command_and_argument(command):
        values = command.split(':', 1)
        if len(values) != 2:
            values.append(None)
        return values[0], values[1]

    def _post_message_to_dialog(self, chat_id, message, attachments=None, forward_messages=None):
        """
        Posts message to dialog. Attaches attachments, if any.
        :param forward_messages: Messages to be forwarded
        :type forward_messages: [VkMessage]
        :param attachments:Documents to be attached
        :type attachments: [VkDocument]
        :param message:
        """
        attachments = attachments or []
        forward_messages = forward_messages or []
        attachment = ','.join(map(lambda d: d.attachment_string, attachments))
        forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages))
        self.vkapi.messages_send(chat_id=chat_id,
                                 message=message,
                                 attachment=attachment,
                                 forward_messages=forward_messages_str)

    def post_message_to_dialog(self, message, attachments=None, forward_messages=None):
        self._post_message_to_dialog(self.chat_id, message, attachments=attachments, forward_messages=forward_messages)

    def post_message_to_mail_dialog(self, message, attachments=None, forward_messages=None):
        self._post_message_to_dialog(self.chat_id_for_mails, message,
                                     attachments=attachments, forward_messages=forward_messages)

    def post_startup_message_to_admins(self):
        self.post_message_to_admins('The Lama is ready to work! (version {0})'.format(self.version))

    @safe_call_and_log_if_failed
    def post_message_to_admins(self, message, forward_messages=None):
        forward_messages = forward_messages or []
        forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages))
        for user_id in self.admins:
            self.vkapi.messages_send(user_id=user_id, message=message, forward_messages=forward_messages_str)

    def command_not_found(self, command):
        message = u'Command `{}` not found'.format(command).encode('utf-8')
        logging.warning(message)

    def run(self, post_welcome_message_to_dialog=True):
        if post_welcome_message_to_dialog:
            self.post_startup_message_to_admins()

        long_poll = Thread(target=self.long_pool_loop, args=(self.exit_event,))
        long_poll.start()

        while True:
            self.safe_notify_about_unread_mails()
            time.sleep(self.number_of_seconds_for_the_rest)

    def stop_running(self):
        self.exit_event.set()

    @safe_call_and_log_if_failed
    def safe_upload_attachment(self, attachment):
        """
        Uploads given attachment

        :type attachment: Attachment
        :rtype: VkDocument
        """
        if attachment.is_loaded:
            url = self.safe_docs_get_upload_server()
            file_string = self.safe_upload_file_to_server(url, self.create_attachment_filename(attachment.filename),
                                                          attachment.data, attachment.mime_type)
            return self.safe_save_doc_file(file_string, attachment.filename)

    @safe_call_and_log_if_failed
    def safe_upload_message_photo(self, image_file_path):
        if image_file_path is not None:
            url = self.safe_get_upload_server_for_private_message_photo()
            data = self.safe_upload_photo_to_server(url, self.create_attachment_filename(image_file_path),
                                                    self.get_image_data(image_file_path),
                                                    self.get_mime_type(image_file_path))
            photo_name = os.path.basename(image_file_path)
            return self.safe_save_photo_file(data['photo'], data['server'], data['hash'], photo_name)

    @staticmethod
    def get_image_data(image_filename):
        with open(image_filename, 'rb') as f:
            data = f.read()
        return data

    @staticmethod
    def get_mime_type(image_filename):
        return mimetypes.guess_type(image_filename)

    @safe_call_and_log_if_failed
    def safe_save_photo_file(self, photo, server, hash, title):
        if photo:
            responses = self.vkapi.photos_save_message_photo(photo=photo, server=server, hash=hash, title=title)
            return VkPhoto(responses[0])

    @safe_call_and_log_if_failed
    def safe_get_upload_server_for_private_message_photo(self):
        """
        Retrieves upload_url for storing files
        """
        return self.vkapi.photos_get_messages_upload_server()['upload_url']

    @staticmethod
    def create_attachment_filename(filename):
        _, extension = os.path.splitext(filename)
        return 'attachment' + extension

    @safe_call_and_log_if_failed
    def safe_upload_to_server(self, url, filename, data, mime_type, post_name):
        """
        Uploads data to given url and saves it with given filename and mime_type

        :return: Raw response, returned by post request
        """
        if url:
            request = requests.post(url, files={post_name: (filename or 'NoName', data, mime_type)})
            response = json.loads(request.text)
            if 'error' in response:
                raise Exception(response['error'])
            else:
                return response

    def safe_upload_file_to_server(self, url, filename, data, mime_type):
        return self.safe_upload_to_server(url, filename, data, mime_type, 'file')['file']

    def safe_upload_photo_to_server(self, url, filename, data, mime_type):
        return self.safe_upload_to_server(url, filename, data, mime_type, 'photo')

    @safe_call_and_log_if_failed
    def safe_save_doc_file(self, file_string, title):
        """
        Saves file on VK server by given string

        :param file_string: String, returned after uploading file
        :return: Saved document
        :rtype: VkDocument
        """
        if file_string:
            responses = self.vkapi.docs_save(file=file_string, title=title)
            return VkDocument(responses[0])

    @safe_call_and_log_if_failed
    def safe_docs_get_upload_server(self):
        """
        Retrieves upload_url for storing files
        """
        return self.vkapi.docs_get_upload_server()['upload_url']

    def retrieve_users_by_ids(self, *user_ids):
        return map(VkUser, self.vkapi.users_get(user_id=','.join(imap(str, user_ids))))

    @staticmethod
    def wrap_mail(mail):
        return LamaBeautifier.get_random_mail_pattern().format(subject=mail.subject, sender=mail.sender, body=mail.body)

    @staticmethod
    def message_is_directed(message):
        return message.body is not None and message.body.encode('utf-8').startswith('Лама, ')

    @staticmethod
    def message_has_body(message):
        return message.body is not None

    def mark_message_as_read(self, message):
        self.mark_message_as_read_by_id(message.id)

    @safe_call_and_log_if_failed(default=False)
    def safe_mark_message_as_read_and_log_if_failed(self, message):
        self.mark_message_as_read(message)
        return True

    def mark_message_as_read_by_id(self, message_ids):
        self.vkapi.messages_mark_as_read(message_ids=message_ids)

    def register_plugin(self, plugin):
        self._plugins.append(plugin)
        plugin.bot = self

    def split_to_words(self, body):
        return body.encode('utf-8').translate(string.maketrans('', ''), string.punctuation).split()

    def normalize_words(self, words):
        return map(self.normalize_word, words)

    def normalize_word(self, word):
        return self.morph.parse(word.decode('utf8'))[0].normal_form.encode('utf8')
Beispiel #39
0
class Timer(AppDaemon):

    def initialize(self):

        self.morph = MorphAnalyzer()

        self.ok_phrases = ["Без проблем, таймер сработает через {time}",
                           "Ок, таймер сработает через {time}",
                           "Сделано, таймер сработает через {time}",
                           "Готово, сработает через {time}",
                           "Ставлю таймер на {time}",
                           "Таймер на {time} - сделано"]

        self.remaining_phrases = ["Осталось {remain}.",
                                  "Таймер сработает через {remain}.",
                                  "{remain} до конца."]

        self.not_set_phrases = ["Таймер не установлен.",
                                "Я не засекала.",
                                "Прости, я не засекала.",
                                "Таймер? Не знаю. Не засекала."]

        self.already_set_phrases = ["Таймер уже установлен.",
                                    "Ты уже поставил таймер."]

        self.ok_remove_phrases = ["Отменяю.",
                                  "Хорошо, таймер отключен.",
                                  "Таймер выключен"]

        self.timer_ended_phrases = ["Время вышло!",
                                    "Сработал таймер",
                                    "Время!"]

        engine = self.get_app("brain").engine
        keyword = ["таймер", "таймеры"]
        self.set_timer_words = ["поставить", "установить"]
        self.reset_timer_words = ["сбросить", "отменить", "остановить"]
        self.state_timer_words = ["как", "остаться", "состояние"]
        re_hours = "(?P<TimerHours>[0-9]+) (час|часы)"
        re_minutes = "(?P<TimerMinutes>[0-9]+) минута"
        re_seconds = "(?P<TimerSeconds>[0-9]+) секунда"
        for k in keyword:
            engine.register_entity(k, "TimerKeyword")
        for a in self.set_timer_words:
            engine.register_entity(a, "TimerAction")
        for a in self.reset_timer_words:
            engine.register_entity(a, "TimerAction")
        for a in self.state_timer_words:
            engine.register_entity(a, "TimerAction")
        engine.register_regex_entity(re_hours)
        engine.register_regex_entity(re_minutes)
        engine.register_regex_entity(re_seconds)
        timer_intent = IntentBuilder("timer")\
            .require("TimerKeyword")\
            .optionally("TimerAction")\
            .optionally("TimerHours")\
            .optionally("TimerMinutes")\
            .optionally("TimerSeconds")\
            .build()
        engine.register_intent_parser(timer_intent)

        self.context_sensitive = True
        self.context_blacklist = ["TimerHours", "TimerSeconds", "TimerMinutes"]

        self.timer_handler = threading.Timer(10, self.timer_ended)

        print("timer initialized")

    def handle(self, intent_dict):
        action = intent_dict.get("TimerAction", "cостояние")
        # if "TimerHours" in intent_dict or "TimerMinutes" in intent_dict or "TimerSeconds" in intent_dict:
        # action = "поставить"
        if action in self.set_timer_words:
            print("setting timer")
            return self.start_timer(intent_dict)
        elif action in self.reset_timer_words:
            print("stopping timer")
            return self.stop_timer()
        elif action in self.state_timer_words:
            print("getting state")
            return self.state_timer()
        else:
            return "Прости, что то не так пошло с этим таймером"

    def start_timer(self, intent_dict):
        if self.timer_handler.is_alive():
            return choice(self.already_set_phrases)
        hours = int(intent_dict.get("TimerHours", 0))
        print("hours:", hours)
        minutes = int(intent_dict.get("TimerMinutes", 0))
        print("minutes:", minutes)
        seconds = int(intent_dict.get("TimerSeconds", 0))
        print("seconds:", seconds)
        time = hours * 3600 + minutes * 60 + seconds
        self.timer_handler = threading.Timer(time, self.timer_ended)
        self.timer_handler.start()
        self.timer_end_time = datetime.datetime.now() + datetime.timedelta(seconds=time)
        print(self.timer_handler.is_alive())
        return choice(self.ok_phrases).format(time=self.pron_time(hours, minutes, seconds))

    def state_timer(self):
        if not self.timer_handler.is_alive():
            return choice(self.not_set_phrases)
        timer_delta = self.timer_end_time - datetime.datetime.now()
        hours, secs = divmod(timer_delta.seconds, 3600)
        minutes, seconds = divmod(secs, 60)
        return choice(self.remaining_phrases).format(remain=self.pron_time(hours, minutes, seconds))

    def stop_timer(self):
        if self.timer_handler is None:
            return choice(self.not_set_phrases)
        self.timer_handler.cancel()
        return choice(self.ok_remove_phrases)

    def timer_ended(self):
        print("Timer!!!")
        # yandex tts say some phrase

    def pron_time(self, hours, minutes, seconds):
        phours = str(hours) + " " + self.morph.parse("час")[
            0].make_agree_with_number(hours).word if hours > 0 else ""
        pminutes = str(minutes) + " " + self.morph.parse("минута")[
            0].make_agree_with_number(minutes).word if minutes > 0 else ""
        pseconds = str(seconds) + " " + self.morph.parse("секунда")[
            0].make_agree_with_number(seconds).word if seconds > 0 else ""
        return " ".join([phours, pminutes, pseconds])
Beispiel #40
0
class TextProcessing:  # TODO: add stemming
    """
        Make usual text processing such as tokenizing, lemmatizing, deleting stopwords.
        :param token_pat: regex pattern to split text
        :param mode: 'normal' mode provides usual tokenize process, 'nospace' mode is probability based mode to recover
        words from given sequence of characters, requires counter attribute to be passed
        :param counter: Counter object with words frequencies
        :param threshold: max number of divisions in 'nospace' mode to be considered successful
        :param allowed_pos: iterable, parts of speech to be left by pymorphy2.MorphAnalizer after lemmatizing,
        others will be dropped
        :param stop_words: if None is passed default russian stopwords are used
        :param stop_cities: if True russian cities will be dropped
    """
    def __init__(self,
                 token_pat="[а-я]+",
                 mode="normal",
                 counter=None,
                 threshold=3,
                 allowed_pos=None,
                 stop_words=None,
                 stop_cities=False):
        self.token = token_pat
        self.mode = mode

        if self.mode not in {"normal", "nospace"}:
            raise ValueError("Unknown mode")
        elif self.mode == "nospace":
            if not isinstance(counter, Counter):
                raise ValueError(
                    "In 'nospace' mode the counter attribute should be passed")
            self.counter = counter
            self.nospace = NoSpaceSplitter(counter)
            self.threshold = threshold

        self.morph = MorphAnalyzer()
        self.allowed_pos = allowed_pos
        self.stop_words = stop_words or STOPWORDS
        if stop_cities:
            self.stop_words.union(CITIES)

    def tokenize(self, doc):
        """
        :param doc: must be a string or iterable, if string it will be splitted in tokens,
        else - left without changes
        :return: list of tokens
        """
        if isinstance(doc, str):
            doc = re.findall(self.token, doc.lower())
        elif not isinstance(doc, Iterable):
            raise ValueError("The doc must be a string or iterable")
        if self.mode == "nospace":
            return self._no_space_split(doc)
        return doc

    def _no_space_split(self, doc):
        res = []
        for w in doc:
            split = self.nospace.segment(w)
            if len(split) <= self.threshold:
                res.extend(split)
            else:
                res.extend(w)
        return res

    def lemmatize(self, doc):
        """
        :param doc: iterable, list of words
        :return: most probable normal forms of words in doc
        """
        res = []
        for w in doc:
            parsed = self.morph.parse(w)[0]
            if parsed in SPECIAL_WORDS:
                continue
            if self.allowed_pos:
                if parsed.tag.POS in self.allowed_pos:
                    res.append(parsed.normal_form)
                else:
                    continue
            else:
                res.append(parsed.normal_form)
        return res

    def clear_stop_words(self, doc):
        """
        :param doc: iterable, list of words
        :return: doc without stopwords
        """
        return [w for w in doc if w not in self.stop_words]

    def transform(self, corpora: "pd.Series"):
        """
        Process full pipeline: tokenizing, deleting stopwords, lemmatizing
        :param corpora: pd.Series to process
        :return: processed data
        """
        data = (corpora.map(self.tokenize).map(self.clear_stop_words).map(
            self.lemmatize))
        return data
Beispiel #41
0
class Analyser():
    def __init__(self):
        #Подключаем конфиги
        main_model_config_path = MODELS_PATHS["main_model_config"]
        main_model_weights_path = MODELS_PATHS["main_model_weights"]
        gram_dict_input = MODELS_PATHS["gram_input"]
        gram_dict_output = MODELS_PATHS["gram_output"]
        word_dictionary = MODELS_PATHS["word_dictionary"]
        char_set_path = MODELS_PATHS["char_set"]
        build_config = MODELS_PATHS["build_config"]

        self.converter = converters.converter('opencorpora-int', 'ud14')
        self.morph = MorphAnalyzer()  #Pymorphy2

        self.build_config = BuildModelConfig()
        self.build_config.load(build_config)

        self.model = LSTMModel()
        self.model.prepare(gram_dict_input, gram_dict_output, word_dictionary,
                           char_set_path)
        self.model.load_main_model(self.build_config, main_model_config_path,
                                   main_model_weights_path)

    def analyse(self, words: List[str]) -> List[WordForm]:
        """
        Грам. разбор введенного текста (без разбиения на предложения)
        """
        words_predicts = self.model.predict_gram_analysis([words], 1,
                                                          self.build_config)[0]
        return self.predictionsParsing(words, words_predicts)

    def analyse_sentences(self,
                          sentences: List[List[str]],
                          batch_size: int = 64) -> List[List[WordForm]]:
        """
        Грам. разбор выборки текста (с разбиением на отдельные предложения)
        """
        sentences_predicts = self.model.predict_gram_analysis(
            sentences, batch_size, self.build_config)
        answers = []
        for words, words_predicts in zip(sentences, sentences_predicts):
            answers.append(self.predictionsParsing(words, words_predicts))
        return answers

    def split_text_on_words(self, text: str) -> List[str]:
        """
        Разбивает текст на слова и пунктуацию
        """
        words = []
        separators = {
            ",", ".", ";", "-", "\"", ":", "'", "—", "(", ")", "?", "!"
        }
        for word in text.split(" "):
            if word == "":
                continue
            count = 0
            for s in word:
                if s in separators:
                    if count > 0:
                        words.append(word[0:count])
                    words.append(word[count])
                    word = word[count + 1:len(word)]
                    count = 0
                else:
                    count += 1
            if len(word) > 0:
                words.append(word)
        return words

    def get_word_dictionary_for_text(
            self, wordForms: List[WordForm]) -> List[WordForm]:
        """
        Формирует словарь уникальных слов по начальной форме и рассчитывает частоту их употребления в тексте
        """
        uniqieWordsDictionary = []
        uniqueWords = []
        for wordForm in wordForms:
            normalForm = wordForm.normal_form
            if normalForm not in uniqueWords:
                uniqueWords.append(normalForm)
                uniqieWordsDictionary.append(wordForm)

        for uniqueWord in uniqieWordsDictionary:
            frequency = 0
            for wordForm in wordForms:
                if uniqueWord.normal_form == wordForm.normal_form:
                    frequency += 1
            uniqueWord.frequency = frequency
            uniqueWord.pos = self.translatePos(uniqueWord.pos)
        return uniqieWordsDictionary

    def translatePos(self, pos: str) -> str:
        if (pos == "NOUN"):
            pos = "сущ."
        elif (pos == "ADJ"):
            pos = "прил."
        elif (pos == "VERB"):
            pos = "гл."
        elif (pos == "NUM"):
            pos = "числит."
        elif (pos == "CONJ"):
            pos = "союз"
        elif (pos == "INTJ"):
            pos = "междом."
        elif (pos == "ADP"):
            pos = "предлог"
        elif (pos == "DET"):
            pos = "местоим."
        elif (pos == "ADV"):
            pos = "нареч."
        elif (pos == "PUNCT"):
            pos = "пункт."
        elif (pos == "PART"):
            pos = "частица"
        elif (pos == "PRON"):
            pos = "местоим."
        elif (pos == "PROPN"):
            pos = "имя собств."
        return pos

    def predictionsParsing(
            self, words: List[str],
            words_predicts: List[List[float]]) -> List[WordForm]:
        """
        Преобразует полученное предсказание в нормальный вид (в объект класса WordForm).
        """
        result = []
        for word, word_prob in zip(words, words_predicts[-len(words):]):
            result.append(self.wordFormBuilding(word, word_prob[1:]))
        return result

    def wordFormBuilding(self, word: str, predicts: List[float]) -> WordForm:
        """
        Собирает WordForm по номеру тега в векторизаторе и слову.
        """
        word_forms = None
        word_forms = self.morph.parse(word)

        vectorizer = self.model.grammeme_vectorizer_output
        tag_num = int(
            np.argmax(predicts)
        )  # номер грамматического разбора (тега) с наибольшей вероятностью
        score = predicts[tag_num]
        full_tag = vectorizer.get_name_by_index(tag_num)
        pos, tag = full_tag.split("#")[0], full_tag.split("#")[1]
        lemma = self.getWordNormalForm(word, pos, tag, word_forms)
        vector = np.array(vectorizer.get_vector(full_tag))
        result_form = WordForm(word=word,
                               normal_form=lemma,
                               pos=pos,
                               tag=tag,
                               vector=vector,
                               score=score)
        return result_form

    def getWordNormalForm(self,
                          word: str,
                          pos_tag: str,
                          gram: str,
                          word_forms=None):
        """
        Определяет лемму слова с помощью pyMorphy2
        """

        if word_forms is None:
            word_forms = self.morph.parse(word)
        guess = ""
        max_common_tags = 0
        for word_form in word_forms:
            word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(
                self.converter, word_form.tag, word)
            word_form_gram = filter_gram_tag(word_form_gram)
            common_tags_len = len(
                set(word_form_gram.split("|")).intersection(
                    set(gram.split("|"))))
            if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
                max_common_tags = common_tags_len
                guess = word_form
            if guess == "":
                guess = word_forms[0]

            lemma = guess.normal_form
        return lemma
Beispiel #42
0
df = pd.DataFrame(list(zip(titles, texts)), 
               columns =['titles', 'texts']) 
df.head()


# In[ ]:


df.to_csv('textbase.csv')


# In[96]:


word_form_instance = {}
parse_inst = morph.parse('синхрофазатроны')[0]
word_form_instance['word'] = parse_inst.word
word_form_instance['lemma'] = parse_inst.normal_form
word_form_instance['form'] = parse_inst.tag
word_form_instance['POS'] = parse_inst.tag.POS


# In[450]:


toy_df = pd.DataFrame({
   'EmployeeId': ['001', '002', '003', '004'],
   'City': ['я хорошая и дружелюбная сорока , меня хвалят. ', 'бегает. прыгает. ', 'смешной , и. расплывчатый', 'кто ты. кто я. '] 
})
toy_df
Beispiel #43
0
def read_lemmas(fileobj):
    # здесь на каждой строчке по предложению (токенизованному)
    m = MorphAnalyzer()
    for line in fileobj:
        yield [m.parse(t)[0].normal_form for t in line.decode('utf-8').split()[1:]]
Beispiel #44
0
def tokenize_ru(file_text):
    tokens = word_tokenize(file_text)
    tokens = [i for i in tokens if (i not in string.punctuation)]
    tokens = [
        i for i in tokens1 if (ma.parse(i)[0].tag.POS not in functors_pos)
    ]
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    return tokens


tokens1 = wt.tokenize(text1)
tokens2 = wt.tokenize(text2)
functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'}  # Удаляем пунктуацию
tokens1 = [i for i in tokens1 if (i not in string.punctuation)]
tokens2 = [i for i in tokens2 if (i not in string.punctuation)]
tokens1 = [i for i in tokens1 if (ma.parse(i)[0].tag.POS not in functors_pos)]
tokens2 = [i for i in tokens2 if (ma.parse(i)[0].tag.POS not in functors_pos)]
i = 0
while i < len(tokens1):  # Переводим в нижний регистр
    tokens1[i] = tokens1[i].lower()
    i = i + 1
i = 0
while i < len(tokens2):
    tokens2[i] = tokens2[i].lower()
    i = i + 1

sentences = [tokenize_ru(sent) for sent in sent_tokenize(text, 'russian')]
model = gensim.models.Word2Vec(sentences,
                               size=500,
                               window=5,
                               min_count=1,