def wordforms(word): arr = [] morph = MorphAnalyzer() lex = morph.parse(word)[0].lexeme for l in lex: arr.append(l.word) return set(arr)
def read_test_corpus(fn): m = MorphAnalyzer() for line in fn: line = line.rstrip('\n') # считаем, что текст у нас уже токенизованный # line = word_tokenize(line) line = line.decode('utf-8').split() # разбираем слова по словарю, возьмем только первый разбор от pymorphy parses = [m.parse(token) for token in line] if line: yield [(p[0].word, p[0].tag) for p in parses]
def agree(w1, w2, t1, t2): if t1 == "comma" or t2 == "comma": return w1, w2 morph = MorphAnalyzer() raw_cur_tags = morph.tag(w1)[-1] raw_next_tags = morph.tag(w1)[-1] cur_tags = re.findall(r"\w+", str(raw_cur_tags)) next_tags = re.findall(r"\w+", str(raw_next_tags)) if t1[:-2] == "person": if t2[:-2] == "verb_right": if morph.normal_forms(w2)[0] in dative_verbs: w1 = morph.parse(w1)[0].inflect({"datv"}).word if t1[:-2] == "verb_right": if t2[:-2] == "property": pass if t2[:-2] == "person": if cur_tags[3] == "tran": w2 = morph.parse(w2)[0].inflect({"accs"}).word else: w2 = morph.parse(w2)[0].inflect({"nomn"}).word #gender with nomn only gender = next_tags[2] if gender == "inan": gender = next_tags[3] w1 = morph.parse(w1)[0].inflect({gender}).word if t1[:-2] == "adjective": if t2[:-2] == "property": #gender gender = next_tags[2] if gender == "inan": gender = next_tags[3] try: w1 = morph.parse(w1)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) if t1[:-2] == "property": if t2[:-2] == "person": pass if t2[:-2] == "adjective": gender = cur_tags[2] if gender == "inan": gender = cur_tags[3] try: w2 = morph.parse(w2)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) #w1 = morph.parse(w1)[0].inflect({}).word return w1, w2
class MorphTest(unittest.TestCase): def __init__(self, document_vector): self.document = None self.documents = document_vector self.morph = MorphAnalyzer() # def setUp(self): # self.document = documents[randint(0, len(documents))] def testMorph(self): self.document = self.document if not None else self.documents[0] morph_array = [self.morph.parse(word)[0].normal_form for word in self.document] print morph_array self.assertTrue(True, msg=None)
class MorphAnalyzer(object): def __init__(self): self.raw = PymorphyAnalyzer() def check_gram(self, gram): if not self.raw.TagClass.grammeme_is_known(gram): raise ValueError(gram) def __call__(self, word): records = self.raw.parse(word) return [prepare_form(_) for _ in records] def normalized(self, word): return {_.normalized for _ in self(word)}
class Analyzer: """ Анализирует входящий текст, разбирает каждое слово на лексемы, убирает пунктуацию и все, кроме существительных, глаголов или прилагательных, а так же слова из списка запрещенных слов. Выдает 10 самых популярных из текущих слов. """ def __init__(self, text_array): self.text_array = text_array self.morph = MorphAnalyzer() # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную self.trash_list = \ {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все", "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный", "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь", "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш", "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься", "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый", "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой", "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий", "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание", "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие", "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий", "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение", "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь", "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть", "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг", "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент", "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа", "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео", "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество", "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка", "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал", "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество", "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право", "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение", "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент", "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"} def start(self): res = list(filter( lambda x: len(x) > 2 and self.pymorphy_analyze(x) and re.match("[а-яА-Я]", x) and x not in self.trash_list, self.text_array)) return [x[0] for x in Counter(res).most_common(10)] def pymorphy_analyze(self, word): lexem = self.morph.parse(word) x = lexem[0].tag.POS if x == ("NOUN" or "ADJF" or "INFN"): return True return False
def read_tab_corpus(inc): m = MorphAnalyzer() sent = [] for t in inc: t = t.rstrip().decode('utf-8') if not t: continue if t == u'sent': sent = [] continue if t == u'/sent' or t == u'SENT': sent = [x[0] for x in sent] parses = [m.parse(token) for token in sent] if sent: yield [(p[0].word, p[0].tag) for p in parses] continue t = t.split('\t') try: token = (t[1], ' '.join(t[2].split(' ')[2:])) sent.append(token) except IndexError: continue
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Apr 18 13:38:13 2019 @author: dan """ def is_formula(token): tags = {''} for var in token: tags |= var.tag.grammemes if {'LATN', 'UNKN'} & tags: chars = set(token[0].word) return chars else: return 0 if __name__ == '__main__': from pymorphy2 import MorphAnalyzer mo = MorphAnalyzer() for x in [ '1/r²', '2*2=4', '\frac{2,2}', 'help', 'what is love', 'лингви́стика' ]: print(x, is_formula(mo.parse(x)))
class Event(): """ Event object - class for working with event candidates. Collects all data on event candidate, stores it between clustering slices; merges slices, if required. TBD: constructs and saves description, scores texts and media, scores and descripts event itself (probability, that candidate is real, event buzz, event category). Attributes: self.created (datetime): creation timestamp self.updated (datetime): last update timestamp self.start (datetime): timestamp of the first message in the self.messages dict self.end (datetime): timestamp of the last message in the self.messages dict self.messages (Dict[dict]): raw tweets from database, enriched with weight, is_core params (on init), tokens (after add_stem_texts) self.media (Dict[dict]): raw media objects from database self.cores (Dict[list]): tokens, that form the most common vocabulary for the event; computed in create_core() method self.entropy (float): entropy for authorship: 0 for mono-authored cluster; computed in event_summary_stats() method self.ppa (float): average number of posts per one author; computed in event_summary_stats() method self.authors (int): number of unique authors for event self.most_active_author (float): share of messages, written by one (most active author) self.authors_share (float): number of authors divided by number of messages self.relevant_messages_share (float): share of messages with token_score above zero self.duration (int): total seconds from self.start to self.end self.classifier (Object): classifier for deciding, whether event is real self.validity (bool): Classifier verdict, whether event is real or not self.verification (bool): Handmade verification of event quality Methods: self.event_update: commands to calculate all data on event, based on messages and media self.is_successor: examines, if current event have common messages with specified event slice self.is_valid: method for classifier to determine, if event is actually event, and not a random messages contilation self.classifier_row: unififed method for creating classifier data-row self.merge: merge current event with another event, update stat Attributes self.add_slice: add messages and media to the event, recompute statistics self.load / self.dump: serialize/deserialize event and put/get it to Redis self.backup / self.restore: dump/restore event to/from MySQL long-term storage self.get_messages_data: get MySQL data for messages ids self.get_media_data: get MySQL data for media using existing messages ids self.event_summary_stats: calculate statistics and start/end time for event self.add_stem_texts: add tokens lists to self.messages self.create_core: create vocabulary of most important words for the event self.score_messages_by_text: method calculates token_score for messages. TF/IDF likelihood with core is used Message keys: cluster (int): legacy from DBSCAN - number of cluster (event ancestor) id (str): DB message id; unique is_core (bool): True, if tweet belongs to the core of ancestor cluster iscopy (int): 1, if message is shared from another network lat (float): latitude lng (float): longitude network (int): 2 for Instagram, 1 for Twitter, 3 for VKontakte text (str): raw text of the message tokens (Set[str]): collection of stemmed tokens from raw text; created in add_stem_texts() tstamp (datetime): 'created at' timestamp user (int): user id, absolutely unique for one network, but matches between networks are possible token_score (float): agreement estimation with average cluster text weight (float): standart deviations below average """ def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []): """ Initialization. Args: mysql_con (PySQLPoolConnection): MySQL connection Object redis_con (StrictRedis): RedisDB connection Object tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. classifier (Object): scikit trained classifier to detect real and fake events points (list[dict]): raw messages from event detector """ self.mysql = mysql_con self.redis = redis_con if morph: self.morph = morph else: self.morph = MorphAnalyzer() if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = TreebankWordTokenizer() self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE) self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') self.validity = None self.verification = None self.cores = {} self.classifier = classifier if points: self.id = str(uuid4()) self.created = datetime.now() self.updated = datetime.now() self.messages = { x['id']:x for x in points } self.get_messages_data() self.media = {} self.get_media_data() self.event_update() def __str__(self): txt = "<Event {}: {} msgs [{} -- {}]>".format(self.id, len(self.messages), self.start.strftime("%Y-%m-%d %H:%M"), self.end.strftime("%H:%M")) return txt def __unicode__(self): return unicode(self.__str__()) def __repr__(self): return self.__str__() def event_update(self): """ Commands to calculate all data on event, based on messages and media. """ self.add_stem_texts() self.create_core(deviation_threshold = 1) self.create_core(deviation_threshold = 2) self.create_core(deviation_threshold = 3) self.score_messages_by_text() self.event_summary_stats() self.is_valid() def is_successor(self, slice_ids, sim_index = 0.3, only_relevant = True): """ Method examines, if current event have common messages with specified event slice. Args: slice_ids (Set): set if message id's to compare with sim_index (float): minimal share of messages that should match in slice to be detected as a successor only_relevant (bool): use only messages with non-zero token_score (to exclude spam) """ if only_relevant: event_ids = set([k for k, v in self.messages.items() if v['token_score'] > 0]) if not event_ids: event_ids = set(self.messages.keys()) else: event_ids = set(self.messages.keys()) #if float(len(event_ids.intersection(slice_ids)))/len(event_ids.union(slice_ids)) >= jaccard: if float(len(event_ids.intersection(slice_ids)))/min((len(event_ids), len(slice_ids))) >= sim_index: return True return False def is_valid(self): """ Method for Classifier to determine, if event is actually event, and not a random messages contilation. """ if self.validity: return True if self.classifier: self.validity = bool(self.classifier.predict([self.classifier_row()])[0]) return self.validity def classifier_row(self): """ Unififed method for creating classifier data-row. Every var, used in prediction, is listed here, and only here. """ row = [ len(self.messages.values()), len(self.media.values()), self.authors, self.most_active_author, self.authors_share, self.entropy, self.ppa, self.relevant_messages_share, self.duration ] return row def merge(self, other_event): """ Method merges current event with another event, update stat Attributes. Args: other_event (Event): another event object - to merge with """ self.messages.update(other_event.messages) self.media.update(other_event.media) self.event_update() self.updated = datetime.now() self.created = min((self.created, other_event.created)) def add_slice(self, new_slice): """ Method adds messages and media to the event, recompute statistics. Args: new_slice (List[dict]): initial list with messages to be added """ self.messages.update({ x['id']:x for x in new_slice }) self.get_messages_data([x['id'] for x in new_slice]) self.get_media_data([x['id'] for x in new_slice]) self.event_update() self.updated = datetime.now() def backup(self): """ Method dumps event to MySQL long-term storage, used for non-evaluating events. """ if self.verification is None: ver = 'NULL' else: ver = int(self.verification) if self.validity is None: val = 'NULL' else: val = int(self.validity) msg_string = self.pack() q = b'''INSERT INTO events(id, start, end, msgs, description, dumps, verification, validity) VALUES ("{}", "{}", "{}", {}, "{}", "{}", {}, {}) ON DUPLICATE KEY UPDATE `start`=VALUES(`start`), `end`=VALUES(`end`), `msgs`=VALUES(`msgs`), `description`=VALUES(`description`), `dumps`=VALUES(`dumps`), `verification`=VALUES(`verification`), `validity`=VALUES(`validity`);'''.format(self.id, self.start, self.end, len(self.messages.keys()), escape_string(', '.join([x.encode('utf-8') for x in self.cores[2]])), escape_string(msg_string), ver, val) exec_mysql(q, self.mysql) self.redis.delete("event:{}".format(self.id)) def restore(self, event_id): """ Method restores event from MySQL table using event_id parameter. Args: event_id (str): unique event identifier """ q = '''SELECT dumps FROM events WHERE id="{}"'''.format(event_id) event_data = exec_mysql(q, self.mysql)[0][0]['dumps'] self.unpack(event_data) def load(self, event_id, redis_prefix='event'): """ Method for deserializing and loading event from Redis database. Args: event_id (str): unique event isentifier redis_prefix (str): prefix used in Redis database """ try: event_data = self.redis.hget('{}:{}'.format(redis_prefix, event_id), 'dumps') except ResponseError: event_data = self.redis.get('{}:{}'.format(redis_prefix, event_id)) self.unpack(event_data) def dump(self, redis_prefix='event'): """ Method for serializing and dumping event to Redis database. Args: redis_prefix (str): prefix to use, when storing new key in Redis database """ if self.verification is None: ver = 'NULL' else: ver = int(self.verification) if self.validity is None: val = 'NULL' else: val = int(self.validity) msg_string = self.pack() event = {'start':self.start.strftime("%Y-%m-%d %H:%M:%S"), 'end':self.end.strftime("%Y-%m-%d %H:%M:%S"), 'msgs':len(self.messages.keys()), 'description':', '.join([x.encode('utf-8') for x in self.cores[2]]), 'dumps':msg_string, 'verification':ver, 'validity':val} self.redis.hmset("{}:{}".format(redis_prefix, self.id), event) def pack(self, complete=False): """ Method for serializing event to string. Args: complete (bool): whether to pack all available data for the event (full texted messages, media links, and cores). """ todump = { 'id':self.id, 'created':int(mktime(self.created.timetuple())), 'updated':int(mktime(self.updated.timetuple())), 'verification':self.verification, 'messages':[{'id':x['id'], 'is_core':x.get('is_core'), 'token_score':x.get('token_score'), 'weight':x.get('weight')} for x in self.messages.values()] } if complete: todump['media'] = self.media todump['validity'] = self.validity for i in range(len(todump['messages'])): msg = self.messages[todump['messages'][i]['id']] todump['messages'][i].update({'iscopy':msg['iscopy'], 'lat':msg['lat'], 'lng':msg['lng'], 'network':msg['network'], 'text':msg['text'], 'tstamp':int(mktime(msg['tstamp'].timetuple())), 'user':msg['user']}) return packb(todump) def unpack(self, data, complete=False): """ Method for deserializing event from string. msgpack lib is used (considered to be faster than pickle). Args: data (str): pickle dump of event-required parameters. complete (bool): whether to unpack all available data for the event (full texted messages, media links, and cores), or compute these parameters on the fly. """ data = unpackb(data) self.id = data['id'] self.created = datetime.fromtimestamp(data['created']) self.updated = datetime.fromtimestamp(data['updated']) self.verification = data['verification'] self.messages = {x['id']:x for x in data['messages']} if complete: self.validity = data['validity'] self.media = data['media'] for k in self.messages.keys(): self.messages[k]['tstamp'] = datetime.fromtimestamp(self.messages[k]['tstamp']) else: self.get_messages_data() self.media = {} self.get_media_data() self.event_update() def get_messages_data(self, ids=None): """ Method loads MySQL data for messages ids and adds it to the self.messagea argument. Args: ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used """ if not ids: ids = [x['id'] for x in self.messages.values()] q = '''SELECT * FROM tweets WHERE id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids])) data = exec_mysql(q, self.mysql)[0] for item in data: self.messages[item['id']].update(item) def get_media_data(self, ids=None): """ Method loads MySQL data for media using existing messages ids and adds it to the self.media argument. Args: ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used """ if not ids: ids = [x['id'] for x in self.messages.values()] q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids])) data = exec_mysql(q, self.mysql)[0] for item in data: self.media[item['id']] = item def event_summary_stats(self): """ Method calculates several statistics, updates self.start and self.end timestamps. """ authorsip_stats = [len(tuple(i[1])) for i in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z: z['user'])] self.authors = len(authorsip_stats) self.most_active_author = max(authorsip_stats)/float(len(self.messages.values())) self.authors_share = float(self.authors)/len(self.messages.values()) self.entropy = entropy(authorsip_stats) self.ppa = mean(authorsip_stats) self.relevant_messages_share = float(len([x for x in self.messages.values() if x['token_score'] > 0]))/len(self.messages.values()) self.start = min([x['tstamp'] for x in self.messages.values()]) self.end = max([x['tstamp'] for x in self.messages.values()]) self.duration = int((self.end - self.start).total_seconds()) def add_stem_texts(self): """ Method adds tokens lists to self.messages. """ for i in self.messages.keys(): if 'tokens' not in self.messages[i].keys(): txt = self.messages[i].get('text', '') txt = sub(self.url_re, '', txt) self.messages[i]['tokens'] = {self.morph.parse(token.decode('utf-8'))[0].normal_form for token in self.tokenizer.tokenize(txt) if match(self.word, token.decode('utf-8'))} def create_core(self, deviation_threshold=2, min_token=3): """ Method creates core of imprtant words for event. Args: deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens min_token (int): minimal length of token, to exclude prepositions/conjunctions """ texts_by_authors = [set().union(*[msg['tokens'] for msg in list(y[1])]) for y in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z:z['user'])] top_words = {} for doc in texts_by_authors: for token in doc: if len(token) >= min_token: try: top_words[token] += 1 except KeyError: top_words[token] = 1 th_vals = [x[1] for x in top_words.items()] threshold = mean(th_vals) + deviation_threshold * std(th_vals) self.cores[deviation_threshold] = [k for k,v in top_words.items() if v > threshold] def score_messages_by_text(self, deviation_threshold=2): """ Method calculates token_score parameter for self.messages. Args: deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens """ texts = [x['tokens'] for x in self.messages.values()] if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1: for k in self.messages.keys(): self.messages[k]['token_score'] = 0 return dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus, id2word=dictionary) index = MatrixSimilarity(tfidf[corpus]) try: scores = index[dictionary.doc2bow(self.cores[deviation_threshold])] except IndexError: error('Index error in token scoring for event {}'.format(self.id)) scores = [0]*len(self.messages.values()) for i in range(len(scores)): self.messages.values()[i]['token_score'] = float(scores[i])
class PymorphyVectorizer(WordIndexVectorizer): """ Transforms russian words into 0-1 vector of its possible Universal Dependencies tags. Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io) and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets). All UD2.0 tags that are compatible with produced tags are memorized. The list of possible Universal Dependencies tags is read from a file, which contains all the labels that occur in UD2.0 SynTagRus dataset. Args: save_path: path to save the tags list, load_path: path to load the list of tags, max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used. """ USELESS_KEYS = ["Abbr"] VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"} def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20') @property def dim(self): return len(self._t2i) def save(self) -> None: """Saves the dictionary to self.save_path""" with self.save_path.open("w", encoding="utf8") as fout: fout.write("\n".join(self._i2t)) def load(self) -> None: """Loads the dictionary from self.load_path""" self._i2t = [] with self.load_path.open("r", encoding="utf8") as fin: for line in fin: line = line.strip() if line == "": continue self._i2t.append(line) self._t2i = {tag: i for i, tag in enumerate(self._i2t)} self._make_tag_trie() def _make_tag_trie(self): self._nodes = [defaultdict(dict)] self._start_nodes_for_pos = dict() self._data = [None] for tag, code in self._t2i.items(): if "," in tag: pos, tag = tag.split(",", maxsplit=1) tag = sorted( [tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag, [] start = self._start_nodes_for_pos.get(pos) if start is None: start = self._start_nodes_for_pos[pos] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) for key, value in tag: values_dict = self._nodes[start][key] child = values_dict.get(value) if child is None: child = values_dict[value] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) start = child self._data[start] = code return self def find_compatible(self, tag: str) -> List[int]: """ Transforms a Pymorphy tag to a list of indexes of compatible UD tags. Args: tag: input Pymorphy tag Returns: indexes of compatible UD tags """ if " " in tag and "_" not in tag: pos, tag = tag.split(" ", maxsplit=1) tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag.split()[0], [] if pos not in self._start_nodes_for_pos: return [] tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag if key not in self.USELESS_KEYS] if len(tag) > 0: curr_nodes = [(0, self._start_nodes_for_pos[pos])] final_nodes = [] else: final_nodes = [self._start_nodes_for_pos[pos]] curr_nodes = [] while len(curr_nodes) > 0: i, node_index = curr_nodes.pop() # key, value = tag[i] node = self._nodes[node_index] if len(node) == 0: final_nodes.append(node_index) for curr_key, curr_values_dict in node.items(): curr_i, curr_node_index = i, node_index while curr_i < len(tag) and tag[curr_i][0] < curr_key: curr_i += 1 if curr_i == len(tag): final_nodes.extend(curr_values_dict.values()) continue key, value = tag[curr_i] if curr_key < key: for child in curr_values_dict.values(): curr_nodes.append((curr_i, child)) else: child = curr_values_dict.get(value) if child is not None: if curr_i < len(tag) - 1: curr_nodes.append((curr_i + 1, child)) else: final_nodes.append(child) answer = [] while len(final_nodes) > 0: index = final_nodes.pop() if self._data[index] is not None: answer.append(self._data[index]) for elem in self._nodes[index].values(): final_nodes.extend(elem.values()) return answer def _get_word_indexes(self, word): answer = self.memorized_word_indexes.get(word) if answer is None: parse = self.analyzer.parse(word) if self.max_pymorphy_variants > 0: parse = parse[:self.max_pymorphy_variants] tag_indexes = set() for elem in parse: tag_indexes.update(set(self._get_tag_indexes(elem.tag))) answer = self.memorized_word_indexes[word] = list(tag_indexes) return answer def _get_tag_indexes(self, pymorphy_tag): answer = self.memorized_tag_indexes.get(pymorphy_tag) if answer is None: tag = self.converter(str(pymorphy_tag)) answer = self.memorized_tag_indexes[ pymorphy_tag] = self.find_compatible(tag) return answer
import json import os import re from pymorphy2 import MorphAnalyzer m = MorphAnalyzer() lemma = lambda word: m.parse(word)[0].normal_form def to_json(cont): return json.dumps(cont, ensure_ascii=False, indent='\t') text_all = '' k = 0 for i in os.listdir('data/history/'): if i[-4:] == 'json': try: k += 1 with open('data/history/{}'.format(i), 'r') as file: for j in file: text = json.loads(j)['body'].strip() if text: text_all += text + '\n' except: pass print(k)
class RNNMorphPredictor(Predictor): """ POS-теггер на освное RNN. """ def __init__(self, language="ru", eval_model_config_path: str=None, eval_model_weights_path: str=None, gram_dict_input: str=None, gram_dict_output: str=None, word_vocabulary: str=None, char_set_path: str=None, build_config: str=None): if eval_model_config_path is None: eval_model_config_path = MODELS_PATHS[language]["eval_model_config"] if eval_model_weights_path is None: eval_model_weights_path = MODELS_PATHS[language]["eval_model_weights"] if gram_dict_input is None: gram_dict_input = MODELS_PATHS[language]["gram_input"] if gram_dict_output is None: gram_dict_output = MODELS_PATHS[language]["gram_output"] if word_vocabulary is None: word_vocabulary = MODELS_PATHS[language]["word_vocabulary"] if char_set_path is None: char_set_path = MODELS_PATHS[language]["char_set"] if build_config is None: build_config = MODELS_PATHS[language]["build_config"] self.language = language self.converter = converters.converter('opencorpora-int', 'ud14') if language == "ru" else None self.morph = MorphAnalyzer() if language == "ru" else None if self.language == "en": nltk.download("wordnet") nltk.download('averaged_perceptron_tagger') nltk.download('universal_tagset') self.build_config = BuildModelConfig() self.build_config.load(build_config) self.model = LSTMMorphoAnalysis(language=language) self.model.prepare(gram_dict_input, gram_dict_output, word_vocabulary, char_set_path) self.model.load_eval(self.build_config, eval_model_config_path, eval_model_weights_path) def predict(self, words: List[str], include_all_forms: bool=False) -> List[WordFormOut]: words_probabilities = self.model.predict_probabilities([words], 1, self.build_config)[0] return self.__get_sentence_forms(words, words_probabilities, include_all_forms) def predict_sentences(self, sentences: List[List[str]], batch_size: int=64, include_all_forms: bool=False) -> List[List[WordFormOut]]: sentences_probabilities = self.model.predict_probabilities(sentences, batch_size, self.build_config) answers = [] for words, words_probabilities in zip(sentences, sentences_probabilities): answers.append(self.__get_sentence_forms(words, words_probabilities, include_all_forms)) return answers def __get_sentence_forms(self, words: List[str], words_probabilities: List[List[float]], include_all_forms: bool) -> List[WordFormOut]: """ Получить теги и формы. :param words: слова. :param words_probabilities: вероятности тегов слов. :param include_all_forms: флаг, включающий все варианты разбора. :return: вероятности и формы для всех вариантов слов. """ result = [] for word, word_prob in zip(words, words_probabilities[-len(words):]): result.append(self.__compose_out_form(word, word_prob[1:], include_all_forms)) return result def __compose_out_form(self, word: str, probabilities: List[float], include_all_forms: bool) -> WordFormOut: """ Собрать форму по номеру теги в векторизаторе и слову. :param word: слово. :param probabilities: вероятности разных форм. :param include_all_forms: флаг, включающий все варианты разбора. :return: форма. """ word_forms = None if self.language == "ru": word_forms = self.morph.parse(word) vectorizer = self.model.grammeme_vectorizer_output tag_num = int(np.argmax(probabilities)) score = probabilities[tag_num] full_tag = vectorizer.get_name_by_index(tag_num) pos, tag = full_tag.split("#")[0], full_tag.split("#")[1] lemma = self.__get_lemma(word, pos, tag, word_forms) vector = np.array(vectorizer.get_vector(full_tag)) result_form = WordFormOut(word=word, normal_form=lemma, pos=pos, tag=tag, vector=vector, score=score) if include_all_forms: weighted_vector = np.zeros_like(vector, dtype='float64') for tag_num, prob in enumerate(probabilities): full_tag = vectorizer.get_name_by_index(tag_num) pos, tag = full_tag.split("#")[0], full_tag.split("#")[1] lemma = self.__get_lemma(word, pos, tag, word_forms) vector = np.array(vectorizer.get_vector(full_tag), dtype='float64') weighted_vector += vector * prob form = WordFormOut(word=word, normal_form=lemma, pos=pos, tag=tag, vector=vector, score=prob) result_form.possible_forms.append(form) result_form.weighted_vector = weighted_vector return result_form def __get_lemma(self, word: str, pos_tag: str, gram: str, word_forms=None, enable_normalization: bool=True): """ Получить лемму. :param word: слово. :param pos_tag: часть речи. :param gram: граммаическое значение. :param enable_normalization: использовать ли нормализацию как в корпусе ГИКРЯ. :return: лемма. """ if '_' in word: return word if self.language == "ru": if word_forms is None: word_forms = self.morph.parse(word) guess = "" max_common_tags = 0 for word_form in word_forms: word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(self.converter, word_form.tag, word) word_form_gram = process_gram_tag(word_form_gram) common_tags_len = len(set(word_form_gram.split("|")).intersection(set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = word_forms[0] if enable_normalization: lemma = self.__normalize_for_gikrya(guess) else: lemma = guess.normal_form return lemma elif self.language == "en": lemmatizer = nltk.stem.WordNetLemmatizer() pos_map = defaultdict(lambda: 'n') pos_map.update({ 'ADJ': 'a', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' }) return lemmatizer.lemmatize(word, pos=pos_map[pos_tag]) else: assert False @staticmethod def __normalize_for_gikrya(form): """ Поучение леммы по правилам, максимально близким к тем, которые в корпусе ГИКРЯ. :param form: форма из pymorphy2. :return: леммма. """ if form.tag.POS == 'NPRO': if form.normal_form == 'она': return 'он' if form.normal_form == 'они': return 'он' if form.normal_form == 'оно': return 'он' if form.word == 'об': return 'об' if form.word == 'тот': return 'то' if form.word == 'со': return 'со' if form.tag.POS in {'PRTS', 'PRTF'}: return form.inflect({'PRTF', 'sing', 'masc', 'nomn'}).word return form.normal_form
class Parser: def __init__(self): self.url = 'https://ru.wiktionary.org/wiki/' self.headers = { 'accept': '*/*', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36' } self.session = requests.Session() self.morph = MorphAnalyzer() text = open('olds.json', 'r').read() self.olds = json.loads(text) def find(self, content): variants = self.morph.parse(content) word = variants[0].normal_form request = self.session.get(self.url + word.lower()) soup = bs(request.text, 'html.parser') response = soup.find('ol').find('li') if response != None: response_dict = {'text': response.text, 'title': content} response_dict['text'] = response_dict['text'].replace( '\n', 'dp-trans') return json.dumps(response_dict) else: response_dict = { 'text': "Информация не найденна(", 'title': content } return json.dumps(response_dict) def parse_current_page(self, url): word_dict = {} request = self.session.get(url, headers=self.headers) soup = bs(request.text, 'html.parser') poems = soup.find_all(attrs={"class": 'dpast__content'}) dialects = {'words': []} def superrost(words): return ''.join( filter( lambda x: ord(x) in range(ord('а'), ord('я') + 1) or ord(x) in range( ord('А'), ord('Я') + 1) or x == ' ', list(words.replace('\\n', ' ')))) tokenizer = TweetTokenizer() analyzer = MorphAnalyzer() def preprocess(text): w = text.lower().split() filtered_words = [ word for word in w if word not in stopwords.words('russian') ] words = tokenizer.tokenize(' '.join(filtered_words)) for i in range(len(words)): k = analyzer.parse(words[i])[0].normal_form word_dict[k] = words[i] words[i] = k return ' '.join(words) for poem in poems: poem = poem.text poem = preprocess(superrost(poem)).split() for w in poem: if w in self.olds: dialects['words'].append(word_dict[w]) return json.dumps(dialects) def parse_current_page_chrome(self, url): request = self.session.get(url, headers=self.headers) soup = bs(request.text, 'html.parser') [s.extract() for s in soup('script')] text = soup.text word_dict = {} dialects = {'words': []} def superrost(words): return ''.join( filter( lambda x: ord(x) in range(ord('а'), ord('я') + 1) or ord(x) in range( ord('А'), ord('Я') + 1) or x == ' ', list(words.replace('\\n', ' ')))) tokenizer = TweetTokenizer() analyzer = MorphAnalyzer() def preprocess(text): w = text.lower().split() filtered_words = [ word for word in w if word not in stopwords.words('russian') ] words = tokenizer.tokenize(' '.join(filtered_words)) for i in range(len(words)): k = analyzer.parse(words[i])[0].normal_form word_dict[k] = words[i] words[i] = k return ' '.join(words) poem = text poem = preprocess(superrost(poem)).split() for w in poem: if w in self.olds: dialects['words'].append(word_dict[w]) dialects['words'] = list(set(dialects['words'])) return json.dumps(dialects)
articles = os.listdir('./articles') for article in articles: if article.endswith('.txt'): with open('./articles/' + article, 'r', encoding='utf-8-sig') as f: all_text = f.read() link = extract_link.search(all_text).group(1) title = extract_title.search(all_text).group(1) text = extract_text.search(all_text) if text is not None: text = text.group(1) words = [ x.lower().strip(string.punctuation + '»«–…') for x in word_tokenize(text) ] lemmas = [ m.parse(x)[0].normal_form for x in words if x and x not in set(stopwords.words('russian')) ] collection[article] = lemmas article_info[article] = (link, title, len(lemmas)) avdl += len(lemmas) inverted_index = inv_index(collection) avdl = avdl / len(collection) with open('inverted_index.json', 'w', encoding='utf-8-sig') as f: s = json.dumps(inverted_index, ensure_ascii=False) f.write(s) with open('article_info.json', 'w', encoding='utf-8-sig') as f: s = json.dumps(article_info, ensure_ascii=False, indent=2)
def process(inpt_dir, otpt_dir, gold): # Создаём директорию с выходными данными на случай, если её нет os.makedirs(otpt_dir, exist_ok=True) # Если директории со входными данными нет, тут возбуждается исключение os.chdir(inpt_dir) # Если директория есть, всё в порядке - программа начинает работу print('Please wait. Python is processing your data...') morph = MorphAnalyzer() files = glob.glob('*.txt') gold_file = open(gold, mode='r', encoding='utf-8', newline='') # Файлы с текстами обрабатываем поштучно for file in files: f = open(file, mode='r', encoding='windows-1251') lines = f.readlines() root = etree.Element('text') # Словарь для статистики stat = { 'breaks on start': 0, 'breaks on end': 0, 'regular breaks': 0, 'fallbacks': 0, 'terminal <pc>\'s': 0 } # Массив для фолбэков log_list = [] for i, line in enumerate(lines): # Массив токенов line_tokens = nltk.word_tokenize(line) # Массив упорядоченных словарей вида {разбор: лемма} line_parses = format_parse_list( [morph.parse(token) for token in line_tokens]) p = etree.SubElement(root, 'p') p.set('n', str(i + 1)) prev_ana = '' for j, ana in enumerate(line_parses): gold_file.seek(0) gold_reader = csv.reader(gold_file, delimiter=';') # parses = все варианты разбора одного слова parses = list(ana.keys()) check = False if parses[0].startswith('PM'): elem = etree.SubElement(p, 'pc') else: elem = etree.SubElement(p, 'w') elem.text = line_tokens[j] for row in gold_reader: # Отсекаем триграммы с частотой < 4 if row[3] == '3': break # Если текущий элемент - однозначно терминальный ЗП, то искать с ним триграмму бессмысленно if parses[0] == 'PM,Tr,_': elem.set('ana', 'PM,Tr,_') elem.set('lemma', ana['PM,Tr,_']) prev_ana = 'PM,Tr,_' stat['terminal <pc>\'s'] += 1 check = True else: # Если находимся в абсолютном начале предложения/чанка, рассматриваем левые биграммы if j == 0 or prev_ana == 'PM,Tr,_': # Фолбэк к pymorphy2, если текущий элемент последний в предложении if j + 1 == len(line_parses): break else: if row[0] in parses and row[1] in line_parses[ j + 1]: elem.set('ana', row[0]) elem.set('lemma', ana[row[0]]) prev_ana = row[0] stat['breaks on start'] += 1 check = True # Если текущий элемент последний в предложении, рассматриваем правые биграммы elif j + 1 == len(line_parses): if prev_ana == row[1] and row[2] in parses: elem.set('ana', row[2]) elem.set('lemma', ana[row[2]]) prev_ana = row[2] stat['breaks on end'] += 1 check = True # В других случаях рассматриваем полноценные триграммы else: if row[0] == prev_ana and row[1] in parses and row[ 2] in line_parses[j + 1]: ## print('row[0]', row[0]) ## print('row[1]',row[1]) elem.set('ana', row[1]) elem.set('lemma', ana[row[1]]) prev_ana = row[1] stat['regular breaks'] += 1 check = True if check: break # Фолбэк, если подходящей триграммы в золотом стандарте не нашлось if not check: elem.set('ana', parses[0]) elem.set('lemma', ana[parses[0]]) prev_ana = parses[0] # Фиксируем триграммы, на которых случился фолбэк if j == 0 and len(line_tokens) == 1: log_data = '''\ { %s: %s, }; ''' % (str(line_tokens[j]), str(parses)) elif j == 0: log_data = '''\ { %s: %s, %s: %s, }; ''' % (str(line_tokens[j]), str(parses), str(line_tokens[j + 1]), str(list(line_parses[j + 1].keys()))) elif j + 1 == len(line_parses): log_data = '''\ { %s: %s, %s: %s, }; ''' % (str(line_tokens[j - 1]), str(prev_ana), str( line_tokens[j]), str(parses)) else: log_data = '''\ { %s: %s, %s: %s, %s: %s, }; ''' % (str(line_tokens[j - 1]), str(prev_ana), str( line_tokens[j]), str(parses), str(line_tokens[j + 1]), str(list(line_parses[j + 1].keys()))) log_list.append(log_data) stat['fallbacks'] += 1 # Шагаем в выходную директорию os.chdir(otpt_dir) # Записываем в XML with open(file[:-3] + 'xml', mode='w', encoding='utf-8') as out: xml = etree.tostring(root, method='xml', encoding='utf-8') pretty = parseString(xml).toprettyxml(indent=' ', encoding='utf-8') out.write(pretty.decode()) # Записываем фолбэки в лог-файл with open(file[:-4] + '_log_trg.txt', mode='w', encoding='utf-8') as log: for line in log_list: log.write(str(line) + '\n') # Выдаём статистику по файлу print(file) for key in stat: print(' %d %s' % (stat[key], key)) # Возвращаемся во входную директорию - к файлам на очереди os.chdir(inpt_dir) f.close() gold_file.close()
class RNNMorphPredictor(Predictor): """ POS-теггер на освное RNN. """ def __init__(self, model_config_path: str = RU_MORPH_DEFAULT_MODEL_CONFIG, model_weights_path: str = RU_MORPH_DEFAULT_MODEL_WEIGHTS, gramm_dict_input: str = RU_MORPH_GRAMMEMES_DICT, gramm_dict_output: str = RU_MORPH_GRAMMEMES_DICT_OUTPUT): self.model = LSTMMorphoAnalysis() self.model.prepare(gramm_dict_input, gramm_dict_output) self.model.load(model_config_path, model_weights_path) self.morph = MorphAnalyzer() def predict_sentence_tags(self, words: List[str]) -> List[WordFormOut]: tags = self.model.predict([words], batch_size=1)[0] return [ self.__compose_out_form(tag_num, word) for tag_num, word in zip(tags, words) ] def predict_sentences_tags( self, sentences: List[List[str]], batch_size: int = 64) -> List[List[WordFormOut]]: sentences_tags = self.model.predict(sentences, batch_size) answers = [] for tags, words in zip(sentences_tags, sentences): answers.append([ self.__compose_out_form(tag_num, word) for tag_num, word in zip(tags, words) ]) return answers def predict_sentence_tags_proba( self, words: List[str]) -> List[List[Tuple[float, WordFormOut]]]: words_probabilities = self.model.predict_proba([words], batch_size=1)[0] return self.__get_sentence_forms_probs(words, words_probabilities) def predict_sentences_tags_proba( self, sentences: List[List[str]], batch_size: int = 64 ) -> List[List[List[Tuple[float, WordFormOut]]]]: result = [] sentences_probabilities = self.model.predict_proba( sentences, batch_size) for sentence, words_probabilities in zip(sentences, sentences_probabilities): result.append( self.__get_sentence_forms_probs(sentence, words_probabilities)) return result def __get_sentence_forms_probs(self, words: List[str], words_probabilities: List[List[float]]) -> \ List[List[Tuple[float, WordFormOut]]]: """ Получить теги и формы. :param words: слова. :param words_probabilities: вероятности тегов слов. :return: вероятности и формы для всех вариантов слов. """ result = [] for word, word_prob in zip(words, words_probabilities[-len(words):]): word_prob = word_prob[1:] word_forms = [(grammeme_prob, self.__compose_out_form(tag_num, word)) for tag_num, grammeme_prob in enumerate(word_prob)] result.append(word_forms) return result def __compose_out_form(self, tag_num: int, word: str) -> WordFormOut: """ Собрать форму по номеру теги в векторизаторе и слову. :param tag_num: номер тега. :param word: слово. :return: форма. """ vectorizer = self.model.grammeme_vectorizer_output tag = vectorizer.get_name_by_index(tag_num) pos_tag = tag.split("#")[0] gram = tag.split("#")[1] lemma = self.__get_lemma(word, pos_tag, gram) return WordForm(lemma=lemma, gram_vector_index=tag_num, text=word).get_out_form(vectorizer) def __get_lemma(self, word: str, pos_tag: str, gram: str, enable_gikrya_normalization: bool = True): """ Получить лемму. :param word: слово. :param pos_tag: часть речи. :param gram: граммаическое значение. :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ. :return: лемма. """ if '_' in word: return word to_ud = converters.converter('opencorpora-int', 'ud14') guess = "" max_common_tags = 0 for word_form in self.morph.parse(word): word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag( to_ud, word_form.tag, word) word_form_gram = process_gram_tag(word_form_gram) common_tags_len = len( set(word_form_gram.split("|")).intersection( set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = self.morph.parse(word)[0] if enable_gikrya_normalization: lemma = self.__normalize_for_gikrya(guess) else: lemma = guess.normal_form return lemma @staticmethod def __normalize_for_gikrya(form): """ Поучение леммы по правилам, максимально близким к тем, которые в корпусе ГИКРЯ. :param form: форма из pymorphy2. :return: леммма. """ if form.tag.POS == 'NPRO': if form.normal_form == 'она': return 'он' if form.normal_form == 'они': return 'он' if form.normal_form == 'оно': return 'он' if form.word == 'об': return 'об' if form.word == 'тот': return 'то' if form.word == 'со': return 'со' if form.tag.POS in {'PRTS', 'PRTF'}: return form.inflect({'PRTF', 'sing', 'masc', 'nomn'}).word return form.normal_form
nonnum += 1 alpha_postings += cnt lo = word.lower() if lo in low_reg: low_reg[lo] += cnt else: low_reg[lo] = cnt just_ru = {k: v for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k)} ru_postings = sum(just_ru.values()) morph = MorphAnalyzer() c = 0 for k, v in just_ru.items(): if c % 100000 == 0: print(c) c += 1 lem = morph.parse(k)[0].normal_form if lem in lemmatized: lemmatized[lem] += int(v) else: lemmatized[lem] = int(v) with open("stopwords", "r") as st: stops = set(st.read().split('\n')) for k, v in just_ru.items(): if not k in stops: no_stops_postings += v print("Raw dictionary size = {0}\n" "Without numbers = {1}\n" "Lowered = {2}\n" "Just russian = {3}\n".format(all, nonnum, len(low_reg), len(just_ru))) print("Lemmatized = {0}\n\n".format(len(lemmatized))) print("All postings = {0}\n"
'C:\\Users\\iburmistrov\\Documents\\Texts_Analyze\\url_list.txt') ] for url in urls: try: response = requests.get(url, timeout=None) except: continue soup = BeautifulSoup(response.content, "html.parser") for pp in soup.select("p"): print(pp.text) #приводим все слова к их исходной форме, настраиваем CountVectorizer на подсчет N-грамм с количеством слов от 2 до 4 и записываем результат в файл. cvn = CountVectorizer(ngram_range=(2, 4), stop_words=stop_words) words_nf = [ ' '.join([m.parse(word)[0].normal_form for word in x.split()]) for x in texts ] ngrams = cvn.fit_transform(words_nf) vb = cvn.vocabulary_ count_values = ngrams.toarray().sum(axis=0) for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vb.items()], reverse=True): print(ng_text, ng_count, sep='\t') #Все, что нужно дальше, – записать все результаты в один Excel-файл: info_data = pd.read_csv('C:\\Users\\evgen\\Documents\\result.txt', encoding='utf-8', sep="\t", header=None)
new_file.write(elem_clear) with open('not_spam_file.txt', encoding='utf8') as a_src: text = a_src.read() morph = MorphAnalyzer() not_spam_m = 0 #количество слов в обучающей выборке not_spam_nk = 0 #количество слов в обучающей выборке без стоп-слов stops = stopwords.words("russian") articles_texts = [] articles_texts.append(text) articles_preprocessed = [] for a_text in articles_texts: a_tokens = wordpunct_tokenize(a_text) a_lemmatized = ' '.join( [morph.parse(item)[0].normal_form for item in a_tokens]) articles_preprocessed.append(a_lemmatized) for token in a_tokens: p = morph.parse(token)[0] if p.tag.POS: not_spam_m += 1 if p.tag.POS and token not in stops: not_spam_nk += 1 tfidf = TfidfVectorizer(analyzer="word", stop_words=stops) articles_tfidf = tfidf.fit_transform(articles_preprocessed) feature_names = np.array(tfidf.get_feature_names()) not_spam_triggers = {}
if lo in low_reg: low_reg[lo] += cnt else: low_reg[lo] = cnt just_ru = { k: v for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k) } ru_postings = sum(just_ru.values()) morph = MorphAnalyzer() c = 0 for k, v in just_ru.items(): if c % 100000 == 0: print(c) c += 1 lem = morph.parse(k)[0].normal_form if lem in lemmatized: lemmatized[lem] += int(v) else: lemmatized[lem] = int(v) with open("stopwords", "r") as st: stops = set(st.read().split('\n')) for k, v in just_ru.items(): if not k in stops: no_stops_postings += v print("Raw dictionary size = {0}\n" "Without numbers = {1}\n" "Lowered = {2}\n" "Just russian = {3}\n".format(all, nonnum, len(low_reg), len(just_ru))) print("Lemmatized = {0}\n\n".format(len(lemmatized))) print("All postings = {0}\n"
class Extractor: def __init__(self): self.word2vec = None self.morph = MorphAnalyzer() @lru_cache(20000) def _morph_parse(self, word): return self.morph.parse(word) def _tokenize(self, text): tokens = word_tokenize(text.lower()) result = [] for token in tokens: morph = self._morph_parse(token) if len(morph) > 0: if morph[0].tag.POS is not None: result.append(morph[0]) return [ "{0}_{1}".format(morph.word, morph.tag.POS) for morph in result ] def fit(self, texts, word2vec_params): converted_texts = [self._tokenize(text) for text in texts] self.word2vec = Word2Vec(converted_texts, **word2vec_params) def _tfidf_order_features(self, tfidf, matrix): mean_features = np.asarray(matrix.mean(axis=0))[0] ordered_features = mean_features.argsort()[::-1] feature_names = tfidf.get_feature_names() result = [] for feature in ordered_features: result.append(feature_names[feature]) return np.array(result) def _tfidf_feature_filter(self, features): ignorance_filter = lambda text: bool(re.match(".*prep", text)) or \ bool(re.match(".*infn", text)) or \ bool(re.match(".*verb", text)) feature_filter = lambda text: bool(re.match(".*adjf .*noun", text) ) and not ignorance_filter(text) return [feature for feature in features if feature_filter(feature)] def _document_distance(self, doc1, doc2): doc1vec = np.array([np.zeros([self.word2vec.vector_size])] + [ self.word2vec[token] for token in doc1.split(" ") if token in self.word2vec ]).sum(axis=0) doc2vec = np.array([np.zeros([self.word2vec.vector_size])] + [ self.word2vec[token] for token in doc2.split(" ") if token in self.word2vec ]).sum(axis=0) return cosine(doc1vec, doc2vec) def _top_features(self, converted_texts, ngram_min, ngram_max, top_tfidf_features): features = [] for size in range(ngram_min, ngram_max + 1): tfidf = TfidfVectorizer(ngram_range=(ngram_min, ngram_max)) tfidf_transformed_texts = tfidf.fit_transform(converted_texts) tfidf_features = self._tfidf_order_features( tfidf, tfidf_transformed_texts) top_features = self._tfidf_feature_filter( tfidf_features)[:top_tfidf_features] features += top_features features = list(set(features)) features.sort() return features def _feature_distances(self, features): distances = np.zeros([len(features), len(features)]) for i, feature1 in enumerate(features): for j, feature2 in enumerate(features): features_distance = self._document_distance(feature1, feature2) distances[i, j] = features_distance distances[j, i] = features_distance return distances def _cluster_features(self, features, distances): dbscan = DBSCAN(0.2, min_samples=1, metric="precomputed") clusters = dbscan.fit_predict(distances) items = {} for cluster, feature in zip(clusters, features): items[cluster] = items.get(cluster, []) + [feature] return items def _choose_features(self, features, distances): choosen_ngrams = [] for key, values in self._cluster_features(features, distances).items(): values_indices = np.array([features.index(val) for val in values]) if len(values) < 2: continue values_distances = distances[values_indices, :][:, values_indices] index_mean_distances = np.zeros([len(values_indices)]) for i in range(0, len(values_indices)): index_mean_distances[i] = np.delete(values_distances[i], i, axis=0).mean() choosen_ngram = values[index_mean_distances.argmin()] choosen_ngrams.append(choosen_ngram) return choosen_ngrams def _apply_rules(self, rules, text): if isinstance(text, list): return [self._apply_rules(rules, item) for item in text] for rule in rules: text = rule(text) return text def _norm(self, text): tokens = wordpunct_tokenize(text) result = [] for token in tokens: parse = self.morph.parse(token) if len(parse) > 0 and parse[0].tag.POS: inflect = parse[0].inflect({"nomn"}) if inflect: result.append(inflect.word) else: result.append(token) else: result.append(token) return " ".join(result).replace(" ,", ",") def transform(self, texts, ngram_min, ngram_max, top_tfidf_features): _texts = [] for text in texts: add_texts = re.split("[,.]|([ ]+и[ ]+)", text) _texts += [item for item in add_texts if item is not None] texts = _texts converted_texts = [" ".join(self._tokenize(text)) for text in texts] features = self._top_features(converted_texts, ngram_min, ngram_max, top_tfidf_features) if len(features) == 0: return [] distances = self._feature_distances(features) choosen_features = self._choose_features(features, distances) rules = [ lambda text: re.sub("_adjf+ (\w+)_intj", ", \g<1>", text), lambda text: re.sub("_noun+ (\w+)_adjf", ", \g<1>", text), lambda text: re.sub("^\w+_conj", "", text), lambda text: re.sub("\w+_conj$", "", text), lambda text: re.sub("^\w+_pred", "", text), lambda text: re.sub("\w+_pred$", "", text), lambda text: re.sub("^\w+_precl", "", text), lambda text: re.sub("\w+_precl$", "", text), lambda text: re.sub("_[a-z]+", "", text), lambda text: text.strip(), lambda text: self._norm(text) ] return self._apply_rules(rules, choosen_features)
# pprint(messages) ngrams = [] n, m = 0, 0 t = int(time()) l = len(messages) for message in messages: if message == "<|BEGIN|>": ngram = [] elif message == "<|END|>": phrases = [] for phrase in ngram: terms = set(te(phrase, strings=1, nested=1)) words = list( set([ma.parse(w)[0].normal_form for w in wpt.tokenize(phrase)])) idx = [] for word in words: w = 1 if word in terms else .5 idx += [(w, word)] phrases += [(idx, phrase)] ngrams += [phrases] else: ngram += [message] n += 1 if time() - t > 1: print("%s of %s, %s / sec" % (m, l, n)) m += n n = 0 t = int(time())
class SearchEngine: """ Search engine """ def __init__(self): self.morph = MorphAnalyzer() self.inverted_index_dict = { 'phrases_index': {}, 'counter_dict': {}, 'categories': [] } if os.path.isfile('inverted_index.pickle'): with open('inverted_index.pickle', 'rb') as handle: self.inverted_index_dict = pickle.load(handle) else: with open('docs.json', encoding='utf8') as f: self.docs = json.load(f) self.build_inverted_index(self.docs) with open('inverted_index.pickle', 'wb') as handle: pickle.dump(self.inverted_index_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) del self.docs async def convert_sentence(self, sentence): """ Each word converts to normal form and returned as a list. """ sentence = re.sub(r'\W|\d', ' ', sentence) tokens = sentence.lower().split() result = [self.morph.parse(word)[0].normal_form for word in tokens] return result def build_inverted_index(self, docs): """ Initializes creation of inverted index and category list. """ for document_idx, doc in enumerate(docs): self.build_dictionary(document_idx, doc, 'phrases', self.inverted_index_dict['phrases_index']) self.inverted_index_dict['categories'] = [ i['category'] for i in self.docs ] def build_dictionary(self, doc_idx, doc, section, dictionary): """ Builds an inverted index dictionary. """ if section in doc: for sentence_ind, sentence in enumerate(doc[section]): splitted_sentence = self.tokenize(sentence) if doc_idx in self.inverted_index_dict['counter_dict']: self.inverted_index_dict['counter_dict'][doc_idx] += ( len(splitted_sentence), ) else: self.inverted_index_dict['counter_dict'][doc_idx] = ( len(splitted_sentence), ) for word in splitted_sentence: if word not in dictionary: dictionary[word] = {} if doc_idx in dictionary[word]: dictionary[word][doc_idx] += (sentence_ind, ) else: dictionary[word][doc_idx] = (sentence_ind, ) async def get_categories(self, sentence): """ Сoroutine get the sentence and returns json with the list of categories. """ result = {'categories': ()} _categories = () _links = [] # calling coroutine, bringing each word to normal form. converted_words = await self.convert_sentence(sentence) for word in converted_words: if word in self.inverted_index_dict['phrases_index']: # tuple with category_ids in index for word _categories += tuple( self.inverted_index_dict['phrases_index'][word].keys()) # dicts with categories_id and phrases_id in index for word _links.append(self.inverted_index_dict['phrases_index'][word]) for category in set(_categories): # count the number of words for each of the categories _word_ids = sum([x[category] for x in _links if x.get(category)], ()) for i in set(_word_ids): real_count = _word_ids.count(i) # If count of words in phrase matches the actual number === the phrase matches the search query. if self.inverted_index_dict['counter_dict'][category][ i] == real_count: result['categories'] += ( self.inverted_index_dict['categories'][category], ) return result def tokenize(self, sentence): sentence = re.sub(r'\W|\d', ' ', sentence) tokens = sentence.lower().split() result = [self.morph.parse(word)[0].normal_form for word in tokens] return result
from pymorphy2 import MorphAnalyzer # Создаем объект анализатор morph = MorphAnalyzer() # Слово для примера word = 'струбинов' lst_case = ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct'] # # Анализируем слово word_parsed =morph.parse(word) for par in word_parsed: if {'masc','Surn'} in par.tag: print(par.inflect({'gent'})) # print(len(word_parsed)) # for par in word_parsed: # print(par) # print() # print(par.lexeme) # # # print(word_parsed.tag.gender) # # # print(word_parsed.lexeme) # # # for case in lst_case: # # # print(word_parsed) # # # print(word_parsed.inflect({case}).word) # # print(len(word_parsed.lexeme)) # # for par in word_parsed.lexeme:
class Substs_loader: def __init__(self, data_name, lemmatizing_method, max_examples=None, delete_word_parts=False, drop_duplicates=True, count_lemmas_weights=False, limit=None): self.data_name = data_name self.lemmatizing_method = lemmatizing_method self.max_examples = max_examples self.delete_word_parts = delete_word_parts self.drop_duplicates = drop_duplicates self.count_lemmas_weights = count_lemmas_weights self.translation = str.maketrans('', '', string.punctuation) self.dfs = dict() self.nf_cnts = dict() self.cache = dict() if lemmatizing_method is not None and lemmatizing_method != 'none': if 'ru' in data_name: self.analyzer = MorphAnalyzer() elif 'german' in data_name: self.analyzer = spacy.load("de_core_news_sm", disable=['ner', 'parser']) elif 'english' in data_name: self.analyzer = spacy.load("en_core_web_sm", disable=['ner', 'parser']) else: assert "unknown data name %s" % data_name def get_nf_cnt(self, substs_probs): nf_cnt = Counter(nf for l in substs_probs for p, s in l for nf in self.analyze_russian_word(s)) return nf_cnt def analyze_russian_word(self, word, nf_cnt=None): word = word.strip() if word not in self.cache: self.cache[word] = { i.normal_form for i in self.analyzer.parse(word) } if nf_cnt is not None and len( self.cache[word]) > 1: # select most common normal form h_weights = [nf_cnt[h] for h in self.cache[word]] max_weight = max(h_weights) res = { h for i, h in enumerate(self.cache[word]) if h_weights[i] == max_weight } else: res = self.cache[word] return sorted(list(res)) def analyze(self, word): if not word: return [''] if not word in self.cache: spacyed = self.analyzer(word) lemma = spacyed[0].lemma_ if spacyed[ 0].lemma_ != '-PRON-' else spacyed[0].lower_ self.cache[word] = [lemma] return self.cache[word] def get_lemmas(self, word, nf_cnt=None): if 'ru' in self.data_name: return self.analyze_russian_word(word, nf_cnt) else: return self.analyze(word) def get_single_lemma(self, word, nf_cnt): return self.get_lemmas(word, nf_cnt)[0] def preprocess_substitutes(self, substs_probs, target_word, nf_cnt, topk, exclude_lemmas=set(), delete_word_parts=False): """ 1) leaves only topk substitutes without spaces inside 2) applies lemmatization 3) excludes unwanted lemmas (if any) 4) returns string of space separated substitutes """ exclude = exclude_lemmas.union({target_word}) if delete_word_parts: res = [ word.strip() for prob, word in substs_probs[:topk] if word.strip() and ' ' not in word.strip() and word[0] == ' ' ] else: res = [ word.strip() for prob, word in substs_probs[:topk] if word.strip() and ' ' not in word.strip() ] # TODO: optimise! if exclude: if self.lemmatizing_method != 'none': res = [ s for s in res if not set(self.get_lemmas(s)).intersection(exclude) ] else: res = [s for s in res if not s in exclude] if self.lemmatizing_method == 'single': res = [self.get_single_lemma(word.strip(), nf_cnt) for word in res] elif self.lemmatizing_method == 'all': res = [ ' '.join(self.get_lemmas(word.strip(), nf_cnt)) for word in res ] else: assert self.lemmatizing_method == 'none', "unrecognized lemmatization method %s" % self.lemmatizing_method return ' '.join(res) def get_substitutes(self, path, topk, data_name=None): if data_name is None: data_name = self.data_name if data_name in self.dfs: assert data_name in self.nf_cnts subst = self.dfs[data_name] nf_cnt = self.nf_cnts[data_name] else: subst = load_substs(path, data_name=data_name, drop_duplicates=self.drop_duplicates, limit=self.max_examples) if self.lemmatizing_method != 'none' and self.count_lemmas_weights and 'ru' in self.data_name: nf_cnt = self.get_nf_cnt(subst['substs_probs']) else: nf_cnt = None self.dfs[data_name] = subst self.nf_cnts[data_name] = nf_cnt subst['substs'] = subst.apply(lambda x: self.preprocess_substitutes( x.substs_probs, x.word, nf_cnt, topk, delete_word_parts=self.delete_word_parts), axis=1) subst['word'] = subst['word'].apply(lambda x: x.replace('ё', 'е')) return subst def get_substs_pair(self, path1, path2, topk): """ loads subs from path1, path2 and applies preprocessing """ return self.get_substitutes(path1, topk=topk, data_name=self.data_name + '_1'), \ self.get_substitutes(path2, topk=topk, data_name=self.data_name + '_2' )
bot = telebot.TeleBot(conf.TOKEN, threaded=False) bot.remove_webhook() bot.set_webhook(url=WEBHOOK_URL_BASE+WEBHOOK_URL_PATH) app = flask.Flask(__name__) @bot.message_handler(commands=['start', 'help']) def send_welcome(message): bot.send_message(message.chat.id, "Здравствуйте! Это бот, с которым можно разговаривать.") @bot.message_handler(func=lambda m: True): def send(message): reply = '' for word in message.split(' '): ana = morph.parse(word.strip('.,:;?!()""'''))[0] if ana.tag.POS in pos_files_dict and ana.tag.POS not in ['NOUN', 'NPRO']: file = pos_files_dict[ana.tag.POS] words = (open(file, 'r').read()).split(' ') word_replace = random.choice(words) grammemes = set() grammemes.add(ana.tag.case) grammemes.add(ana.tag.gender) grammemes.add(ana.tag.mood) grammemes.add(ana.tag.number) grammemes.add(ana.tag.person) grammemes.add(ana.tag.tense) grammemes.add(ana.tag.voice) grammemes.remove(None) word_replace = ((morph.parse(word_replace)[0]).inflect(grammemes)).word elif ana.tag.POS in ['NOUN', 'NPRO'] and ana.tag.gender != None:
inverted_index = {} page_occurrences = {} pages = os.listdir(path=PAGES_PATH) for index, page in enumerate(pages): file = open(PAGES_PATH + page, 'r', encoding="utf-8") text = file.read() tokens = tokenizers.simple_word_tokenize(text) page_occurrences[index] = len(tokens) for token in tokens: lemma = morph.parse(token)[0].normal_form.lower() value = inverted_index.get(lemma) if value is None: inverted_index[lemma] = {index: 1} elif inverted_index[lemma].get(index) is None: inverted_index[lemma][index] = 1 else: inverted_index[lemma][index] += 1 inverted_index_file = open("inverted_index.pkl", "wb") pickle.dump(inverted_index, inverted_index_file) inverted_index_file.close() page_occurrences_file = open("page_occurrences.pkl", "wb") pickle.dump(page_occurrences, page_occurrences_file)
class PymorphyVectorizer(WordIndexVectorizer): """ Transforms russian words into 0-1 vector of its possible Universal Dependencies tags. Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io) and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets). All UD2.0 tags that are compatible with produced tags are memorized. The list of possible Universal Dependencies tags is read from a file, which contains all the labels that occur in UD2.0 SynTagRus dataset. Args: save_path: path to save the tags list, load_path: path to load the list of tags, max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used. """ USELESS_KEYS = ["Abbr"] VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"} def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20') @property def dim(self): return len(self._t2i) def save(self) -> None: """Saves the dictionary to self.save_path""" with self.save_path.open("w", encoding="utf8") as fout: fout.write("\n".join(self._i2t)) def load(self) -> None: """Loads the dictionary from self.load_path""" self._i2t = [] with self.load_path.open("r", encoding="utf8") as fin: for line in fin: line = line.strip() if line == "": continue self._i2t.append(line) self._t2i = {tag: i for i, tag in enumerate(self._i2t)} self._make_tag_trie() def _make_tag_trie(self): self._nodes = [defaultdict(dict)] self._start_nodes_for_pos = dict() self._data = [None] for tag, code in self._t2i.items(): if "," in tag: pos, tag = tag.split(",", maxsplit=1) tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag, [] start = self._start_nodes_for_pos.get(pos) if start is None: start = self._start_nodes_for_pos[pos] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) for key, value in tag: values_dict = self._nodes[start][key] child = values_dict.get(value) if child is None: child = values_dict[value] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) start = child self._data[start] = code return self def find_compatible(self, tag: str) -> List[int]: """ Transforms a Pymorphy tag to a list of indexes of compatible UD tags. Args: tag: input Pymorphy tag Returns: indexes of compatible UD tags """ if " " in tag and "_" not in tag: pos, tag = tag.split(" ", maxsplit=1) tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag.split()[0], [] if pos not in self._start_nodes_for_pos: return [] tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag if key not in self.USELESS_KEYS] if len(tag) > 0: curr_nodes = [(0, self._start_nodes_for_pos[pos])] final_nodes = [] else: final_nodes = [self._start_nodes_for_pos[pos]] curr_nodes = [] while len(curr_nodes) > 0: i, node_index = curr_nodes.pop() # key, value = tag[i] node = self._nodes[node_index] if len(node) == 0: final_nodes.append(node_index) for curr_key, curr_values_dict in node.items(): curr_i, curr_node_index = i, node_index while curr_i < len(tag) and tag[curr_i][0] < curr_key: curr_i += 1 if curr_i == len(tag): final_nodes.extend(curr_values_dict.values()) continue key, value = tag[curr_i] if curr_key < key: for child in curr_values_dict.values(): curr_nodes.append((curr_i, child)) else: child = curr_values_dict.get(value) if child is not None: if curr_i < len(tag) - 1: curr_nodes.append((curr_i + 1, child)) else: final_nodes.append(child) answer = [] while len(final_nodes) > 0: index = final_nodes.pop() if self._data[index] is not None: answer.append(self._data[index]) for elem in self._nodes[index].values(): final_nodes.extend(elem.values()) return answer def _get_word_indexes(self, word): answer = self.memorized_word_indexes.get(word) if answer is None: parse = self.analyzer.parse(word) if self.max_pymorphy_variants > 0: parse = parse[:self.max_pymorphy_variants] tag_indexes = set() for elem in parse: tag_indexes.update(set(self._get_tag_indexes(elem.tag))) answer = self.memorized_word_indexes[word] = list(tag_indexes) return answer def _get_tag_indexes(self, pymorphy_tag): answer = self.memorized_tag_indexes.get(pymorphy_tag) if answer is None: tag = self.converter(str(pymorphy_tag)) answer = self.memorized_tag_indexes[pymorphy_tag] = self.find_compatible(tag) return answer
class RussianLemmatizer(Lemmatizer): def __init__( self, vocab: Vocab, model: Optional[Model], name: str = "lemmatizer", *, mode: str = "pymorphy2", overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: from pymorphy2 import MorphAnalyzer except ImportError: raise ImportError( "The Russian lemmatizer mode 'pymorphy2' requires the " "pymorphy2 library. Install it with: pip install pymorphy2" ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer) def pymorphy2_lemmatize(self, token: Token) -> List[str]: string = token.text univ_pos = token.pos_ morphology = token.morph.to_dict() if univ_pos == "PUNCT": return [PUNCT_RULES.get(string, string)] if univ_pos not in ("ADJ", "DET", "NOUN", "NUM", "PRON", "PROPN", "VERB"): # Skip unchangeable pos return [string.lower()] analyses = self._morph.parse(string) filtered_analyses = [] for analysis in analyses: if not analysis.is_known: # Skip suggested parse variant for unknown word for pymorphy continue analysis_pos, _ = oc2ud(str(analysis.tag)) if analysis_pos == univ_pos or (analysis_pos in ("NOUN", "PROPN") and univ_pos in ("NOUN", "PROPN")): filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] if morphology is None or (len(morphology) == 1 and POS in morphology): return list( dict.fromkeys( [analysis.normal_form for analysis in filtered_analyses])) if univ_pos in ("ADJ", "DET", "NOUN", "PROPN"): features_to_compare = ["Case", "Number", "Gender"] elif univ_pos == "NUM": features_to_compare = ["Case", "Gender"] elif univ_pos == "PRON": features_to_compare = ["Case", "Number", "Gender", "Person"] else: # VERB features_to_compare = [ "Aspect", "Gender", "Mood", "Number", "Tense", "VerbForm", "Voice", ] analyses, filtered_analyses = filtered_analyses, [] for analysis in analyses: _, analysis_morph = oc2ud(str(analysis.tag)) for feature in features_to_compare: if (feature in morphology and feature in analysis_morph and morphology[feature].lower() != analysis_morph[feature].lower()): break else: filtered_analyses.append(analysis) if not len(filtered_analyses): return [string.lower()] return list( dict.fromkeys( [analysis.normal_form for analysis in filtered_analyses])) def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]: string = token.text analyses = self._morph.parse(string) if len(analyses) == 1: return [analyses[0].normal_form] return [string]
class MagicWorker(): def __init__(self, path_stopwords, path_clusters): #self.classifier = ft.load(path_to_fasttext_model) self.__stopwords = self.__get_stopwords__(path_stopwords) self.__analyzer = MorphAnalyzer() self.__classes, self.__answers = self.__get_classes__(path_clusters) def __get_classes__(self, path_to_clusters): classes, answers = [], [] with open(path_to_clusters, 'r') as f: for line in f: tmp = list(line.split(', ')) tmp[-1] = tmp[-1].replace('\n', '') classes.append(tmp[:-1]) answers.append(tmp[-1]) return classes, answers def __get_stopwords__(self, path_to_stopwords): with open(path_to_stopwords, 'r') as f: stopwords = list(f.read().split('\n')) return stopwords def __process_request__(self, request: str): request = request.lower() letters = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя ' spec_syms = ',./<>?;":[]{}!@#$%^&*()-=_+|' for sym in spec_syms: request = request.replace(sym, ' ') request = re.sub(r'\s+', ' ', request) request = request.replace('ё', 'е') result = '' for letter in request: if letter in letters: result += letter temp = [] for word in result.split(): temp.append(self.__analyzer.parse(word)[0].normal_form) result = ' '.join(temp) tmp_ = [] for t in result.split(' '): if not (t in self.__stopwords): tmp_.append(t) result = ' '.join(tmp_) return result def __analize__request__(self, request: str): processed_request = self.__process_request__(request) count_of_entries = [0 for _ in range(len(self.__classes))] for word in processed_request.split(' '): for i in range(len(self.__classes)): if word in self.__classes[i]: count_of_entries[i] += 1 persents_of_entries = [ int(count_of_entries[i] / len(self.__classes[i]) * 100) for i in range(len(count_of_entries)) ] return persents_of_entries def predict(self, request: str): persents = self.__analize__request__(request) ans = 'Попробуйте переформулировать вопрос' max_persents_index = 0 for i in range(1, len(persents)): if persents[i] > persents[max_persents_index]: max_persents_index = i if (persents[max_persents_index] > 10 ): # and (persents.count(persents[max_persents_index]) == 1): ans = self.__answers[max_persents_index] return ans
def get_first_data(self): morp = MorphAnalyzer() udpipe_data = pd.DataFrame( columns=['id', 'form', 'lemma', 'UPosTag', 'XPosTag', 'Feats', 'Head', 'DepRel', 'Deps', 'Misc']) for text in self.df['first_or_propn'].values.tolist(): if len(text.split(' ')) == 1: processed = pipeline.process(text) elif len(text.split(' ')) > 1: final_word = None for word in text.split(' '): p = morp.parse(word)[0] if str(p.tag.POS) == 'NOUN' or str(p.tag.POS) == 'PRON': final_word = word #processed = pipeline.process(word) break if final_word is None: processed = pipeline.process(text.split(' ')[0]) else: processed = pipeline.process(final_word) else: processed = pipeline.process(text) processed = processed.split('\n') for line in processed: if '#' not in line and line != '': udpipe_data = udpipe_data.append({ 'id': line.split('\t')[0], 'form': line.split('\t')[1], 'lemma': line.split('\t')[2], 'UPosTag': line.split('\t')[3], 'XPosTag': line.split('\t')[4], 'Feats': line.split('\t')[5], 'Head': line.split('\t')[6], 'DepRel': line.split('\t')[7], 'Deps': line.split('\t')[8], 'Misc': line.split('\t')[9] }, ignore_index=True) # за первый проход находим все разные возможные колонки cols = list() for i in range(udpipe_data.shape[0]): if '|' in udpipe_data.Feats[i]: tmp_data = udpipe_data.Feats[i].split('|') for j in tmp_data: cols.append(j.split('=')[0]) cols = list(set(cols)) for i in range(len(cols)): udpipe_data[cols[i]] = pd.np.nan for i in range(udpipe_data.shape[0]): if '|' in udpipe_data.Feats[i]: tmp_data = udpipe_data.Feats[i].split('|') for j in tmp_data: udpipe_data[j.split('=')[0]][i] = j.split('=')[1] udpipe_data.insert(1, 'number_of_sent', pd.np.nan) n = 1 for i in range(udpipe_data.shape[0] - 1): if udpipe_data.iloc[i, 0] < udpipe_data.iloc[i + 1, 0]: udpipe_data.iloc[i, 1] = n else: udpipe_data.iloc[i, 1] = n n += 1 udpipe_data.iloc[udpipe_data.shape[0] - 1, 1] = n udpipe_data = udpipe_data.drop(['id', 'number_of_sent', 'form', 'lemma', 'XPosTag', 'Feats', 'Head', 'DepRel', 'Deps', 'Misc'], axis=1) need_cols = ['UPosTag', 'Animacy', 'Number', 'Case', 'Gender'] for col in need_cols: if col not in udpipe_data: udpipe_data[col] = pd.np.nan udpipe_data = udpipe_data[['UPosTag', 'Animacy', 'Number', 'Case', 'Gender']] return udpipe_data
def convert_words(self, word, number): a = MorphAnalyzer() conv_word = a.parse(word)[0] return conv_word.make_agree_with_number(number).word
from nltk.corpus import stopwords import pandas as pd # text reading, cleaning and splitting into words if len(sys.argv) > 1: if path.exists(sys.argv[1]): with open(sys.argv[1]) as f: text = re.split(r'\b[\W\d\s]+\b', f.read()) else: raise FileNotFoundError("File {} not found".format(sys.argv[1])) else: raise Exception("Missing commad line parameter") # word normalization morph = MorphAnalyzer() normal_words = [morph.parse(word)[0].normal_form for word in text] # stopwords removing try: set_stopwords = set(stopwords.words('russian')) except Exception: import nltk nltk.download('stopwords') set_stopwords = set(stopwords.words('russian')) set_words = set(normal_words) - set_stopwords # Words counting and sorting counted_words = pd.Series(Counter(normal_words)) sorted_words = counted_words[set_words].sort_values(ascending=False)
model = gensim.models.KeyedVectors.load_word2vec_format(m, binary=False) elif m.endswith('.bin.gz'): model = gensim.models.KeyedVectors.load_word2vec_format(m, binary=True) else: model = gensim.models.KeyedVectors.load(m) model.init_sims(replace=True) #первые 4 ближайшие по косинусовой схожести слов, берем в словарь dop_words = [] for word in words_: if word in model: for i in model.most_similar(positive=[word], topn=4): # слово + коэффициент косинусной близости wrd = re.sub('[^а-я|\s|А-Я]', '', i[0]) wrd = morph.parse(wrd)[0].normal_form dop_words.append(wrd) words_ = ["режиссёр", "кино", "премьера", "показ", "блокбастер"] words_ = dop_words + words_ def for_bar_dict(fr_dict): #словарь для постоение столбчатой диаграммы graph_dict = {} for word in words_: try: graph_dict[word] = fr_dict[word] except: graph_dict[word] = 0 return graph_dict
class UDPymorphyLemmatizer(BasicLemmatizer): """ A class that returns a normal form of a Russian word given its morphological tag in UD format. Lemma is selected from one of PyMorphy parses, the parse whose tag resembles the most a known UD tag is chosen. """ RARE_FEATURES = ["Fixd", "Litr"] SPECIAL_FEATURES = ["Patr", "Surn"] def __init__(self, save_path: Optional[str] = None, load_path: Optional[str] = None, rare_grammeme_penalty: float = 1.0, long_lemma_penalty: float = 1.0, **kwargs) -> None: self.rare_grammeme_penalty = rare_grammeme_penalty self.long_lemma_penalty = long_lemma_penalty self._reset() self.analyzer = MorphAnalyzer() self.converter = converters.converter("opencorpora-int", "ud20") super().__init__(save_path, load_path, **kwargs) def save(self, *args, **kwargs): pass def load(self, *args, **kwargs): pass def _reset(self): self.memo = dict() def _extract_lemma(self, parse: Parse) -> str: special_feats = [x for x in self.SPECIAL_FEATURES if x in parse.tag] if len(special_feats) == 0: return parse.normal_form # here we process surnames and patronyms since PyMorphy lemmatizes them incorrectly for other in parse.lexeme: tag = other.tag if any(x not in tag for x in special_feats): continue if tag.case == "nomn" and tag.gender == parse.tag.gender and tag.number == "sing": return other.word return parse.normal_form def _lemmatize(self, word: str, tag: Optional[str] = None) -> str: lemma = self.memo.get((word, tag)) if lemma is not None: return lemma parses = self.analyzer.parse(word) best_lemma, best_distance = word, np.inf for i, parse in enumerate(parses): curr_tag = self.converter(str(parse.tag)) distance = get_tag_distance(tag, curr_tag) for feat in self.RARE_FEATURES: if feat in parse.tag: distance += self.rare_grammeme_penalty break if len(word) == 1 and len(parse.normal_form) > 1: distance += self.long_lemma_penalty if distance < best_distance: best_lemma, best_distance = self._extract_lemma( parse), distance if distance == 0: break self.memo[(word, tag)] = best_lemma return best_lemma
def read_text_lemmas(fileobj): m = MorphAnalyzer() for line in fileobj: yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
def lemmatize(self, tokens): """ :param tokens: a list of tokens to lemmatize """ analyzer = MorphAnalyzer() return Counter([analyzer.parse(token)[0].normal_form for token in tokens if len(token) > 1])
class LamaBot(object): def __init__(self, app_id, mail_manager, chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs): """ Initializes Lama Bot. Expects login/password or access_token as named parameters :param mail_manager: A manager for retrieving mails :type mail_manager: AbstractMailManager :param chat_id: Chat identifier :type chat_id: int :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented :type chat_id_for_mails: int :raise ValueError: When neither login/password nor access_token was provided """ self.exit_event = Event() self.morph = MorphAnalyzer() self.version = '0.1.1' self.app_id = app_id self.access_token = None self.password = None self.login = None self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs) self.commands = {} self._plugins = [] self.mail_manager = mail_manager self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest self.chat_id = chat_id self.chat_id_for_mails = chat_id_for_mails or self.chat_id self.admins = admins or [] self.initialize_commands() def initialize_commands(self): self.commands = { 'post_to_dialog': lambda args, m: self.safe_post_message_and_log_if_failed(args), 'ping': self.pong_to_admins } def safe_notify_about_unread_mails(self): for m in self.safe_unread_mails: if self.safe_post_mail_and_log_if_failed(m): self.mail_manager.safe_mark_mail_as_read_and_log_if_failed(m) def safe_process_directed_dialog_message(self, message): logging.debug(u'Processing message with body {}'.format(message.body)) words = self.split_to_words(message.body) logging.debug(u'Words in the body: {}'.format(words)) self.safe_process_plugins(message, words) self.safe_mark_message_as_read_and_log_if_failed(message) def safe_process_private_message(self, message): if self.safe_execute_and_log_if_failed(message): self.safe_mark_message_as_read_and_log_if_failed(message) @safe_call_and_log_if_failed def safe_process_plugins(self, message, words): normalized_words = self.normalize_words(words) for p in self.plugins: p.process_input(message.body, words, normalized_words, message) def long_pool_loop(self, exit_event): server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response() while not exit_event.is_set(): response = self.send_long_poll_request(server, key, ts) if 'failed' in response: server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response() else: self.process_long_poll_response(response) ts = self.get_timestamp(response, ts) def extract_server_key_and_timestamp_from_get_long_poll_server_response(self): response = self.vkapi.messages_get_long_poll_server() while not all(x in response for x in ('server', 'key', 'ts')): logging.error('Could not retrieve credentials for connecting to long poll server', response) response = self.vkapi.messages_get_long_poll_server() return response['server'], response['key'], response['ts'] @safe_call_and_log_if_failed(default={'failed': True}) def send_long_poll_request(self, server, key, ts, act='a_check', wait=25, mode=2): params = { 'act': act, 'key': key, 'ts': ts, 'wait': wait, 'mode': mode } return requests.get('http://{server}'.format(server=server), params=params).json() def process_long_poll_response(self, response): if response: for update in response.get('updates', []): self.process_long_poll_update(update) def process_long_poll_update(self, update): functions = { 4: self.process_long_poll_new_message } function = functions.get(update[0]) if function: function(update) def process_long_poll_new_message(self, update): chat_id = self.get_chat_id_from_long_poll_new_message_update(update) fwd_messages = self.get_fwd_messages_from_long_poll_new_message_update(update) self.process_new_message(VkMessage({'id': update[1], 'user_id': None, 'read_state': (update[2] + 1) % 2, 'chat_id': chat_id, 'title': update[5], 'body': update[6], 'fwd_messages': fwd_messages, 'out': (update[2] & 2) >> 1})) def process_new_message(self, message): if message.is_unread and message.is_inbox: if message.chat_id == self.chat_id and self.message_is_directed(message): self.safe_process_directed_dialog_message(message) elif message.is_private: self.safe_process_private_message(message) def get_fwd_messages_from_long_poll_new_message_update(self, update): return map(self.convert_fwd_from_long_poll_new_message_update_to_fwd_message, ifilter(None, self.get_attachments_from_long_poll_new_message_update(update).get('fwd', '').split(','))) @staticmethod def convert_fwd_from_long_poll_new_message_update_to_fwd_message(fwd): regex = re.compile('(?P<user_id>\d+)_(?P<msg_id>\d+)') m = regex.match(fwd) return { 'id': m.group('msg_id'), 'user_id': m.group('user_id') } @staticmethod def get_chat_id_from_long_poll_new_message_update(update): """ The message was sent from chat if user_id is greater than 2000000000 :param update: :return: """ return update[3] - 2000000000 if update[3] > 2000000000 else None def get_user_id_from_long_poll_new_message_update(self, update): """ Retrieves user_id from update according to documentation https://vk.com/pages?oid=-17680044&p=Connecting_to_the_LongPoll_Server :param update: :return: """ return self.get_attachments_from_long_poll_new_message_update(update).get('from') @staticmethod def get_attachments_from_long_poll_new_message_update(update): return update[7] if len(update) > 7 else {} @staticmethod def get_timestamp(response, default): return response.get('ts', default) if response else default @property def unread_mails(self): return self.mail_manager.unread_mails @property def safe_unread_mails(self): """ Just delegates the work to the mail manager :return: """ return self.mail_manager.safe_unread_mails @property def vkapi_messages_get(self): return self.vkapi.messages_get() @property def plugins(self): """ :rtype : a list of LamaPlugin """ return self._plugins def vkapi_messages_set_activity_in_chat(self): return self.vkapi.messages_set_activity(chat_id=self.chat_id, type='typing') def post_mail(self, mail): """ Posts mail to VK. Loads and attaches documents, if any. :param mail: :return: """ documents = None if mail.attachments: documents = filter(None, imap(self.safe_upload_attachment, mail.attachments)) self.post_message_to_mail_dialog(self.wrap_mail(mail), attachments=documents) @safe_call_and_log_if_failed(default=False) def safe_post_mail_and_log_if_failed(self, mail): """ :param mail: :return: True if no error, False otherwise """ self.post_mail(mail) return True @safe_call_and_log_if_failed() def safe_post_message_and_log_if_failed(self, message): self.post_message_to_dialog(message) @safe_call_and_log_if_failed def pong_to_admins(self, _, message): self.post_message_to_admins('Pong', forward_messages=[message]) @safe_call_and_log_if_failed def safe_post_message_with_forward_messages(self, message, forward_messages): self.post_message_to_dialog(message, forward_messages=forward_messages) def execute(self, message): s = message.body command, args = self.split_to_command_and_argument(s) if command in self.commands: self.commands[command](args, message) else: self.command_not_found(command) @safe_call_and_log_if_failed(default=False) def safe_execute_and_log_if_failed(self, message): self.execute(message) return True @staticmethod def split_to_command_and_argument(command): values = command.split(':', 1) if len(values) != 2: values.append(None) return values[0], values[1] def _post_message_to_dialog(self, chat_id, message, attachments=None, forward_messages=None): """ Posts message to dialog. Attaches attachments, if any. :param forward_messages: Messages to be forwarded :type forward_messages: [VkMessage] :param attachments:Documents to be attached :type attachments: [VkDocument] :param message: """ attachments = attachments or [] forward_messages = forward_messages or [] attachment = ','.join(map(lambda d: d.attachment_string, attachments)) forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages)) self.vkapi.messages_send(chat_id=chat_id, message=message, attachment=attachment, forward_messages=forward_messages_str) def post_message_to_dialog(self, message, attachments=None, forward_messages=None): self._post_message_to_dialog(self.chat_id, message, attachments=attachments, forward_messages=forward_messages) def post_message_to_mail_dialog(self, message, attachments=None, forward_messages=None): self._post_message_to_dialog(self.chat_id_for_mails, message, attachments=attachments, forward_messages=forward_messages) def post_startup_message_to_admins(self): self.post_message_to_admins('The Lama is ready to work! (version {0})'.format(self.version)) @safe_call_and_log_if_failed def post_message_to_admins(self, message, forward_messages=None): forward_messages = forward_messages or [] forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages)) for user_id in self.admins: self.vkapi.messages_send(user_id=user_id, message=message, forward_messages=forward_messages_str) def command_not_found(self, command): message = u'Command `{}` not found'.format(command).encode('utf-8') logging.warning(message) def run(self, post_welcome_message_to_dialog=True): if post_welcome_message_to_dialog: self.post_startup_message_to_admins() long_poll = Thread(target=self.long_pool_loop, args=(self.exit_event,)) long_poll.start() while True: self.safe_notify_about_unread_mails() time.sleep(self.number_of_seconds_for_the_rest) def stop_running(self): self.exit_event.set() @safe_call_and_log_if_failed def safe_upload_attachment(self, attachment): """ Uploads given attachment :type attachment: Attachment :rtype: VkDocument """ if attachment.is_loaded: url = self.safe_docs_get_upload_server() file_string = self.safe_upload_file_to_server(url, self.create_attachment_filename(attachment.filename), attachment.data, attachment.mime_type) return self.safe_save_doc_file(file_string, attachment.filename) @safe_call_and_log_if_failed def safe_upload_message_photo(self, image_file_path): if image_file_path is not None: url = self.safe_get_upload_server_for_private_message_photo() data = self.safe_upload_photo_to_server(url, self.create_attachment_filename(image_file_path), self.get_image_data(image_file_path), self.get_mime_type(image_file_path)) photo_name = os.path.basename(image_file_path) return self.safe_save_photo_file(data['photo'], data['server'], data['hash'], photo_name) @staticmethod def get_image_data(image_filename): with open(image_filename, 'rb') as f: data = f.read() return data @staticmethod def get_mime_type(image_filename): return mimetypes.guess_type(image_filename) @safe_call_and_log_if_failed def safe_save_photo_file(self, photo, server, hash, title): if photo: responses = self.vkapi.photos_save_message_photo(photo=photo, server=server, hash=hash, title=title) return VkPhoto(responses[0]) @safe_call_and_log_if_failed def safe_get_upload_server_for_private_message_photo(self): """ Retrieves upload_url for storing files """ return self.vkapi.photos_get_messages_upload_server()['upload_url'] @staticmethod def create_attachment_filename(filename): _, extension = os.path.splitext(filename) return 'attachment' + extension @safe_call_and_log_if_failed def safe_upload_to_server(self, url, filename, data, mime_type, post_name): """ Uploads data to given url and saves it with given filename and mime_type :return: Raw response, returned by post request """ if url: request = requests.post(url, files={post_name: (filename or 'NoName', data, mime_type)}) response = json.loads(request.text) if 'error' in response: raise Exception(response['error']) else: return response def safe_upload_file_to_server(self, url, filename, data, mime_type): return self.safe_upload_to_server(url, filename, data, mime_type, 'file')['file'] def safe_upload_photo_to_server(self, url, filename, data, mime_type): return self.safe_upload_to_server(url, filename, data, mime_type, 'photo') @safe_call_and_log_if_failed def safe_save_doc_file(self, file_string, title): """ Saves file on VK server by given string :param file_string: String, returned after uploading file :return: Saved document :rtype: VkDocument """ if file_string: responses = self.vkapi.docs_save(file=file_string, title=title) return VkDocument(responses[0]) @safe_call_and_log_if_failed def safe_docs_get_upload_server(self): """ Retrieves upload_url for storing files """ return self.vkapi.docs_get_upload_server()['upload_url'] def retrieve_users_by_ids(self, *user_ids): return map(VkUser, self.vkapi.users_get(user_id=','.join(imap(str, user_ids)))) @staticmethod def wrap_mail(mail): return LamaBeautifier.get_random_mail_pattern().format(subject=mail.subject, sender=mail.sender, body=mail.body) @staticmethod def message_is_directed(message): return message.body is not None and message.body.encode('utf-8').startswith('Лама, ') @staticmethod def message_has_body(message): return message.body is not None def mark_message_as_read(self, message): self.mark_message_as_read_by_id(message.id) @safe_call_and_log_if_failed(default=False) def safe_mark_message_as_read_and_log_if_failed(self, message): self.mark_message_as_read(message) return True def mark_message_as_read_by_id(self, message_ids): self.vkapi.messages_mark_as_read(message_ids=message_ids) def register_plugin(self, plugin): self._plugins.append(plugin) plugin.bot = self def split_to_words(self, body): return body.encode('utf-8').translate(string.maketrans('', ''), string.punctuation).split() def normalize_words(self, words): return map(self.normalize_word, words) def normalize_word(self, word): return self.morph.parse(word.decode('utf8'))[0].normal_form.encode('utf8')
class Timer(AppDaemon): def initialize(self): self.morph = MorphAnalyzer() self.ok_phrases = ["Без проблем, таймер сработает через {time}", "Ок, таймер сработает через {time}", "Сделано, таймер сработает через {time}", "Готово, сработает через {time}", "Ставлю таймер на {time}", "Таймер на {time} - сделано"] self.remaining_phrases = ["Осталось {remain}.", "Таймер сработает через {remain}.", "{remain} до конца."] self.not_set_phrases = ["Таймер не установлен.", "Я не засекала.", "Прости, я не засекала.", "Таймер? Не знаю. Не засекала."] self.already_set_phrases = ["Таймер уже установлен.", "Ты уже поставил таймер."] self.ok_remove_phrases = ["Отменяю.", "Хорошо, таймер отключен.", "Таймер выключен"] self.timer_ended_phrases = ["Время вышло!", "Сработал таймер", "Время!"] engine = self.get_app("brain").engine keyword = ["таймер", "таймеры"] self.set_timer_words = ["поставить", "установить"] self.reset_timer_words = ["сбросить", "отменить", "остановить"] self.state_timer_words = ["как", "остаться", "состояние"] re_hours = "(?P<TimerHours>[0-9]+) (час|часы)" re_minutes = "(?P<TimerMinutes>[0-9]+) минута" re_seconds = "(?P<TimerSeconds>[0-9]+) секунда" for k in keyword: engine.register_entity(k, "TimerKeyword") for a in self.set_timer_words: engine.register_entity(a, "TimerAction") for a in self.reset_timer_words: engine.register_entity(a, "TimerAction") for a in self.state_timer_words: engine.register_entity(a, "TimerAction") engine.register_regex_entity(re_hours) engine.register_regex_entity(re_minutes) engine.register_regex_entity(re_seconds) timer_intent = IntentBuilder("timer")\ .require("TimerKeyword")\ .optionally("TimerAction")\ .optionally("TimerHours")\ .optionally("TimerMinutes")\ .optionally("TimerSeconds")\ .build() engine.register_intent_parser(timer_intent) self.context_sensitive = True self.context_blacklist = ["TimerHours", "TimerSeconds", "TimerMinutes"] self.timer_handler = threading.Timer(10, self.timer_ended) print("timer initialized") def handle(self, intent_dict): action = intent_dict.get("TimerAction", "cостояние") # if "TimerHours" in intent_dict or "TimerMinutes" in intent_dict or "TimerSeconds" in intent_dict: # action = "поставить" if action in self.set_timer_words: print("setting timer") return self.start_timer(intent_dict) elif action in self.reset_timer_words: print("stopping timer") return self.stop_timer() elif action in self.state_timer_words: print("getting state") return self.state_timer() else: return "Прости, что то не так пошло с этим таймером" def start_timer(self, intent_dict): if self.timer_handler.is_alive(): return choice(self.already_set_phrases) hours = int(intent_dict.get("TimerHours", 0)) print("hours:", hours) minutes = int(intent_dict.get("TimerMinutes", 0)) print("minutes:", minutes) seconds = int(intent_dict.get("TimerSeconds", 0)) print("seconds:", seconds) time = hours * 3600 + minutes * 60 + seconds self.timer_handler = threading.Timer(time, self.timer_ended) self.timer_handler.start() self.timer_end_time = datetime.datetime.now() + datetime.timedelta(seconds=time) print(self.timer_handler.is_alive()) return choice(self.ok_phrases).format(time=self.pron_time(hours, minutes, seconds)) def state_timer(self): if not self.timer_handler.is_alive(): return choice(self.not_set_phrases) timer_delta = self.timer_end_time - datetime.datetime.now() hours, secs = divmod(timer_delta.seconds, 3600) minutes, seconds = divmod(secs, 60) return choice(self.remaining_phrases).format(remain=self.pron_time(hours, minutes, seconds)) def stop_timer(self): if self.timer_handler is None: return choice(self.not_set_phrases) self.timer_handler.cancel() return choice(self.ok_remove_phrases) def timer_ended(self): print("Timer!!!") # yandex tts say some phrase def pron_time(self, hours, minutes, seconds): phours = str(hours) + " " + self.morph.parse("час")[ 0].make_agree_with_number(hours).word if hours > 0 else "" pminutes = str(minutes) + " " + self.morph.parse("минута")[ 0].make_agree_with_number(minutes).word if minutes > 0 else "" pseconds = str(seconds) + " " + self.morph.parse("секунда")[ 0].make_agree_with_number(seconds).word if seconds > 0 else "" return " ".join([phours, pminutes, pseconds])
class TextProcessing: # TODO: add stemming """ Make usual text processing such as tokenizing, lemmatizing, deleting stopwords. :param token_pat: regex pattern to split text :param mode: 'normal' mode provides usual tokenize process, 'nospace' mode is probability based mode to recover words from given sequence of characters, requires counter attribute to be passed :param counter: Counter object with words frequencies :param threshold: max number of divisions in 'nospace' mode to be considered successful :param allowed_pos: iterable, parts of speech to be left by pymorphy2.MorphAnalizer after lemmatizing, others will be dropped :param stop_words: if None is passed default russian stopwords are used :param stop_cities: if True russian cities will be dropped """ def __init__(self, token_pat="[а-я]+", mode="normal", counter=None, threshold=3, allowed_pos=None, stop_words=None, stop_cities=False): self.token = token_pat self.mode = mode if self.mode not in {"normal", "nospace"}: raise ValueError("Unknown mode") elif self.mode == "nospace": if not isinstance(counter, Counter): raise ValueError( "In 'nospace' mode the counter attribute should be passed") self.counter = counter self.nospace = NoSpaceSplitter(counter) self.threshold = threshold self.morph = MorphAnalyzer() self.allowed_pos = allowed_pos self.stop_words = stop_words or STOPWORDS if stop_cities: self.stop_words.union(CITIES) def tokenize(self, doc): """ :param doc: must be a string or iterable, if string it will be splitted in tokens, else - left without changes :return: list of tokens """ if isinstance(doc, str): doc = re.findall(self.token, doc.lower()) elif not isinstance(doc, Iterable): raise ValueError("The doc must be a string or iterable") if self.mode == "nospace": return self._no_space_split(doc) return doc def _no_space_split(self, doc): res = [] for w in doc: split = self.nospace.segment(w) if len(split) <= self.threshold: res.extend(split) else: res.extend(w) return res def lemmatize(self, doc): """ :param doc: iterable, list of words :return: most probable normal forms of words in doc """ res = [] for w in doc: parsed = self.morph.parse(w)[0] if parsed in SPECIAL_WORDS: continue if self.allowed_pos: if parsed.tag.POS in self.allowed_pos: res.append(parsed.normal_form) else: continue else: res.append(parsed.normal_form) return res def clear_stop_words(self, doc): """ :param doc: iterable, list of words :return: doc without stopwords """ return [w for w in doc if w not in self.stop_words] def transform(self, corpora: "pd.Series"): """ Process full pipeline: tokenizing, deleting stopwords, lemmatizing :param corpora: pd.Series to process :return: processed data """ data = (corpora.map(self.tokenize).map(self.clear_stop_words).map( self.lemmatize)) return data
class Analyser(): def __init__(self): #Подключаем конфиги main_model_config_path = MODELS_PATHS["main_model_config"] main_model_weights_path = MODELS_PATHS["main_model_weights"] gram_dict_input = MODELS_PATHS["gram_input"] gram_dict_output = MODELS_PATHS["gram_output"] word_dictionary = MODELS_PATHS["word_dictionary"] char_set_path = MODELS_PATHS["char_set"] build_config = MODELS_PATHS["build_config"] self.converter = converters.converter('opencorpora-int', 'ud14') self.morph = MorphAnalyzer() #Pymorphy2 self.build_config = BuildModelConfig() self.build_config.load(build_config) self.model = LSTMModel() self.model.prepare(gram_dict_input, gram_dict_output, word_dictionary, char_set_path) self.model.load_main_model(self.build_config, main_model_config_path, main_model_weights_path) def analyse(self, words: List[str]) -> List[WordForm]: """ Грам. разбор введенного текста (без разбиения на предложения) """ words_predicts = self.model.predict_gram_analysis([words], 1, self.build_config)[0] return self.predictionsParsing(words, words_predicts) def analyse_sentences(self, sentences: List[List[str]], batch_size: int = 64) -> List[List[WordForm]]: """ Грам. разбор выборки текста (с разбиением на отдельные предложения) """ sentences_predicts = self.model.predict_gram_analysis( sentences, batch_size, self.build_config) answers = [] for words, words_predicts in zip(sentences, sentences_predicts): answers.append(self.predictionsParsing(words, words_predicts)) return answers def split_text_on_words(self, text: str) -> List[str]: """ Разбивает текст на слова и пунктуацию """ words = [] separators = { ",", ".", ";", "-", "\"", ":", "'", "—", "(", ")", "?", "!" } for word in text.split(" "): if word == "": continue count = 0 for s in word: if s in separators: if count > 0: words.append(word[0:count]) words.append(word[count]) word = word[count + 1:len(word)] count = 0 else: count += 1 if len(word) > 0: words.append(word) return words def get_word_dictionary_for_text( self, wordForms: List[WordForm]) -> List[WordForm]: """ Формирует словарь уникальных слов по начальной форме и рассчитывает частоту их употребления в тексте """ uniqieWordsDictionary = [] uniqueWords = [] for wordForm in wordForms: normalForm = wordForm.normal_form if normalForm not in uniqueWords: uniqueWords.append(normalForm) uniqieWordsDictionary.append(wordForm) for uniqueWord in uniqieWordsDictionary: frequency = 0 for wordForm in wordForms: if uniqueWord.normal_form == wordForm.normal_form: frequency += 1 uniqueWord.frequency = frequency uniqueWord.pos = self.translatePos(uniqueWord.pos) return uniqieWordsDictionary def translatePos(self, pos: str) -> str: if (pos == "NOUN"): pos = "сущ." elif (pos == "ADJ"): pos = "прил." elif (pos == "VERB"): pos = "гл." elif (pos == "NUM"): pos = "числит." elif (pos == "CONJ"): pos = "союз" elif (pos == "INTJ"): pos = "междом." elif (pos == "ADP"): pos = "предлог" elif (pos == "DET"): pos = "местоим." elif (pos == "ADV"): pos = "нареч." elif (pos == "PUNCT"): pos = "пункт." elif (pos == "PART"): pos = "частица" elif (pos == "PRON"): pos = "местоим." elif (pos == "PROPN"): pos = "имя собств." return pos def predictionsParsing( self, words: List[str], words_predicts: List[List[float]]) -> List[WordForm]: """ Преобразует полученное предсказание в нормальный вид (в объект класса WordForm). """ result = [] for word, word_prob in zip(words, words_predicts[-len(words):]): result.append(self.wordFormBuilding(word, word_prob[1:])) return result def wordFormBuilding(self, word: str, predicts: List[float]) -> WordForm: """ Собирает WordForm по номеру тега в векторизаторе и слову. """ word_forms = None word_forms = self.morph.parse(word) vectorizer = self.model.grammeme_vectorizer_output tag_num = int( np.argmax(predicts) ) # номер грамматического разбора (тега) с наибольшей вероятностью score = predicts[tag_num] full_tag = vectorizer.get_name_by_index(tag_num) pos, tag = full_tag.split("#")[0], full_tag.split("#")[1] lemma = self.getWordNormalForm(word, pos, tag, word_forms) vector = np.array(vectorizer.get_vector(full_tag)) result_form = WordForm(word=word, normal_form=lemma, pos=pos, tag=tag, vector=vector, score=score) return result_form def getWordNormalForm(self, word: str, pos_tag: str, gram: str, word_forms=None): """ Определяет лемму слова с помощью pyMorphy2 """ if word_forms is None: word_forms = self.morph.parse(word) guess = "" max_common_tags = 0 for word_form in word_forms: word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag( self.converter, word_form.tag, word) word_form_gram = filter_gram_tag(word_form_gram) common_tags_len = len( set(word_form_gram.split("|")).intersection( set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = word_forms[0] lemma = guess.normal_form return lemma
df = pd.DataFrame(list(zip(titles, texts)), columns =['titles', 'texts']) df.head() # In[ ]: df.to_csv('textbase.csv') # In[96]: word_form_instance = {} parse_inst = morph.parse('синхрофазатроны')[0] word_form_instance['word'] = parse_inst.word word_form_instance['lemma'] = parse_inst.normal_form word_form_instance['form'] = parse_inst.tag word_form_instance['POS'] = parse_inst.tag.POS # In[450]: toy_df = pd.DataFrame({ 'EmployeeId': ['001', '002', '003', '004'], 'City': ['я хорошая и дружелюбная сорока , меня хвалят. ', 'бегает. прыгает. ', 'смешной , и. расплывчатый', 'кто ты. кто я. '] }) toy_df
def read_lemmas(fileobj): # здесь на каждой строчке по предложению (токенизованному) m = MorphAnalyzer() for line in fileobj: yield [m.parse(t)[0].normal_form for t in line.decode('utf-8').split()[1:]]
def tokenize_ru(file_text): tokens = word_tokenize(file_text) tokens = [i for i in tokens if (i not in string.punctuation)] tokens = [ i for i in tokens1 if (ma.parse(i)[0].tag.POS not in functors_pos) ] tokens = [i.replace("«", "").replace("»", "") for i in tokens] return tokens tokens1 = wt.tokenize(text1) tokens2 = wt.tokenize(text2) functors_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP'} # Удаляем пунктуацию tokens1 = [i for i in tokens1 if (i not in string.punctuation)] tokens2 = [i for i in tokens2 if (i not in string.punctuation)] tokens1 = [i for i in tokens1 if (ma.parse(i)[0].tag.POS not in functors_pos)] tokens2 = [i for i in tokens2 if (ma.parse(i)[0].tag.POS not in functors_pos)] i = 0 while i < len(tokens1): # Переводим в нижний регистр tokens1[i] = tokens1[i].lower() i = i + 1 i = 0 while i < len(tokens2): tokens2[i] = tokens2[i].lower() i = i + 1 sentences = [tokenize_ru(sent) for sent in sent_tokenize(text, 'russian')] model = gensim.models.Word2Vec(sentences, size=500, window=5, min_count=1,