def get_popular_tags(): """ Returns a set of strings, each string is a tag """ if (time() - collector.last_request < collector.update_interval): return collector.tags else: try: stop_words = [] with open("stop words.txt", "r") as f: stop_words = f.read().split() top_headlines = collector.newsapi.get_top_headlines( language='ru')['articles'] descriptions = ''.join([ x for x in ' '.join( article['description'] for article in top_headlines) if (x.isalpha() or x in [' ', '-']) ]).replace('- ', ' ').replace(' -', ' ').split() morphy = MorphAnalyzer() descriptions = [ morphy.parse(x)[0].normal_form for x in descriptions ] uniq = {} for word in descriptions: uniq[word] = uniq.get(word, 0) + 1 sorted_uniq = sorted( [(key, value) for key, value in uniq.items() if key not in stop_words], key=itemgetter(1)) collector.tags = [i[0] for i in sorted_uniq[-10:]] collector.last_request = time() finally: return collector.tags
def wordforms(word): arr = [] morph = MorphAnalyzer() lex = morph.parse(word)[0].lexeme for l in lex: arr.append(l.word) return set(arr)
def analyze(response): whitelist = ['найти', 'придумать', 'сказать', 'подсказать'] text = re.findall('([а-яА-Я\-]+)', response) if len(text) == 1: return text else: parser = MorphAnalyzer() a = [] for word in text: a.append((word, parser.parse(word)[0])) if {'VERB', 'INFN'} & a[0][1].tag.grammemes: verb = a.pop(0) if verb[1].normal_form not in whitelist: return None # print(a[0][0]) if a[0][0] == 'синоним': if a[1][0] in ('к', 'для'): if a[2][1].normal_form == 'слово': return a[3][0] if a[1][1].normal_form == 'слово': return a[2][0] return None
def statistics(folder, filename): # words cloud with open('{}/words.txt'.format(STATIC_ROOT), 'r', encoding='utf-8') as f: words = [line.rstrip() for line in f] words_dict = {i: 0 for i in words} # json files to analyze with open('{}/{}/{}.json'.format(MEDIA_ROOT, folder, filename), 'r', encoding='utf-8') as f: data_json = json.load(f) # type: dict morph = MorphAnalyzer() messages_words = [] for text in data_json.values(): for w in text.split(' '): messages_words.append(morph.parse(w)[0].normal_form) # statistics for all words messages_words_counts = Counter(messages_words) messages_words_counts['ВСЕГО СЛОВ'] = len(messages_words) with open('{}/{}/{}_COUNTER.json'.format(MEDIA_ROOT, folder, filename), 'w', encoding='utf-8') as f: json.dump(messages_words_counts, f, ensure_ascii=False) # statistics for word cloud for m in messages_words: if m in words_dict: words_dict[m] = words_dict[m] + 1 with open('{}/{}/{}_WORD_CLOUD.json'.format(MEDIA_ROOT, folder, filename), 'w', encoding='utf-8') as f: json.dump(words_dict, f, ensure_ascii=False)
def generate_answer(input_sentence): morph = MorphAnalyzer() words = input_sentence.split() answer = '' for word in words: analyze = morph.parse(word)[0] pos_tag = analyze.tag.POS filename = str(pos_tag) + '.txt' with open(filename, 'r', encoding='utf-8') as file: lemmas = file.readlines() changed_word = None while changed_word == None: new_analyze, inf_tags = collect_inf_tags(analyze, pos_tag, lemmas, morph) changed_word = new_analyze.inflect(inf_tags) answer = answer + changed_word.word + ' ' print(answer)
def analyzeWord(self, word): morph = MorphAnalyzer() analysisResults = [] for p in morph.parse(word): curAnalysis = { 'исходное слово': word, 'нормальная форма': p.normal_form, 'часть речи': p.tag.POS, 'одушевленность': p.tag.animacy, 'вид': p.tag.aspect, 'падеж': p.tag.case, 'род': p.tag.gender, 'включенность': p.tag.involvement, 'наклонение': p.tag.mood, 'число': p.tag.number, 'лицо': p.tag.person, 'время': p.tag.tense, 'переходность': p.tag.transitivity, 'залог': p.tag.voice, 'лексема': [lexeme[0] for lexeme in p.lexeme] } analysisResults.append(curAnalysis) return analysisResults
def initialize(self): self.context_sensitive = True self.answers = [ "{now} {weekday}.", "{weekday}.", "{weekday} вроде бы." ] self.times = {"сегодня": 0, "завтра": 1, "послезавтра": 2} self.morph = MorphAnalyzer() engine = self.get_app("brain").engine keyword = ["день", "число"] day = ["сегодня", "завтра", "послезавтра"] question = ["какой"] for k in keyword: engine.register_entity(k, "SayWeekdayKeyword") for d in day: engine.register_entity(d, "SayWeekdayDay") for q in question: engine.register_entity(q, "SayWeekdayQuestion") sayweekday_intent = IntentBuilder("sayweekday").\ require("SayWeekdayKeyword").optionally( "SayWeekdayDay").optionally("SayWeekdayQuestion").build() engine.register_intent_parser(sayweekday_intent) print("sayweekday initialized")
def agree(w1, w2, t1, t2): morph = MorphAnalyzer() raw_cur_tags = morph.tag(w1)[0] raw_next_tags = morph.tag(w1)[0] cur_tags = re.findall(r"\w+", raw_cur_tags) next_tags = re.findall(r"\w+", raw_next_tags) if t1 == "person": if t2 == "verb_right": if next_tags[3] == "tran": cur_tags[-1] = "nomn" else: cur_tags[-1] = "datv" if t1 == "verb_right": if t2 == "property": pass if t1 == "adjective": if t2 == "property": pass if t1 == "property": if t2 == "person": pass if t2 == "adjective": pass #w1 = morph.parse(w1)[0].inflect({}).word return w1, w2
def thanks(): morph = MorphAnalyzer() if request.args: #a = input('Введите предложение: ') a = request.args['sentence'] words = open('words.txt', 'r', encoding='utf-8') words = words.readlines() reg = re.compile('[^а-яА-Я ]') a = a.split() new_sent = open('sentence.txt', 'w', encoding='utf-8') for i in a: ana = morph.parse(i)[0] random.shuffle(words) for word in words: word = reg.sub('', word) word = morph.parse(word)[0] if word.tag == ana.tag: new_sent.write(word.word) new_sent.write(' ') break new_sent.close() new_sent1 = open('sentence.txt', 'r', encoding='utf-8') new_sent1 = new_sent1.read() return render_template('thanks.html', sentence_answer=new_sent1) #print(new_sent1) #new_sent1.close() return redirect(url_for(''))
def __init__(self): self.grammeme_vectorizer_input = GrammemeVectorizer() self.grammeme_vectorizer_output = GrammemeVectorizer() self.word_dictionary = WordDictionary() self.char_set = set() self.morph = MorphAnalyzer() # pyMorphy2 self.converter = converters.converter('opencorpora-int', 'ud14')
def pymorphying(filename): dictionary = dict() morph = MorphAnalyzer() words_and_grams = list() with open(filename, 'r', encoding='utf-8') as file: text = file.read() tokenized = word_tokenize(text) for one in tokenized: parsed = morph.parse(one) parsed = parsed[0] original_word = one gram_info = str(parsed.tag).split(',') first_gram = str(gram_info[0]).split()[0] if first_gram == 'PNCT' or first_gram == 'UNKN': continue if len(gram_info) == 1: continue loop = dictionary counter = 0 for gram in gram_info: counter += 1 check = gram checking = check in loop if checking == False: add_to_dict(check, loop, counter, len(gram_info)) if type(loop) != list: loop = loop[check] try: loop.append(original_word) loop.sort() except AttributeError: loop = list() loop.append(original_word) loop.sort() print(dictionary)
def make_tags(sentence): morph = MorphAnalyzer() tags = {'NOUN': 'N', 'NPRO': 'N', 'ADJF': 'A', 'ADJS': 'A', 'PRTF': 'A', 'PRTS': 'V', 'NUMR': 'A', 'VERB': 'V', 'INFN': 'V', 'GRND': 'V', 'ADVB': 'D', 'PREP': 'P', 'PRCL': 'P', 'CONJ': 'P'} tokens = [token for token in nltk.word_tokenize(sentence)] tokens_tags = [tags[morph.parse(token)[0].tag.POS] for token in nltk.word_tokenize(sentence)] return [tokens, tokens_tags]
def parse_text(self, string, flag): morph = MorphAnalyzer() FURTHER_DEVELOPMENT = morph.parse('дальнейшие'.lower())[0].normal_form FURTHER_IMPROVEMENTS = morph.parse('улучшения'.lower())[0].normal_form self.sentences = [] self.find_further_development = False self.filtered_docs = [] stop_words = set(stopwords.words("russian")) filtered_doc = [] self.get_sentences(string, flag) for sent in self.sentences: token_sent = [ w.lower() for w in word_tokenize(sent) if w.lower() not in stop_words ] for word in token_sent: w = morph.parse(word)[0].normal_form filtered_doc.append(w) if w in [FURTHER_DEVELOPMENT, FURTHER_IMPROVEMENTS ] and not flag: self.find_further_development = True self.further_dev_sentence = sent self.filtered_docs.append(filtered_doc) filtered_doc = []
def __init__(self, token_pat="[а-я]+", mode="normal", counter=None, threshold=3, allowed_pos=None, stop_words=None, stop_cities=False): self.token = token_pat self.mode = mode if self.mode not in {"normal", "nospace"}: raise ValueError("Unknown mode") elif self.mode == "nospace": if not isinstance(counter, Counter): raise ValueError( "In 'nospace' mode the counter attribute should be passed") self.counter = counter self.nospace = NoSpaceSplitter(counter) self.threshold = threshold self.morph = MorphAnalyzer() self.allowed_pos = allowed_pos self.stop_words = stop_words or STOPWORDS if stop_cities: self.stop_words.union(CITIES)
def search(query): relevance = defaultdict(float) m = MorphAnalyzer() inverted_index, articles, avdl = get_indices() N = len(articles) words = [ x.lower().strip(string.punctuation + '»«–…') for x in word_tokenize(query) ] lemmas = [ m.parse(x)[0].normal_form for x in words if x and x not in set(stopwords.words('russian')) ] for lemma in lemmas: if lemma in inverted_index: articles_w_lemma = inverted_index[lemma] n = len(articles_w_lemma) for a in articles_w_lemma: a_info = articles[a[0]] qf = a[1] dl = a_info[2] relevance[(a_info[0], a_info[1])] += score_BM25(n, qf, N, dl, avdl) res = sorted(relevance.items(), key=lambda x: x[1], reverse=True) res = [x[0] for x in res] return res
def feminine_checker(self, w): ''' Check if the word is feminine. Necessary for some variants of hieroglyphs Args: w: str, input Russian word Returns: sex: str, 'M' or 'F' - gender of a word ''' morph = MorphAnalyzer() w = self.input_word.split(' ')[0] ana = morph.parse(w)[0] gram = str(ana.tag).split(',') # print(gram) try: if 'femn' in gram[2]: sex = 'F' else: sex = 'M' except: if w[-1] == 'а' or w[-1] == 'я': sex = 'F' else: sex = 'M' self.sex = sex return self.sex
def __init__(self, app_id, mail_manager, chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs): """ Initializes Lama Bot. Expects login/password or access_token as named parameters :param mail_manager: A manager for retrieving mails :type mail_manager: AbstractMailManager :param chat_id: Chat identifier :type chat_id: int :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented :type chat_id_for_mails: int :raise ValueError: When neither login/password nor access_token was provided """ self.exit_event = Event() self.morph = MorphAnalyzer() self.version = '0.1.1' self.app_id = app_id self.access_token = None self.password = None self.login = None self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs) self.commands = {} self._plugins = [] self.mail_manager = mail_manager self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest self.chat_id = chat_id self.chat_id_for_mails = chat_id_for_mails or self.chat_id self.admins = admins or [] self.initialize_commands()
def read_tab_corpus(inc): m = MorphAnalyzer() sent = [] for t in inc: # try: # t = t.rstrip().decode('utf-8') # except AttributeError: t = t.rstrip() if not t: continue if t == u'sent': sent = [] continue if t == u'/sent' or t == u'SENT': sent = [x[0] for x in sent] parses = [m.parse(token) for token in sent] if sent: yield [(p[0].word, p[0].tag) for p in parses] continue t = t.split('\t') try: token = (t[1], ' '.join(t[2].split(' ')[2:])) sent.append(token) except IndexError: continue
def get_correct_form_of_points_number_name(number: int) -> str: """ Возвращает верное слово (Баллов/Балла/Балл) для правильного написания """ if not isinstance(number, int): # Ввелось не число return "Балл" analysis = MorphAnalyzer().parse("Балл")[0] return analysis.make_agree_with_number(number).word
def text_rank(text, language): sentences = [] a = [] if (language == 'ukrainian'): morph = MorphAnalyzer(lang='uk') sentences = sent_tokenizer_ua(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ua, stop_words_ua) else: morph = MorphAnalyzer() sentences = sent_tokenizer_ru(text) if len(sentences) < 2: s = sentences[0] return [(1, 0, s)] a = tfidf(text, language, sent_tokenizer_ru, stop_words_ru) pairs = combinations(range(len(sentences)), 2) scores = [(i, j, similarity(a[i, :], a[j, :])) for i, j in pairs] scores = filter(lambda x: x[2], scores) pr = rank_graph(scores) return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True) # Сортировка по убыванию ранга тройки
def addWord(): global vocabulary start_time = time.time() words = {} analyzer = MorphAnalyzer() vocabulary.append(inputText.get(1.0, END)) tokenize_sentence = word_tokenize(vocabulary[0]) for word in tokenize_sentence: parse_word = analyzer.parse(word)[0] word_word = parse_word.word word_lemma = parse_word.normal_form word_tags = parse_word.tag.cyr_repr word_ending = list(set(word_word) - set(word_lemma)) if word_word is not word_lemma: words.update({ word_word: { 'lemma': word_lemma, 'tag': word_tags, 'ending': word_ending } }) sorted_words = sorted(words) for key in sorted_words: lexeme = Lexeme((words[key]['lemma']), (words[key]['tag']), (words[key]['ending'])) outputText.insert(0, str(lexeme.lemma) + ' ' + str(lexeme.tags) + ' ' \ + str(lexeme.endings)) end_time = time.time() result_time = end_time - start_time print(str(result_time) + " seconds") vocabulary.clear()
def __init__( self, vocab: Vocab, model: Optional[Model], name: str = "lemmatizer", *, mode: str = "pymorphy2", overwrite: bool = False, scorer: Optional[Callable] = lemmatizer_score, ) -> None: if mode == "pymorphy2": try: from pymorphy2 import MorphAnalyzer except ImportError: raise ImportError( "The Russian lemmatizer mode 'pymorphy2' requires the " "pymorphy2 library. Install it with: pip install pymorphy2" ) from None if getattr(self, "_morph", None) is None: self._morph = MorphAnalyzer() super().__init__(vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer)
class SayTime(AppDaemon): def initialize(self): self.answers = [ "Сейчас {hour} {hword} {minute} {mword}.", "Московское время - {hour} {hword} {minute} {mword}.", "{hour} {hword} {minute} {mword}.", "{hour} {hword} {minute} {mword}." ] self.morph = MorphAnalyzer() engine = self.get_app("brain").engine keyword = ["час", "время"] question = ["сколько", "который"] for k in keyword: engine.register_entity(k, "SayTimeKeyword") for q in question: engine.register_entity(q, "SayTimeQuestion") saytime_intent = IntentBuilder("saytime").\ require("SayTimeKeyword").optionally("SayTimeQuestion").build() engine.register_intent_parser(saytime_intent) print("saytime init done") def handle(self, intent_dict): now = datetime.datetime.now() hword = "час" ahword = self.morph.parse(hword)[0].make_agree_with_number( now.hour).word mword = "минута" amword = self.morph.parse(mword)[0].make_agree_with_number( now.minute).word return choice(self.answers).format(hour=now.hour, minute=now.minute, hword=ahword, mword=amword)
def to_normal_form(file_text): morph = MorphAnalyzer() out = [] for word in word_tokenize(file_text.lower()): if word.isalnum(): out.append(morph.parse(word)[0].normal_form) return " ".join(out)
def __init__(self, data_name, lemmatizing_method, max_examples=None, delete_word_parts=False, drop_duplicates=True, count_lemmas_weights=False, limit=None): self.data_name = data_name self.lemmatizing_method = lemmatizing_method self.max_examples = max_examples self.delete_word_parts = delete_word_parts self.drop_duplicates = drop_duplicates self.count_lemmas_weights = count_lemmas_weights self.translation = str.maketrans('', '', string.punctuation) self.dfs = dict() self.nf_cnts = dict() self.cache = dict() self.pattern = re.compile(r'\b\w+\b') if lemmatizing_method is not None and lemmatizing_method != 'none': if 'ru' in data_name: self.analyzer = MorphAnalyzer() elif 'german' in data_name: self.analyzer = spacy.load("de_core_news_sm", disable=['ner', 'parser']) elif 'english' in data_name: self.analyzer = spacy.load("en_core_web_sm", disable=['ner', 'parser']) else: assert "unknown data name %s" % data_name
def place(message): global places m = MorphAnalyzer() word = m.parse(message)[0] if 'гео' in word.tag.cyr_repr: if message not in places: places[message] = r.choice([ "Отличное место! Бывало, что я заползал туда иногда, раз в месяцок", "О да, знаю, там подают таки-и-ие блюда!", "Ну, знаешь, насчет этого места. Тут точно дело вкуса, обычному туристу лучше сюда не соваться...", "Место, откровенно говоря, так себе...", "Это одно из моих любимых мест на планете! Когда будет возможность, обязательно посети", "Это место меня отталкивает, даже не планируй туда поездку", "Да ладно, нашел место для отдыха!", "Погодка там так себе", "Ну, ничего, норм выбор", "Как тебе вообще в голову пришло туда захотеть поехать?!", "Для питона как раз!)", "Как-то одним морозным дням я замечательно отдохнул там, но общее впечатление оставляет желать лучшего", "Там бывает мокро, но для меня, питона, это естественная среда)", "Брррррррр, не нада", "Питон одобряет", "Не трать время на это", "Конечно, там прекрасно!", "Что ты там будешь делать?", "Хммм, ничего!" ]) return places[message] return False
def my_function(message): # здесь код, который генерирует ответ def find_POS(POS, arr): temp = arr[POS] idx = random.randint(0, len(temp) - 1) return temp[idx] def examine(word, arr): obj = morph.parse(word)[0] POS = obj.tag.POS word_to_print = find_POS(POS, arr) i = str(obj.tag).find(',') r = str(obj.tag).find(' ') string = str(obj.tag)[i + 1:] string = string.replace(" ", ",") if r != -1: tag = frozenset(string.split(',')) else: tag = frozenset() return tag, word_to_print morph = MorphAnalyzer() text = str(message.text) reply = str() if not text.isalnum() and ' ' not in text: reply = 'Введены непонятные символы :(' else: words_separated = text.split() file_str = open('1grams-3.txt', 'r', encoding='utf-8').read() words_separated_new = file_str.split() arr = dict() idx = 0 for word in words_separated_new: idx += 1 temp = morph.parse(word)[0] if temp.tag.POS in arr: arr[temp.tag.POS].append(temp.normal_form) else: arr[temp.tag.POS] = [temp.normal_form] if idx == 10000: break info = None for word in words_separated: info = None while info == None: tag, word_to_print = examine(word, arr) info = morph.parse(word_to_print)[0].inflect(tag) reply += info.word + ' ' bot.send_message(message.chat.id, reply) # отправляем в чат ответ
def __init__(self): nltk.download('stopwords') self.patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+" self.stopwords_ru = stopwords.words("russian") self.morph = MorphAnalyzer() self.ann_model = keras.Sequential() self.__preparing_data__() print("Data Prepared")
def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20')
def lemmatization(list_of_strings): morph = MorphAnalyzer() for i in range(len(list_of_strings)): words = list_of_strings[i].split() for k in range(len(words)): words[k] = morph.parse(words[k])[0].normal_form list_of_strings[i] = ' '.join(words) return list_of_strings
def agree(w1, w2, t1, t2): if t1 == "comma" or t2 == "comma": return w1, w2 morph = MorphAnalyzer() raw_cur_tags = morph.tag(w1)[-1] raw_next_tags = morph.tag(w1)[-1] cur_tags = re.findall(r"\w+", str(raw_cur_tags)) next_tags = re.findall(r"\w+", str(raw_next_tags)) if t1[:-2] == "person": if t2[:-2] == "verb_right": if morph.normal_forms(w2)[0] in dative_verbs: w1 = morph.parse(w1)[0].inflect({"datv"}).word if t1[:-2] == "verb_right": if t2[:-2] == "property": pass if t2[:-2] == "person": if cur_tags[3] == "tran": w2 = morph.parse(w2)[0].inflect({"accs"}).word else: w2 = morph.parse(w2)[0].inflect({"nomn"}).word #gender with nomn only gender = next_tags[2] if gender == "inan": gender = next_tags[3] w1 = morph.parse(w1)[0].inflect({gender}).word if t1[:-2] == "adjective": if t2[:-2] == "property": #gender gender = next_tags[2] if gender == "inan": gender = next_tags[3] try: w1 = morph.parse(w1)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) if t1[:-2] == "property": if t2[:-2] == "person": pass if t2[:-2] == "adjective": gender = cur_tags[2] if gender == "inan": gender = cur_tags[3] try: w2 = morph.parse(w2)[0].inflect({gender}).word except Exception: print("f**k") print(w1, w2) #w1 = morph.parse(w1)[0].inflect({}).word return w1, w2
def read_test_corpus(fn): m = MorphAnalyzer() for line in fn: line = line.rstrip('\n') # считаем, что текст у нас уже токенизованный # line = word_tokenize(line) line = line.decode('utf-8').split() # разбираем слова по словарю, возьмем только первый разбор от pymorphy parses = [m.parse(token) for token in line] if line: yield [(p[0].word, p[0].tag) for p in parses]
def __init__(self, text_array): self.text_array = text_array self.morph = MorphAnalyzer() # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную self.trash_list = \ {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все", "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный", "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь", "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш", "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься", "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый", "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой", "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий", "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание", "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие", "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий", "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение", "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь", "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть", "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг", "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент", "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа", "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео", "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество", "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка", "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал", "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество", "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право", "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение", "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент", "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"}
def read_tab_corpus(inc): m = MorphAnalyzer() sent = [] for t in inc: t = t.rstrip().decode('utf-8') if not t: continue if t == u'sent': sent = [] continue if t == u'/sent' or t == u'SENT': sent = [x[0] for x in sent] parses = [m.parse(token) for token in sent] if sent: yield [(p[0].word, p[0].tag) for p in parses] continue t = t.split('\t') try: token = (t[1], ' '.join(t[2].split(' ')[2:])) sent.append(token) except IndexError: continue
class MorphTest(unittest.TestCase): def __init__(self, document_vector): self.document = None self.documents = document_vector self.morph = MorphAnalyzer() # def setUp(self): # self.document = documents[randint(0, len(documents))] def testMorph(self): self.document = self.document if not None else self.documents[0] morph_array = [self.morph.parse(word)[0].normal_form for word in self.document] print morph_array self.assertTrue(True, msg=None)
class MorphAnalyzer(object): def __init__(self): self.raw = PymorphyAnalyzer() def check_gram(self, gram): if not self.raw.TagClass.grammeme_is_known(gram): raise ValueError(gram) def __call__(self, word): records = self.raw.parse(word) return [prepare_form(_) for _ in records] def normalized(self, word): return {_.normalized for _ in self(word)}
class Analyzer: """ Анализирует входящий текст, разбирает каждое слово на лексемы, убирает пунктуацию и все, кроме существительных, глаголов или прилагательных, а так же слова из списка запрещенных слов. Выдает 10 самых популярных из текущих слов. """ def __init__(self, text_array): self.text_array = text_array self.morph = MorphAnalyzer() # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную self.trash_list = \ {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все", "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный", "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь", "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш", "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься", "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый", "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой", "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий", "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание", "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие", "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий", "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение", "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь", "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть", "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг", "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент", "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа", "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео", "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество", "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка", "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал", "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество", "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право", "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение", "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент", "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"} def start(self): res = list(filter( lambda x: len(x) > 2 and self.pymorphy_analyze(x) and re.match("[а-яА-Я]", x) and x not in self.trash_list, self.text_array)) return [x[0] for x in Counter(res).most_common(10)] def pymorphy_analyze(self, word): lexem = self.morph.parse(word) x = lexem[0].tag.POS if x == ("NOUN" or "ADJF" or "INFN"): return True return False
def __init__(self, input_ru, input_en): self.morph_ru = MorphAnalyzer() self.sentences_ru = self.Pars_sentences( input_ru ) wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" ) self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru, self.Normalize_ru, self.Translate_ru ) self.word_list_ru = [] self.sentences_en = self.Pars_sentences( input_en ) self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo') wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)") self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en, self.Normalize_en, self.Translate_en ) self.word_list_en = [] self.Graph = self.Create_Graph() munkres_algorithm = munkres.Munkres()
def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []): """ Initialization. Args: mysql_con (PySQLPoolConnection): MySQL connection Object redis_con (StrictRedis): RedisDB connection Object tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. classifier (Object): scikit trained classifier to detect real and fake events points (list[dict]): raw messages from event detector """ self.mysql = mysql_con self.redis = redis_con if morph: self.morph = morph else: self.morph = MorphAnalyzer() if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = TreebankWordTokenizer() self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE) self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') self.validity = None self.verification = None self.cores = {} self.classifier = classifier if points: self.id = str(uuid4()) self.created = datetime.now() self.updated = datetime.now() self.messages = { x['id']:x for x in points } self.get_messages_data() self.media = {} self.get_media_data() self.event_update()
class Event(): """ Event object - class for working with event candidates. Collects all data on event candidate, stores it between clustering slices; merges slices, if required. TBD: constructs and saves description, scores texts and media, scores and descripts event itself (probability, that candidate is real, event buzz, event category). Attributes: self.created (datetime): creation timestamp self.updated (datetime): last update timestamp self.start (datetime): timestamp of the first message in the self.messages dict self.end (datetime): timestamp of the last message in the self.messages dict self.messages (Dict[dict]): raw tweets from database, enriched with weight, is_core params (on init), tokens (after add_stem_texts) self.media (Dict[dict]): raw media objects from database self.cores (Dict[list]): tokens, that form the most common vocabulary for the event; computed in create_core() method self.entropy (float): entropy for authorship: 0 for mono-authored cluster; computed in event_summary_stats() method self.ppa (float): average number of posts per one author; computed in event_summary_stats() method self.authors (int): number of unique authors for event self.most_active_author (float): share of messages, written by one (most active author) self.authors_share (float): number of authors divided by number of messages self.relevant_messages_share (float): share of messages with token_score above zero self.duration (int): total seconds from self.start to self.end self.classifier (Object): classifier for deciding, whether event is real self.validity (bool): Classifier verdict, whether event is real or not self.verification (bool): Handmade verification of event quality Methods: self.event_update: commands to calculate all data on event, based on messages and media self.is_successor: examines, if current event have common messages with specified event slice self.is_valid: method for classifier to determine, if event is actually event, and not a random messages contilation self.classifier_row: unififed method for creating classifier data-row self.merge: merge current event with another event, update stat Attributes self.add_slice: add messages and media to the event, recompute statistics self.load / self.dump: serialize/deserialize event and put/get it to Redis self.backup / self.restore: dump/restore event to/from MySQL long-term storage self.get_messages_data: get MySQL data for messages ids self.get_media_data: get MySQL data for media using existing messages ids self.event_summary_stats: calculate statistics and start/end time for event self.add_stem_texts: add tokens lists to self.messages self.create_core: create vocabulary of most important words for the event self.score_messages_by_text: method calculates token_score for messages. TF/IDF likelihood with core is used Message keys: cluster (int): legacy from DBSCAN - number of cluster (event ancestor) id (str): DB message id; unique is_core (bool): True, if tweet belongs to the core of ancestor cluster iscopy (int): 1, if message is shared from another network lat (float): latitude lng (float): longitude network (int): 2 for Instagram, 1 for Twitter, 3 for VKontakte text (str): raw text of the message tokens (Set[str]): collection of stemmed tokens from raw text; created in add_stem_texts() tstamp (datetime): 'created at' timestamp user (int): user id, absolutely unique for one network, but matches between networks are possible token_score (float): agreement estimation with average cluster text weight (float): standart deviations below average """ def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []): """ Initialization. Args: mysql_con (PySQLPoolConnection): MySQL connection Object redis_con (StrictRedis): RedisDB connection Object tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. classifier (Object): scikit trained classifier to detect real and fake events points (list[dict]): raw messages from event detector """ self.mysql = mysql_con self.redis = redis_con if morph: self.morph = morph else: self.morph = MorphAnalyzer() if tokenizer: self.tokenizer = tokenizer else: self.tokenizer = TreebankWordTokenizer() self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE) self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') self.validity = None self.verification = None self.cores = {} self.classifier = classifier if points: self.id = str(uuid4()) self.created = datetime.now() self.updated = datetime.now() self.messages = { x['id']:x for x in points } self.get_messages_data() self.media = {} self.get_media_data() self.event_update() def __str__(self): txt = "<Event {}: {} msgs [{} -- {}]>".format(self.id, len(self.messages), self.start.strftime("%Y-%m-%d %H:%M"), self.end.strftime("%H:%M")) return txt def __unicode__(self): return unicode(self.__str__()) def __repr__(self): return self.__str__() def event_update(self): """ Commands to calculate all data on event, based on messages and media. """ self.add_stem_texts() self.create_core(deviation_threshold = 1) self.create_core(deviation_threshold = 2) self.create_core(deviation_threshold = 3) self.score_messages_by_text() self.event_summary_stats() self.is_valid() def is_successor(self, slice_ids, sim_index = 0.3, only_relevant = True): """ Method examines, if current event have common messages with specified event slice. Args: slice_ids (Set): set if message id's to compare with sim_index (float): minimal share of messages that should match in slice to be detected as a successor only_relevant (bool): use only messages with non-zero token_score (to exclude spam) """ if only_relevant: event_ids = set([k for k, v in self.messages.items() if v['token_score'] > 0]) if not event_ids: event_ids = set(self.messages.keys()) else: event_ids = set(self.messages.keys()) #if float(len(event_ids.intersection(slice_ids)))/len(event_ids.union(slice_ids)) >= jaccard: if float(len(event_ids.intersection(slice_ids)))/min((len(event_ids), len(slice_ids))) >= sim_index: return True return False def is_valid(self): """ Method for Classifier to determine, if event is actually event, and not a random messages contilation. """ if self.validity: return True if self.classifier: self.validity = bool(self.classifier.predict([self.classifier_row()])[0]) return self.validity def classifier_row(self): """ Unififed method for creating classifier data-row. Every var, used in prediction, is listed here, and only here. """ row = [ len(self.messages.values()), len(self.media.values()), self.authors, self.most_active_author, self.authors_share, self.entropy, self.ppa, self.relevant_messages_share, self.duration ] return row def merge(self, other_event): """ Method merges current event with another event, update stat Attributes. Args: other_event (Event): another event object - to merge with """ self.messages.update(other_event.messages) self.media.update(other_event.media) self.event_update() self.updated = datetime.now() self.created = min((self.created, other_event.created)) def add_slice(self, new_slice): """ Method adds messages and media to the event, recompute statistics. Args: new_slice (List[dict]): initial list with messages to be added """ self.messages.update({ x['id']:x for x in new_slice }) self.get_messages_data([x['id'] for x in new_slice]) self.get_media_data([x['id'] for x in new_slice]) self.event_update() self.updated = datetime.now() def backup(self): """ Method dumps event to MySQL long-term storage, used for non-evaluating events. """ if self.verification is None: ver = 'NULL' else: ver = int(self.verification) if self.validity is None: val = 'NULL' else: val = int(self.validity) msg_string = self.pack() q = b'''INSERT INTO events(id, start, end, msgs, description, dumps, verification, validity) VALUES ("{}", "{}", "{}", {}, "{}", "{}", {}, {}) ON DUPLICATE KEY UPDATE `start`=VALUES(`start`), `end`=VALUES(`end`), `msgs`=VALUES(`msgs`), `description`=VALUES(`description`), `dumps`=VALUES(`dumps`), `verification`=VALUES(`verification`), `validity`=VALUES(`validity`);'''.format(self.id, self.start, self.end, len(self.messages.keys()), escape_string(', '.join([x.encode('utf-8') for x in self.cores[2]])), escape_string(msg_string), ver, val) exec_mysql(q, self.mysql) self.redis.delete("event:{}".format(self.id)) def restore(self, event_id): """ Method restores event from MySQL table using event_id parameter. Args: event_id (str): unique event identifier """ q = '''SELECT dumps FROM events WHERE id="{}"'''.format(event_id) event_data = exec_mysql(q, self.mysql)[0][0]['dumps'] self.unpack(event_data) def load(self, event_id, redis_prefix='event'): """ Method for deserializing and loading event from Redis database. Args: event_id (str): unique event isentifier redis_prefix (str): prefix used in Redis database """ try: event_data = self.redis.hget('{}:{}'.format(redis_prefix, event_id), 'dumps') except ResponseError: event_data = self.redis.get('{}:{}'.format(redis_prefix, event_id)) self.unpack(event_data) def dump(self, redis_prefix='event'): """ Method for serializing and dumping event to Redis database. Args: redis_prefix (str): prefix to use, when storing new key in Redis database """ if self.verification is None: ver = 'NULL' else: ver = int(self.verification) if self.validity is None: val = 'NULL' else: val = int(self.validity) msg_string = self.pack() event = {'start':self.start.strftime("%Y-%m-%d %H:%M:%S"), 'end':self.end.strftime("%Y-%m-%d %H:%M:%S"), 'msgs':len(self.messages.keys()), 'description':', '.join([x.encode('utf-8') for x in self.cores[2]]), 'dumps':msg_string, 'verification':ver, 'validity':val} self.redis.hmset("{}:{}".format(redis_prefix, self.id), event) def pack(self, complete=False): """ Method for serializing event to string. Args: complete (bool): whether to pack all available data for the event (full texted messages, media links, and cores). """ todump = { 'id':self.id, 'created':int(mktime(self.created.timetuple())), 'updated':int(mktime(self.updated.timetuple())), 'verification':self.verification, 'messages':[{'id':x['id'], 'is_core':x.get('is_core'), 'token_score':x.get('token_score'), 'weight':x.get('weight')} for x in self.messages.values()] } if complete: todump['media'] = self.media todump['validity'] = self.validity for i in range(len(todump['messages'])): msg = self.messages[todump['messages'][i]['id']] todump['messages'][i].update({'iscopy':msg['iscopy'], 'lat':msg['lat'], 'lng':msg['lng'], 'network':msg['network'], 'text':msg['text'], 'tstamp':int(mktime(msg['tstamp'].timetuple())), 'user':msg['user']}) return packb(todump) def unpack(self, data, complete=False): """ Method for deserializing event from string. msgpack lib is used (considered to be faster than pickle). Args: data (str): pickle dump of event-required parameters. complete (bool): whether to unpack all available data for the event (full texted messages, media links, and cores), or compute these parameters on the fly. """ data = unpackb(data) self.id = data['id'] self.created = datetime.fromtimestamp(data['created']) self.updated = datetime.fromtimestamp(data['updated']) self.verification = data['verification'] self.messages = {x['id']:x for x in data['messages']} if complete: self.validity = data['validity'] self.media = data['media'] for k in self.messages.keys(): self.messages[k]['tstamp'] = datetime.fromtimestamp(self.messages[k]['tstamp']) else: self.get_messages_data() self.media = {} self.get_media_data() self.event_update() def get_messages_data(self, ids=None): """ Method loads MySQL data for messages ids and adds it to the self.messagea argument. Args: ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used """ if not ids: ids = [x['id'] for x in self.messages.values()] q = '''SELECT * FROM tweets WHERE id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids])) data = exec_mysql(q, self.mysql)[0] for item in data: self.messages[item['id']].update(item) def get_media_data(self, ids=None): """ Method loads MySQL data for media using existing messages ids and adds it to the self.media argument. Args: ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used """ if not ids: ids = [x['id'] for x in self.messages.values()] q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids])) data = exec_mysql(q, self.mysql)[0] for item in data: self.media[item['id']] = item def event_summary_stats(self): """ Method calculates several statistics, updates self.start and self.end timestamps. """ authorsip_stats = [len(tuple(i[1])) for i in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z: z['user'])] self.authors = len(authorsip_stats) self.most_active_author = max(authorsip_stats)/float(len(self.messages.values())) self.authors_share = float(self.authors)/len(self.messages.values()) self.entropy = entropy(authorsip_stats) self.ppa = mean(authorsip_stats) self.relevant_messages_share = float(len([x for x in self.messages.values() if x['token_score'] > 0]))/len(self.messages.values()) self.start = min([x['tstamp'] for x in self.messages.values()]) self.end = max([x['tstamp'] for x in self.messages.values()]) self.duration = int((self.end - self.start).total_seconds()) def add_stem_texts(self): """ Method adds tokens lists to self.messages. """ for i in self.messages.keys(): if 'tokens' not in self.messages[i].keys(): txt = self.messages[i].get('text', '') txt = sub(self.url_re, '', txt) self.messages[i]['tokens'] = {self.morph.parse(token.decode('utf-8'))[0].normal_form for token in self.tokenizer.tokenize(txt) if match(self.word, token.decode('utf-8'))} def create_core(self, deviation_threshold=2, min_token=3): """ Method creates core of imprtant words for event. Args: deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens min_token (int): minimal length of token, to exclude prepositions/conjunctions """ texts_by_authors = [set().union(*[msg['tokens'] for msg in list(y[1])]) for y in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z:z['user'])] top_words = {} for doc in texts_by_authors: for token in doc: if len(token) >= min_token: try: top_words[token] += 1 except KeyError: top_words[token] = 1 th_vals = [x[1] for x in top_words.items()] threshold = mean(th_vals) + deviation_threshold * std(th_vals) self.cores[deviation_threshold] = [k for k,v in top_words.items() if v > threshold] def score_messages_by_text(self, deviation_threshold=2): """ Method calculates token_score parameter for self.messages. Args: deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens """ texts = [x['tokens'] for x in self.messages.values()] if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1: for k in self.messages.keys(): self.messages[k]['token_score'] = 0 return dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = TfidfModel(corpus, id2word=dictionary) index = MatrixSimilarity(tfidf[corpus]) try: scores = index[dictionary.doc2bow(self.cores[deviation_threshold])] except IndexError: error('Index error in token scoring for event {}'.format(self.id)) scores = [0]*len(self.messages.values()) for i in range(len(scores)): self.messages.values()[i]['token_score'] = float(scores[i])
def __init__(self, document_vector): self.document = None self.documents = document_vector self.morph = MorphAnalyzer()
class PymorphyVectorizer(WordIndexVectorizer): """ Transforms russian words into 0-1 vector of its possible Universal Dependencies tags. Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io) and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets). All UD2.0 tags that are compatible with produced tags are memorized. The list of possible Universal Dependencies tags is read from a file, which contains all the labels that occur in UD2.0 SynTagRus dataset. Args: save_path: path to save the tags list, load_path: path to load the list of tags, max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used. """ USELESS_KEYS = ["Abbr"] VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"} def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None: super().__init__(save_path, load_path, **kwargs) self.max_pymorphy_variants = max_pymorphy_variants self.load() self.memorized_word_indexes = dict() self.memorized_tag_indexes = dict() self.analyzer = MorphAnalyzer() self.converter = converters.converter('opencorpora-int', 'ud20') @property def dim(self): return len(self._t2i) def save(self) -> None: """Saves the dictionary to self.save_path""" with self.save_path.open("w", encoding="utf8") as fout: fout.write("\n".join(self._i2t)) def load(self) -> None: """Loads the dictionary from self.load_path""" self._i2t = [] with self.load_path.open("r", encoding="utf8") as fin: for line in fin: line = line.strip() if line == "": continue self._i2t.append(line) self._t2i = {tag: i for i, tag in enumerate(self._i2t)} self._make_tag_trie() def _make_tag_trie(self): self._nodes = [defaultdict(dict)] self._start_nodes_for_pos = dict() self._data = [None] for tag, code in self._t2i.items(): if "," in tag: pos, tag = tag.split(",", maxsplit=1) tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag, [] start = self._start_nodes_for_pos.get(pos) if start is None: start = self._start_nodes_for_pos[pos] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) for key, value in tag: values_dict = self._nodes[start][key] child = values_dict.get(value) if child is None: child = values_dict[value] = len(self._nodes) self._nodes.append(defaultdict(dict)) self._data.append(None) start = child self._data[start] = code return self def find_compatible(self, tag: str) -> List[int]: """ Transforms a Pymorphy tag to a list of indexes of compatible UD tags. Args: tag: input Pymorphy tag Returns: indexes of compatible UD tags """ if " " in tag and "_" not in tag: pos, tag = tag.split(" ", maxsplit=1) tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")]) else: pos, tag = tag.split()[0], [] if pos not in self._start_nodes_for_pos: return [] tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag if key not in self.USELESS_KEYS] if len(tag) > 0: curr_nodes = [(0, self._start_nodes_for_pos[pos])] final_nodes = [] else: final_nodes = [self._start_nodes_for_pos[pos]] curr_nodes = [] while len(curr_nodes) > 0: i, node_index = curr_nodes.pop() # key, value = tag[i] node = self._nodes[node_index] if len(node) == 0: final_nodes.append(node_index) for curr_key, curr_values_dict in node.items(): curr_i, curr_node_index = i, node_index while curr_i < len(tag) and tag[curr_i][0] < curr_key: curr_i += 1 if curr_i == len(tag): final_nodes.extend(curr_values_dict.values()) continue key, value = tag[curr_i] if curr_key < key: for child in curr_values_dict.values(): curr_nodes.append((curr_i, child)) else: child = curr_values_dict.get(value) if child is not None: if curr_i < len(tag) - 1: curr_nodes.append((curr_i + 1, child)) else: final_nodes.append(child) answer = [] while len(final_nodes) > 0: index = final_nodes.pop() if self._data[index] is not None: answer.append(self._data[index]) for elem in self._nodes[index].values(): final_nodes.extend(elem.values()) return answer def _get_word_indexes(self, word): answer = self.memorized_word_indexes.get(word) if answer is None: parse = self.analyzer.parse(word) if self.max_pymorphy_variants > 0: parse = parse[:self.max_pymorphy_variants] tag_indexes = set() for elem in parse: tag_indexes.update(set(self._get_tag_indexes(elem.tag))) answer = self.memorized_word_indexes[word] = list(tag_indexes) return answer def _get_tag_indexes(self, pymorphy_tag): answer = self.memorized_tag_indexes.get(pymorphy_tag) if answer is None: tag = self.converter(str(pymorphy_tag)) answer = self.memorized_tag_indexes[pymorphy_tag] = self.find_compatible(tag) return answer
class Parallel_Translate: def __init__(self, input_ru, input_en): self.morph_ru = MorphAnalyzer() self.sentences_ru = self.Pars_sentences( input_ru ) wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" ) self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru, self.Normalize_ru, self.Translate_ru ) self.word_list_ru = [] self.sentences_en = self.Pars_sentences( input_en ) self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo') wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)") self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en, self.Normalize_en, self.Translate_en ) self.word_list_en = [] self.Graph = self.Create_Graph() munkres_algorithm = munkres.Munkres() #self.word_matching = munkres_algorithm.compute( self.Graph ) # Input file? read text and split to sentences def Pars_sentences(self,file_name ) : sentences_list = [] with open(file_name, 'rU') as input_file: file_str = input_file.read() sentences_tokenize = nltk.tokenize.PunktSentenceTokenizer() for sentence in sentences_tokenize.sentences_from_text( file_str ): sentences_list.append( sentence ) return sentences_list def Create_Word_List(self, wordPattern, sentences, Normalize, Translate ): word_list = [] sentence_num = 0 sent_list = [] for sentence in sentences: sentence_word_list = [] for word in wordPattern.findall( sentence ): word = word.strip() word = word.lower() n_word = Normalize( word ) translate_list = Translate( n_word ) w_info = word_info( word, sentence_num, n_word, translate_list ) word_list.append( w_info ) sentence_word_list.append(w_info) sent_list.append( sentence_info( sentence, sentence_word_list ) ) sentence_num= sentence_num + 1 return sent_list def Translate_ru( self, n_word ): return [] def Translate_en( self, n_word ): self.re_for_entry = re.compile("<dtrn>(.*?)</dtrn>") valueWord = [] try: for normal_word in n_word: for entry in self.dict_en_ru[ normal_word ]: result_pars = self.ParsEntry( entry.data ) valueWord = valueWord + result_pars except KeyError: pass return valueWord def ParsEntry( self, entry_data ) : l = entry_data.split( "<abr><i><c><co>" ) result_first_step = [] for data in l: result_first_step = result_first_step + self.re_for_entry.findall(data) result_second_step = [] for data in result_first_step: temp = data.split("<") if temp[0] != "": result_second_step.append(temp[0]) result = [] for data in result_second_step: for data_prom in data.split(","): result = result + data_prom.split(";") for i in range( len( result ) ): result[i] = result[i].strip() return result def Normalize_ru( self, word ): n_word = self.morph_ru.normal_forms( word ) if n_word: return n_word[0] else: return [] def Normalize_en( self, word ): n_word = wordnet.morphy( word ) if n_word: return [ n_word ] else: return [] def Create_Graph(self): graph_matrix = [ [ 0 for i in range( len( self.sentences_list_ru ) ) ] for j in range( len( self.sentences_list_en ) ) ] koef = abs( len( self.sentences_list_en ) - len( self.sentences_list_ru ) ) sentence_num = 0 for sentence in self.sentences_list_en: sentence_left_num = sentence_num sentence_right_num = sentence_num +1 while (sentence_left_num >= 0) and (sentence_num - sentence_left_num <= koef): sum_eq_words = 0 for w_info in sentence.sentence_words: for translate_word in w_info.translate_list: for w_info_ru in self.sentences_list_ru[sentence_left_num]: for w_normal in w_info_ru.normal_form: if w_normal == translate_word: sum_eq_words = sum_eq_words + 1 graph_matrix[sentence_num][sentence_left_num] = -( sum_eq_words - sentence_num + sentence_left_num ) while (sentence_right_num < len( self.sentences_list_ru ) ) and ( sentence_right_num - sentence_num <= koef): sum_eq_words = 0 for w_info in sentence.sentence_words: for translate_word in w_info.translate_list: for w_info_ru in self.sentences_list_ru[sentence_right_num]: for w_normal in w_info_ru.normal_form: if w_normal == translate_word: sum_eq_words = sum_eq_words + 1 graph_matrix[sentence_num][sentence_right_num] = -( sum_eq_words - sentence_num + sentence_left_num ) return graph_matrix
def read_text_lemmas(fileobj): m = MorphAnalyzer() for line in fileobj: yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
for line in f.readlines(): all += 1 word = line.split(" ")[0] cnt = int(line.split(" ")[1]) all_postings += cnt if match("^[^\W\d]+$", word): nonnum += 1 alpha_postings += cnt lo = word.lower() if lo in low_reg: low_reg[lo] += cnt else: low_reg[lo] = cnt just_ru = {k: v for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k)} ru_postings = sum(just_ru.values()) morph = MorphAnalyzer() c = 0 for k, v in just_ru.items(): if c % 100000 == 0: print(c) c += 1 lem = morph.parse(k)[0].normal_form if lem in lemmatized: lemmatized[lem] += int(v) else: lemmatized[lem] = int(v) with open("stopwords", "r") as st: stops = set(st.read().split('\n')) for k, v in just_ru.items(): if not k in stops: no_stops_postings += v
def __init__(self): self.raw = PymorphyAnalyzer()
def lemmatize(self, tokens): """ :param tokens: a list of tokens to lemmatize """ analyzer = MorphAnalyzer() return Counter([analyzer.parse(token)[0].normal_form for token in tokens if len(token) > 1])
def read_lemmas(fileobj): # здесь на каждой строчке по предложению (токенизованному) m = MorphAnalyzer() for line in fileobj: yield [m.parse(t)[0].normal_form for t in line.decode('utf-8').split()[1:]]
class LamaBot(object): def __init__(self, app_id, mail_manager, chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs): """ Initializes Lama Bot. Expects login/password or access_token as named parameters :param mail_manager: A manager for retrieving mails :type mail_manager: AbstractMailManager :param chat_id: Chat identifier :type chat_id: int :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented :type chat_id_for_mails: int :raise ValueError: When neither login/password nor access_token was provided """ self.exit_event = Event() self.morph = MorphAnalyzer() self.version = '0.1.1' self.app_id = app_id self.access_token = None self.password = None self.login = None self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs) self.commands = {} self._plugins = [] self.mail_manager = mail_manager self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest self.chat_id = chat_id self.chat_id_for_mails = chat_id_for_mails or self.chat_id self.admins = admins or [] self.initialize_commands() def initialize_commands(self): self.commands = { 'post_to_dialog': lambda args, m: self.safe_post_message_and_log_if_failed(args), 'ping': self.pong_to_admins } def safe_notify_about_unread_mails(self): for m in self.safe_unread_mails: if self.safe_post_mail_and_log_if_failed(m): self.mail_manager.safe_mark_mail_as_read_and_log_if_failed(m) def safe_process_directed_dialog_message(self, message): logging.debug(u'Processing message with body {}'.format(message.body)) words = self.split_to_words(message.body) logging.debug(u'Words in the body: {}'.format(words)) self.safe_process_plugins(message, words) self.safe_mark_message_as_read_and_log_if_failed(message) def safe_process_private_message(self, message): if self.safe_execute_and_log_if_failed(message): self.safe_mark_message_as_read_and_log_if_failed(message) @safe_call_and_log_if_failed def safe_process_plugins(self, message, words): normalized_words = self.normalize_words(words) for p in self.plugins: p.process_input(message.body, words, normalized_words, message) def long_pool_loop(self, exit_event): server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response() while not exit_event.is_set(): response = self.send_long_poll_request(server, key, ts) if 'failed' in response: server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response() else: self.process_long_poll_response(response) ts = self.get_timestamp(response, ts) def extract_server_key_and_timestamp_from_get_long_poll_server_response(self): response = self.vkapi.messages_get_long_poll_server() while not all(x in response for x in ('server', 'key', 'ts')): logging.error('Could not retrieve credentials for connecting to long poll server', response) response = self.vkapi.messages_get_long_poll_server() return response['server'], response['key'], response['ts'] @safe_call_and_log_if_failed(default={'failed': True}) def send_long_poll_request(self, server, key, ts, act='a_check', wait=25, mode=2): params = { 'act': act, 'key': key, 'ts': ts, 'wait': wait, 'mode': mode } return requests.get('http://{server}'.format(server=server), params=params).json() def process_long_poll_response(self, response): if response: for update in response.get('updates', []): self.process_long_poll_update(update) def process_long_poll_update(self, update): functions = { 4: self.process_long_poll_new_message } function = functions.get(update[0]) if function: function(update) def process_long_poll_new_message(self, update): chat_id = self.get_chat_id_from_long_poll_new_message_update(update) fwd_messages = self.get_fwd_messages_from_long_poll_new_message_update(update) self.process_new_message(VkMessage({'id': update[1], 'user_id': None, 'read_state': (update[2] + 1) % 2, 'chat_id': chat_id, 'title': update[5], 'body': update[6], 'fwd_messages': fwd_messages, 'out': (update[2] & 2) >> 1})) def process_new_message(self, message): if message.is_unread and message.is_inbox: if message.chat_id == self.chat_id and self.message_is_directed(message): self.safe_process_directed_dialog_message(message) elif message.is_private: self.safe_process_private_message(message) def get_fwd_messages_from_long_poll_new_message_update(self, update): return map(self.convert_fwd_from_long_poll_new_message_update_to_fwd_message, ifilter(None, self.get_attachments_from_long_poll_new_message_update(update).get('fwd', '').split(','))) @staticmethod def convert_fwd_from_long_poll_new_message_update_to_fwd_message(fwd): regex = re.compile('(?P<user_id>\d+)_(?P<msg_id>\d+)') m = regex.match(fwd) return { 'id': m.group('msg_id'), 'user_id': m.group('user_id') } @staticmethod def get_chat_id_from_long_poll_new_message_update(update): """ The message was sent from chat if user_id is greater than 2000000000 :param update: :return: """ return update[3] - 2000000000 if update[3] > 2000000000 else None def get_user_id_from_long_poll_new_message_update(self, update): """ Retrieves user_id from update according to documentation https://vk.com/pages?oid=-17680044&p=Connecting_to_the_LongPoll_Server :param update: :return: """ return self.get_attachments_from_long_poll_new_message_update(update).get('from') @staticmethod def get_attachments_from_long_poll_new_message_update(update): return update[7] if len(update) > 7 else {} @staticmethod def get_timestamp(response, default): return response.get('ts', default) if response else default @property def unread_mails(self): return self.mail_manager.unread_mails @property def safe_unread_mails(self): """ Just delegates the work to the mail manager :return: """ return self.mail_manager.safe_unread_mails @property def vkapi_messages_get(self): return self.vkapi.messages_get() @property def plugins(self): """ :rtype : a list of LamaPlugin """ return self._plugins def vkapi_messages_set_activity_in_chat(self): return self.vkapi.messages_set_activity(chat_id=self.chat_id, type='typing') def post_mail(self, mail): """ Posts mail to VK. Loads and attaches documents, if any. :param mail: :return: """ documents = None if mail.attachments: documents = filter(None, imap(self.safe_upload_attachment, mail.attachments)) self.post_message_to_mail_dialog(self.wrap_mail(mail), attachments=documents) @safe_call_and_log_if_failed(default=False) def safe_post_mail_and_log_if_failed(self, mail): """ :param mail: :return: True if no error, False otherwise """ self.post_mail(mail) return True @safe_call_and_log_if_failed() def safe_post_message_and_log_if_failed(self, message): self.post_message_to_dialog(message) @safe_call_and_log_if_failed def pong_to_admins(self, _, message): self.post_message_to_admins('Pong', forward_messages=[message]) @safe_call_and_log_if_failed def safe_post_message_with_forward_messages(self, message, forward_messages): self.post_message_to_dialog(message, forward_messages=forward_messages) def execute(self, message): s = message.body command, args = self.split_to_command_and_argument(s) if command in self.commands: self.commands[command](args, message) else: self.command_not_found(command) @safe_call_and_log_if_failed(default=False) def safe_execute_and_log_if_failed(self, message): self.execute(message) return True @staticmethod def split_to_command_and_argument(command): values = command.split(':', 1) if len(values) != 2: values.append(None) return values[0], values[1] def _post_message_to_dialog(self, chat_id, message, attachments=None, forward_messages=None): """ Posts message to dialog. Attaches attachments, if any. :param forward_messages: Messages to be forwarded :type forward_messages: [VkMessage] :param attachments:Documents to be attached :type attachments: [VkDocument] :param message: """ attachments = attachments or [] forward_messages = forward_messages or [] attachment = ','.join(map(lambda d: d.attachment_string, attachments)) forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages)) self.vkapi.messages_send(chat_id=chat_id, message=message, attachment=attachment, forward_messages=forward_messages_str) def post_message_to_dialog(self, message, attachments=None, forward_messages=None): self._post_message_to_dialog(self.chat_id, message, attachments=attachments, forward_messages=forward_messages) def post_message_to_mail_dialog(self, message, attachments=None, forward_messages=None): self._post_message_to_dialog(self.chat_id_for_mails, message, attachments=attachments, forward_messages=forward_messages) def post_startup_message_to_admins(self): self.post_message_to_admins('The Lama is ready to work! (version {0})'.format(self.version)) @safe_call_and_log_if_failed def post_message_to_admins(self, message, forward_messages=None): forward_messages = forward_messages or [] forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages)) for user_id in self.admins: self.vkapi.messages_send(user_id=user_id, message=message, forward_messages=forward_messages_str) def command_not_found(self, command): message = u'Command `{}` not found'.format(command).encode('utf-8') logging.warning(message) def run(self, post_welcome_message_to_dialog=True): if post_welcome_message_to_dialog: self.post_startup_message_to_admins() long_poll = Thread(target=self.long_pool_loop, args=(self.exit_event,)) long_poll.start() while True: self.safe_notify_about_unread_mails() time.sleep(self.number_of_seconds_for_the_rest) def stop_running(self): self.exit_event.set() @safe_call_and_log_if_failed def safe_upload_attachment(self, attachment): """ Uploads given attachment :type attachment: Attachment :rtype: VkDocument """ if attachment.is_loaded: url = self.safe_docs_get_upload_server() file_string = self.safe_upload_file_to_server(url, self.create_attachment_filename(attachment.filename), attachment.data, attachment.mime_type) return self.safe_save_doc_file(file_string, attachment.filename) @safe_call_and_log_if_failed def safe_upload_message_photo(self, image_file_path): if image_file_path is not None: url = self.safe_get_upload_server_for_private_message_photo() data = self.safe_upload_photo_to_server(url, self.create_attachment_filename(image_file_path), self.get_image_data(image_file_path), self.get_mime_type(image_file_path)) photo_name = os.path.basename(image_file_path) return self.safe_save_photo_file(data['photo'], data['server'], data['hash'], photo_name) @staticmethod def get_image_data(image_filename): with open(image_filename, 'rb') as f: data = f.read() return data @staticmethod def get_mime_type(image_filename): return mimetypes.guess_type(image_filename) @safe_call_and_log_if_failed def safe_save_photo_file(self, photo, server, hash, title): if photo: responses = self.vkapi.photos_save_message_photo(photo=photo, server=server, hash=hash, title=title) return VkPhoto(responses[0]) @safe_call_and_log_if_failed def safe_get_upload_server_for_private_message_photo(self): """ Retrieves upload_url for storing files """ return self.vkapi.photos_get_messages_upload_server()['upload_url'] @staticmethod def create_attachment_filename(filename): _, extension = os.path.splitext(filename) return 'attachment' + extension @safe_call_and_log_if_failed def safe_upload_to_server(self, url, filename, data, mime_type, post_name): """ Uploads data to given url and saves it with given filename and mime_type :return: Raw response, returned by post request """ if url: request = requests.post(url, files={post_name: (filename or 'NoName', data, mime_type)}) response = json.loads(request.text) if 'error' in response: raise Exception(response['error']) else: return response def safe_upload_file_to_server(self, url, filename, data, mime_type): return self.safe_upload_to_server(url, filename, data, mime_type, 'file')['file'] def safe_upload_photo_to_server(self, url, filename, data, mime_type): return self.safe_upload_to_server(url, filename, data, mime_type, 'photo') @safe_call_and_log_if_failed def safe_save_doc_file(self, file_string, title): """ Saves file on VK server by given string :param file_string: String, returned after uploading file :return: Saved document :rtype: VkDocument """ if file_string: responses = self.vkapi.docs_save(file=file_string, title=title) return VkDocument(responses[0]) @safe_call_and_log_if_failed def safe_docs_get_upload_server(self): """ Retrieves upload_url for storing files """ return self.vkapi.docs_get_upload_server()['upload_url'] def retrieve_users_by_ids(self, *user_ids): return map(VkUser, self.vkapi.users_get(user_id=','.join(imap(str, user_ids)))) @staticmethod def wrap_mail(mail): return LamaBeautifier.get_random_mail_pattern().format(subject=mail.subject, sender=mail.sender, body=mail.body) @staticmethod def message_is_directed(message): return message.body is not None and message.body.encode('utf-8').startswith('Лама, ') @staticmethod def message_has_body(message): return message.body is not None def mark_message_as_read(self, message): self.mark_message_as_read_by_id(message.id) @safe_call_and_log_if_failed(default=False) def safe_mark_message_as_read_and_log_if_failed(self, message): self.mark_message_as_read(message) return True def mark_message_as_read_by_id(self, message_ids): self.vkapi.messages_mark_as_read(message_ids=message_ids) def register_plugin(self, plugin): self._plugins.append(plugin) plugin.bot = self def split_to_words(self, body): return body.encode('utf-8').translate(string.maketrans('', ''), string.punctuation).split() def normalize_words(self, words): return map(self.normalize_word, words) def normalize_word(self, word): return self.morph.parse(word.decode('utf8'))[0].normal_form.encode('utf8')