コード例 #1
0
def get_popular_tags():
    """ Returns a set of strings,
    each string is a tag """
    if (time() - collector.last_request < collector.update_interval):
        return collector.tags
    else:
        try:
            stop_words = []
            with open("stop words.txt", "r") as f:
                stop_words = f.read().split()
            top_headlines = collector.newsapi.get_top_headlines(
                language='ru')['articles']
            descriptions = ''.join([
                x for x in ' '.join(
                    article['description'] for article in top_headlines)
                if (x.isalpha() or x in [' ', '-'])
            ]).replace('- ', ' ').replace(' -', ' ').split()
            morphy = MorphAnalyzer()
            descriptions = [
                morphy.parse(x)[0].normal_form for x in descriptions
            ]
            uniq = {}
            for word in descriptions:
                uniq[word] = uniq.get(word, 0) + 1
            sorted_uniq = sorted(
                [(key, value)
                 for key, value in uniq.items() if key not in stop_words],
                key=itemgetter(1))
            collector.tags = [i[0] for i in sorted_uniq[-10:]]
            collector.last_request = time()
        finally:
            return collector.tags
コード例 #2
0
ファイル: flask_app.py プロジェクト: evgeniyamiller/python_hw
def wordforms(word):
    arr = []
    morph = MorphAnalyzer()
    lex = morph.parse(word)[0].lexeme 
    for l in lex:
        arr.append(l.word)
    return set(arr)
コード例 #3
0
ファイル: synonym.py プロジェクト: bzvr/alice-synonym
def analyze(response):
    whitelist = ['найти', 'придумать', 'сказать', 'подсказать']
    text = re.findall('([а-яА-Я\-]+)', response)

    if len(text) == 1:
        return text
    else:
        parser = MorphAnalyzer()
        a = []
        for word in text:
            a.append((word, parser.parse(word)[0]))
        if {'VERB', 'INFN'} & a[0][1].tag.grammemes:
            verb = a.pop(0)
            if verb[1].normal_form not in whitelist:
                return None

        # print(a[0][0])
        if a[0][0] == 'синоним':
            if a[1][0] in ('к', 'для'):
                if a[2][1].normal_form == 'слово':
                    return a[3][0]
            if a[1][1].normal_form == 'слово':
                return a[2][0]

        return None
コード例 #4
0
def statistics(folder, filename):
    # words cloud
    with open('{}/words.txt'.format(STATIC_ROOT), 'r', encoding='utf-8') as f:
        words = [line.rstrip() for line in f]
    words_dict = {i: 0 for i in words}

    # json files to analyze
    with open('{}/{}/{}.json'.format(MEDIA_ROOT, folder, filename),
              'r',
              encoding='utf-8') as f:
        data_json = json.load(f)  # type: dict

    morph = MorphAnalyzer()
    messages_words = []
    for text in data_json.values():
        for w in text.split(' '):
            messages_words.append(morph.parse(w)[0].normal_form)

    # statistics for all words
    messages_words_counts = Counter(messages_words)
    messages_words_counts['ВСЕГО СЛОВ'] = len(messages_words)
    with open('{}/{}/{}_COUNTER.json'.format(MEDIA_ROOT, folder, filename),
              'w',
              encoding='utf-8') as f:
        json.dump(messages_words_counts, f, ensure_ascii=False)

    # statistics for word cloud
    for m in messages_words:
        if m in words_dict:
            words_dict[m] = words_dict[m] + 1

    with open('{}/{}/{}_WORD_CLOUD.json'.format(MEDIA_ROOT, folder, filename),
              'w',
              encoding='utf-8') as f:
        json.dump(words_dict, f, ensure_ascii=False)
コード例 #5
0
def generate_answer(input_sentence):
    morph = MorphAnalyzer()

    words = input_sentence.split()
    answer = ''

    for word in words:
        analyze = morph.parse(word)[0]

        pos_tag = analyze.tag.POS
        filename = str(pos_tag) + '.txt'

        with open(filename, 'r', encoding='utf-8') as file:
            lemmas = file.readlines()

        changed_word = None

        while changed_word == None:
            new_analyze, inf_tags = collect_inf_tags(analyze, pos_tag, lemmas,
                                                     morph)
            changed_word = new_analyze.inflect(inf_tags)

        answer = answer + changed_word.word + ' '

    print(answer)
コード例 #6
0
    def analyzeWord(self, word):
        morph = MorphAnalyzer()
        analysisResults = []

        for p in morph.parse(word):
            curAnalysis = {
                'исходное слово': word,
                'нормальная форма': p.normal_form,
                'часть речи': p.tag.POS,
                'одушевленность': p.tag.animacy,
                'вид': p.tag.aspect,
                'падеж': p.tag.case,
                'род': p.tag.gender,
                'включенность': p.tag.involvement,
                'наклонение': p.tag.mood,
                'число': p.tag.number,
                'лицо': p.tag.person,
                'время': p.tag.tense,
                'переходность': p.tag.transitivity,
                'залог': p.tag.voice,
                'лексема': [lexeme[0] for lexeme in p.lexeme]
            }
            analysisResults.append(curAnalysis)

        return analysisResults
コード例 #7
0
ファイル: dandt.py プロジェクト: jumper047/yuki
    def initialize(self):
        self.context_sensitive = True
        self.answers = [
            "{now} {weekday}.", "{weekday}.", "{weekday} вроде бы."
        ]
        self.times = {"сегодня": 0, "завтра": 1, "послезавтра": 2}

        self.morph = MorphAnalyzer()

        engine = self.get_app("brain").engine
        keyword = ["день", "число"]
        day = ["сегодня", "завтра", "послезавтра"]
        question = ["какой"]
        for k in keyword:
            engine.register_entity(k, "SayWeekdayKeyword")
        for d in day:
            engine.register_entity(d, "SayWeekdayDay")
        for q in question:
            engine.register_entity(q, "SayWeekdayQuestion")
        sayweekday_intent = IntentBuilder("sayweekday").\
            require("SayWeekdayKeyword").optionally(
                "SayWeekdayDay").optionally("SayWeekdayQuestion").build()
        engine.register_intent_parser(sayweekday_intent)

        print("sayweekday initialized")
コード例 #8
0
def agree(w1, w2, t1, t2):
    morph = MorphAnalyzer()
    raw_cur_tags = morph.tag(w1)[0]
    raw_next_tags = morph.tag(w1)[0]

    cur_tags = re.findall(r"\w+", raw_cur_tags)
    next_tags = re.findall(r"\w+", raw_next_tags)

    if t1 == "person":
        if t2 == "verb_right":
            if next_tags[3] == "tran":
                cur_tags[-1] = "nomn"
            else:
                cur_tags[-1] = "datv"

    if t1 == "verb_right":
        if t2 == "property":
            pass

    if t1 == "adjective":
        if t2 == "property":
            pass

    if t1 == "property":
        if t2 == "person":
            pass
        if t2 == "adjective":
            pass


    #w1 = morph.parse(w1)[0].inflect({}).word
    return w1, w2
コード例 #9
0
def thanks():
    morph = MorphAnalyzer()
    if request.args:
        #a = input('Введите предложение: ')
        a = request.args['sentence']
        words = open('words.txt', 'r', encoding='utf-8')
        words = words.readlines()
        reg = re.compile('[^а-яА-Я ]')
        a = a.split()
        new_sent = open('sentence.txt', 'w', encoding='utf-8')
        for i in a:
            ana = morph.parse(i)[0]
            random.shuffle(words)
            for word in words:
                word = reg.sub('', word)
                word = morph.parse(word)[0]
                if word.tag == ana.tag:
                    new_sent.write(word.word)
                    new_sent.write(' ')
                    break
        new_sent.close()
        new_sent1 = open('sentence.txt', 'r', encoding='utf-8')
        new_sent1 = new_sent1.read()
        return render_template('thanks.html', sentence_answer=new_sent1)
        #print(new_sent1)
        #new_sent1.close()
    return redirect(url_for(''))
コード例 #10
0
 def __init__(self):
     self.grammeme_vectorizer_input = GrammemeVectorizer()
     self.grammeme_vectorizer_output = GrammemeVectorizer()
     self.word_dictionary = WordDictionary()
     self.char_set = set()
     self.morph = MorphAnalyzer() # pyMorphy2
     self.converter = converters.converter('opencorpora-int', 'ud14')
コード例 #11
0
def pymorphying(filename):
    dictionary = dict()
    morph = MorphAnalyzer()
    words_and_grams = list()
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
        tokenized = word_tokenize(text)
        for one in tokenized:
            parsed = morph.parse(one)
            parsed = parsed[0]
            original_word = one
            gram_info = str(parsed.tag).split(',')
            first_gram = str(gram_info[0]).split()[0]
            if first_gram == 'PNCT' or first_gram == 'UNKN':
                continue
            if len(gram_info) == 1:
                continue
            loop = dictionary
            counter = 0
            for gram in gram_info:
                counter += 1
                check = gram
                checking = check in loop
                if checking == False:
                    add_to_dict(check, loop, counter, len(gram_info))
                if type(loop) != list:
                    loop = loop[check]
            try:
                loop.append(original_word)
                loop.sort()
            except AttributeError:
                loop = list()
                loop.append(original_word)
                loop.sort()
    print(dictionary)
コード例 #12
0
def make_tags(sentence):
    morph = MorphAnalyzer()
    tags = {'NOUN': 'N', 'NPRO': 'N', 'ADJF': 'A', 'ADJS': 'A', 'PRTF': 'A', 'PRTS': 'V', 'NUMR': 'A', 'VERB': 'V',
            'INFN': 'V', 'GRND': 'V', 'ADVB': 'D', 'PREP': 'P', 'PRCL': 'P', 'CONJ': 'P'}
    tokens = [token for token in nltk.word_tokenize(sentence)]
    tokens_tags = [tags[morph.parse(token)[0].tag.POS] for token in nltk.word_tokenize(sentence)]
    return [tokens, tokens_tags]
コード例 #13
0
    def parse_text(self, string, flag):
        morph = MorphAnalyzer()
        FURTHER_DEVELOPMENT = morph.parse('дальнейшие'.lower())[0].normal_form
        FURTHER_IMPROVEMENTS = morph.parse('улучшения'.lower())[0].normal_form
        self.sentences = []
        self.find_further_development = False
        self.filtered_docs = []
        stop_words = set(stopwords.words("russian"))
        filtered_doc = []
        self.get_sentences(string, flag)

        for sent in self.sentences:
            token_sent = [
                w.lower() for w in word_tokenize(sent)
                if w.lower() not in stop_words
            ]
            for word in token_sent:
                w = morph.parse(word)[0].normal_form
                filtered_doc.append(w)
                if w in [FURTHER_DEVELOPMENT, FURTHER_IMPROVEMENTS
                         ] and not flag:
                    self.find_further_development = True
                    self.further_dev_sentence = sent
            self.filtered_docs.append(filtered_doc)
            filtered_doc = []
コード例 #14
0
    def __init__(self,
                 token_pat="[а-я]+",
                 mode="normal",
                 counter=None,
                 threshold=3,
                 allowed_pos=None,
                 stop_words=None,
                 stop_cities=False):
        self.token = token_pat
        self.mode = mode

        if self.mode not in {"normal", "nospace"}:
            raise ValueError("Unknown mode")
        elif self.mode == "nospace":
            if not isinstance(counter, Counter):
                raise ValueError(
                    "In 'nospace' mode the counter attribute should be passed")
            self.counter = counter
            self.nospace = NoSpaceSplitter(counter)
            self.threshold = threshold

        self.morph = MorphAnalyzer()
        self.allowed_pos = allowed_pos
        self.stop_words = stop_words or STOPWORDS
        if stop_cities:
            self.stop_words.union(CITIES)
コード例 #15
0
def search(query):
    relevance = defaultdict(float)
    m = MorphAnalyzer()
    inverted_index, articles, avdl = get_indices()
    N = len(articles)
    words = [
        x.lower().strip(string.punctuation + '»«–…')
        for x in word_tokenize(query)
    ]
    lemmas = [
        m.parse(x)[0].normal_form for x in words
        if x and x not in set(stopwords.words('russian'))
    ]
    for lemma in lemmas:
        if lemma in inverted_index:
            articles_w_lemma = inverted_index[lemma]
            n = len(articles_w_lemma)
            for a in articles_w_lemma:
                a_info = articles[a[0]]
                qf = a[1]
                dl = a_info[2]
                relevance[(a_info[0],
                           a_info[1])] += score_BM25(n, qf, N, dl, avdl)
    res = sorted(relevance.items(), key=lambda x: x[1], reverse=True)
    res = [x[0] for x in res]
    return res
コード例 #16
0
    def feminine_checker(self, w): 
        '''
        Check if the word is feminine. Necessary for some variants of hieroglyphs
        Args:
            w: str, input Russian word
        Returns:
            sex: str, 'M' or 'F' - gender of a word
        '''

        morph = MorphAnalyzer()
        w = self.input_word.split(' ')[0]
        ana = morph.parse(w)[0]
        gram = str(ana.tag).split(',')
#        print(gram)
        try:
            if 'femn' in gram[2]:
                sex = 'F'
            else:
                sex = 'M'
        except:
            if w[-1] == 'а' or w[-1] == 'я':
                sex = 'F'
            else:
                sex = 'M'
        self.sex = sex
        return self.sex
コード例 #17
0
ファイル: lama_bot.py プロジェクト: soon/Lama-Reporter
    def __init__(self, app_id, mail_manager,
                 chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs):
        """
        Initializes Lama Bot.

        Expects login/password or access_token as named parameters

        :param mail_manager: A manager for retrieving mails
        :type mail_manager: AbstractMailManager

        :param chat_id: Chat identifier
        :type chat_id: int

        :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented
        :type chat_id_for_mails: int

        :raise ValueError: When neither login/password nor access_token was provided
        """
        self.exit_event = Event()
        self.morph = MorphAnalyzer()
        self.version = '0.1.1'
        self.app_id = app_id
        self.access_token = None
        self.password = None
        self.login = None
        self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs)
        self.commands = {}
        self._plugins = []
        self.mail_manager = mail_manager
        self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest
        self.chat_id = chat_id
        self.chat_id_for_mails = chat_id_for_mails or self.chat_id
        self.admins = admins or []

        self.initialize_commands()
コード例 #18
0
ファイル: util.py プロジェクト: turchaev/nltk4russian
def read_tab_corpus(inc):
    m = MorphAnalyzer()
    sent = []
    for t in inc:
        # try:
        #     t = t.rstrip().decode('utf-8')
        # except AttributeError:
        t = t.rstrip()
        if not t:
            continue
        if t == u'sent':
            sent = []
            continue
        if t == u'/sent' or t == u'SENT':
            sent = [x[0] for x in sent]
            parses = [m.parse(token) for token in sent]
            if sent:
                yield [(p[0].word, p[0].tag) for p in parses]
            continue
        t = t.split('\t')
        try:
            token = (t[1], ' '.join(t[2].split(' ')[2:]))
            sent.append(token)
        except IndexError:
            continue
コード例 #19
0
ファイル: market_operations.py プロジェクト: gt005/it-class
def get_correct_form_of_points_number_name(number: int) -> str:
    """ Возвращает верное слово (Баллов/Балла/Балл) для правильного написания """
    if not isinstance(number, int):  # Ввелось не число
        return "Балл"

    analysis = MorphAnalyzer().parse("Балл")[0]
    return analysis.make_agree_with_number(number).word
コード例 #20
0
def text_rank(text, language):
    sentences = []
    a = []
    if (language == 'ukrainian'):
        morph = MorphAnalyzer(lang='uk')
        sentences = sent_tokenizer_ua(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        a = tfidf(text, language, sent_tokenizer_ua, stop_words_ua)
    else:
        morph = MorphAnalyzer()
        sentences = sent_tokenizer_ru(text)
        if len(sentences) < 2:
            s = sentences[0]
            return [(1, 0, s)]
        a = tfidf(text, language, sent_tokenizer_ru, stop_words_ru)

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(a[i, :], a[j, :])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    pr = rank_graph(scores)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)  # Сортировка по убыванию ранга тройки
コード例 #21
0
ファイル: main.py プロジェクト: VadzimIlyukevich/vocabulary
def addWord():
    global vocabulary
    start_time = time.time()
    words = {}
    analyzer = MorphAnalyzer()
    vocabulary.append(inputText.get(1.0, END))
    tokenize_sentence = word_tokenize(vocabulary[0])
    for word in tokenize_sentence:
        parse_word = analyzer.parse(word)[0]
        word_word = parse_word.word
        word_lemma = parse_word.normal_form
        word_tags = parse_word.tag.cyr_repr
        word_ending = list(set(word_word) - set(word_lemma))
        if word_word is not word_lemma:
            words.update({
                word_word: {
                    'lemma': word_lemma,
                    'tag': word_tags,
                    'ending': word_ending
                }
            })
    sorted_words = sorted(words)
    for key in sorted_words:
        lexeme = Lexeme((words[key]['lemma']), (words[key]['tag']),
                        (words[key]['ending']))
        outputText.insert(0, str(lexeme.lemma) + '      ' + str(lexeme.tags) + '      ' \
                          + str(lexeme.endings))
    end_time = time.time()
    result_time = end_time - start_time
    print(str(result_time) + " seconds")
    vocabulary.clear()
コード例 #22
0
ファイル: lemmatizer.py プロジェクト: xettrisomeman/spaCy
 def __init__(
     self,
     vocab: Vocab,
     model: Optional[Model],
     name: str = "lemmatizer",
     *,
     mode: str = "pymorphy2",
     overwrite: bool = False,
     scorer: Optional[Callable] = lemmatizer_score,
 ) -> None:
     if mode == "pymorphy2":
         try:
             from pymorphy2 import MorphAnalyzer
         except ImportError:
             raise ImportError(
                 "The Russian lemmatizer mode 'pymorphy2' requires the "
                 "pymorphy2 library. Install it with: pip install pymorphy2"
             ) from None
         if getattr(self, "_morph", None) is None:
             self._morph = MorphAnalyzer()
     super().__init__(vocab,
                      model,
                      name,
                      mode=mode,
                      overwrite=overwrite,
                      scorer=scorer)
コード例 #23
0
class SayTime(AppDaemon):
    def initialize(self):
        self.answers = [
            "Сейчас {hour} {hword} {minute} {mword}.",
            "Московское время - {hour} {hword} {minute} {mword}.",
            "{hour} {hword} {minute} {mword}.",
            "{hour} {hword} {minute} {mword}."
        ]
        self.morph = MorphAnalyzer()

        engine = self.get_app("brain").engine
        keyword = ["час", "время"]
        question = ["сколько", "который"]
        for k in keyword:
            engine.register_entity(k, "SayTimeKeyword")
        for q in question:
            engine.register_entity(q, "SayTimeQuestion")
        saytime_intent = IntentBuilder("saytime").\
            require("SayTimeKeyword").optionally("SayTimeQuestion").build()
        engine.register_intent_parser(saytime_intent)

        print("saytime init done")

    def handle(self, intent_dict):
        now = datetime.datetime.now()
        hword = "час"
        ahword = self.morph.parse(hword)[0].make_agree_with_number(
            now.hour).word
        mword = "минута"
        amword = self.morph.parse(mword)[0].make_agree_with_number(
            now.minute).word
        return choice(self.answers).format(hour=now.hour,
                                           minute=now.minute,
                                           hword=ahword,
                                           mword=amword)
コード例 #24
0
def to_normal_form(file_text):
    morph = MorphAnalyzer()
    out = []
    for word in word_tokenize(file_text.lower()):
        if word.isalnum():
            out.append(morph.parse(word)[0].normal_form)
    return " ".join(out)
コード例 #25
0
    def __init__(self,
                 data_name,
                 lemmatizing_method,
                 max_examples=None,
                 delete_word_parts=False,
                 drop_duplicates=True,
                 count_lemmas_weights=False,
                 limit=None):
        self.data_name = data_name
        self.lemmatizing_method = lemmatizing_method
        self.max_examples = max_examples
        self.delete_word_parts = delete_word_parts
        self.drop_duplicates = drop_duplicates
        self.count_lemmas_weights = count_lemmas_weights
        self.translation = str.maketrans('', '', string.punctuation)

        self.dfs = dict()
        self.nf_cnts = dict()
        self.cache = dict()
        self.pattern = re.compile(r'\b\w+\b')

        if lemmatizing_method is not None and lemmatizing_method != 'none':
            if 'ru' in data_name:
                self.analyzer = MorphAnalyzer()
            elif 'german' in data_name:
                self.analyzer = spacy.load("de_core_news_sm",
                                           disable=['ner', 'parser'])
            elif 'english' in data_name:
                self.analyzer = spacy.load("en_core_web_sm",
                                           disable=['ner', 'parser'])
            else:
                assert "unknown data name %s" % data_name
コード例 #26
0
def place(message):
    global places
    m = MorphAnalyzer()
    word = m.parse(message)[0]
    if 'гео' in word.tag.cyr_repr:
        if message not in places:
            places[message] = r.choice([
                "Отличное место! Бывало, что я заползал туда иногда, раз в месяцок",
                "О да, знаю, там подают таки-и-ие блюда!",
                "Ну, знаешь, насчет этого места. Тут точно дело вкуса, обычному туристу лучше сюда не соваться...",
                "Место, откровенно говоря, так себе...",
                "Это одно из моих любимых мест на планете! Когда будет возможность, обязательно посети",
                "Это место меня отталкивает, даже не планируй туда поездку",
                "Да ладно, нашел место для отдыха!", "Погодка там так себе",
                "Ну, ничего, норм выбор",
                "Как тебе вообще в голову пришло туда захотеть поехать?!",
                "Для питона как раз!)",
                "Как-то одним морозным дням я замечательно отдохнул там, но общее впечатление оставляет желать лучшего",
                "Там бывает мокро, но для меня, питона, это естественная среда)",
                "Брррррррр, не нада", "Питон одобряет",
                "Не трать время на это", "Конечно, там прекрасно!",
                "Что ты там будешь делать?", "Хммм, ничего!"
            ])
        return places[message]
    return False
コード例 #27
0
ファイル: hw_02_06.py プロジェクト: astafyevai/project
def my_function(message):
    # здесь код, который генерирует ответ
    def find_POS(POS, arr):
        temp = arr[POS]
        idx = random.randint(0, len(temp) - 1)
        return temp[idx]

    def examine(word, arr):
        obj = morph.parse(word)[0]
        POS = obj.tag.POS
        word_to_print = find_POS(POS, arr)
        i = str(obj.tag).find(',')
        r = str(obj.tag).find(' ')
        string = str(obj.tag)[i + 1:]
        string = string.replace(" ", ",")

        if r != -1:
            tag = frozenset(string.split(','))
        else:
            tag = frozenset()
        return tag, word_to_print

    morph = MorphAnalyzer()

    text = str(message.text)
    reply = str()

    if not text.isalnum() and ' ' not in text:
        reply = 'Введены непонятные символы :('
    else:
        words_separated = text.split()

        file_str = open('1grams-3.txt', 'r', encoding='utf-8').read()

        words_separated_new = file_str.split()

        arr = dict()
        idx = 0

        for word in words_separated_new:
            idx += 1
            temp = morph.parse(word)[0]
            if temp.tag.POS in arr:
                arr[temp.tag.POS].append(temp.normal_form)
            else:
                arr[temp.tag.POS] = [temp.normal_form]
            if idx == 10000:
                break

        info = None

        for word in words_separated:
            info = None
            while info == None:
                tag, word_to_print = examine(word, arr)
                info = morph.parse(word_to_print)[0].inflect(tag)
            reply += info.word + ' '

    bot.send_message(message.chat.id, reply)  # отправляем в чат ответ
コード例 #28
0
 def __init__(self):
     nltk.download('stopwords')
     self.patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
     self.stopwords_ru = stopwords.words("russian")
     self.morph = MorphAnalyzer()
     self.ann_model = keras.Sequential()
     self.__preparing_data__()
     print("Data Prepared")
コード例 #29
0
ファイル: word_vectorizer.py プロジェクト: netsafe/DeepVesnin
 def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
     super().__init__(save_path, load_path, **kwargs)
     self.max_pymorphy_variants = max_pymorphy_variants
     self.load()
     self.memorized_word_indexes = dict()
     self.memorized_tag_indexes = dict()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud20')
コード例 #30
0
def lemmatization(list_of_strings):
    morph = MorphAnalyzer()
    for i in range(len(list_of_strings)):
        words = list_of_strings[i].split()
        for k in range(len(words)):
            words[k] = morph.parse(words[k])[0].normal_form
        list_of_strings[i] = ' '.join(words)
    return list_of_strings
コード例 #31
0
def agree(w1, w2, t1, t2):
    if t1 == "comma" or t2 == "comma":
        return w1, w2

    morph = MorphAnalyzer()
    raw_cur_tags = morph.tag(w1)[-1]
    raw_next_tags = morph.tag(w1)[-1]

    cur_tags = re.findall(r"\w+", str(raw_cur_tags))
    next_tags = re.findall(r"\w+", str(raw_next_tags))

    if t1[:-2] == "person":
        if t2[:-2] == "verb_right":
            if morph.normal_forms(w2)[0] in dative_verbs:
                w1 = morph.parse(w1)[0].inflect({"datv"}).word

    if t1[:-2] == "verb_right":
        if t2[:-2] == "property":
            pass
        if t2[:-2] == "person":
            if cur_tags[3] == "tran":
                w2 = morph.parse(w2)[0].inflect({"accs"}).word
            else:
                w2 = morph.parse(w2)[0].inflect({"nomn"}).word
                #gender with nomn only
                gender = next_tags[2]
                if gender == "inan":
                    gender = next_tags[3]
                w1 = morph.parse(w1)[0].inflect({gender}).word

    if t1[:-2] == "adjective":
        if t2[:-2] == "property":
            #gender
            gender = next_tags[2]
            if gender == "inan":
                gender = next_tags[3]
            try:
                w1 = morph.parse(w1)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)

    if t1[:-2] == "property":
        if t2[:-2] == "person":
            pass
        if t2[:-2] == "adjective":
            gender = cur_tags[2]
            if gender == "inan":
                gender = cur_tags[3]
            try:
                w2 = morph.parse(w2)[0].inflect({gender}).word
            except Exception:
                print("f**k")
                print(w1, w2)


    #w1 = morph.parse(w1)[0].inflect({}).word
    return w1, w2
コード例 #32
0
ファイル: util.py プロジェクト: 0623forbidden/nltk4russian
def read_test_corpus(fn):
    m = MorphAnalyzer()
    for line in fn:
        line = line.rstrip('\n')
# считаем, что текст у нас уже токенизованный
#        line = word_tokenize(line)
        line = line.decode('utf-8').split()
# разбираем слова по словарю, возьмем только первый разбор от pymorphy
        parses = [m.parse(token) for token in line]
        if line:
            yield [(p[0].word, p[0].tag) for p in parses]
コード例 #33
0
 def __init__(self, text_array):
     self.text_array = text_array
     self.morph = MorphAnalyzer()  # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную
     self.trash_list = \
         {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все",
          "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный",
          "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь",
          "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш",
          "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься",
          "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый",
          "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой",
          "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий",
          "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание",
          "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие",
          "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий",
          "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение",
          "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь",
          "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть",
          "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг",
          "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент",
          "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа",
          "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео",
          "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество",
          "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка",
          "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал",
          "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество",
          "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право",
          "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение",
          "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент",
          "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"}
コード例 #34
0
 def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
     super().__init__(save_path, load_path, **kwargs)
     self.max_pymorphy_variants = max_pymorphy_variants
     self.load()
     self.memorized_word_indexes = dict()
     self.memorized_tag_indexes = dict()
     self.analyzer = MorphAnalyzer()
     self.converter = converters.converter('opencorpora-int', 'ud20')
コード例 #35
0
ファイル: util.py プロジェクト: 0623forbidden/nltk4russian
def read_tab_corpus(inc):
    m = MorphAnalyzer()
    sent = []
    for t in inc:
        t = t.rstrip().decode('utf-8')
        if not t:
            continue
        if t == u'sent':
            sent = []
            continue
        if t == u'/sent' or t == u'SENT':
            sent = [x[0] for x in sent]
            parses = [m.parse(token) for token in sent]
            if sent:
                yield [(p[0].word, p[0].tag) for p in parses]
            continue
        t = t.split('\t')
        try:
            token = (t[1], ' '.join(t[2].split(' ')[2:]))
            sent.append(token)
        except IndexError:
            continue
コード例 #36
0
ファイル: morphTest.py プロジェクト: Serafim-End/HackDay
class MorphTest(unittest.TestCase):
    def __init__(self, document_vector):
        self.document = None
        self.documents = document_vector
        self.morph = MorphAnalyzer()

    # def setUp(self):
    #     self.document = documents[randint(0, len(documents))]

    def testMorph(self):
        self.document = self.document if not None else self.documents[0]
        morph_array = [self.morph.parse(word)[0].normal_form for word in self.document]
        print morph_array
        self.assertTrue(True, msg=None)
コード例 #37
0
ファイル: morph.py プロジェクト: bureaucratic-labs/yargy
class MorphAnalyzer(object):
    def __init__(self):
        self.raw = PymorphyAnalyzer()

    def check_gram(self, gram):
        if not self.raw.TagClass.grammeme_is_known(gram):
            raise ValueError(gram)

    def __call__(self, word):
        records = self.raw.parse(word)
        return [prepare_form(_) for _ in records]

    def normalized(self, word):
        return {_.normalized for _ in self(word)}
コード例 #38
0
class Analyzer:
    """
    Анализирует входящий текст, разбирает каждое слово на лексемы, убирает пунктуацию и все, кроме существительных,
    глаголов или прилагательных, а так же слова из списка запрещенных слов. Выдает 10 самых популярных из текущих слов.
    """

    def __init__(self, text_array):
        self.text_array = text_array
        self.morph = MorphAnalyzer()  # Составлено с помощью glvrd.ru, перебирая все 20 * 1700 слов вместе и вручную
        self.trash_list = \
            {"она", "они", "что", "это", "быть", "аплодисменты", "этот", "как", "если", "быть", "если", "для", "все",
             "этот", "чтобы", "так", "для", "который", "тот", "такой", "мой", "смех", "красивый", "дорогой", "уютный",
             "роскошный", "активный", "школа", "должный", "сделать", "наш", "мочь", "один", "весь", "свой", "речь",
             "человек", "слайд", "разный", "хотеть", "промышленность", "пытаться", "хороший", "позволять", "ваш",
             "решать", "общий", "продажа", "модуль", "множество", "оставлять", "важный", "решение", "заниматься",
             "служить", "реальность", "самка", "самец", "проводить", "известный", "таинственность", "быстрый",
             "большинство", "позволять", "обучение", "население", "настоящий", "необходимо", "любой", "большой",
             "форма", "успешный", "обычный", "оказываться", "высокий", "потрясающий", "богатый", "документ", "мелкий",
             "оказывать", "возможность", "простой", "крупный", "колония", "система", "реальный", "плохой", "мечтание",
             "огромный", "электрический", "ландшафт", "изломанность", "интерактивный", "суть", "позволять", "наличие",
             "иметься", "проводить", "обычный", "мощный", "аналогия", "различный", "самый", "эффективность", "низкий",
             "реальность", "определенный", "являться", "пользование", "исторический", "элементарный", "обеспечение",
             "наблюдаться", "обладать", "важный", "известняк", "хотеться", "продолжать", "год", "время", "мир", "жизнь",
             "дело", "проблема", "ребенок", "вопрос", "день", "друг", "работа", "идея", "история", "место", "часть",
             "вещь", "страна", "технология", "раз", "женщина", "слово", "вода", "вид", "проект", "информация", "мозг",
             "земля", "миллион", "город", "исследование", "помощь", "компания", "образ", "рука", "результат", "момент",
             "конец", "пример", "доллар", "дом", "книга", "музыка", "машина", "сторона", "случай", "процесс", "группа",
             "способ", "мужчина", "уровень", "тысяча", "интернет", "деньги", "семья", "компьютер", "энергия", "видео",
             "программа", "свет", "модель", "сила", "планета", "клетка", "движение", "тело", "наука", "общество",
             "язык", "фотография", "причина", "война", "пациент", "неделя", "миллиард", "будущее", "сеть", "точка",
             "сша", "игра", "отец", "природа", "изменение", "фильм", "цель", "устройство", "образование", "материал",
             "путь", "глаз", "студент", "африка", "отношение", "правительство", "болезнь", "связь", "количество",
             "звук", "парень", "искусство", "пространство", "организация", "ответ", "лицо", "час", "дизайн", "право",
             "поведение", "эксперимент", "лечение", "индия", "месяц", "мама", "карта", "мать", "здание", "изображение",
             "океан", "родитель", "внимание", "улица", "продукт", "развитие", "песня", "структура", "рынок", "процент",
             "голова", "минута", "чувство", "нога", "пара", "объект", "создание", "закон", "учитель", "действие"}

    def start(self):
        res = list(filter(
            lambda x: len(x) > 2 and self.pymorphy_analyze(x) and re.match("[а-яА-Я]", x) and x not in self.trash_list,
            self.text_array))
        return [x[0] for x in Counter(res).most_common(10)]

    def pymorphy_analyze(self, word):
        lexem = self.morph.parse(word)
        x = lexem[0].tag.POS
        if x == ("NOUN" or "ADJF" or "INFN"):
            return True
        return False
コード例 #39
0
ファイル: main.py プロジェクト: antonfait/SerchEngine
    def __init__(self, input_ru, input_en):

        self.morph_ru = MorphAnalyzer()

        self.sentences_ru = self.Pars_sentences( input_ru )
        wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" )
        self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru,
                                                   self.Normalize_ru, self.Translate_ru )
        self.word_list_ru = []

        self.sentences_en = self.Pars_sentences( input_en )
        self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo')
        wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)")
        self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en,
                                                   self.Normalize_en, self.Translate_en )
        self.word_list_en = []
        self.Graph = self.Create_Graph()

        munkres_algorithm = munkres.Munkres()
コード例 #40
0
ファイル: event.py プロジェクト: city-pulse/mskpulse.backend
	def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []):
		"""
		Initialization.

		Args:
			mysql_con (PySQLPoolConnection): MySQL connection Object
			redis_con (StrictRedis): RedisDB connection Object
			tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words
			morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. 
			classifier (Object): scikit trained classifier to detect real and fake events
			points (list[dict]): raw messages from event detector
		"""
		self.mysql = mysql_con
		self.redis = redis_con

		if morph:
			self.morph = morph
		else:
			self.morph = MorphAnalyzer()
		if tokenizer:
			self.tokenizer = tokenizer
		else:
			self.tokenizer = TreebankWordTokenizer()
		self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE)
		self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

		self.validity = None
		self.verification = None
		self.cores = {}
		self.classifier = classifier

		if points:
			self.id = str(uuid4())
			self.created = datetime.now()
			self.updated = datetime.now()

			self.messages = { x['id']:x for x in points }
			self.get_messages_data()
			self.media = {}
			self.get_media_data()
			self.event_update()
コード例 #41
0
ファイル: event.py プロジェクト: city-pulse/mskpulse.backend
class Event():
	"""
	Event object - class for working with event candidates.
	Collects all data on event candidate, stores it between clustering slices; merges slices, if required.
	TBD: constructs and saves description, scores texts and media, scores and descripts event itself 
	(probability, that candidate is real, event buzz, event category).

	Attributes:
		self.created (datetime): creation timestamp
		self.updated (datetime): last update timestamp
		self.start (datetime): timestamp of the first message in the self.messages dict
		self.end (datetime): timestamp of the last message in the self.messages dict
		self.messages (Dict[dict]): raw tweets from database, enriched with weight, is_core params (on init), tokens (after add_stem_texts)
		self.media (Dict[dict]): raw media objects from database
		self.cores (Dict[list]): tokens, that form the most common vocabulary for the event; computed in create_core() method
		self.entropy (float): entropy for authorship: 0 for mono-authored cluster; computed in event_summary_stats() method
		self.ppa (float): average number of posts per one author; computed in event_summary_stats() method
		self.authors (int): number of unique authors for event
		self.most_active_author (float): share of messages, written by one (most active author)
		self.authors_share (float): number of authors divided by number of messages
		self.relevant_messages_share (float): share of messages with token_score above zero
		self.duration (int): total seconds from self.start to self.end
		self.classifier (Object): classifier for deciding, whether event is real
		self.validity (bool): Classifier verdict, whether event is real or not
		self.verification (bool): Handmade verification of event quality

	Methods:
		self.event_update: commands to calculate all data on event, based on messages and media
		self.is_successor: examines, if current event have common messages with specified event slice
		self.is_valid: method for classifier to determine, if event is actually event, and not a random messages contilation
		self.classifier_row: unififed method for creating classifier data-row
		self.merge: merge current event with another event, update stat Attributes
		self.add_slice: add messages and media to the event, recompute statistics
		self.load / self.dump: serialize/deserialize event and put/get it to Redis
		self.backup / self.restore: dump/restore event to/from MySQL long-term storage
		self.get_messages_data: get MySQL data for messages ids
		self.get_media_data: get MySQL data for media using existing messages ids
		self.event_summary_stats: calculate statistics and start/end time for event
		self.add_stem_texts: add tokens lists to self.messages
		self.create_core: create vocabulary of most important words for the event
		self.score_messages_by_text: method calculates token_score for messages. TF/IDF likelihood with core is used

	Message keys:
		cluster (int): legacy from DBSCAN - number of cluster (event ancestor)
		id (str): DB message id; unique
		is_core (bool): True, if tweet belongs to the core of ancestor cluster
		iscopy (int): 1, if message is shared from another network
		lat (float): latitude
		lng (float): longitude
		network (int): 2 for Instagram, 1 for Twitter, 3 for VKontakte
		text (str): raw text of the message
		tokens (Set[str]): collection of stemmed tokens from raw text; created in add_stem_texts()
		tstamp (datetime): 'created at' timestamp
		user (int): user id, absolutely unique for one network, but matches between networks are possible
		token_score (float): agreement estimation with average cluster text
		weight (float): standart deviations below average
	"""

	def __init__(self, mysql_con, redis_con, tokenizer = None, morph = None, classifier = None, points = []):
		"""
		Initialization.

		Args:
			mysql_con (PySQLPoolConnection): MySQL connection Object
			redis_con (StrictRedis): RedisDB connection Object
			tokenizer (NLTK.TreebankWordTokenizer): object to split tweets into words
			morph (pymorphy2.MorphAnalyzer): word analyzer - converts words tokens to normalized form. Requires a lot of memory, so it is not created for every event object. 
			classifier (Object): scikit trained classifier to detect real and fake events
			points (list[dict]): raw messages from event detector
		"""
		self.mysql = mysql_con
		self.redis = redis_con

		if morph:
			self.morph = morph
		else:
			self.morph = MorphAnalyzer()
		if tokenizer:
			self.tokenizer = tokenizer
		else:
			self.tokenizer = TreebankWordTokenizer()
		self.word = compile(r'^\w+$', flags = UNICODE | IGNORECASE)
		self.url_re = compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

		self.validity = None
		self.verification = None
		self.cores = {}
		self.classifier = classifier

		if points:
			self.id = str(uuid4())
			self.created = datetime.now()
			self.updated = datetime.now()

			self.messages = { x['id']:x for x in points }
			self.get_messages_data()
			self.media = {}
			self.get_media_data()
			self.event_update()

	def __str__(self):
		txt = "<Event {}: {} msgs [{} -- {}]>".format(self.id, len(self.messages), self.start.strftime("%Y-%m-%d %H:%M"), self.end.strftime("%H:%M"))
		return txt

	def __unicode__(self):
		return unicode(self.__str__())

	def __repr__(self):
		return self.__str__()

	def event_update(self):
		"""
		Commands to calculate all data on event, based on messages and media.
		"""
		self.add_stem_texts()
		self.create_core(deviation_threshold = 1)
		self.create_core(deviation_threshold = 2)
		self.create_core(deviation_threshold = 3)
		self.score_messages_by_text()
		self.event_summary_stats()
		self.is_valid()

	def is_successor(self, slice_ids, sim_index = 0.3, only_relevant = True):
		"""
		Method examines, if current event have common messages with specified event slice.

		Args:
			slice_ids (Set): set if message id's to compare with
			sim_index (float): minimal share of messages that should match in slice to be detected as a successor
			only_relevant (bool): use only messages with non-zero token_score (to exclude spam)
		"""
		if only_relevant:
			event_ids = set([k for k, v in self.messages.items() if v['token_score'] > 0])
			if not event_ids:
				event_ids = set(self.messages.keys())
		else:
			event_ids = set(self.messages.keys())
		#if float(len(event_ids.intersection(slice_ids)))/len(event_ids.union(slice_ids)) >= jaccard:
		if float(len(event_ids.intersection(slice_ids)))/min((len(event_ids), len(slice_ids))) >= sim_index:
			return True
		return False

	def is_valid(self):
		"""
		Method for Classifier to determine, if event is actually event, and not a random messages contilation.
		"""
		if self.validity:
			return True
		if self.classifier:
			self.validity = bool(self.classifier.predict([self.classifier_row()])[0])
		return self.validity

	def classifier_row(self):
		"""
		Unififed method for creating classifier data-row. Every var, used in prediction, is listed here, and only here.
		"""
		row = [
			len(self.messages.values()), 
			len(self.media.values()), 
			self.authors, 
			self.most_active_author, 
			self.authors_share, 
			self.entropy, 
			self.ppa, 
			self.relevant_messages_share, 
			self.duration
		]
		return row

	def merge(self, other_event):
		"""
		Method merges current event with another event, update stat Attributes.

		Args:
			other_event (Event): another event object - to merge with
		"""
		self.messages.update(other_event.messages)
		self.media.update(other_event.media)
		self.event_update()
		self.updated = datetime.now()
		self.created = min((self.created, other_event.created))

	def add_slice(self, new_slice):
		"""
		Method adds messages and media to the event, recompute statistics.

		Args:
			new_slice (List[dict]): initial list with messages to be added
		"""
		self.messages.update({ x['id']:x for x in new_slice })
		self.get_messages_data([x['id'] for x in new_slice])
		self.get_media_data([x['id'] for x in new_slice])
		self.event_update()
		self.updated = datetime.now()

	def backup(self):
		"""
		Method dumps event to MySQL long-term storage, used for non-evaluating events.
		"""
		if self.verification is None:
			ver = 'NULL'
		else:
			ver = int(self.verification)
		if self.validity is None:
			val = 'NULL'
		else:
			val = int(self.validity)
		msg_string = self.pack()
		q = b'''INSERT INTO events(id, start, end, msgs, description, dumps, verification, validity) VALUES ("{}", "{}", "{}", {}, "{}", "{}", {}, {}) ON DUPLICATE KEY UPDATE `start`=VALUES(`start`), `end`=VALUES(`end`), `msgs`=VALUES(`msgs`), `description`=VALUES(`description`), `dumps`=VALUES(`dumps`), `verification`=VALUES(`verification`), `validity`=VALUES(`validity`);'''.format(self.id, self.start, self.end, len(self.messages.keys()), escape_string(', '.join([x.encode('utf-8') for x in self.cores[2]])), escape_string(msg_string), ver, val)
		exec_mysql(q, self.mysql)
		self.redis.delete("event:{}".format(self.id))

	def restore(self, event_id):
		"""
		Method restores event from MySQL table using event_id parameter.

		Args:
			event_id (str): unique event identifier
		"""
		q = '''SELECT dumps FROM events WHERE id="{}"'''.format(event_id)
		event_data = exec_mysql(q, self.mysql)[0][0]['dumps']
		self.unpack(event_data)

	def load(self, event_id, redis_prefix='event'):
		"""
		Method for deserializing and loading event from Redis database.

		Args:
			event_id (str): unique event isentifier
			redis_prefix (str): prefix used in Redis database
		"""
		try:
			event_data = self.redis.hget('{}:{}'.format(redis_prefix, event_id), 'dumps')
		except ResponseError:
			event_data = self.redis.get('{}:{}'.format(redis_prefix, event_id))
		self.unpack(event_data)

	def dump(self, redis_prefix='event'):
		"""
		Method for serializing and dumping event to Redis database.

		Args:
			redis_prefix (str): prefix to use, when storing new key in Redis database
		"""
		if self.verification is None:
			ver = 'NULL'
		else:
			ver = int(self.verification)
		if self.validity is None:
			val = 'NULL'
		else:
			val = int(self.validity)
		msg_string = self.pack()
		event = {'start':self.start.strftime("%Y-%m-%d %H:%M:%S"), 'end':self.end.strftime("%Y-%m-%d %H:%M:%S"), 'msgs':len(self.messages.keys()), 'description':', '.join([x.encode('utf-8') for x in self.cores[2]]), 'dumps':msg_string, 'verification':ver, 'validity':val}
		self.redis.hmset("{}:{}".format(redis_prefix, self.id), event)

	def pack(self, complete=False):
		"""
		Method for serializing event to string.

		Args:
			complete (bool): whether to pack all available data for the event (full texted messages, media links, and cores).
		"""
		todump = {
			'id':self.id,
			'created':int(mktime(self.created.timetuple())),
			'updated':int(mktime(self.updated.timetuple())),
			'verification':self.verification,
			'messages':[{'id':x['id'], 'is_core':x.get('is_core'), 'token_score':x.get('token_score'), 'weight':x.get('weight')} for x in self.messages.values()]
		}

		if complete:
			todump['media'] = self.media
			todump['validity'] = self.validity
			for i in range(len(todump['messages'])):
				msg = self.messages[todump['messages'][i]['id']]
				todump['messages'][i].update({'iscopy':msg['iscopy'], 'lat':msg['lat'], 'lng':msg['lng'], 'network':msg['network'], 'text':msg['text'], 'tstamp':int(mktime(msg['tstamp'].timetuple())), 'user':msg['user']})
		return packb(todump)

	def unpack(self, data, complete=False):
		"""
		Method for deserializing event from string. msgpack lib is used (considered to be faster than pickle).

		Args:
			data (str): pickle dump of event-required parameters.
			complete (bool): whether to unpack all available data for the event (full texted messages, media links, and cores), or compute these parameters on the fly.
		"""
		data = unpackb(data)
		self.id = data['id']
		self.created = datetime.fromtimestamp(data['created'])
		self.updated = datetime.fromtimestamp(data['updated'])
		self.verification = data['verification']
		self.messages = {x['id']:x for x in data['messages']}

		if complete:
			self.validity = data['validity']
			self.media = data['media']
			for k in self.messages.keys():
				self.messages[k]['tstamp'] = datetime.fromtimestamp(self.messages[k]['tstamp'])

		else:
			self.get_messages_data()
			self.media = {}
			self.get_media_data()

		self.event_update()

	def get_messages_data(self, ids=None):
		"""
		Method loads MySQL data for messages ids and adds it to the self.messagea argument.

		Args:
			ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used 
		"""
		if not ids:
			ids = [x['id'] for x in self.messages.values()]
		q = '''SELECT * FROM tweets WHERE id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.messages[item['id']].update(item)

	def get_media_data(self, ids=None):
		"""
		Method loads MySQL data for media using existing messages ids and adds it to the self.media argument.

		Args:
			ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used 
		"""
		if not ids:
			ids = [x['id'] for x in self.messages.values()]
		q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.media[item['id']] = item

	def event_summary_stats(self):
		"""
		Method calculates several statistics, updates self.start and self.end timestamps.
		"""
		authorsip_stats = [len(tuple(i[1])) for i in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z: z['user'])]
		self.authors = len(authorsip_stats)
		self.most_active_author = max(authorsip_stats)/float(len(self.messages.values()))
		self.authors_share = float(self.authors)/len(self.messages.values())
		self.entropy = entropy(authorsip_stats)
		self.ppa = mean(authorsip_stats)
		self.relevant_messages_share = float(len([x for x in self.messages.values() if x['token_score'] > 0]))/len(self.messages.values())
		self.start = min([x['tstamp'] for x in self.messages.values()])
		self.end = max([x['tstamp'] for x in self.messages.values()])
		self.duration = int((self.end - self.start).total_seconds())

	def add_stem_texts(self):
		"""
		Method adds tokens lists to self.messages.
		"""
		for i in self.messages.keys():
			if 'tokens' not in self.messages[i].keys():
				txt = self.messages[i].get('text', '')
				txt = sub(self.url_re, '', txt)
				self.messages[i]['tokens'] = {self.morph.parse(token.decode('utf-8'))[0].normal_form for token in self.tokenizer.tokenize(txt) if match(self.word, token.decode('utf-8'))}

	def create_core(self, deviation_threshold=2, min_token=3):
		"""
		Method creates core of imprtant words for event.

		Args:
			deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens
			min_token (int): minimal length of token, to exclude prepositions/conjunctions
		"""
		texts_by_authors = [set().union(*[msg['tokens'] for msg in list(y[1])]) for y in groupby(sorted(self.messages.values(), key=lambda x:x['user']), lambda z:z['user'])]
		top_words = {}
		for doc in texts_by_authors:
			for token in doc:
				if len(token) >= min_token:
					try:
						top_words[token] += 1
					except KeyError:
						top_words[token] = 1
		th_vals = [x[1] for x in top_words.items()]
		threshold = mean(th_vals) + deviation_threshold * std(th_vals)
		self.cores[deviation_threshold] = [k for k,v in top_words.items() if v > threshold]

	def score_messages_by_text(self, deviation_threshold=2):
		"""
		Method calculates token_score parameter for self.messages.

		Args:
			deviation_threshold (int): number of standart deviations, that differs core tokens from average tokens
		"""
		texts = [x['tokens'] for x in self.messages.values()]
		if not sum([bool(x) for x in texts]) or len(set([frozenset(x) for x in texts])) == 1:
			for k in self.messages.keys():
				self.messages[k]['token_score'] = 0
			return
		dictionary = Dictionary(texts)
		corpus = [dictionary.doc2bow(text) for text in texts]
		tfidf = TfidfModel(corpus, id2word=dictionary)
		index = MatrixSimilarity(tfidf[corpus])
		try:
			scores = index[dictionary.doc2bow(self.cores[deviation_threshold])]
		except IndexError:
			error('Index error in token scoring for event {}'.format(self.id))
			scores = [0]*len(self.messages.values())
		for i in range(len(scores)):
			self.messages.values()[i]['token_score'] = float(scores[i])
コード例 #42
0
ファイル: morphTest.py プロジェクト: Serafim-End/HackDay
 def __init__(self, document_vector):
     self.document = None
     self.documents = document_vector
     self.morph = MorphAnalyzer()
コード例 #43
0
class PymorphyVectorizer(WordIndexVectorizer):
    """
    Transforms russian words into 0-1 vector of its possible Universal Dependencies tags.
    Tags are obtained using Pymorphy analyzer (pymorphy2.readthedocs.io)
    and transformed to UD2.0 format using russian-tagsets library (https://github.com/kmike/russian-tagsets).
    All UD2.0 tags that are compatible with produced tags are memorized.
    The list of possible Universal Dependencies tags is read from a file,
    which contains all the labels that occur in UD2.0 SynTagRus dataset.

    Args:
        save_path: path to save the tags list,
        load_path: path to load the list of tags,
        max_pymorphy_variants: maximal number of pymorphy parses to be used. If -1, all parses are used.
    """

    USELESS_KEYS = ["Abbr"]
    VALUE_MAP = {"Ptan": "Plur", "Brev": "Short"}

    def __init__(self, save_path: str, load_path: str, max_pymorphy_variants: int = -1, **kwargs) -> None:
        super().__init__(save_path, load_path, **kwargs)
        self.max_pymorphy_variants = max_pymorphy_variants
        self.load()
        self.memorized_word_indexes = dict()
        self.memorized_tag_indexes = dict()
        self.analyzer = MorphAnalyzer()
        self.converter = converters.converter('opencorpora-int', 'ud20')

    @property
    def dim(self):
        return len(self._t2i)

    def save(self) -> None:
        """Saves the dictionary to self.save_path"""
        with self.save_path.open("w", encoding="utf8") as fout:
            fout.write("\n".join(self._i2t))

    def load(self) -> None:
        """Loads the dictionary from self.load_path"""
        self._i2t = []
        with self.load_path.open("r", encoding="utf8") as fin:
            for line in fin:
                line = line.strip()
                if line == "":
                    continue
                self._i2t.append(line)
        self._t2i = {tag: i for i, tag in enumerate(self._i2t)}
        self._make_tag_trie()

    def _make_tag_trie(self):
        self._nodes = [defaultdict(dict)]
        self._start_nodes_for_pos = dict()
        self._data = [None]
        for tag, code in self._t2i.items():
            if "," in tag:
                pos, tag = tag.split(",", maxsplit=1)
                tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")])
            else:
                pos, tag = tag, []
            start = self._start_nodes_for_pos.get(pos)
            if start is None:
                start = self._start_nodes_for_pos[pos] = len(self._nodes)
                self._nodes.append(defaultdict(dict))
                self._data.append(None)
            for key, value in tag:
                values_dict = self._nodes[start][key]
                child = values_dict.get(value)
                if child is None:
                    child = values_dict[value] = len(self._nodes)
                    self._nodes.append(defaultdict(dict))
                    self._data.append(None)
                start = child
            self._data[start] = code
        return self

    def find_compatible(self, tag: str) -> List[int]:
        """
        Transforms a Pymorphy tag to a list of indexes of compatible UD tags.

        Args:
            tag: input Pymorphy tag

        Returns:
            indexes of compatible UD tags
        """
        if " " in tag and "_" not in tag:
            pos, tag = tag.split(" ", maxsplit=1)
            tag = sorted([tuple(elem.split("=")) for elem in tag.split("|")])
        else:
            pos, tag = tag.split()[0], []
        if pos not in self._start_nodes_for_pos:
            return []
        tag = [(key, self.VALUE_MAP.get(value, value)) for key, value in tag
               if key not in self.USELESS_KEYS]
        if len(tag) > 0:
            curr_nodes = [(0, self._start_nodes_for_pos[pos])]
            final_nodes = []
        else:
            final_nodes = [self._start_nodes_for_pos[pos]]
            curr_nodes = []
        while len(curr_nodes) > 0:
            i, node_index = curr_nodes.pop()
            # key, value = tag[i]
            node = self._nodes[node_index]
            if len(node) == 0:
                final_nodes.append(node_index)
            for curr_key, curr_values_dict in node.items():
                curr_i, curr_node_index = i, node_index
                while curr_i < len(tag) and tag[curr_i][0] < curr_key:
                    curr_i += 1
                if curr_i == len(tag):
                    final_nodes.extend(curr_values_dict.values())
                    continue
                key, value = tag[curr_i]
                if curr_key < key:
                    for child in curr_values_dict.values():
                        curr_nodes.append((curr_i, child))
                else:
                    child = curr_values_dict.get(value)
                    if child is not None:
                        if curr_i < len(tag) - 1:
                            curr_nodes.append((curr_i + 1, child))
                        else:
                            final_nodes.append(child)
        answer = []
        while len(final_nodes) > 0:
            index = final_nodes.pop()
            if self._data[index] is not None:
                answer.append(self._data[index])
            for elem in self._nodes[index].values():
                final_nodes.extend(elem.values())
        return answer

    def _get_word_indexes(self, word):
        answer = self.memorized_word_indexes.get(word)
        if answer is None:
            parse = self.analyzer.parse(word)
            if self.max_pymorphy_variants > 0:
                parse = parse[:self.max_pymorphy_variants]
            tag_indexes = set()
            for elem in parse:
                tag_indexes.update(set(self._get_tag_indexes(elem.tag)))
            answer = self.memorized_word_indexes[word] = list(tag_indexes)
        return answer

    def _get_tag_indexes(self, pymorphy_tag):
        answer = self.memorized_tag_indexes.get(pymorphy_tag)
        if answer is None:
            tag = self.converter(str(pymorphy_tag))
            answer = self.memorized_tag_indexes[pymorphy_tag] = self.find_compatible(tag)
        return answer
コード例 #44
0
ファイル: main.py プロジェクト: antonfait/SerchEngine
class Parallel_Translate:
    def __init__(self, input_ru, input_en):

        self.morph_ru = MorphAnalyzer()

        self.sentences_ru = self.Pars_sentences( input_ru )
        wordPattern_ru = re.compile( "((?:[а-яА-ЯёЁ]+[-']?)*[а-яА-яёЁ]+)" )
        self.sentences_list_ru = self.Create_Word_List( wordPattern_ru, self.sentences_ru,
                                                   self.Normalize_ru, self.Translate_ru )
        self.word_list_ru = []

        self.sentences_en = self.Pars_sentences( input_en )
        self.dict_en_ru = Dictionary('Dict/ER-LingvoUniversal.ifo')
        wordPattern_en = re.compile("((?:[a-zA-Z]+[-']?)*[a-zA-Z]+)")
        self.sentences_list_en = self.Create_Word_List( wordPattern_en, self.sentences_en,
                                                   self.Normalize_en, self.Translate_en )
        self.word_list_en = []
        self.Graph = self.Create_Graph()

        munkres_algorithm = munkres.Munkres()
        #self.word_matching = munkres_algorithm.compute( self.Graph )




# Input file? read text and split to sentences
    def Pars_sentences(self,file_name ) :
        sentences_list = []

        with open(file_name, 'rU') as input_file:
            file_str = input_file.read()
            sentences_tokenize = nltk.tokenize.PunktSentenceTokenizer()
            for sentence in sentences_tokenize.sentences_from_text( file_str ):
                sentences_list.append(  sentence )

        return sentences_list



    def Create_Word_List(self, wordPattern, sentences, Normalize, Translate ):
        word_list = []
        sentence_num = 0
        sent_list = []
        for sentence in sentences:
            sentence_word_list = []
            for word in wordPattern.findall( sentence ):
                word = word.strip()
                word = word.lower()
                n_word = Normalize( word )
                translate_list = Translate( n_word )
                w_info = word_info( word, sentence_num, n_word, translate_list )
                word_list.append( w_info )
                sentence_word_list.append(w_info)
            sent_list.append( sentence_info( sentence, sentence_word_list ) )
            sentence_num= sentence_num + 1
        return sent_list



    def Translate_ru( self, n_word ):
        return []

    def Translate_en( self, n_word ):

        self.re_for_entry = re.compile("<dtrn>(.*?)</dtrn>")

        valueWord = []
        try:
            for normal_word in n_word:
                for entry in self.dict_en_ru[ normal_word ]:
                    result_pars = self.ParsEntry( entry.data )
                    valueWord = valueWord + result_pars
        except KeyError:
            pass
        return valueWord

    def ParsEntry( self, entry_data  ) :
        l = entry_data.split( "<abr><i><c><co>" )
        result_first_step = []
        for data in l:
            result_first_step = result_first_step + self.re_for_entry.findall(data)
        result_second_step = []
        for data in result_first_step:
            temp = data.split("<")
            if temp[0] != "":
                result_second_step.append(temp[0])
        result = []
        for data in result_second_step:
            for data_prom in data.split(","):
                result = result + data_prom.split(";")
        for i in range( len( result ) ):
            result[i] = result[i].strip()
        return result


    def Normalize_ru( self, word ):
        n_word = self.morph_ru.normal_forms( word )
        if n_word:
            return n_word[0]
        else:
            return []

    def Normalize_en( self, word ):
        n_word = wordnet.morphy( word )
        if n_word:
            return [ n_word ]
        else:
            return []

    def Create_Graph(self):
        graph_matrix = [ [ 0 for i in range( len( self.sentences_list_ru ) ) ]
                            for j in range( len( self.sentences_list_en ) ) ]
        koef = abs( len( self.sentences_list_en ) - len( self.sentences_list_ru ) )
        sentence_num = 0
        for sentence in self.sentences_list_en:

            sentence_left_num = sentence_num
            sentence_right_num = sentence_num +1

            while (sentence_left_num >= 0) and (sentence_num - sentence_left_num <= koef):

                sum_eq_words = 0
                for w_info in sentence.sentence_words:

                    for translate_word in w_info.translate_list:

                        for w_info_ru in self.sentences_list_ru[sentence_left_num]:

                            for w_normal in w_info_ru.normal_form:

                                if w_normal == translate_word:
                                    sum_eq_words = sum_eq_words + 1

                graph_matrix[sentence_num][sentence_left_num] = -( sum_eq_words - sentence_num + sentence_left_num )

            while (sentence_right_num < len( self.sentences_list_ru ) ) and ( sentence_right_num - sentence_num <= koef):

                sum_eq_words = 0
                for w_info in sentence.sentence_words:

                    for translate_word in w_info.translate_list:

                        for w_info_ru in self.sentences_list_ru[sentence_right_num]:

                            for w_normal in w_info_ru.normal_form:

                                if w_normal == translate_word:
                                    sum_eq_words = sum_eq_words + 1

                graph_matrix[sentence_num][sentence_right_num] = -( sum_eq_words - sentence_num + sentence_left_num )

        return graph_matrix
コード例 #45
0
ファイル: utils.py プロジェクト: named-entity/lexpagerank
def read_text_lemmas(fileobj):
    m = MorphAnalyzer()
    for line in fileobj:
        yield ' '.join((m.parse(t)[0].normal_form for t in simple_word_tokenize(line.decode('utf-8'))))
コード例 #46
0
 for line in f.readlines():
     all += 1
     word = line.split(" ")[0]
     cnt = int(line.split(" ")[1])
     all_postings += cnt
     if match("^[^\W\d]+$", word):
         nonnum += 1
         alpha_postings += cnt
         lo = word.lower()
         if lo in low_reg:
             low_reg[lo] += cnt
         else:
             low_reg[lo] = cnt
 just_ru = {k: v for (k, v) in low_reg.items() if match(u"^[\u0400-\u0500]+$", k)}
 ru_postings = sum(just_ru.values())
 morph = MorphAnalyzer()
 c = 0
 for k, v in just_ru.items():
     if c % 100000 == 0:
         print(c)
     c += 1
     lem = morph.parse(k)[0].normal_form
     if lem in lemmatized:
         lemmatized[lem] += int(v)
     else:
         lemmatized[lem] = int(v)
 with open("stopwords", "r") as st:
     stops = set(st.read().split('\n'))
     for k, v in just_ru.items():
         if not k in stops:
             no_stops_postings += v
コード例 #47
0
ファイル: morph.py プロジェクト: bureaucratic-labs/yargy
 def __init__(self):
     self.raw = PymorphyAnalyzer()
コード例 #48
0
ファイル: learning_news.py プロジェクト: Sereni/assignments
 def lemmatize(self, tokens):
     """
     :param tokens: a list of tokens to lemmatize
     """
     analyzer = MorphAnalyzer()
     return Counter([analyzer.parse(token)[0].normal_form for token in tokens if len(token) > 1])
コード例 #49
0
ファイル: utils.py プロジェクト: named-entity/lexpagerank
def read_lemmas(fileobj):
    # здесь на каждой строчке по предложению (токенизованному)
    m = MorphAnalyzer()
    for line in fileobj:
        yield [m.parse(t)[0].normal_form for t in line.decode('utf-8').split()[1:]]
コード例 #50
0
ファイル: lama_bot.py プロジェクト: soon/Lama-Reporter
class LamaBot(object):
    def __init__(self, app_id, mail_manager,
                 chat_id=1, number_of_seconds_for_the_rest=60, chat_id_for_mails=None, admins=None, **kwargs):
        """
        Initializes Lama Bot.

        Expects login/password or access_token as named parameters

        :param mail_manager: A manager for retrieving mails
        :type mail_manager: AbstractMailManager

        :param chat_id: Chat identifier
        :type chat_id: int

        :param chat_id_for_mails: Chat for mails. Same as chat_id, if not presented
        :type chat_id_for_mails: int

        :raise ValueError: When neither login/password nor access_token was provided
        """
        self.exit_event = Event()
        self.morph = MorphAnalyzer()
        self.version = '0.1.1'
        self.app_id = app_id
        self.access_token = None
        self.password = None
        self.login = None
        self.vkapi = ThreadSafeVkApi(app_id=app_id, **kwargs)
        self.commands = {}
        self._plugins = []
        self.mail_manager = mail_manager
        self.number_of_seconds_for_the_rest = number_of_seconds_for_the_rest
        self.chat_id = chat_id
        self.chat_id_for_mails = chat_id_for_mails or self.chat_id
        self.admins = admins or []

        self.initialize_commands()

    def initialize_commands(self):
        self.commands = {
            'post_to_dialog': lambda args, m: self.safe_post_message_and_log_if_failed(args),
            'ping': self.pong_to_admins
        }

    def safe_notify_about_unread_mails(self):
        for m in self.safe_unread_mails:
            if self.safe_post_mail_and_log_if_failed(m):
                self.mail_manager.safe_mark_mail_as_read_and_log_if_failed(m)

    def safe_process_directed_dialog_message(self, message):
        logging.debug(u'Processing message with body {}'.format(message.body))
        words = self.split_to_words(message.body)
        logging.debug(u'Words in the body: {}'.format(words))
        self.safe_process_plugins(message, words)
        self.safe_mark_message_as_read_and_log_if_failed(message)

    def safe_process_private_message(self, message):
        if self.safe_execute_and_log_if_failed(message):
            self.safe_mark_message_as_read_and_log_if_failed(message)

    @safe_call_and_log_if_failed
    def safe_process_plugins(self, message, words):
        normalized_words = self.normalize_words(words)
        for p in self.plugins:
            p.process_input(message.body, words, normalized_words, message)

    def long_pool_loop(self, exit_event):
        server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response()

        while not exit_event.is_set():
            response = self.send_long_poll_request(server, key, ts)
            if 'failed' in response:
                server, key, ts = self.extract_server_key_and_timestamp_from_get_long_poll_server_response()
            else:
                self.process_long_poll_response(response)
                ts = self.get_timestamp(response, ts)

    def extract_server_key_and_timestamp_from_get_long_poll_server_response(self):
        response = self.vkapi.messages_get_long_poll_server()
        while not all(x in response for x in ('server', 'key', 'ts')):
            logging.error('Could not retrieve credentials for connecting to long poll server', response)
            response = self.vkapi.messages_get_long_poll_server()
        return response['server'], response['key'], response['ts']

    @safe_call_and_log_if_failed(default={'failed': True})
    def send_long_poll_request(self, server, key, ts, act='a_check', wait=25, mode=2):
        params = {
            'act': act,
            'key': key,
            'ts': ts,
            'wait': wait,
            'mode': mode
        }
        return requests.get('http://{server}'.format(server=server), params=params).json()

    def process_long_poll_response(self, response):
        if response:
            for update in response.get('updates', []):
                self.process_long_poll_update(update)

    def process_long_poll_update(self, update):
        functions = {
            4: self.process_long_poll_new_message
        }
        function = functions.get(update[0])
        if function:
            function(update)

    def process_long_poll_new_message(self, update):
        chat_id = self.get_chat_id_from_long_poll_new_message_update(update)
        fwd_messages = self.get_fwd_messages_from_long_poll_new_message_update(update)
        self.process_new_message(VkMessage({'id': update[1],
                                            'user_id': None,
                                            'read_state': (update[2] + 1) % 2,
                                            'chat_id': chat_id,
                                            'title': update[5],
                                            'body': update[6],
                                            'fwd_messages': fwd_messages,
                                            'out': (update[2] & 2) >> 1}))

    def process_new_message(self, message):
        if message.is_unread and message.is_inbox:
            if message.chat_id == self.chat_id and self.message_is_directed(message):
                self.safe_process_directed_dialog_message(message)
            elif message.is_private:
                self.safe_process_private_message(message)

    def get_fwd_messages_from_long_poll_new_message_update(self, update):
        return map(self.convert_fwd_from_long_poll_new_message_update_to_fwd_message,
                   ifilter(None,
                           self.get_attachments_from_long_poll_new_message_update(update).get('fwd', '').split(',')))

    @staticmethod
    def convert_fwd_from_long_poll_new_message_update_to_fwd_message(fwd):
        regex = re.compile('(?P<user_id>\d+)_(?P<msg_id>\d+)')
        m = regex.match(fwd)
        return {
            'id': m.group('msg_id'),
            'user_id': m.group('user_id')
        }

    @staticmethod
    def get_chat_id_from_long_poll_new_message_update(update):
        """
        The message was sent from chat if user_id is greater than 2000000000
        :param update:
        :return:
        """
        return update[3] - 2000000000 if update[3] > 2000000000 else None

    def get_user_id_from_long_poll_new_message_update(self, update):
        """
        Retrieves user_id from update according to documentation
        https://vk.com/pages?oid=-17680044&p=Connecting_to_the_LongPoll_Server
        :param update:
        :return:
        """
        return self.get_attachments_from_long_poll_new_message_update(update).get('from')

    @staticmethod
    def get_attachments_from_long_poll_new_message_update(update):
        return update[7] if len(update) > 7 else {}

    @staticmethod
    def get_timestamp(response, default):
        return response.get('ts', default) if response else default

    @property
    def unread_mails(self):
        return self.mail_manager.unread_mails

    @property
    def safe_unread_mails(self):
        """
        Just delegates the work to the mail manager
        :return:
        """
        return self.mail_manager.safe_unread_mails

    @property
    def vkapi_messages_get(self):
        return self.vkapi.messages_get()

    @property
    def plugins(self):
        """

        :rtype : a list of LamaPlugin
        """
        return self._plugins

    def vkapi_messages_set_activity_in_chat(self):
        return self.vkapi.messages_set_activity(chat_id=self.chat_id, type='typing')

    def post_mail(self, mail):
        """
        Posts mail to VK. Loads and attaches documents, if any.
        :param mail:
        :return:
        """
        documents = None
        if mail.attachments:
            documents = filter(None, imap(self.safe_upload_attachment, mail.attachments))
        self.post_message_to_mail_dialog(self.wrap_mail(mail), attachments=documents)

    @safe_call_and_log_if_failed(default=False)
    def safe_post_mail_and_log_if_failed(self, mail):
        """
        :param mail:
        :return: True if no error, False otherwise
        """
        self.post_mail(mail)
        return True

    @safe_call_and_log_if_failed()
    def safe_post_message_and_log_if_failed(self, message):
        self.post_message_to_dialog(message)

    @safe_call_and_log_if_failed
    def pong_to_admins(self, _, message):
        self.post_message_to_admins('Pong', forward_messages=[message])

    @safe_call_and_log_if_failed
    def safe_post_message_with_forward_messages(self, message, forward_messages):
        self.post_message_to_dialog(message, forward_messages=forward_messages)

    def execute(self, message):
        s = message.body
        command, args = self.split_to_command_and_argument(s)
        if command in self.commands:
            self.commands[command](args, message)
        else:
            self.command_not_found(command)

    @safe_call_and_log_if_failed(default=False)
    def safe_execute_and_log_if_failed(self, message):
        self.execute(message)
        return True

    @staticmethod
    def split_to_command_and_argument(command):
        values = command.split(':', 1)
        if len(values) != 2:
            values.append(None)
        return values[0], values[1]

    def _post_message_to_dialog(self, chat_id, message, attachments=None, forward_messages=None):
        """
        Posts message to dialog. Attaches attachments, if any.
        :param forward_messages: Messages to be forwarded
        :type forward_messages: [VkMessage]
        :param attachments:Documents to be attached
        :type attachments: [VkDocument]
        :param message:
        """
        attachments = attachments or []
        forward_messages = forward_messages or []
        attachment = ','.join(map(lambda d: d.attachment_string, attachments))
        forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages))
        self.vkapi.messages_send(chat_id=chat_id,
                                 message=message,
                                 attachment=attachment,
                                 forward_messages=forward_messages_str)

    def post_message_to_dialog(self, message, attachments=None, forward_messages=None):
        self._post_message_to_dialog(self.chat_id, message, attachments=attachments, forward_messages=forward_messages)

    def post_message_to_mail_dialog(self, message, attachments=None, forward_messages=None):
        self._post_message_to_dialog(self.chat_id_for_mails, message,
                                     attachments=attachments, forward_messages=forward_messages)

    def post_startup_message_to_admins(self):
        self.post_message_to_admins('The Lama is ready to work! (version {0})'.format(self.version))

    @safe_call_and_log_if_failed
    def post_message_to_admins(self, message, forward_messages=None):
        forward_messages = forward_messages or []
        forward_messages_str = ','.join(map(lambda m: str(m.id), forward_messages))
        for user_id in self.admins:
            self.vkapi.messages_send(user_id=user_id, message=message, forward_messages=forward_messages_str)

    def command_not_found(self, command):
        message = u'Command `{}` not found'.format(command).encode('utf-8')
        logging.warning(message)

    def run(self, post_welcome_message_to_dialog=True):
        if post_welcome_message_to_dialog:
            self.post_startup_message_to_admins()

        long_poll = Thread(target=self.long_pool_loop, args=(self.exit_event,))
        long_poll.start()

        while True:
            self.safe_notify_about_unread_mails()
            time.sleep(self.number_of_seconds_for_the_rest)

    def stop_running(self):
        self.exit_event.set()

    @safe_call_and_log_if_failed
    def safe_upload_attachment(self, attachment):
        """
        Uploads given attachment

        :type attachment: Attachment
        :rtype: VkDocument
        """
        if attachment.is_loaded:
            url = self.safe_docs_get_upload_server()
            file_string = self.safe_upload_file_to_server(url, self.create_attachment_filename(attachment.filename),
                                                          attachment.data, attachment.mime_type)
            return self.safe_save_doc_file(file_string, attachment.filename)

    @safe_call_and_log_if_failed
    def safe_upload_message_photo(self, image_file_path):
        if image_file_path is not None:
            url = self.safe_get_upload_server_for_private_message_photo()
            data = self.safe_upload_photo_to_server(url, self.create_attachment_filename(image_file_path),
                                                    self.get_image_data(image_file_path),
                                                    self.get_mime_type(image_file_path))
            photo_name = os.path.basename(image_file_path)
            return self.safe_save_photo_file(data['photo'], data['server'], data['hash'], photo_name)

    @staticmethod
    def get_image_data(image_filename):
        with open(image_filename, 'rb') as f:
            data = f.read()
        return data

    @staticmethod
    def get_mime_type(image_filename):
        return mimetypes.guess_type(image_filename)

    @safe_call_and_log_if_failed
    def safe_save_photo_file(self, photo, server, hash, title):
        if photo:
            responses = self.vkapi.photos_save_message_photo(photo=photo, server=server, hash=hash, title=title)
            return VkPhoto(responses[0])

    @safe_call_and_log_if_failed
    def safe_get_upload_server_for_private_message_photo(self):
        """
        Retrieves upload_url for storing files
        """
        return self.vkapi.photos_get_messages_upload_server()['upload_url']

    @staticmethod
    def create_attachment_filename(filename):
        _, extension = os.path.splitext(filename)
        return 'attachment' + extension

    @safe_call_and_log_if_failed
    def safe_upload_to_server(self, url, filename, data, mime_type, post_name):
        """
        Uploads data to given url and saves it with given filename and mime_type

        :return: Raw response, returned by post request
        """
        if url:
            request = requests.post(url, files={post_name: (filename or 'NoName', data, mime_type)})
            response = json.loads(request.text)
            if 'error' in response:
                raise Exception(response['error'])
            else:
                return response

    def safe_upload_file_to_server(self, url, filename, data, mime_type):
        return self.safe_upload_to_server(url, filename, data, mime_type, 'file')['file']

    def safe_upload_photo_to_server(self, url, filename, data, mime_type):
        return self.safe_upload_to_server(url, filename, data, mime_type, 'photo')

    @safe_call_and_log_if_failed
    def safe_save_doc_file(self, file_string, title):
        """
        Saves file on VK server by given string

        :param file_string: String, returned after uploading file
        :return: Saved document
        :rtype: VkDocument
        """
        if file_string:
            responses = self.vkapi.docs_save(file=file_string, title=title)
            return VkDocument(responses[0])

    @safe_call_and_log_if_failed
    def safe_docs_get_upload_server(self):
        """
        Retrieves upload_url for storing files
        """
        return self.vkapi.docs_get_upload_server()['upload_url']

    def retrieve_users_by_ids(self, *user_ids):
        return map(VkUser, self.vkapi.users_get(user_id=','.join(imap(str, user_ids))))

    @staticmethod
    def wrap_mail(mail):
        return LamaBeautifier.get_random_mail_pattern().format(subject=mail.subject, sender=mail.sender, body=mail.body)

    @staticmethod
    def message_is_directed(message):
        return message.body is not None and message.body.encode('utf-8').startswith('Лама, ')

    @staticmethod
    def message_has_body(message):
        return message.body is not None

    def mark_message_as_read(self, message):
        self.mark_message_as_read_by_id(message.id)

    @safe_call_and_log_if_failed(default=False)
    def safe_mark_message_as_read_and_log_if_failed(self, message):
        self.mark_message_as_read(message)
        return True

    def mark_message_as_read_by_id(self, message_ids):
        self.vkapi.messages_mark_as_read(message_ids=message_ids)

    def register_plugin(self, plugin):
        self._plugins.append(plugin)
        plugin.bot = self

    def split_to_words(self, body):
        return body.encode('utf-8').translate(string.maketrans('', ''), string.punctuation).split()

    def normalize_words(self, words):
        return map(self.normalize_word, words)

    def normalize_word(self, word):
        return self.morph.parse(word.decode('utf8'))[0].normal_form.encode('utf8')