Ejemplo n.º 1
0
def main():
    try:
        log('Loading tagger: %s, lemmatizer: %s', TAGGER, LEMMATIZER)
        global ANALYZER
        ANALYZER = maru.get_analyzer(tagger=TAGGER, lemmatizer=LEMMATIZER)
    except Exception as error:
        log('Can not load analyzer: "%s"', error)
        return

    server = HTTPServer((HOST, PORT), HTTPHandler)
    try:
        log('Listening http://%s:%d', HOST, PORT)
        server.serve_forever()
    except KeyboardInterrupt:
        log('Quiting')
    finally:
        server.server_close()
Ejemplo n.º 2
0
    def FitWordNetFeature(self,
                          train_indices,
                          num_clues,
                          language="ru",
                          verbose=False):
        morph = pymorphy2.MorphAnalyzer()
        all_clue_words = {}
        pos_classes = self.fulldata_words['target_pos_label'].unique()
        fulldata_words_train = self.fulldata_words[
            self.fulldata_words['sentence_num'].isin(train_indices)]
        analyzer = maru.get_analyzer(tagger='linear', lemmatizer='dummy')

        def get_pos(q):
            analyzed = analyzer.analyze([q])
            return list(analyzed)[0].tag.pos.value.casefold()

        for pos_class in pos_classes:
            all_syn = []
            class_words = fulldata_words_train[
                fulldata_words_train['target_pos_label'] == pos_class]
            class_words = class_words[class_words['token'] != self.target_word]
            class_words = class_words[class_words['POS1'].isin(
                ['noun', 'verb', 'adj', 'adv'])]

            for s in class_words['synset'].values:
                for w in s:
                    for r in w.get_words():
                        word = r.definition().split('~')[0]
                        pos = get_pos(word)
                        if pos in ['noun', 'verb', 'adj']:
                            all_syn.append(morph.parse(word)[0].normal_form)
                            print(word, pos, morph.parse(word)[0].normal_form)
            clue_words = pd.Series(all_syn).value_counts().index.values
            all_clue_words[pos_class] = clue_words

        for w, v in all_clue_words.items():
            all_clue_words[w] = [
                y for y in all_clue_words[w] if y not in [
                    x for k in all_clue_words.keys() for x in all_clue_words[k]
                    if k != w
                ]
            ]
        all_clue_words = {k: v[:30] for k, v in all_clue_words.items()}
        self.all_clue_words = all_clue_words
        print(all_clue_words)
        return all_clue_words
Ejemplo n.º 3
0
    def CreatePosCorpus(self, language='ru', verbose=False):
        if language == "ru":
            analyzer = maru.get_analyzer(tagger='linear', lemmatizer='dummy')

            def get_pos(q):
                analyzed = analyzer.analyze([q])
                return list(
                    analyzed)[0].tag.pos.value.casefold()  #print pos tag

            self.fulldata_words['POS1'] = self.fulldata_words['token'].apply(
                get_pos)

        elif language == "en":
            raise NotImplementedError
        else:
            NotImplementedError
        return self.fulldata_words
Ejemplo n.º 4
0
 def test2():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. No tokenizer
         3. Stop words
         4. pos tag
     ===========================================================
     '''
     print('running preprocessor test 2 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=False,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так_ADV', 'говорить_VERB', 'в_ADP', 'июль_NOUN', '1805_NUM',
         'год_NOUN', 'известный_ADJ'
     ],
                 [
                     'как_CONJ', 'можно_ADJ', 'быть_VERB',
                     'здоровой..._CONJ', 'когда_CONJ', 'нравственно_ADV',
                     'страдаешь?_PRON'
                 ], ['праздник_NOUN', 'отменен,_VERB']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 2 passed')
Ejemplo n.º 5
0
 def test4():
     '''
     ===========================================================
     Following steps:
         1. Add numbers and punct
         2. Razdel tokenizer
         3. Stop words
         4. No pos tag
         5. No lemmatization
     ===========================================================
     '''
     print('running preprocessor test 3 ...')
     pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     tokenizer = razdel
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=False,
                                  remove_stop_words=False,
                                  lemmatize=False,
                                  tokenizer=tokenizer)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [[
         'так', 'говорила', 'в', 'июле', '1805', 'года', 'известная'
     ],
                 [
                     'как', 'можно', 'быть', 'здоровой', '...', 'когда',
                     'нравственно', 'страдаешь', '?'
                 ], ['праздник', 'отменен', ',']]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 4 passed')
Ejemplo n.º 6
0
 def test1():
     '''
     ===========================================================
     Full house:
         1. Leave only alphabet characters
         2. Remove stop words
         3. Lemmatize and add pos tags
     ===========================================================
     '''
     print('running preprocessor test 1 ...')
     pattern = re.compile(r'[^а-яА-я ё]')
     analyzer = maru.get_analyzer(tagger='linear')
     config = preprocessor.Config(regexp=pattern,
                                  stopwords=stopwords_set,
                                  analyzer=analyzer,
                                  with_pos_tag=True,
                                  remove_stop_words=True,
                                  lemmatize=True,
                                  tokenizer=None)
     pipeline = preprocessor.Preprocessor(config)
     case = [
         'Так говорила в июле 1805 года известная',
         '— Как можно быть здоровой... когда нравственно страдаешь?',
         'праздник отменен, Je vous avoue que toutes ces fêtes'
     ]
     expected = [
         ['говорить_VERB', 'июль_NOUN', 'год_NOUN', 'известный_ADJ'],
         ['здоровый_ADJ', 'нравственно_ADV', 'страдать_VERB'],
         ### here is the case when lemmatization fails
         ['праздник_NOUN', 'отменный_ADJ']
     ]
     res = pipeline.fit(case).transform(case)
     for res_line, expected_line in zip(res, expected):
         assert compare(res_line, expected_line), \
             'failed with {} and {}'.format(res_line, expected_line)
     print('test 1 passed')
Ejemplo n.º 7
0
 def __init__(self):
     self.morph = pymorphy2.MorphAnalyzer()
     self.analyzer = maru.get_analyzer(tagger='crf', lemmatizer='pymorphy')
Ejemplo n.º 8
0
import sys
import re
import maru

# https://github.com/chomechome/maru

analyzer = maru.get_analyzer(tagger="rnn", lemmatizer="pymorphy")

for line in sys.stdin:

    tokens = line.strip().split(" ")
    analyzed = analyzer.analyze(tokens)

    words = []
    lemmas = []
    tags = []

    for morph in analyzed:

        word = morph.word
        lemma = morph.lemma
        tag = morph.tag

        words.append(word)
        lemmas.append(lemma)
        tags.append(tag)

    print("tokens =", tokens)
    print("lemmas =", lemmas)
    print("tags =", tags)