def main(): try: log('Loading tagger: %s, lemmatizer: %s', TAGGER, LEMMATIZER) global ANALYZER ANALYZER = maru.get_analyzer(tagger=TAGGER, lemmatizer=LEMMATIZER) except Exception as error: log('Can not load analyzer: "%s"', error) return server = HTTPServer((HOST, PORT), HTTPHandler) try: log('Listening http://%s:%d', HOST, PORT) server.serve_forever() except KeyboardInterrupt: log('Quiting') finally: server.server_close()
def FitWordNetFeature(self, train_indices, num_clues, language="ru", verbose=False): morph = pymorphy2.MorphAnalyzer() all_clue_words = {} pos_classes = self.fulldata_words['target_pos_label'].unique() fulldata_words_train = self.fulldata_words[ self.fulldata_words['sentence_num'].isin(train_indices)] analyzer = maru.get_analyzer(tagger='linear', lemmatizer='dummy') def get_pos(q): analyzed = analyzer.analyze([q]) return list(analyzed)[0].tag.pos.value.casefold() for pos_class in pos_classes: all_syn = [] class_words = fulldata_words_train[ fulldata_words_train['target_pos_label'] == pos_class] class_words = class_words[class_words['token'] != self.target_word] class_words = class_words[class_words['POS1'].isin( ['noun', 'verb', 'adj', 'adv'])] for s in class_words['synset'].values: for w in s: for r in w.get_words(): word = r.definition().split('~')[0] pos = get_pos(word) if pos in ['noun', 'verb', 'adj']: all_syn.append(morph.parse(word)[0].normal_form) print(word, pos, morph.parse(word)[0].normal_form) clue_words = pd.Series(all_syn).value_counts().index.values all_clue_words[pos_class] = clue_words for w, v in all_clue_words.items(): all_clue_words[w] = [ y for y in all_clue_words[w] if y not in [ x for k in all_clue_words.keys() for x in all_clue_words[k] if k != w ] ] all_clue_words = {k: v[:30] for k, v in all_clue_words.items()} self.all_clue_words = all_clue_words print(all_clue_words) return all_clue_words
def CreatePosCorpus(self, language='ru', verbose=False): if language == "ru": analyzer = maru.get_analyzer(tagger='linear', lemmatizer='dummy') def get_pos(q): analyzed = analyzer.analyze([q]) return list( analyzed)[0].tag.pos.value.casefold() #print pos tag self.fulldata_words['POS1'] = self.fulldata_words['token'].apply( get_pos) elif language == "en": raise NotImplementedError else: NotImplementedError return self.fulldata_words
def test2(): ''' =========================================================== Following steps: 1. Add numbers and punct 2. No tokenizer 3. Stop words 4. pos tag =========================================================== ''' print('running preprocessor test 2 ...') pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]') analyzer = maru.get_analyzer(tagger='linear') config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=True, remove_stop_words=False, lemmatize=True, tokenizer=None) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [[ 'так_ADV', 'говорить_VERB', 'в_ADP', 'июль_NOUN', '1805_NUM', 'год_NOUN', 'известный_ADJ' ], [ 'как_CONJ', 'можно_ADJ', 'быть_VERB', 'здоровой..._CONJ', 'когда_CONJ', 'нравственно_ADV', 'страдаешь?_PRON' ], ['праздник_NOUN', 'отменен,_VERB']] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 2 passed')
def test4(): ''' =========================================================== Following steps: 1. Add numbers and punct 2. Razdel tokenizer 3. Stop words 4. No pos tag 5. No lemmatization =========================================================== ''' print('running preprocessor test 3 ...') pattern = re.compile(r'[^а-яА-я0-9,.!?;\- ё]') analyzer = maru.get_analyzer(tagger='linear') tokenizer = razdel config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=False, remove_stop_words=False, lemmatize=False, tokenizer=tokenizer) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [[ 'так', 'говорила', 'в', 'июле', '1805', 'года', 'известная' ], [ 'как', 'можно', 'быть', 'здоровой', '...', 'когда', 'нравственно', 'страдаешь', '?' ], ['праздник', 'отменен', ',']] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 4 passed')
def test1(): ''' =========================================================== Full house: 1. Leave only alphabet characters 2. Remove stop words 3. Lemmatize and add pos tags =========================================================== ''' print('running preprocessor test 1 ...') pattern = re.compile(r'[^а-яА-я ё]') analyzer = maru.get_analyzer(tagger='linear') config = preprocessor.Config(regexp=pattern, stopwords=stopwords_set, analyzer=analyzer, with_pos_tag=True, remove_stop_words=True, lemmatize=True, tokenizer=None) pipeline = preprocessor.Preprocessor(config) case = [ 'Так говорила в июле 1805 года известная', '— Как можно быть здоровой... когда нравственно страдаешь?', 'праздник отменен, Je vous avoue que toutes ces fêtes' ] expected = [ ['говорить_VERB', 'июль_NOUN', 'год_NOUN', 'известный_ADJ'], ['здоровый_ADJ', 'нравственно_ADV', 'страдать_VERB'], ### here is the case when lemmatization fails ['праздник_NOUN', 'отменный_ADJ'] ] res = pipeline.fit(case).transform(case) for res_line, expected_line in zip(res, expected): assert compare(res_line, expected_line), \ 'failed with {} and {}'.format(res_line, expected_line) print('test 1 passed')
def __init__(self): self.morph = pymorphy2.MorphAnalyzer() self.analyzer = maru.get_analyzer(tagger='crf', lemmatizer='pymorphy')
import sys import re import maru # https://github.com/chomechome/maru analyzer = maru.get_analyzer(tagger="rnn", lemmatizer="pymorphy") for line in sys.stdin: tokens = line.strip().split(" ") analyzed = analyzer.analyze(tokens) words = [] lemmas = [] tags = [] for morph in analyzed: word = morph.word lemma = morph.lemma tag = morph.tag words.append(word) lemmas.append(lemma) tags.append(tag) print("tokens =", tokens) print("lemmas =", lemmas) print("tags =", tags)