Exemple #1
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     #self.lemmatizer = Mystem()
     self.tagger = rupostagger.RuPosTagger()
     self.tagger.load()
     self.lemm = rulemma.Lemmatizer()
     self.lemm.load()
Exemple #2
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lemmatizer = Mystem()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
Exemple #3
0
def test_posstager():
    import rupostagger

    tagger = rupostagger.RuPosTagger()
    tagger.load()
    for word, label in tagger.tag(u'кошки спят'.split()):
        print(u'{} -> {}'.format(word, label))


#test_posstager()
Exemple #4
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.chunker = ruchunker.Chunker()
     self.word2tags = ruword2tags.RuWord2Tags()
     self.flexer = ruword2tags.RuFlexer()
     self.syntan = None
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.word_embeddings = None
Exemple #5
0
	def test(self):
		lemmatizer = rulemma.Lemmatizer()
		lemmatizer.load()

		tokenizer = rutokenizer.Tokenizer()
		tokenizer.load()

		tagger = rupostagger.RuPosTagger()
		tagger.load()

		sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
		tokens = tokenizer.tokenize(sent)
		tags = tagger.tag(tokens)
		lemmas = lemmatizer.lemmatize(tags)
		for word, tags, lemma, *_ in lemmas:
			print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
Exemple #6
0
def main():
    try:
        log('Loading tagger')
        global TAGGER
        TAGGER = rupostagger.RuPosTagger()
        TAGGER.load()
    except Exception as error:
        log('Can not load tagger: "%s"', error)
        return

    server = HTTPServer((HOST, PORT), HTTPHandler)
    try:
        log('Listening http://%s:%d', HOST, PORT)
        server.serve_forever()
    except KeyboardInterrupt:
        log('Quiting')
    finally:
        server.server_close()
Exemple #7
0
    def load(self, model_dir=None):
        if model_dir is None:
            module_folder = str(pathlib.Path(__file__).resolve().parent)
            model_dir = os.path.join(module_folder, '../tmp')
            if not os.path.exists(model_dir):
                model_dir = module_folder

        config_path = os.path.join(model_dir, 'chunker_NP.config')
        self.chunker_params = ChunkerCrfParams.load(config_path)

        if self.chunker_params.use_gren:
            self.word2tags = ruword2tags.RuWord2Tags()
            self.word2tags.load()

        if self.chunker_params.use_postagger:
            self.postagger = rupostagger.RuPosTagger()
            self.postagger.load()

        self.crf_tagger = pycrfsuite.Tagger()
        self.crf_tagger.open(
            os.path.join(model_dir, self.chunker_params.model_filename))
Exemple #8
0
    def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs):
        nltk.download("stopwords")
        #nltk.download("punkt")
        self.mystem = Mystem()
        self.useLemmas = False

        if lang == 'russian':
            self.lemmatizer = rulemma.Lemmatizer()
            self.lemmatizer.load()

            self.tokenizer = rutokenizer.Tokenizer()
            self.tokenizer.load()

            self.tagger = rupostagger.RuPosTagger()
            self.tagger.load()
        else:
            self.lemmatizer = WordNetLemmatizer()

        alphabet = []
        self.language = lang

        self.tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

        if lang == 'russian':
            self.stopwords = stopwords.words("russian")
            alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
        else:
            self.stopwords = stopwords.words('english')
            alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
        self.stopwords.extend(list(alphabet))

        if not stopwordsList is None:
            self.stopwords.extend(stopwordsList)
def vectorize_data(samples, vectorizer, params):
    labels = [s[2] for s in samples]
    y_data = np.asarray(labels)

    phrases1 = [s[0] for s in samples]
    phrases2 = [s[1] for s in samples]

    if params['nlp_transform'] == 'lemmatize':
        tagger = rupostagger.RuPosTagger()
        tagger.load()

        lemmatizer = rulemma.Lemmatizer()
        lemmatizer.load()

        all_phrases = list(set(phrases1) | set(phrases2))
        phrase2lemma = dict(
            (phrase, lemmatize_phrase(phrase, tagger, lemmatizer))
            for phrase in all_phrases)
        lphrases1 = [phrase2lemma[f] for f in phrases1]
        lphrases2 = [phrase2lemma[f] for f in phrases2]
        return vectorize_data2(lphrases1, lphrases2, vectorizer,
                               params), y_data
    else:
        return vectorize_data2(phrases1, phrases2, vectorizer, params), y_data
Exemple #10
0
import operator
import rutokenizer
import rupostagger

import rulemma

if __name__ == '__main__':
    print('Loading dictionaries and models...')
    lemmatizer = rulemma.Lemmatizer()
    lemmatizer.load('../tmp/rulemma.dat')

    tokenizer = rutokenizer.Tokenizer()
    tokenizer.load()

    tagger = rupostagger.RuPosTagger()
    tagger.load()
    print('Loading finished')

    sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
    tokens = tokenizer.tokenize(sent)
    tags = tagger.tag(tokens)
    lemmas = lemmatizer.lemmatize(tags)
    for word, tags, lemma, *_ in lemmas:
        print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))

    tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'),
             (u'Мяукая, голодные кошки ловят жирненьких мышек',
              u'мяукать , голодный кошка ловить жирненький мышка'),
             (u'Мы спрашивали про уроки и оценки',
              u'я спрашивать про урок и оценка'),
             (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]
Exemple #11
0
    def __init__(self):
        self.predictor = RNNMorphPredictor(language="ru")

        self.tagger = rupostagger.RuPosTagger()
        self.tagger.load()