Ejemplo n.º 1
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     #self.lemmatizer = Mystem()
     self.tagger = rupostagger.RuPosTagger()
     self.tagger.load()
     self.lemm = rulemma.Lemmatizer()
     self.lemm.load()
Ejemplo n.º 2
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.lemmatizer.load()
Ejemplo n.º 3
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.chunker = ruchunker.Chunker()
     self.word2tags = ruword2tags.RuWord2Tags()
     self.flexer = ruword2tags.RuFlexer()
     self.syntan = None
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.word_embeddings = None
Ejemplo n.º 4
0
	def test(self):
		lemmatizer = rulemma.Lemmatizer()
		lemmatizer.load()

		tokenizer = rutokenizer.Tokenizer()
		tokenizer.load()

		tagger = rupostagger.RuPosTagger()
		tagger.load()

		sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
		tokens = tokenizer.tokenize(sent)
		tags = tagger.tag(tokens)
		lemmas = lemmatizer.lemmatize(tags)
		for word, tags, lemma, *_ in lemmas:
			print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
Ejemplo n.º 5
0
    def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs):
        nltk.download("stopwords")
        #nltk.download("punkt")
        self.mystem = Mystem()
        self.useLemmas = False

        if lang == 'russian':
            self.lemmatizer = rulemma.Lemmatizer()
            self.lemmatizer.load()

            self.tokenizer = rutokenizer.Tokenizer()
            self.tokenizer.load()

            self.tagger = rupostagger.RuPosTagger()
            self.tagger.load()
        else:
            self.lemmatizer = WordNetLemmatizer()

        alphabet = []
        self.language = lang

        self.tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

        if lang == 'russian':
            self.stopwords = stopwords.words("russian")
            alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
        else:
            self.stopwords = stopwords.words('english')
            alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
        self.stopwords.extend(list(alphabet))

        if not stopwordsList is None:
            self.stopwords.extend(stopwordsList)
Ejemplo n.º 6
0
def vectorize_data(samples, vectorizer, params):
    labels = [s[2] for s in samples]
    y_data = np.asarray(labels)

    phrases1 = [s[0] for s in samples]
    phrases2 = [s[1] for s in samples]

    if params['nlp_transform'] == 'lemmatize':
        tagger = rupostagger.RuPosTagger()
        tagger.load()

        lemmatizer = rulemma.Lemmatizer()
        lemmatizer.load()

        all_phrases = list(set(phrases1) | set(phrases2))
        phrase2lemma = dict(
            (phrase, lemmatize_phrase(phrase, tagger, lemmatizer))
            for phrase in all_phrases)
        lphrases1 = [phrase2lemma[f] for f in phrases1]
        lphrases2 = [phrase2lemma[f] for f in phrases2]
        return vectorize_data2(lphrases1, lphrases2, vectorizer,
                               params), y_data
    else:
        return vectorize_data2(phrases1, phrases2, vectorizer, params), y_data
Ejemplo n.º 7
0
import operator
import rutokenizer
import rupostagger

import rulemma

if __name__ == '__main__':
    print('Loading dictionaries and models...')
    lemmatizer = rulemma.Lemmatizer()
    lemmatizer.load('../tmp/rulemma.dat')

    tokenizer = rutokenizer.Tokenizer()
    tokenizer.load()

    tagger = rupostagger.RuPosTagger()
    tagger.load()
    print('Loading finished')

    sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
    tokens = tokenizer.tokenize(sent)
    tags = tagger.tag(tokens)
    lemmas = lemmatizer.lemmatize(tags)
    for word, tags, lemma, *_ in lemmas:
        print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))

    tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'),
             (u'Мяукая, голодные кошки ловят жирненьких мышек',
              u'мяукать , голодный кошка ловить жирненький мышка'),
             (u'Мы спрашивали про уроки и оценки',
              u'я спрашивать про урок и оценка'),
             (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]