def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() #self.lemmatizer = Mystem() self.tagger = rupostagger.RuPosTagger() self.tagger.load() self.lemm = rulemma.Lemmatizer() self.lemm.load()
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.lemmatizer.load()
def __init__(self): self.tokenizer = Tokenizer() self.tokenizer.load() self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.chunker = ruchunker.Chunker() self.word2tags = ruword2tags.RuWord2Tags() self.flexer = ruword2tags.RuFlexer() self.syntan = None self.gg_dictionaries = GenerativeGrammarDictionaries() self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.word_embeddings = None
def test(self): lemmatizer = rulemma.Lemmatizer() lemmatizer.load() tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs): nltk.download("stopwords") #nltk.download("punkt") self.mystem = Mystem() self.useLemmas = False if lang == 'russian': self.lemmatizer = rulemma.Lemmatizer() self.lemmatizer.load() self.tokenizer = rutokenizer.Tokenizer() self.tokenizer.load() self.tagger = rupostagger.RuPosTagger() self.tagger.load() else: self.lemmatizer = WordNetLemmatizer() alphabet = [] self.language = lang self.tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } if lang == 'russian': self.stopwords = stopwords.words("russian") alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" else: self.stopwords = stopwords.words('english') alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" self.stopwords.extend(list(alphabet)) if not stopwordsList is None: self.stopwords.extend(stopwordsList)
def vectorize_data(samples, vectorizer, params): labels = [s[2] for s in samples] y_data = np.asarray(labels) phrases1 = [s[0] for s in samples] phrases2 = [s[1] for s in samples] if params['nlp_transform'] == 'lemmatize': tagger = rupostagger.RuPosTagger() tagger.load() lemmatizer = rulemma.Lemmatizer() lemmatizer.load() all_phrases = list(set(phrases1) | set(phrases2)) phrase2lemma = dict( (phrase, lemmatize_phrase(phrase, tagger, lemmatizer)) for phrase in all_phrases) lphrases1 = [phrase2lemma[f] for f in phrases1] lphrases2 = [phrase2lemma[f] for f in phrases2] return vectorize_data2(lphrases1, lphrases2, vectorizer, params), y_data else: return vectorize_data2(phrases1, phrases2, vectorizer, params), y_data
import operator import rutokenizer import rupostagger import rulemma if __name__ == '__main__': print('Loading dictionaries and models...') lemmatizer = rulemma.Lemmatizer() lemmatizer.load('../tmp/rulemma.dat') tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() print('Loading finished') sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags)) tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'), (u'Мяукая, голодные кошки ловят жирненьких мышек', u'мяукать , голодный кошка ловить жирненький мышка'), (u'Мы спрашивали про уроки и оценки', u'я спрашивать про урок и оценка'), (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]