def test(self): lemmatizer = rulemma.Lemmatizer() lemmatizer.load() tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs): nltk.download("stopwords") #nltk.download("punkt") self.mystem = Mystem() self.useLemmas = False if lang == 'russian': self.lemmatizer = rulemma.Lemmatizer() self.lemmatizer.load() self.tokenizer = rutokenizer.Tokenizer() self.tokenizer.load() self.tagger = rupostagger.RuPosTagger() self.tagger.load() else: self.lemmatizer = WordNetLemmatizer() alphabet = [] self.language = lang self.tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } if lang == 'russian': self.stopwords = stopwords.words("russian") alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" else: self.stopwords = stopwords.words('english') alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" self.stopwords.extend(list(alphabet)) if not stopwordsList is None: self.stopwords.extend(stopwordsList)
import operator import rutokenizer import rupostagger import rulemma if __name__ == '__main__': print('Loading dictionaries and models...') lemmatizer = rulemma.Lemmatizer() lemmatizer.load('../tmp/rulemma.dat') tokenizer = rutokenizer.Tokenizer() tokenizer.load() tagger = rupostagger.RuPosTagger() tagger.load() print('Loading finished') sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей' tokens = tokenizer.tokenize(sent) tags = tagger.tag(tokens) lemmas = lemmatizer.lemmatize(tags) for word, tags, lemma, *_ in lemmas: print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags)) tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'), (u'Мяукая, голодные кошки ловят жирненьких мышек', u'мяукать , голодный кошка ловить жирненький мышка'), (u'Мы спрашивали про уроки и оценки', u'я спрашивать про урок и оценка'), (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]
def load_samples(data_folder): logging.info('Loading samples...') tokenizer = rutokenizer.Tokenizer() tokenizer.load() samples = [] emitted = set() with io.open(os.path.join(data_folder, 'invalid_syntax_dataset.txt'), 'r', encoding='utf-8') as rdr: for line in rdr: if not line.startswith('#'): words = tokenizer.tokenize(line.strip().lower()) if len(words) > 0: words = remove_terminator(words) key = u' '.join(words) if key not in emitted: samples.append((words, 0)) emitted.add(key) # В отдельном файле - валидные (но возможно не всегда разумные) сэмплы with io.open(os.path.join(data_folder, 'valid_syntax_dataset.txt'), 'r', encoding='utf-8') as rdr: for line in rdr: if not line.startswith('#'): words = tokenizer.tokenize(line.strip().lower()) if len(words) > 0: words = remove_terminator(words) key = u' '.join(words) if key not in emitted: samples.append((words, 1)) emitted.add(key) # Предполагаем, что корпус текстов для N-грамм содержит хорошие образцы with io.open(os.path.join(data_folder, 'ngrams_corpus.txt'), 'r', encoding='utf-8') as rdr: for line in rdr: if not line.startswith('#'): words = tokenizer.tokenize(line.strip().lower()) if len(words) > 0: words = remove_terminator(words) key = u' '.join(words) if key not in emitted: samples.append((words, 0)) emitted.add(key) for inpath in ['paraphrases.txt', 'pqa_all.dat']: with io.open(os.path.join(data_folder, inpath), 'r', encoding='utf-8') as rdr: for line in rdr: if not line.startswith('#'): s = clean_input(line) words = tokenizer.tokenize(s) if len(words) > 0: if u'ты' in words: words = remove_terminator(words) key = u' '.join(words) if key not in emitted: emitted.add(key) samples.append((words, 1)) print('sample count={}'.format(len(samples))) nb0 = sum((label == 0) for (words, label) in samples) nb1 = sum((label == 1) for (words, label) in samples) print('nb0={} nb1={}'.format(nb0, nb1)) max_wordseq_len = max(len(words) for (words, label) in samples) print('max_wordseq_len={}'.format(max_wordseq_len)) return samples
def __init__(self): import rutokenizer self.tokenizer = rutokenizer.Tokenizer() self.tokenizer.load()