Ejemplo n.º 1
0
	def test(self):
		lemmatizer = rulemma.Lemmatizer()
		lemmatizer.load()

		tokenizer = rutokenizer.Tokenizer()
		tokenizer.load()

		tagger = rupostagger.RuPosTagger()
		tagger.load()

		sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
		tokens = tokenizer.tokenize(sent)
		tags = tagger.tag(tokens)
		lemmas = lemmatizer.lemmatize(tags)
		for word, tags, lemma, *_ in lemmas:
			print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))
Ejemplo n.º 2
0
    def __init__(self, stopwordsList=None, lang='russian', *args, **kwargs):
        nltk.download("stopwords")
        #nltk.download("punkt")
        self.mystem = Mystem()
        self.useLemmas = False

        if lang == 'russian':
            self.lemmatizer = rulemma.Lemmatizer()
            self.lemmatizer.load()

            self.tokenizer = rutokenizer.Tokenizer()
            self.tokenizer.load()

            self.tagger = rupostagger.RuPosTagger()
            self.tagger.load()
        else:
            self.lemmatizer = WordNetLemmatizer()

        alphabet = []
        self.language = lang

        self.tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

        if lang == 'russian':
            self.stopwords = stopwords.words("russian")
            alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
        else:
            self.stopwords = stopwords.words('english')
            alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
        self.stopwords.extend(list(alphabet))

        if not stopwordsList is None:
            self.stopwords.extend(stopwordsList)
Ejemplo n.º 3
0
import operator
import rutokenizer
import rupostagger

import rulemma

if __name__ == '__main__':
    print('Loading dictionaries and models...')
    lemmatizer = rulemma.Lemmatizer()
    lemmatizer.load('../tmp/rulemma.dat')

    tokenizer = rutokenizer.Tokenizer()
    tokenizer.load()

    tagger = rupostagger.RuPosTagger()
    tagger.load()
    print('Loading finished')

    sent = u'Мяукая, голодные кошки ловят жирненьких хрюнделей'
    tokens = tokenizer.tokenize(sent)
    tags = tagger.tag(tokens)
    lemmas = lemmatizer.lemmatize(tags)
    for word, tags, lemma, *_ in lemmas:
        print(u'{:15}\t{:15}\t{}'.format(word, lemma, tags))

    tests = [(u'я вижу хрюнделя', u'я видеть хрюндель'),
             (u'Мяукая, голодные кошки ловят жирненьких мышек',
              u'мяукать , голодный кошка ловить жирненький мышка'),
             (u'Мы спрашивали про уроки и оценки',
              u'я спрашивать про урок и оценка'),
             (u'Куда же улетели облачка?', u'куда же улететь облачко ?')]
def load_samples(data_folder):
    logging.info('Loading samples...')
    tokenizer = rutokenizer.Tokenizer()
    tokenizer.load()

    samples = []
    emitted = set()

    with io.open(os.path.join(data_folder, 'invalid_syntax_dataset.txt'),
                 'r',
                 encoding='utf-8') as rdr:
        for line in rdr:
            if not line.startswith('#'):
                words = tokenizer.tokenize(line.strip().lower())
                if len(words) > 0:
                    words = remove_terminator(words)
                    key = u' '.join(words)
                    if key not in emitted:
                        samples.append((words, 0))
                        emitted.add(key)

    # В отдельном файле - валидные (но возможно не всегда разумные) сэмплы
    with io.open(os.path.join(data_folder, 'valid_syntax_dataset.txt'),
                 'r',
                 encoding='utf-8') as rdr:
        for line in rdr:
            if not line.startswith('#'):
                words = tokenizer.tokenize(line.strip().lower())
                if len(words) > 0:
                    words = remove_terminator(words)
                    key = u' '.join(words)
                    if key not in emitted:
                        samples.append((words, 1))
                        emitted.add(key)

    # Предполагаем, что корпус текстов для N-грамм содержит хорошие образцы
    with io.open(os.path.join(data_folder, 'ngrams_corpus.txt'),
                 'r',
                 encoding='utf-8') as rdr:
        for line in rdr:
            if not line.startswith('#'):
                words = tokenizer.tokenize(line.strip().lower())
                if len(words) > 0:
                    words = remove_terminator(words)
                    key = u' '.join(words)
                    if key not in emitted:
                        samples.append((words, 0))
                        emitted.add(key)

    for inpath in ['paraphrases.txt', 'pqa_all.dat']:
        with io.open(os.path.join(data_folder, inpath), 'r',
                     encoding='utf-8') as rdr:
            for line in rdr:
                if not line.startswith('#'):
                    s = clean_input(line)
                    words = tokenizer.tokenize(s)
                    if len(words) > 0:
                        if u'ты' in words:
                            words = remove_terminator(words)
                            key = u' '.join(words)
                            if key not in emitted:
                                emitted.add(key)
                                samples.append((words, 1))

    print('sample count={}'.format(len(samples)))

    nb0 = sum((label == 0) for (words, label) in samples)
    nb1 = sum((label == 1) for (words, label) in samples)
    print('nb0={} nb1={}'.format(nb0, nb1))

    max_wordseq_len = max(len(words) for (words, label) in samples)
    print('max_wordseq_len={}'.format(max_wordseq_len))

    return samples
Ejemplo n.º 5
0
    def __init__(self):
        import rutokenizer

        self.tokenizer = rutokenizer.Tokenizer()
        self.tokenizer.load()