Ejemplo n.º 1
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     #self.lemmatizer = Mystem()
     self.tagger = rupostagger.RuPosTagger()
     self.tagger.load()
     self.lemm = rulemma.Lemmatizer()
     self.lemm.load()
Ejemplo n.º 2
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lemmatizer = Mystem()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
Ejemplo n.º 3
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
     self.lexicon = Word2Lemmas()
     self.language_resources = LanguageResources()
     self.postagger = rupostagger.RuPosTagger()
     self.chunker = ruchunker.Chunker()
     self.word2tags = ruword2tags.RuWord2Tags()
     self.flexer = ruword2tags.RuFlexer()
     self.syntan = None
     self.gg_dictionaries = GenerativeGrammarDictionaries()
     self.known_words = set()
     #self.lemmatizer = Mystem()
     self.lemmatizer = rulemma.Lemmatizer()
     self.word_embeddings = None
Ejemplo n.º 4
0
def load_dataset(params):
    tokenizer = Tokenizer()
    tokenizer.load()

    # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py
    df = pd.read_csv(os.path.join(data_folder,
                                  'req_interpretation_dataset.csv'),
                     sep='\t',
                     encoding='utf-8')
    samples = [
        Sample(row['text'], int(row['label'])) for i, row in df.iterrows()
    ]

    # Токенизация сэмплов
    for sample in samples:
        sample.words = tokenizer.tokenize(sample.phrase)

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = max(len(sample.words) for sample in samples)
    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if params['padding'] == 'left':
        for sample in samples:
            sample.words = lpad_wordseq(sample.words, max_wordseq_len)
    else:
        for sample in samples:
            sample.words = rpad_wordseq(sample.words, max_wordseq_len)

    computed_params = {
        'max_wordseq_len': max_wordseq_len,
        'nb_0': nb_0,
        'nb_1': nb_1
    }

    return samples, computed_params
Ejemplo n.º 5
0
    line = line.replace(u'Q:', u'')
    line = line.replace(u'A:', u'')
    line = line.replace(u'\t', u' ')
    line = line.replace('.', ' ').replace('?', ' ').replace('!', ' ')
    line = line.replace('  ', ' ').strip().lower()
    line = ru_sanitize(line)
    return line


# ---------------------------------------------------------------

result_path = os.path.join(data_folder, 'premise_question_answer.csv')
pqa_yesno_path = os.path.join(data_folder, 'pqa_yes_no.dat')
pqa_all_path = os.path.join(data_folder, 'pqa_all.dat')

tokenizer = Tokenizer()
tokenizer.load()

records = []  # список из (предпосылка, вопрос, ответ, паттерн_создан_вручную)

added_records_set = set()  # для предотвращения повторов


def add_record(premise, question, answer, is_handmade):
    premise = premise.strip()
    question = question.strip()
    answer = answer.strip()

    if not premise or not question or not answer:
        print(
            u'ERROR empty phrase in: premise={} question={} answer={}'.format(
Ejemplo n.º 6
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.tokenizer.load()
Ejemplo n.º 7
0
def prepare_data(input_path, params, max_samples):
    logging.info('prepare_data for "%s', get_params_str(params))
    samples3 = []
    df = pd.read_csv(input_path, encoding='utf-8', delimiter='\t', quoting=3)
    for anchor, positive, negative in zip(df['anchor'].values,
                                          df['positive'].values,
                                          df['negative'].values):
        samples3.append(Sample3(anchor, positive, negative))

    if len(samples3) > max_samples:
        samples3 = random.sample(samples3, max_samples)

    computed_params = dict()

    if params['repres'] == 'words':
        embeddings = WordEmbeddings.load_word_vectors(
            params['wordchar2vector_path'], params['word2vector_path'])
        computed_params['embeddings'] = embeddings
        computed_params['word_dims'] = embeddings.vector_size

        tokenizer = Tokenizer()
        tokenizer.load()
        computed_params['tokenizer'] = tokenizer

        max_wordseq_len = 0
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                words = tokenizer.tokenize(phrase)
                max_wordseq_len = max(max_wordseq_len, len(words))

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        # Выравниваем все фразы
        pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq
        computed_params['pad_func'] = pad_func
        for sample in samples3:
            sample.anchor_words = pad_func(tokenizer.tokenize(sample.anchor),
                                           max_wordseq_len)
            sample.positive_words = pad_func(
                tokenizer.tokenize(sample.positive), max_wordseq_len)
            sample.negative_words = pad_func(
                tokenizer.tokenize(sample.negative), max_wordseq_len)
    elif params['repres'] == 'pieces':
        spm_name = 'spm_synonymy({})'.format(params['spm_items'])
        computed_params['spm_name'] = spm_name

        if not os.path.exists(os.path.join(tmp_folder, spm_name + '.model')):
            # Для обучения модели SentencePiece нам нужен текстовый корпус. Изготовим его
            # из имеющихся вариантов предложений в обучающем наборе
            all_texts = set()
            for sample in samples3:
                all_texts.add(sample.anchor)
                all_texts.add(sample.positive)
                all_texts.add(sample.negative)

            sentencepiece_corpus = os.path.join(tmp_folder,
                                                'sentencepiece_corpus.txt')
            with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt:
                for text in all_texts:
                    wrt.write(text)
                    wrt.write(u'\n')

            # Корпус готов, обучаем сегментатор
            logging.info('Train SentencePiece model on {}...'.format(
                sentencepiece_corpus))
            spm.SentencePieceTrainer.Train(
                '--input={} --model_prefix={} --vocab_size={} --character_coverage=1.0 --model_type=bpe --input_sentence_size=10000000'
                .format(sentencepiece_corpus, spm_name, params['spm_items']))
            os.rename(spm_name + '.vocab',
                      os.path.join(tmp_folder, spm_name + '.vocab'))
            os.rename(spm_name + '.model',
                      os.path.join(tmp_folder, spm_name + '.model'))

        splitter = spm.SentencePieceProcessor()
        splitter.Load(os.path.join(tmp_folder, spm_name + '.model'))
        computed_params['splitter'] = splitter

        max_wordseq_len = 0
        all_tokens = set([PAD_TOKEN])
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                tokens = splitter.EncodeAsPieces(phrase)
                max_wordseq_len = max(max_wordseq_len, len(tokens))
                all_tokens.update(tokens)

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        token2index = {PAD_TOKEN: 0}
        for token in all_tokens:
            if token != PAD_TOKEN:
                token2index[token] = len(token2index)

        computed_params['token2index'] = token2index

        for sample in samples3:
            sample.anchor_words = spm2tokens(splitter, sample.anchor,
                                             max_wordseq_len, token2index)
            sample.positive_words = spm2tokens(splitter, sample.positive,
                                               max_wordseq_len, token2index)
            sample.negative_words = spm2tokens(splitter, sample.negative,
                                               max_wordseq_len, token2index)

    else:
        raise NotImplementedError()

    return samples3, computed_params
Ejemplo n.º 8
0
 def __init__(self):
     self.tokenizer = Tokenizer()
     self.stemmer = RussianStemmer()