Esempio n. 1
0
class PhraseTokenizer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()

    def tokenize(self, phrase):
        return self.tokenizer.tokenize(phrase)
Esempio n. 2
0
class PhraseCleaner:
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()

    def process(self, phrase):
        return u' '.join(self.tokenizer.tokenize(phrase))
Esempio n. 3
0
class PhraseStemmer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.stemmer = RussianStemmer()

    def tokenize(self, phrase):
        return [self.stemmer.stem(w) for w in self.tokenizer.tokenize(phrase) if len(w.strip()) > 0]
Esempio n. 4
0
def load_dataset(params):
    tokenizer = Tokenizer()
    tokenizer.load()

    # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py
    df = pd.read_csv(os.path.join(data_folder,
                                  'req_interpretation_dataset.csv'),
                     sep='\t',
                     encoding='utf-8')
    samples = [
        Sample(row['text'], int(row['label'])) for i, row in df.iterrows()
    ]

    # Токенизация сэмплов
    for sample in samples:
        sample.words = tokenizer.tokenize(sample.phrase)

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = max(len(sample.words) for sample in samples)
    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if params['padding'] == 'left':
        for sample in samples:
            sample.words = lpad_wordseq(sample.words, max_wordseq_len)
    else:
        for sample in samples:
            sample.words = rpad_wordseq(sample.words, max_wordseq_len)

    computed_params = {
        'max_wordseq_len': max_wordseq_len,
        'nb_0': nb_0,
        'nb_1': nb_1
    }

    return samples, computed_params
Esempio n. 5
0
class PhraseLemmatizer(PhraseSplitter):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()
        #self.lemmatizer = Mystem()
        self.tagger = rupostagger.RuPosTagger()
        self.tagger.load()
        self.lemm = rulemma.Lemmatizer()
        self.lemm.load()

    def extract_lemma(self, token):
        return token[0] if token[1] == 'PRON' else token[2]

    def tokenize(self, phrase):
        words = self.tokenizer.tokenize(phrase)
        # вариант с pymystem
        #wx = u' '.join(words)
        #return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]

        # вариант с собственным лемматизатором
        tags = self.tagger.tag(words)
        tokens = self.lemm.lemmatize(tags)
        return [self.extract_lemma(t) for t in tokens]
Esempio n. 6
0
class TextUtils(object):
    def __init__(self):
        self.clause_splitter = rutokenizer.Segmenter()
        self.tokenizer = Tokenizer()
        self.tokenizer.load()
        #self.lexicon = Word2Lemmas()
        self.language_resources = LanguageResources()
        self.postagger = rupostagger.RuPosTagger()
        self.chunker = ruchunker.Chunker()
        self.word2tags = ruword2tags.RuWord2Tags()
        self.flexer = ruword2tags.RuFlexer()
        self.syntan = None
        self.gg_dictionaries = GenerativeGrammarDictionaries()
        #self.known_words = set()
        #self.lemmatizer = Mystem()
        self.lemmatizer = rulemma.Lemmatizer()
        self.word_embeddings = None

    def load_embeddings(self, w2v_dir, wc2v_dir):
        # Загрузка векторных словарей
        self.word_embeddings = WordEmbeddings()
        self.word_embeddings.load_models(w2v_dir, wc2v_dir)

        if wc2v_dir:
            p = os.path.join(wc2v_dir, 'wc2v.kv')
            self.word_embeddings.load_wc2v_model(p)

        p = os.path.join(w2v_dir, 'w2v.kv')
        self.word_embeddings.load_w2v_model(p)

    def load_dictionaries(self, data_folder, models_folder):
        self.lemmatizer.load()

        # Общий словарь для генеративных грамматик
        #self.gg_dictionaries.load(os.path.join(models_folder, 'generative_grammar_dictionaries.bin'))

        #word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz')
        #self.lexicon.load(word2lemmas_path)

        #word2tags_path = os.path.join(data_folder, 'chatbot_word2tags.dat')
        #self.postagger.load(word2tags_path)
        self.postagger.load()

        self.word2tags.load()
        self.flexer.load()
        self.chunker.load()

        # Грузим dependency parser UDPipe и русскоязычную модель
        model_file = os.path.join(models_folder, 'udpipe_syntagrus.model')
        self.udpipe_model = Model.load(model_file)
        self.udpipe_pipeline = Pipeline(self.udpipe_model, 'tokenize',
                                        Pipeline.DEFAULT, Pipeline.DEFAULT,
                                        'conllu')
        self.udpipe_error = ProcessingError()

        #self.syntan = rusyntax2.Tagger(self.word2tags, w2v, self.postagger)
        #self.syntan.load()

        #rules_path = os.path.join(data_folder, 'rules.yaml')
        #with io.open(rules_path, 'r', encoding='utf-8') as f:
        #data = yaml.safe_load(f)
        #self.no_info_replicas = data['no_relevant_information']
        #self.unknown_order = data['unknown_order']

        #self.language_resources.key2phrase[u'yes'] = data[u'answers'][u'yes']
        #self.language_resources.key2phrase[u'not'] = data[u'answers'][u'not']

        # Список "хороших слов" для генеративной грамматики
        #with io.open(os.path.join(models_folder, 'dataset_words.txt'), 'r', encoding='utf-8') as rdr:
        #    for line in rdr:
        #        word = line.strip()
        #        self.known_words.add(word)

    def apply_word_function(self, func, constants, words):
        part_of_speech = None
        tag = None
        if func == '$chooseAdjByGender':
            part_of_speech = 'ПРИЛАГАТЕЛЬНОЕ'
            tag = ('РОД', constants['gender'])
        elif func == '$chooseVByGender':
            part_of_speech = 'ГЛАГОЛ'
            tag = ('РОД', constants['gender'])
        elif func == '$chooseNByGender':
            part_of_speech = 'СУЩЕСТВИТЕЛЬНОЕ'
            tag = ('РОД', constants['gender'])
        else:
            raise NotImplementedError()

        tag2 = tag[0] + '=' + tag[1]
        for word in words:
            #tagsets = self.gg_dictionaries.grdict.get_word_tagsets2(word.lower(), part_of_speech)
            for tagset in self.word2tags[word.lower()]:
                if part_of_speech in tagset and tag2 in tagset:
                    return word

        msg = 'Could not choose a word among {}'.format(' '.join(words))
        raise RuntimeError(msg)

    def tag(self, words, with_lemmas=False):
        """ Частеречная разметка для цепочки слов words """
        if with_lemmas:
            return self.lemmatizer.lemmatize(self.postagger.tag(words))
        else:
            return self.postagger.tag(words)

    def canonize_text(self, s):
        """ Удаляем два и более пробелов подряд, заменяя на один """
        s = re.sub("(\\s{2,})", ' ', s.strip())
        return s

    def remove_terminators(self, s):
        """ Убираем финальные пунктуаторы ! ? ."""
        return s[:-1].strip() if s[-1] in u'?!.' else s

    def wordize_text(self, s):
        return u' '.join(self.tokenize(s))

    def ngrams(self, s, n):
        #return [u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)])]
        return [u''.join(z) for z in zip(*[s[i:] for i in range(n)])]

    def words2str(self, words):
        return u' '.join(
            itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words),
                            [END_WORD]))

    def split_clauses(self, s):
        return list(self.clause_splitter.split(s))

    def tokenize(self, s):
        return self.tokenizer.tokenize(s)

    def extract_lemma(self, token):
        return token[0] if token[1] == 'PRON' else token[2]

    def lemmatize(self, s):
        words = self.tokenizer.tokenize(s)
        #wx = u' '.join(words)
        #return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]
        tokens = self.lemmatizer.lemmatize(self.postagger.tag(words))
        return [self.extract_lemma(t) for t in tokens]

    def lemmatize2(self, s):
        words = self.tokenizer.tokenize(s)
        return self.lemmatizer.lemmatize(self.postagger.tag(words))

    def lpad_wordseq(self, words, n):
        """ Слева добавляем пустые слова """
        return list(
            itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words))

    def rpad_wordseq(self, words, n):
        """ Справа добавляем пустые слова """
        return list(
            itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words))))

    #def get_lexicon(self):
    #    return self.lexicon

    def is_question_word(self, word):
        return word in u'насколько где кто что почему откуда куда зачем чего кого кем чем кому чему ком чем как сколько ли когда докуда какой какая какое какие какого какую каких каким какими какому какой каков какова каковы'.split(
        )

    def build_output_phrase(self, words):
        s = u' '.join(words)
        s = s.replace(u' ?', u'?').replace(u' !', u'!').replace(u' ,', u',').replace(u' :', u',') \
            .replace(u' .', u'.').replace(u'( ', u'(').replace(u' )', u')')
        s = s[0].upper() + s[1:]
        return s

    def detect_person0(self, words):
        if any((word in (u'ты', u'тебя', u'тебе')) for word in words):
            return 2

        if any((word in (u'я', u'мне', u'меня')) for word in words):
            return 1

        return -1

    def extract_chunks(self, sample):
        tokens = self.tokenizer.tokenize(sample)
        tagsets = list(self.postagger.tag(tokens))
        lemmas = self.lemmatizer.lemmatize(tagsets)
        #edges = syntan.parse(tokens, tagsets)

        phrase_tokens = []
        for word_index, (token, tagset,
                         lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = self.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        return chunks

    def word_similarity(self, word1, word2):
        return self.word_embeddings.word_similarity(word1, word2)

    def parse_syntax(self, text_str):
        processed = self.udpipe_pipeline.process(text_str, self.udpipe_error)
        if self.udpipe_error.occurred():
            logging.error("An error occurred when running run_udpipe: %s",
                          self.udpipe_error.message)
            return None

        parsed_data = pyconll.load_from_string(processed)[0]
        return parsed_data

    def get_udpipe_attr(self, token, tag_name):
        if tag_name in token.feats:
            v = list(token.feats[tag_name])[0]
            return v

        return ''

    def change_verb_gender(self, verb_inf, new_gender):
        """ Изменение формы глагола в прошедшем времени единственном числе """
        required_tags = [('ВРЕМЯ', 'ПРОШЕДШЕЕ'), ('ЧИСЛО', 'ЕД')]
        if new_gender == 'Fem':
            required_tags.append(('РОД', 'ЖЕН'))
        else:
            required_tags.append(('РОД', 'МУЖ'))

        forms = list(self.flexer.find_forms_by_tags(verb_inf, required_tags))
        if forms:
            return forms[0]
        else:
            return None

    def change_adj_gender(self, adj_lemma, new_gender, variant):
        if adj_lemma == 'должен':
            if new_gender == 'Fem':
                return 'должна'
            else:
                return 'должен'

        required_tags = [('ЧИСЛО', 'ЕД')]
        if variant == 'Short':
            required_tags.append(('КРАТКИЙ', '1'))
        else:
            required_tags.append(('КРАТКИЙ', '0'))
            required_tags.append(('ПАДЕЖ', 'ИМ'))

        if new_gender == 'Fem':
            required_tags.append(('РОД', 'ЖЕН'))
        else:
            required_tags.append(('РОД', 'МУЖ'))

        forms = list(self.flexer.find_forms_by_tags(adj_lemma, required_tags))
        if forms:
            return forms[0]
        else:
            return None

    def is_premise_suitable_as_answer(self, premise_text):
        # Можно ли текст предпосылки использовать в качестве ответа
        tx = self.tokenize(premise_text)
        if len(tx) > 5:
            return False

        if ',' in tx or 'и' in tx or 'или' in tx:
            return False

        return True
Esempio n. 7
0
            if len(line) == 0:
                if len(lines) > 1:
                    questions = []
                    premises = []
                    for line in lines:
                        if line.startswith(
                                u'(+)') or not line.startswith(u'(-)'):
                            s = normalize_qline(line)
                            if line[-1] == u'?':
                                questions.append(s)
                            else:
                                premises.append(s)
                                questions.append(s)

                    for premise in premises:
                        premise_words = tokenizer.tokenize(premise)
                        if any((word in question_words)
                               for word in premise_words):
                            # Пропускаем фразу, так как это вопрос, и в качестве предпосылки он не будет использоваться
                            continue

                        for question in questions:
                            if random.random() < 0.1:
                                add_record(premise, question, u'да', False)
                                nb_paraphrases += 1

                lines = []
            else:
                lines.append(line)

    print('{:<6d} patterns'.format(nb_paraphrases))
Esempio n. 8
0
class TextUtils(object):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()
        self.lexicon = Word2Lemmas()
        self.language_resources = LanguageResources()
        self.postagger = rupostagger.RuPosTagger()
        self.chunker = ruchunker.Chunker()
        self.word2tags = ruword2tags.RuWord2Tags()
        self.flexer = ruword2tags.RuFlexer()
        self.syntan = None
        self.gg_dictionaries = GenerativeGrammarDictionaries()
        self.known_words = set()
        #self.lemmatizer = Mystem()
        self.lemmatizer = rulemma.Lemmatizer()
        self.word_embeddings = None

    def load_embeddings(self, w2v_dir, wc2v_dir):
        # Загрузка векторных словарей
        self.word_embeddings = WordEmbeddings()
        self.word_embeddings.load_models(w2v_dir)

        p = os.path.join(wc2v_dir, 'wc2v.kv')
        self.word_embeddings.load_wc2v_model(p)

        p = os.path.join(w2v_dir, 'w2v.kv')
        self.word_embeddings.load_w2v_model(p)

    def load_dictionaries(self, data_folder, models_folder):
        self.lemmatizer.load()

        # Общий словарь для генеративных грамматик
        self.gg_dictionaries.load(
            os.path.join(models_folder, 'generative_grammar_dictionaries.bin'))

        word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz')
        self.lexicon.load(word2lemmas_path)

        #word2tags_path = os.path.join(data_folder, 'chatbot_word2tags.dat')
        #self.postagger.load(word2tags_path)
        self.postagger.load()

        self.word2tags.load()
        self.flexer.load()
        self.chunker.load()

        #self.syntan = rusyntax2.Tagger(self.word2tags, w2v, self.postagger)
        #self.syntan.load()

        rules_path = os.path.join(data_folder, 'rules.yaml')
        with io.open(rules_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            #self.no_info_replicas = data['no_relevant_information']
            #self.unknown_order = data['unknown_order']

            self.language_resources.key2phrase[u'yes'] = data[u'answers'][
                u'yes']
            self.language_resources.key2phrase[u'not'] = data[u'answers'][
                u'not']

        # Список "хороших слов" для генеративной грамматики
        with io.open(os.path.join(models_folder, 'dataset_words.txt'),
                     'r',
                     encoding='utf-8') as rdr:
            for line in rdr:
                word = line.strip()
                self.known_words.add(word)

    def apply_word_function(self, func, constants, words):
        part_of_speech = None
        tag = None
        if func == '$chooseAdjByGender':
            part_of_speech = 'ПРИЛАГАТЕЛЬНОЕ'
            tag = ('РОД', constants['gender'])
        elif func == '$chooseVByGender':
            part_of_speech = 'ГЛАГОЛ'
            tag = ('РОД', constants['gender'])
        elif func == '$chooseNByGender':
            part_of_speech = 'СУЩЕСТВИТЕЛЬНОЕ'
            tag = ('РОД', constants['gender'])
        else:
            raise NotImplementedError()

        for word in words:
            tagsets = self.gg_dictionaries.grdict.get_word_tagsets2(
                word.lower(), part_of_speech)
            if any((tag in tagset) for tagset in tagsets):
                return word

        msg = u'Could not choose a word among {}'.format(u' '.join(words))
        raise RuntimeError(msg)

    def tag(self, words, with_lemmas=False):
        """ Частеречная разметка для цепочки слов words """
        if with_lemmas:
            return self.lemmatizer.lemmatize(self.postagger.tag(words))
        else:
            return self.postagger.tag(words)

    def canonize_text(self, s):
        """ Удаляем два и более пробелов подряд, заменяя на один """
        s = re.sub("(\\s{2,})", ' ', s.strip())
        return s

    def remove_terminators(self, s):
        """ Убираем финальные пунктуаторы ! ? ."""
        return s[:-1].strip() if s[-1] in u'?!.' else s

    def wordize_text(self, s):
        return u' '.join(self.tokenize(s))

    def ngrams(self, s, n):
        #return [u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)])]
        return [u''.join(z) for z in zip(*[s[i:] for i in range(n)])]

    def words2str(self, words):
        return u' '.join(
            itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words),
                            [END_WORD]))

    def tokenize(self, s):
        return self.tokenizer.tokenize(s)

    def extract_lemma(self, token):
        return token[0] if token[1] == 'PRON' else token[2]

    def lemmatize(self, s):
        words = self.tokenizer.tokenize(s)
        #wx = u' '.join(words)
        #return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]
        tokens = self.lemmatizer.lemmatize(self.postagger.tag(words))
        return [self.extract_lemma(t) for t in tokens]

    def lpad_wordseq(self, words, n):
        """ Слева добавляем пустые слова """
        return list(
            itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words))

    def rpad_wordseq(self, words, n):
        """ Справа добавляем пустые слова """
        return list(
            itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words))))

    def get_lexicon(self):
        return self.lexicon

    def is_question_word(self, word):
        return word in u'насколько где кто что почему откуда куда зачем чего кого кем чем кому чему ком чем как сколько ли когда докуда какой какая какое какие какого какую каких каким какими какому какой'.split(
        )

    def build_output_phrase(self, words):
        s = u' '.join(words)
        s = s.replace(u' ?', u'?').replace(u' !', u'!').replace(u' ,', u',').replace(u' :', u',') \
            .replace(u' .', u'.').replace(u'( ', u'(').replace(u' )', u')')
        s = s[0].upper() + s[1:]
        return s

    def detect_person0(self, words):
        if any((word in (u'ты', u'тебя', u'тебе')) for word in words):
            return 2

        if any((word in (u'я', u'мне', u'меня')) for word in words):
            return 1

        return -1

    def extract_chunks(self, sample):
        tokens = self.tokenizer.tokenize(sample)
        tagsets = list(self.postagger.tag(tokens))
        lemmas = self.lemmatizer.lemmatize(tagsets)
        #edges = syntan.parse(tokens, tagsets)

        phrase_tokens = []
        for word_index, (token, tagset,
                         lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = self.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        return chunks
Esempio n. 9
0
    computed_params['embeddings'] = embeddings
    computed_params['word_dims'] = embeddings.vector_size

    logging.info('Restoring model architecture from "%s"', arch_filepath)
    with open(arch_filepath, 'r') as f:
        model = model_from_json(f.read())

    logging.info('Loading model weights from "%s"', weights_path)
    model.load_weights(weights_path)

    tokenizer = Tokenizer()
    tokenizer.load()

    while True:
        phrase = ruchatbot.utils.console_helpers.input_kbd(':> ').strip()
        sample1 = Sample(phrase, 0)
        sample1.words = tokenizer.tokenize(phrase)

        if padding == 'left':
            sample1.words = lpad_wordseq(sample1.words, max_wordseq_len)
        else:
            sample1.words = rpad_wordseq(sample1.words, max_wordseq_len)

        for istep, xy in enumerate(
                generate_rows([sample1], 2, computed_params, 1)):
            x = xy[0]
            y_pred = model.predict(x=x, verbose=0)[0]
            print('y==0 --> {}'.format(y_pred[0]))
            print('y==1 --> {}'.format(y_pred[1]))
            break
Esempio n. 10
0
    df = pd.read_csv(input_path, encoding='utf-8', delimiter='\t', quoting=3)

    samples = []  # список из экземпляров Sample

    for phrase1, phrase2, y in zip(df['premise'].values, df['question'].values,
                                   df['relevance'].values):
        samples.append(Sample(phrase1, phrase2, y))

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = 0
    for sample in samples:
        for phrase in [sample.phrase1, sample.phrase2]:
            words = tokenizer.tokenize(phrase)
            max_wordseq_len = max(max_wordseq_len, len(words))

    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq

    for sample in samples:
        sample.words1 = pad_func(tokenizer.tokenize(sample.phrase1),
                                 max_wordseq_len)
        sample.words2 = pad_func(tokenizer.tokenize(sample.phrase2),
                                 max_wordseq_len)

    # суммарное кол-во дополнительных фич, подаваемых на вход сетки
    # помимо двух отдельных предложений.
    nb_addfeatures = 0
Esempio n. 11
0
def prepare_data(input_path, params, max_samples):
    logging.info('prepare_data for "%s', get_params_str(params))
    samples3 = []
    df = pd.read_csv(input_path, encoding='utf-8', delimiter='\t', quoting=3)
    for anchor, positive, negative in zip(df['anchor'].values,
                                          df['positive'].values,
                                          df['negative'].values):
        samples3.append(Sample3(anchor, positive, negative))

    if len(samples3) > max_samples:
        samples3 = random.sample(samples3, max_samples)

    computed_params = dict()

    if params['repres'] == 'words':
        embeddings = WordEmbeddings.load_word_vectors(
            params['wordchar2vector_path'], params['word2vector_path'])
        computed_params['embeddings'] = embeddings
        computed_params['word_dims'] = embeddings.vector_size

        tokenizer = Tokenizer()
        tokenizer.load()
        computed_params['tokenizer'] = tokenizer

        max_wordseq_len = 0
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                words = tokenizer.tokenize(phrase)
                max_wordseq_len = max(max_wordseq_len, len(words))

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        # Выравниваем все фразы
        pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq
        computed_params['pad_func'] = pad_func
        for sample in samples3:
            sample.anchor_words = pad_func(tokenizer.tokenize(sample.anchor),
                                           max_wordseq_len)
            sample.positive_words = pad_func(
                tokenizer.tokenize(sample.positive), max_wordseq_len)
            sample.negative_words = pad_func(
                tokenizer.tokenize(sample.negative), max_wordseq_len)
    elif params['repres'] == 'pieces':
        spm_name = 'spm_synonymy({})'.format(params['spm_items'])
        computed_params['spm_name'] = spm_name

        if not os.path.exists(os.path.join(tmp_folder, spm_name + '.model')):
            # Для обучения модели SentencePiece нам нужен текстовый корпус. Изготовим его
            # из имеющихся вариантов предложений в обучающем наборе
            all_texts = set()
            for sample in samples3:
                all_texts.add(sample.anchor)
                all_texts.add(sample.positive)
                all_texts.add(sample.negative)

            sentencepiece_corpus = os.path.join(tmp_folder,
                                                'sentencepiece_corpus.txt')
            with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt:
                for text in all_texts:
                    wrt.write(text)
                    wrt.write(u'\n')

            # Корпус готов, обучаем сегментатор
            logging.info('Train SentencePiece model on {}...'.format(
                sentencepiece_corpus))
            spm.SentencePieceTrainer.Train(
                '--input={} --model_prefix={} --vocab_size={} --character_coverage=1.0 --model_type=bpe --input_sentence_size=10000000'
                .format(sentencepiece_corpus, spm_name, params['spm_items']))
            os.rename(spm_name + '.vocab',
                      os.path.join(tmp_folder, spm_name + '.vocab'))
            os.rename(spm_name + '.model',
                      os.path.join(tmp_folder, spm_name + '.model'))

        splitter = spm.SentencePieceProcessor()
        splitter.Load(os.path.join(tmp_folder, spm_name + '.model'))
        computed_params['splitter'] = splitter

        max_wordseq_len = 0
        all_tokens = set([PAD_TOKEN])
        for sample in samples3:
            for phrase in [sample.anchor, sample.positive, sample.negative]:
                tokens = splitter.EncodeAsPieces(phrase)
                max_wordseq_len = max(max_wordseq_len, len(tokens))
                all_tokens.update(tokens)

        logging.info('max_wordseq_len={}'.format(max_wordseq_len))
        computed_params['max_wordseq_len'] = max_wordseq_len

        token2index = {PAD_TOKEN: 0}
        for token in all_tokens:
            if token != PAD_TOKEN:
                token2index[token] = len(token2index)

        computed_params['token2index'] = token2index

        for sample in samples3:
            sample.anchor_words = spm2tokens(splitter, sample.anchor,
                                             max_wordseq_len, token2index)
            sample.positive_words = spm2tokens(splitter, sample.positive,
                                               max_wordseq_len, token2index)
            sample.negative_words = spm2tokens(splitter, sample.negative,
                                               max_wordseq_len, token2index)

    else:
        raise NotImplementedError()

    return samples3, computed_params
Esempio n. 12
0
        word_dims = embeddings.vector_size
        pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq
        tokenizer = Tokenizer()
        tokenizer.load()
    elif repres == 'pieces':
        splitter = spm.SentencePieceProcessor()
        splitter.Load(
            os.path.join(tmp_folder, model_config['spm_name'] + '.model'))

    # Загрузим эталонные предложения, похожесть на которые будем определять для
    # введенного в консоли предложения.
    phrases2 = set()
    if True:
        for phrase in load_strings_from_yaml(
                os.path.join(data_folder, 'rules.yaml')):
            phrase2 = u' '.join(tokenizer.tokenize(phrase))
            phrases2.add(phrase2)

    if True:
        with codecs.open(os.path.join(data_folder, 'intents.txt'), 'r',
                         'utf-8') as rdr:
            for line in rdr:
                phrase = line.strip()
                if len(phrase) > 5 and not phrase.startswith(
                        '#') and u'_' not in phrase:
                    phrase2 = u' '.join(tokenizer.tokenize(phrase))
                    phrases2.add(phrase2)

    if True:
        with codecs.open(os.path.join(data_folder, 'faq2.txt'), 'r',
                         'utf-8') as rdr:
Esempio n. 13
0
class TextUtils(object):
    def __init__(self):
        self.tokenizer = Tokenizer()
        self.tokenizer.load()
        self.lemmatizer = Mystem()
        self.lexicon = Word2Lemmas()
        self.language_resources = LanguageResources()
        self.postagger = rupostagger.RuPosTagger()
        self.gg_dictionaries = GenerativeGrammarDictionaries()
        self.known_words = set()

    def load_dictionaries(self, data_folder, models_folder):
        # Общий словарь для генеративных грамматик
        self.gg_dictionaries.load(
            os.path.join(models_folder, 'generative_grammar_dictionaries.bin'))

        word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz')
        self.lexicon.load(word2lemmas_path)

        word2tags_path = os.path.join(data_folder, 'chatbot_word2tags.dat')
        self.postagger.load(word2tags_path)

        rules_path = os.path.join(data_folder, 'rules.yaml')
        with io.open(rules_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            self.no_info_replicas = data['no_relevant_information']
            self.unknown_order = data['unknown_order']

            self.language_resources.key2phrase[u'yes'] = data[u'answers'][
                u'yes']
            self.language_resources.key2phrase[u'not'] = data[u'answers'][
                u'not']

        # Список "хороших слов" для генеративной грамматики
        with io.open(os.path.join(models_folder, 'dataset_words.txt'),
                     'r',
                     encoding='utf-8') as rdr:
            for line in rdr:
                word = line.strip()
                self.known_words.add(word)

    def tag(self, words):
        """ Частеречная разметка для цепочки слов words """
        return self.postagger.tag(words)

    def canonize_text(self, s):
        """ Удаляем два и более пробелов подряд, заменяя на один """
        s = re.sub("(\\s{2,})", ' ', s.strip())
        return s

    def remove_terminators(self, s):
        """ Убираем финальные пунктуаторы ! ? ."""
        return s[:-1].strip() if s[-1] in u'?!.' else s

    def wordize_text(self, s):
        return u' '.join(self.tokenize(s))

    def ngrams(self, s, n):
        #return [u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)])]
        return [u''.join(z) for z in zip(*[s[i:] for i in range(n)])]

    def words2str(self, words):
        return u' '.join(
            itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words),
                            [END_WORD]))

    def tokenize(self, s):
        return self.tokenizer.tokenize(s)

    def lemmatize(self, s):
        words = self.tokenizer.tokenize(s)
        wx = u' '.join(words)
        return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0]

    def lpad_wordseq(self, words, n):
        """ Слева добавляем пустые слова """
        return list(
            itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words))

    def rpad_wordseq(self, words, n):
        """ Справа добавляем пустые слова """
        return list(
            itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words))))

    def get_lexicon(self):
        return self.lexicon

    def is_question_word(self, word):
        return word in u'насколько где кто что почему откуда куда зачем чего кого кем чем кому чему ком чем как сколько ли когда докуда какой какая какое какие какого какую каких каким какими какому какой'.split(
        )

    def build_output_phrase(self, words):
        s = u' '.join(words)
        s = s.replace(u' ?', u'?').replace(u' !', u'!').replace(u' ,', u',').replace(u' :', u',') \
            .replace(u' .', u'.').replace(u'( ', u'(').replace(u' )', u')')
        s = s[0].upper() + s[1:]
        return s
Esempio n. 14
0
# ---------------------------------------------------------------

tokenizer = Tokenizer()
tokenizer.load()
random_questions = CorpusSearcher()
random_facts = CorpusSearcher()

# прочитаем список случайных вопросов из заранее сформированного файла
# (см. код на C# https://github.com/Koziev/chatbot/tree/master/CSharpCode/ExtractFactsFromParsing
# и результаты его работы https://github.com/Koziev/NLP_Datasets/blob/master/Samples/questions4.txt)
print('Loading random questions and facts...')
with codecs.open(questions_path, 'r', 'utf-8') as rdr:
    for line in rdr:
        if len(line) < 40:
            question = line.strip()
            question = ru_sanitize(u' '.join(tokenizer.tokenize(question)))
            random_questions.add_phrase(normalize_qline(question))

# Прочитаем список случайных фактов, чтобы потом генерировать отрицательные паттерны
for facts_path in [
        'paraphrases.txt',
        'facts4.txt',
        'facts5.txt',
        'facts6.txt',
]:
    with codecs.open(os.path.join(data_folder, facts_path), 'r',
                     'utf-8') as rdr:
        n = 0
        for line in rdr:
            s = line.strip()
            if s:
Esempio n. 15
0
            line = line.strip()
            if line:
                tx = line.split('\t')
                word = normalize_word(tx[1])
                known_words.add(word)
                dataset_words.add(word)

print('Parsing {}'.format(yaml_path))
with io.open(yaml_path, 'r', encoding='utf-8') as f:
    data = yaml.safe_load(f)
    strings = collect_strings(data)
    for phrase in strings:
        phrase = phrase.strip()
        if u'_' not in phrase and any(
            (c in u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя') for c in phrase):
            words = tokenizer.tokenize(phrase)
            known_words.update(words)
            dataset_words.update(words)

# Берем слова из большого текстового файла, на котором тренируется w2v модели.
print('Parsing {}'.format(corpus_path))
with codecs.open(corpus_path, 'r', 'utf-8') as rdr:
    line_count = 0
    for line0 in rdr:
        line = line0.strip()
        words = [normalize_word(w) for w in line.split(u' ')]
        known_words.update(words)
        line_count += 1
        if line_count > 5000000:
            break
    lexicon.load(os.path.join(data_folder, 'dict/word2lemma.dat'))

    grdict = ruword2tags.RuWord2Tags()
    grdict.load()

    flexer = ruword2tags.RuFlexer()
    flexer.load()

    # Аугментация: генерируем негативных сэмплы через выбор вариантов словоформ, отличающихся
    # от использованных в валидном ответе.
    logging.info('Generating negative samples...')
    all_keys = set(sample.get_key() for sample in samples)
    neg_samples = []
    for sample in samples:
        if sample.label == 1:
            answer_words = tokenizer.tokenize(sample.answer)
            answer_len = len(answer_words)
            if answer_len == 1:
                # Аугментация для однословного ответа.
                # Формы единственного слова, кроме упомянутой в ответе
                for lemma, part_of_speech in lexicon.get_lemmas(
                        answer_words[0]):
                    forms = list(lexicon.get_forms(lemma, part_of_speech))
                    forms = np.random.permutation(forms)[:5]
                    for form in forms:
                        if form != answer_words[0]:
                            sample0 = Sample(sample.premises, sample.question,
                                             form, 0)
                            key0 = sample0.get_key()
                            if key0 not in all_keys:
                                all_keys.add(key0)