Ejemplo n.º 1
0
    def prepare4saying(self, phrase, condition_matching_results, text_utils):
        utterance = phrase.raw_text

        # Если нужно сделать подстановку сматченных при проверке условия чанков.
        if condition_matching_results and condition_matching_results.has_groups(
        ) and phrase.has_entries():
            for name, group in condition_matching_results.groups.items():
                group_ancor = name.upper()
                if group_ancor in phrase.name2entry:
                    entry = phrase.name2entry[group_ancor]
                    words = group.words

                    # Нужно просклонять чанк?
                    if entry.tags:
                        tokens = group.phrase_tokens
                        target_tags = dict()
                        for tag in entry.tags:
                            if tag in ('ИМ', 'ВИН', 'РОД', 'ТВОР', 'ДАТ',
                                       'ПРЕДЛ'):
                                target_tags['ПАДЕЖ'] = tag
                            else:
                                raise NotImplementedError()

                        words = normalize_chunk(tokens,
                                                edges=None,
                                                flexer=text_utils.flexer,
                                                word2tags=text_utils.word2tags,
                                                target_tags=target_tags)

                    # Подставляем слова чанка вместо подстроки $NP1(...)
                    entry_value = ' '.join(words)
                    utterance = utterance.replace(entry.raw_text, entry_value)

        return utterance
Ejemplo n.º 2
0
    def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine):
        input_phrase = interpreted_phrase.interpretation
        text_utils = answering_engine.text_utils
        tokens = text_utils.tokenizer.tokenize(input_phrase)
        tagsets = list(text_utils.postagger.tag(tokens))
        lemmas = text_utils.lemmatizer.lemmatize(tagsets)

        #edges = syntan.parse(tokens, tagsets)
        # заглушка вместо парсинга:
        edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)]

        phrase_tokens = []
        for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = text_utils.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        for mask in self.masks:
            mx = match(phrase_tokens, mask.mask_terms)
            if mx:
                #print('{} groups in matching:'.format(mx.groups_count()))
                res = RuleConditionMatching.create(True)
                for group_name, tokens in mx.index2group.items():
                    normal_words = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags)
                    #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words)))
                    res.add_group(group_name.upper(), normal_words, tokens)
                return res

        return RuleConditionMatching.create(False)
Ejemplo n.º 3
0
    def check_condition(self, bot, session, interlocutor, interpreted_phrase,
                        answering_engine):
        if self.is_raw:
            input_phrase = interpreted_phrase.raw_phrase
        else:
            input_phrase = interpreted_phrase.interpretation

        text_utils = answering_engine.text_utils
        tokens = text_utils.tokenizer.tokenize(input_phrase)
        tagsets = list(text_utils.postagger.tag(tokens))
        lemmas = text_utils.lemmatizer.lemmatize(tagsets)

        #edges = syntan.parse(tokens, tagsets)
        # заглушка вместо парсинга:
        edges = [(word, iword, None, None, None)
                 for (iword, word) in enumerate(tokens)]

        phrase_tokens = []
        for word_index, (token, tagset,
                         lemma) in enumerate(zip(tokens, tagsets, lemmas)):
            t = PhraseToken()
            t.word = token
            t.norm_word = token.lower()
            t.lemma = lemma[2]
            t.tagset = tagset[1]
            t.word_index = word_index
            phrase_tokens.append(t)

        chunks = text_utils.chunker.parse(tokens)
        for chunk_index, chunk in enumerate(chunks):
            phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True
            for token in chunk.tokens:
                phrase_tokens[token.index].chunk_index = chunk_index

        for mask in self.masks:
            mx = match(phrase_tokens, mask.mask_terms)
            if mx:
                #print('{} groups in matching:'.format(mx.groups_count()))
                res = RuleConditionMatching.create(True)
                for group_name, tokens in mx.index2group.items():
                    normal_words1 = normalize_chunk(tokens, edges,
                                                    text_utils.flexer,
                                                    text_utils.word2tags)
                    normal_words2 = normalize_chunk(
                        tokens,
                        edges,
                        text_utils.flexer,
                        text_utils.word2tags,
                        target_tags={'ЧИСЛО': 'ЕД'})
                    normal_words = list(
                        set(normal_words1) | set(normal_words2))

                    if group_name in self.constraints_w2v:
                        constraints_satisfied = True
                        for c in self.constraints_w2v[group_name]:
                            hit = False
                            for chunk_word in normal_words:
                                sim = text_utils.word_similarity(
                                    c.anchor, chunk_word)
                                if sim >= c.sim:
                                    hit = True
                                    break
                            if not hit:
                                constraints_satisfied = False
                                break

                        if not constraints_satisfied:
                            return RuleConditionMatching.create(False)

                    #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words)))
                    res.add_group(group_name.upper(), normal_words, tokens)

                return res

        return RuleConditionMatching.create(False)