def prepare4saying(self, phrase, condition_matching_results, text_utils): utterance = phrase.raw_text # Если нужно сделать подстановку сматченных при проверке условия чанков. if condition_matching_results and condition_matching_results.has_groups( ) and phrase.has_entries(): for name, group in condition_matching_results.groups.items(): group_ancor = name.upper() if group_ancor in phrase.name2entry: entry = phrase.name2entry[group_ancor] words = group.words # Нужно просклонять чанк? if entry.tags: tokens = group.phrase_tokens target_tags = dict() for tag in entry.tags: if tag in ('ИМ', 'ВИН', 'РОД', 'ТВОР', 'ДАТ', 'ПРЕДЛ'): target_tags['ПАДЕЖ'] = tag else: raise NotImplementedError() words = normalize_chunk(tokens, edges=None, flexer=text_utils.flexer, word2tags=text_utils.word2tags, target_tags=target_tags) # Подставляем слова чанка вместо подстроки $NP1(...) entry_value = ' '.join(words) utterance = utterance.replace(entry.raw_text, entry_value) return utterance
def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine): input_phrase = interpreted_phrase.interpretation text_utils = answering_engine.text_utils tokens = text_utils.tokenizer.tokenize(input_phrase) tagsets = list(text_utils.postagger.tag(tokens)) lemmas = text_utils.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) # заглушка вместо парсинга: edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)] phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = text_utils.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index for mask in self.masks: mx = match(phrase_tokens, mask.mask_terms) if mx: #print('{} groups in matching:'.format(mx.groups_count())) res = RuleConditionMatching.create(True) for group_name, tokens in mx.index2group.items(): normal_words = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags) #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words))) res.add_group(group_name.upper(), normal_words, tokens) return res return RuleConditionMatching.create(False)
def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine): if self.is_raw: input_phrase = interpreted_phrase.raw_phrase else: input_phrase = interpreted_phrase.interpretation text_utils = answering_engine.text_utils tokens = text_utils.tokenizer.tokenize(input_phrase) tagsets = list(text_utils.postagger.tag(tokens)) lemmas = text_utils.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) # заглушка вместо парсинга: edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)] phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = text_utils.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index for mask in self.masks: mx = match(phrase_tokens, mask.mask_terms) if mx: #print('{} groups in matching:'.format(mx.groups_count())) res = RuleConditionMatching.create(True) for group_name, tokens in mx.index2group.items(): normal_words1 = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags) normal_words2 = normalize_chunk( tokens, edges, text_utils.flexer, text_utils.word2tags, target_tags={'ЧИСЛО': 'ЕД'}) normal_words = list( set(normal_words1) | set(normal_words2)) if group_name in self.constraints_w2v: constraints_satisfied = True for c in self.constraints_w2v[group_name]: hit = False for chunk_word in normal_words: sim = text_utils.word_similarity( c.anchor, chunk_word) if sim >= c.sim: hit = True break if not hit: constraints_satisfied = False break if not constraints_satisfied: return RuleConditionMatching.create(False) #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words))) res.add_group(group_name.upper(), normal_words, tokens) return res return RuleConditionMatching.create(False)