def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine): input_phrase = interpreted_phrase.interpretation text_utils = answering_engine.text_utils tokens = text_utils.tokenizer.tokenize(input_phrase) tagsets = list(text_utils.postagger.tag(tokens)) lemmas = text_utils.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) # заглушка вместо парсинга: edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)] phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = text_utils.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index for mask in self.masks: mx = match(phrase_tokens, mask.mask_terms) if mx: #print('{} groups in matching:'.format(mx.groups_count())) res = RuleConditionMatching.create(True) for group_name, tokens in mx.index2group.items(): normal_words = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags) #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words))) res.add_group(group_name.upper(), normal_words, tokens) return res return RuleConditionMatching.create(False)
def extract_chunks(self, sample): tokens = self.tokenizer.tokenize(sample) tagsets = list(self.postagger.tag(tokens)) lemmas = self.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = self.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index return chunks
def check_condition(self, bot, session, interlocutor, interpreted_phrase, answering_engine): if self.is_raw: input_phrase = interpreted_phrase.raw_phrase else: input_phrase = interpreted_phrase.interpretation text_utils = answering_engine.text_utils tokens = text_utils.tokenizer.tokenize(input_phrase) tagsets = list(text_utils.postagger.tag(tokens)) lemmas = text_utils.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) # заглушка вместо парсинга: edges = [(word, iword, None, None, None) for (iword, word) in enumerate(tokens)] phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = text_utils.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index for mask in self.masks: mx = match(phrase_tokens, mask.mask_terms) if mx: #print('{} groups in matching:'.format(mx.groups_count())) res = RuleConditionMatching.create(True) for group_name, tokens in mx.index2group.items(): normal_words1 = normalize_chunk(tokens, edges, text_utils.flexer, text_utils.word2tags) normal_words2 = normalize_chunk( tokens, edges, text_utils.flexer, text_utils.word2tags, target_tags={'ЧИСЛО': 'ЕД'}) normal_words = list( set(normal_words1) | set(normal_words2)) if group_name in self.constraints_w2v: constraints_satisfied = True for c in self.constraints_w2v[group_name]: hit = False for chunk_word in normal_words: sim = text_utils.word_similarity( c.anchor, chunk_word) if sim >= c.sim: hit = True break if not hit: constraints_satisfied = False break if not constraints_satisfied: return RuleConditionMatching.create(False) #print('{}={} normal={}'.format(group_name, ' '.join(t.word for t in tokens), ' '.join(normal_words))) res.add_group(group_name.upper(), normal_words, tokens) return res return RuleConditionMatching.create(False)
def do_action(self, bot, session, interlocutor, interpreted_phrase, condition_matching_results, text_utils): if self.np_sources: if condition_matching_results is None: condition_matching_results = RuleConditionMatching.create(True) for np, question in self.np_sources.items(): if bot.get_engine().does_bot_know_answer( question, bot, session, interlocutor): interpreted_phrase2 = InterpretedPhrase(question) answers = bot.get_engine().build_answers( session, bot, interlocutor, interpreted_phrase2) if answers: answer = answers[0] tokens = text_utils.tokenize(answer) tagsets = list(text_utils.postagger.tag(tokens)) lemmas = text_utils.lemmatizer.lemmatize(tagsets) phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate( zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) condition_matching_results.add_group( np, tokens, phrase_tokens) else: return None # Сначала попробуем убрать из списка те реплики, которые мы уже произносили. new_utterances = [] for utterance0 in self.phrases: utterance = self.prepare4saying(utterance0, condition_matching_results, text_utils) if '$' in utterance: # Не удалось подставить значение в один из $-слотов, значит # надо исключить фразу. continue if session.count_bot_phrase(utterance) == 0: if self.known_answer_policy == 'skip' and utterance[-1] == '?': # Проверим, что бот еще не знает ответ на этот вопрос: if bot.does_bot_know_answer(utterance, session, interlocutor): continue new_utterances.append(utterance) uttered = False if len(new_utterances) > 0: # Выбираем одну из оставшихся фраз. if len(new_utterances) == 1: bot.say(session, new_utterances[0]) else: bot.say(session, random.choice(new_utterances)) uttered = True else: # Все фразы бот уже произнес # Если задан список фраз на случай исчерпания (типа "не знаю больше ничего про кошек"), # то выдадим одну из них. new_utterances = [] for utterance0 in self.exhausted_phrases: utterance = self.prepare4saying(utterance0, condition_matching_results, text_utils) if '$' in utterance: # Не удалось подставить значение в один из $-слотов, значит # надо исключить фразу. continue if session.count_bot_phrase(utterance) == 0: if self.known_answer_policy == 'skip' and utterance[ -1] == '?': # Проверим, что бот еще не знает ответ на этот вопрос: if bot.does_bot_know_answer(utterance, session, interlocutor): continue new_utterances.append(utterance) if new_utterances: bot.say(session, random.choice(new_utterances)) uttered = True else: if self.known_answer_policy == 'skip': pass else: # Начиная с этого момента данное правило будет повторно выдавать # одну из фраз. #for src_phrase in sorted(self.phrases, key=lambda z: random.random()): # random_phrase = self.prepare4saying(src_phrase, condition_matching_results, text_utils) # if '$' not in random_phrase: # bot.say(session, random_phrase) # uttered = True # break uttered = False return uttered