def _parse_top_intents(self, text, top_n, intents=None): if isinstance(intents, str): intents = {intents} elif isinstance(intents, list): intents = set(intents) if top_n < 1: raise ValueError( "top_n argument must be greater or equal to 1, but got: %s" % top_n) def placeholder_fn(entity_name): return _get_entity_name_placeholder(entity_name, self.language) results = [] for intent, entity_scope in iteritems(self.entity_scopes): if intents is not None and intent not in intents: continue builtin_entities = self.builtin_entity_parser.parse( text, scope=entity_scope["builtin"], use_cache=True) custom_entities = self.custom_entity_parser.parse( text, scope=entity_scope["custom"], use_cache=True) all_entities = builtin_entities + custom_entities mapping, processed_text = replace_entities_with_placeholders( text, all_entities, placeholder_fn=placeholder_fn) cleaned_text = self._preprocess_text(text, intent) cleaned_processed_text = self._preprocess_text( processed_text, intent) for regex in self.regexes_per_intent[intent]: res = self._get_matching_result(text, cleaned_processed_text, regex, intent, mapping) if res is None and cleaned_text != cleaned_processed_text: res = self._get_matching_result(text, cleaned_text, regex, intent) if res is not None: results.append(res) break # In some rare cases there can be multiple ambiguous intents # In such cases, priority is given to results containing fewer slots weights = [1.0 / (1.0 + len(res[RES_SLOTS])) for res in results] total_weight = sum(weights) for res, weight in zip(results, weights): res[RES_INTENT][RES_PROBA] = weight / total_weight results = sorted(results, key=lambda r: -r[RES_INTENT][RES_PROBA]) return results[:top_n]
def test_should_replace_entities(self): # Given text = "Be the first to be there at 9pm" # When entities = [{ "entity_kind": "snips/ordinal", "value": "the first", "range": { "start": 3, "end": 12 } }, { "entity_kind": "my_custom_entity", "value": "first", "range": { "start": 7, "end": 12 } }, { "entity_kind": "snips/datetime", "value": "at 9pm", "range": { "start": 25, "end": 31 } }] def placeholder_fn(x): return "%%%s%%" % "".join(tokenize_light(x, "en")).upper() range_mapping, processed_text = replace_entities_with_placeholders( text=text, entities=entities, placeholder_fn=placeholder_fn) # Then expected_mapping = { (3, 17): { START: 3, END: 12 }, (30, 45): { START: 25, END: 31 } } expected_processed_text = \ "Be %SNIPSORDINAL% to be there %SNIPSDATETIME%" self.assertDictEqual(expected_mapping, range_mapping) self.assertEqual(expected_processed_text, processed_text)
def _parse_top_intents(self, text, top_n, intents=None): if isinstance(intents, str): intents = {intents} elif isinstance(intents, list): intents = set(intents) if top_n < 1: raise ValueError( "top_n argument must be greater or equal to 1, but got: %s" % top_n) def placeholder_fn(entity_name): return _get_entity_name_placeholder(entity_name, self.language) results = [] for intent, entity_scope in iteritems(self.entity_scopes): if intents is not None and intent not in intents: continue builtin_entities = self.builtin_entity_parser.parse( text, scope=entity_scope["builtin"], use_cache=True) custom_entities = self.custom_entity_parser.parse( text, scope=entity_scope["custom"], use_cache=True) all_entities = builtin_entities + custom_entities mapping, processed_text = replace_entities_with_placeholders( text, all_entities, placeholder_fn=placeholder_fn) cleaned_text = self._preprocess_text(text, intent) cleaned_processed_text = self._preprocess_text(processed_text, intent) for regex in self.regexes_per_intent[intent]: res = self._get_matching_result(text, cleaned_processed_text, regex, intent, mapping) if res is None and cleaned_text != cleaned_processed_text: res = self._get_matching_result(text, cleaned_text, regex, intent) if res is not None: results.append(res) break confidence_score = 1. if results: confidence_score = 1. / float(len(results)) results = results[:top_n] for res in results: res[RES_INTENT][RES_PROBA] = confidence_score return results
def _enrich_utterance(self, x, builtin_ents, custom_ents): utterance = get_text_from_chunks(x[DATA]) all_entities = builtin_ents + custom_ents placeholder_fn = self._placeholder_fn # Replace entities with placeholders enriched_utterance = replace_entities_with_placeholders( utterance, all_entities, placeholder_fn)[1] # Tokenize enriched_utterance = tokenize_light(enriched_utterance, self.language) # Remove the unknownword strings if needed if self.config.unknown_words_replacement_string: enriched_utterance = [ t for t in enriched_utterance if t != self.config.unknown_words_replacement_string ] return enriched_utterance