def resolve_slots(input, slots, dataset_entities, language, scope): builtin_entities = get_builtin_entities(input, language, scope) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] if is_builtin_entity(entity_name): found = False for ent in builtin_entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = builtin_slot(slot, ent[ENTITY]) resolved_slots.append(resolved_slot) found = True break if not found: builtin_matches = get_builtin_entities(raw_value, language, scope=[entity_name]) if builtin_matches: resolved_slot = builtin_slot(slot, builtin_matches[0][VALUE]) resolved_slots.append(resolved_slot) else: # custom slot entity = dataset_entities[entity_name] if raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][raw_value] elif entity[AUTOMATICALLY_EXTENSIBLE]: resolved_value = raw_value else: # entity is skipped resolved_value = None if resolved_value is not None: resolved_slots.append(custom_slot(slot, resolved_value)) return resolved_slots
def test_get_builtin_entities_should_support_all_languages(self): # Given text = "" for language in get_all_languages(): msg = "get_builtin_entities does not support %s." % language with self.fail_if_exception(msg): # When / Then get_builtin_entities(text, language)
def test_get_builtin_entities_should_support_all_languages(self): # Given text = "" for language in get_all_languages(): msg = "get_builtin_entities does not support %s." % language with self.fail_if_exception(msg): # When / Then get_builtin_entities(text, language)
def numbers_variations(string, language): if not supports_num2words(language): return set() number_entities = get_builtin_entities(string, language, scope=[SNIPS_NUMBER]) number_entities = sorted(number_entities, key=lambda x: x[RES_MATCH_RANGE][START]) if not number_entities: return set() digit_values = [digit_value(e) for e in number_entities] alpha_values = [alphabetic_value(e, language) for e in number_entities] values = [(n[RES_MATCH_RANGE], (d, a)) for (n, d, a) in zip(number_entities, digit_values, alpha_values) if a is not None] n_values = len(values) if 2**n_values > MAX_ENTITY_VARIATIONS: return set() combinations = itertools.product(range(2), repeat=n_values) variations = set() for c in combinations: ranges_and_utterances = [(values[i][0], values[i][1][ix]) for i, ix in enumerate(c)] variations.add(build_variated_query(string, ranges_and_utterances)) return variations
def _replace_builtin_entities(text, language): builtin_entities = get_builtin_entities(text, language, use_cache=True) if not builtin_entities: return dict(), text range_mapping = dict() processed_text = "" offset = 0 current_ix = 0 builtin_entities = sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]) for ent in builtin_entities: ent_start = ent[RES_MATCH_RANGE][START] ent_end = ent[RES_MATCH_RANGE][END] rng_start = ent_start + offset processed_text += text[current_ix:ent_start] entity_length = ent_end - ent_start entity_place_holder = _get_entity_name_placeholder(ent[ENTITY_KIND], language) offset += len(entity_place_holder) - entity_length processed_text += entity_place_holder rng_end = ent_end + offset new_range = (rng_start, rng_end) range_mapping[new_range] = ent[RES_MATCH_RANGE] current_ix = ent_end processed_text += text[current_ix:] return range_mapping, processed_text
def numbers_variations(string, language): variations = set() if not supports_num2words(language): return variations number_entities = get_builtin_entities( string, language, scope=[SNIPS_NUMBER]) number_entities = sorted(number_entities, key=lambda x: x[RES_MATCH_RANGE][START]) if not number_entities: return variations digit_values = [digit_value(e) for e in number_entities] alpha_values = [alphabetic_value(e, language) for e in number_entities] values = [(n[RES_MATCH_RANGE], (d, a)) for (n, d, a) in zip(number_entities, digit_values, alpha_values) if a is not None] combinations = itertools.product(range(2), repeat=len(values)) for c in combinations: ranges_and_utterances = [(values[i][0], values[i][1][ix]) for i, ix in enumerate(c)] variations.add(build_variated_query(string, ranges_and_utterances)) return variations
def _replace_builtin_entities(text, language): builtin_entities = get_builtin_entities(text, language) if not builtin_entities: return dict(), text range_mapping = dict() processed_text = "" offset = 0 current_ix = 0 builtin_entities = sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]) for ent in builtin_entities: ent_start = ent[RES_MATCH_RANGE][START] ent_end = ent[RES_MATCH_RANGE][END] rng_start = ent_start + offset processed_text += text[current_ix:ent_start] entity_length = ent_end - ent_start entity_place_holder = _get_builtin_entity_name(ent[ENTITY_KIND], language) offset += len(entity_place_holder) - entity_length processed_text += entity_place_holder rng_end = ent_end + offset new_range = (rng_start, rng_end) range_mapping[new_range] = ent[RES_MATCH_RANGE] current_ix = ent_end processed_text += text[current_ix:] return range_mapping, processed_text
def resolve_slots(input, slots, dataset_entities, language, scope): # Do not use cached entities here as datetimes must be computed using # current context builtin_entities = get_builtin_entities(input, language, scope, use_cache=False) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] if is_builtin_entity(entity_name): found = False for ent in builtin_entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = builtin_slot(slot, ent[ENTITY]) resolved_slots.append(resolved_slot) found = True break if not found: builtin_matches = get_builtin_entities(raw_value, language, scope=[entity_name], use_cache=False) if builtin_matches: resolved_slot = builtin_slot(slot, builtin_matches[0][VALUE]) resolved_slots.append(resolved_slot) else: # custom slot entity = dataset_entities[entity_name] normalized_raw_value = normalize(raw_value) if raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][raw_value] elif normalized_raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][normalized_raw_value] elif entity[AUTOMATICALLY_EXTENSIBLE]: resolved_value = raw_value else: # entity is skipped resolved_value = None if resolved_value is not None: resolved_slots.append(custom_slot(slot, resolved_value)) return resolved_slots
def test_get_builtin_entities_should_respect_scope(self): # Given text = "meet me at 10 p.m." # When scope = ["snips/number"] parse = get_builtin_entities(text, "en", scope=scope) # Then self.assertEqual(len(parse), 1) self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
def test_get_builtin_entities_should_respect_scope(self): # Given text = "meet me at 10 p.m." # When scope = ["snips/number"] parse = get_builtin_entities(text, "en", scope=scope) # Then self.assertEqual(len(parse), 1) self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
def _augment_slots(self, text, tokens, tags, builtin_slots_names): scope = set(self.slot_name_mapping[slot] for slot in builtin_slots_names) builtin_entities = [ be for entity_kind in scope for be in get_builtin_entities( text, self.language, [entity_kind], use_cache=True) ] # We remove builtin entities which conflicts with custom slots # extracted by the CRF builtin_entities = _filter_overlapping_builtins( builtin_entities, tokens, tags, self.config.tagging_scheme) # We resolve conflicts between builtin entities by keeping the longest # matches. In case when two builtin entities span the same range, we # keep both. builtin_entities = _disambiguate_builtin_entities(builtin_entities) # We group builtin entities based on their position grouped_entities = ( list(bes) for _, bes in groupby(builtin_entities, key=lambda s: s[RES_MATCH_RANGE][START])) grouped_entities = sorted( grouped_entities, key=lambda entities: entities[0][RES_MATCH_RANGE][START]) features = self.compute_features(tokens) spans_ranges = [entities[0][RES_MATCH_RANGE] for entities in grouped_entities] tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens) # We loop on all possible slots permutations and use the CRF to find # the best one in terms of probability slots_permutations = _get_slots_permutations( grouped_entities, self.slot_name_mapping) best_updated_tags = tags best_permutation_score = -1 for slots in slots_permutations: updated_tags = copy(tags) for slot_index, slot in enumerate(slots): indexes = tokens_indexes[slot_index] sub_tags_sequence = positive_tagging( self.config.tagging_scheme, slot, len(indexes)) updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence score = self._get_sequence_probability(features, updated_tags) if score > best_permutation_score: best_updated_tags = updated_tags best_permutation_score = score slots = tags_to_slots(text, tokens, best_updated_tags, self.config.tagging_scheme, self.slot_name_mapping) return _reconciliate_builtin_slots(text, slots, builtin_entities)
def _augment_slots(self, text, tokens, tags, builtin_slots_names): scope = set(self.slot_name_mapping[slot] for slot in builtin_slots_names) builtin_entities = [be for entity_kind in scope for be in get_builtin_entities(text, self.language, [entity_kind])] # We remove builtin entities which conflicts with custom slots # extracted by the CRF builtin_entities = _filter_overlapping_builtins( builtin_entities, tokens, tags, self.config.tagging_scheme) # We resolve conflicts between builtin entities by keeping the longest # matches. In case when two builtin entities span the same range, we # keep both. builtin_entities = _disambiguate_builtin_entities(builtin_entities) # We group builtin entities based on their position grouped_entities = ( list(bes) for _, bes in groupby(builtin_entities, key=lambda s: s[RES_MATCH_RANGE][START])) grouped_entities = sorted( grouped_entities, key=lambda entities: entities[0][RES_MATCH_RANGE][START]) features = self.compute_features(tokens) spans_ranges = [entities[0][RES_MATCH_RANGE] for entities in grouped_entities] tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens) # We loop on all possible slots permutations and use the CRF to find # the best one in terms of probability slots_permutations = _get_slots_permutations( grouped_entities, self.slot_name_mapping) best_updated_tags = tags best_permutation_score = -1 for slots in slots_permutations: updated_tags = copy(tags) for slot_index, slot in enumerate(slots): indexes = tokens_indexes[slot_index] sub_tags_sequence = positive_tagging( self.config.tagging_scheme, slot, len(indexes)) updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence score = self._get_sequence_probability(features, updated_tags) if score > best_permutation_score: best_updated_tags = updated_tags best_permutation_score = score slots = tags_to_slots(text, tokens, best_updated_tags, self.config.tagging_scheme, self.slot_name_mapping) return _reconciliate_builtin_slots(text, slots, builtin_entities)
def _augment_slots(self, text, tokens, tags, builtin_slots_names): augmented_tags = tags scope = [self.slot_name_mapping[slot] for slot in builtin_slots_names] builtin_entities = get_builtin_entities(text, self.language, scope) builtin_entities = _filter_overlapping_builtins( builtin_entities, tokens, tags, self.config.tagging_scheme) grouped_entities = groupby(builtin_entities, key=lambda s: s[ENTITY_KIND]) features = None for entity, matches in grouped_entities: spans_ranges = [match[RES_MATCH_RANGE] for match in matches] num_possible_builtins = len(spans_ranges) tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens) related_slots = list( set(s for s in builtin_slots_names if self.slot_name_mapping[s] == entity)) best_updated_tags = augmented_tags best_permutation_score = -1 for slots in _generate_slots_permutations( num_possible_builtins, related_slots, self.config.exhaustive_permutations_threshold): updated_tags = copy(augmented_tags) for slot_index, slot in enumerate(slots): if slot_index >= len(tokens_indexes): break indexes = tokens_indexes[slot_index] sub_tags_sequence = positive_tagging( self.config.tagging_scheme, slot, len(indexes)) updated_tags[indexes[0]:indexes[-1] + 1] = \ sub_tags_sequence if features is None: features = self.compute_features(tokens) score = self._get_sequence_probability(features, updated_tags) if score > best_permutation_score: best_updated_tags = updated_tags best_permutation_score = score augmented_tags = best_updated_tags slots = tags_to_slots(text, tokens, augmented_tags, self.config.tagging_scheme, self.slot_name_mapping) return _reconciliate_builtin_slots(text, slots, builtin_entities)
def builtin_entity_match(tokens, token_index): text = initial_string_from_tokens(tokens) start = tokens[token_index].start end = tokens[token_index].end builtin_entities = get_builtin_entities( text, self.language, scope=[builtin_entity]) builtin_entities = [ent for ent in builtin_entities if entity_filter(ent, start, end)] for ent in builtin_entities: entity_start = ent[RES_MATCH_RANGE][START] entity_end = ent[RES_MATCH_RANGE][END] indexes = [] for index, token in enumerate(tokens): if (entity_start <= token.start < entity_end) \ and (entity_start < token.end <= entity_end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme)
def test_alphabetic_value(self): # Given language = LANGUAGE_EN string = "1 time and 23 times and one thousand and sixty and 1.2" entities = get_builtin_entities(string, language) entities = sorted(entities, key=lambda x: x[RES_MATCH_RANGE][START]) expected_values = [ "one", "twenty-three", "one thousand and sixty", None ] self.assertEqual(len(entities), len(expected_values)) for i, ent in enumerate(entities): # When value = alphabetic_value(ent, language) # Then self.assertEqual(value, expected_values[i])
def builtin_entity_match(tokens, token_index): text = initial_string_from_tokens(tokens) start = tokens[token_index].start end = tokens[token_index].end builtin_entities = get_builtin_entities( text, self.language, scope=[builtin_entity]) builtin_entities = [ent for ent in builtin_entities if entity_filter(ent, start, end)] for ent in builtin_entities: entity_start = ent[RES_MATCH_RANGE][START] entity_end = ent[RES_MATCH_RANGE][END] indexes = [] for index, token in enumerate(tokens): if (entity_start <= token.start < entity_end) \ and (entity_start < token.end <= entity_end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme)
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language) for t in utterance_tokens] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = ( e[RES_MATCH_RANGE] for e in sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]) ) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [_normalize_stem(t, language) for t in filtered_utterance_tokens] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = (e[RES_MATCH_RANGE] for e in sorted( builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in filtered_utterance_tokens ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features