def _resolve_slots(self, text, slots): builtin_scope = [ slot[RES_ENTITY] for slot in slots if is_builtin_entity(slot[RES_ENTITY]) ] custom_scope = [ slot[RES_ENTITY] for slot in slots if not is_builtin_entity(slot[RES_ENTITY]) ] # Do not use cached entities here as datetimes must be computed using # current context builtin_entities = self.builtin_entity_parser.parse(text, builtin_scope, use_cache=False) custom_entities = self.custom_entity_parser.parse(text, custom_scope, use_cache=True) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] is_builtin = is_builtin_entity(entity_name) if is_builtin: entities = builtin_entities parser = self.builtin_entity_parser slot_builder = builtin_slot use_cache = False extensible = False else: entities = custom_entities parser = self.custom_entity_parser slot_builder = custom_slot use_cache = True extensible = self.dataset_metadata[ENTITIES][entity_name][ AUTOMATICALLY_EXTENSIBLE] resolved_slot = None for ent in entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = slot_builder(slot, ent[RESOLVED_VALUE]) break if resolved_slot is None: matches = parser.parse(raw_value, scope=[entity_name], use_cache=use_cache) if matches: match = matches[0] if is_builtin or len(match[RES_VALUE]) == len(raw_value): resolved_slot = slot_builder(slot, match[RESOLVED_VALUE]) if resolved_slot is None and extensible: resolved_slot = slot_builder(slot) if resolved_slot is not None: resolved_slots.append(resolved_slot) return resolved_slots
def slot_names_to_entities(self, value): self._slot_names_to_entities = value if value is None: self.entity_scopes = None else: self.entity_scopes = { intent: { "builtin": {ent for ent in itervalues(slot_mapping) if is_builtin_entity(ent)}, "custom": {ent for ent in itervalues(slot_mapping) if not is_builtin_entity(ent)} } for intent, slot_mapping in iteritems(value)}
def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it""" # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict) mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict) validate_type(dataset[INTENTS], dict) language = dataset[LANGUAGE] validate_type(language, str) if language not in get_all_languages(): raise ValueError("Unknown language: '%s'" % language) for intent in itervalues(dataset[INTENTS]): validate_and_format_intent(intent, dataset[ENTITIES]) utterance_entities_values = extract_utterance_entities(dataset) builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) for entity_name, entity in iteritems(dataset[ENTITIES]): uterrance_entities = utterance_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ validate_and_format_builtin_entity(entity, uterrance_entities) else: dataset[ENTITIES][entity_name] = validate_and_format_custom_entity( entity, uterrance_entities, language, builtin_entity_parser) dataset[VALIDATED] = True return dataset
def get_slots(self, text): """Extracts slots from the provided text Returns: list of dict: The list of extracted slots Raises: NotTrained: When the slot filler is not fitted """ if not self.slot_name_mapping: # Early return if the intent has no slots return [] tokens = tokenize(text, self.language) if not tokens: return [] features = self.compute_features(tokens) tags = [ _decode_tag(tag) for tag in self.crf_model.predict_single(features) ] slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme, self.slot_name_mapping) builtin_slots_names = set( slot_name for (slot_name, entity) in iteritems(self.slot_name_mapping) if is_builtin_entity(entity)) if not builtin_slots_names: return slots # Replace tags corresponding to builtin entities by outside tags tags = _replace_builtin_tags(tags, builtin_slots_names) return self._augment_slots(text, tokens, tags, builtin_slots_names)
def _enrich_utterance(self, utterance, builtin_entities, custom_entities, word_clusters): custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], self.language) for e in custom_entities ] builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], self.language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid # learning specific samples such as '42' or 'tomorrow' filtered_tokens = [ chunk[TEXT] for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(self.language).join(filtered_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters: features += " " + " ".join(sorted(word_clusters)) return features
def parse_slot(self, intent, slot, msg): if not self.is_ready: return [] # Here I still use my own method to parse slots because it gives better # results in my benchmarks. # # However, we should keep an eye on https://github.com/snipsco/snips-nlu/pull/724 # for when it becomes relevant. For now get_slots returns less results than this # homemade method below. entity_label = self._slot_mappings.get(intent, {}).get(slot) # No label, just returns the given value if not entity_label: return [SlotValue(msg)] result = [] # If it's a builtin entity, try to parse it if is_builtin_entity(entity_label): parsed = self._engine.builtin_entity_parser.parse( msg, [entity_label]) for slot_data in parsed: # Here we move some keys to keep the returned meta consistent with the parse above # We are checking if `rawValue` is already present because snips-nlu seems to keep # a cache so to avoid mutating the same dict twice, we check again this added key. if RES_RAW_VALUE not in slot_data: slot_data[RES_RAW_VALUE] = slot_data[RES_VALUE] slot_data[RES_VALUE] = slot_data[RESOLVED_VALUE] slot_data[ENTITY] = slot_data[ENTITY_KIND] result.append( SlotValue(get_entity_value(slot_data[RES_VALUE]), **slot_data)) else: parsed = self._engine.custom_entity_parser.parse( msg, [entity_label]) # The custom parser did not found a match and it's extensible? Just returns the value if not parsed and self._entities.get( entity_label, {})[AUTOMATICALLY_EXTENSIBLE] == True: return [SlotValue(msg)] for slot_data in parsed: if RES_RAW_VALUE not in slot_data: slot_data[RES_RAW_VALUE] = slot_data[RES_VALUE] slot_data[RES_VALUE] = { 'kind': 'Custom', RES_VALUE: slot_data[RESOLVED_VALUE], } slot_data[ENTITY] = slot_data[ENTITY_KIND] result.append( SlotValue(get_entity_value(slot_data[RES_VALUE]), **slot_data)) return result
def fit(self, x, dataset): """Fits the idf of the vectorizer on the given utterances after enriching them with builtin entities matches, custom entities matches and the potential word clusters matches Args: x (list of dict): list of utterances dataset (dict): dataset from which x was extracted (needed to extract the language and the builtin entity scope) Returns: :class:`.TfidfVectorizer`: The fitted vectorizer """ self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self._language = dataset[LANGUAGE] self._init_vectorizer(self._language) self.builtin_entity_scope = set(e for e in dataset[ENTITIES] if is_builtin_entity(e)) preprocessed_data = self._preprocess(x, training=True) utterances = [ self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters) for u, builtin_ents, custom_ents, w_clusters in zip( *preprocessed_data) ] return self._tfidf_vectorizer.fit(utterances)
def fit_transform(self, x, dataset): """Fits the idf of the vectorizer on the given utterances after enriching them with builtin entities matches, custom entities matches and the potential word clusters matches. Returns the featurized utterances. Args: x (list of dict): list of utterances dataset (dict): dataset from which x was extracted (needed to extract the language and the builtin entity scope) Returns: :class:`.scipy.sparse.csr_matrix`: A sparse matrix X of shape (len(x), len(self.vocabulary)) where X[i, j] contains tfdif of the ngram of index j of the vocabulary in the utterance i """ self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self._language = dataset[LANGUAGE] self._init_vectorizer(self._language) self.builtin_entity_scope = set(e for e in dataset[ENTITIES] if is_builtin_entity(e)) preprocessed_data = self._preprocess(x, training=True) utterances = [ self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters) for u, builtin_ents, custom_ents, w_clusters in zip( *preprocessed_data) ] return self._tfidf_vectorizer.fit_transform(utterances)
def fit(self, x, dataset): """Fits the CooccurrenceVectorizer Given a list of utterances the CooccurrenceVectorizer will extract word pairs appearing in the same utterance. The order in which the words appear is kept. Additionally, if self.config.window_size is not None then the vectorizer will only look in a context window of self.config.window_size after each word. Args: x (iterable): list of utterances dataset (dict): dataset from which x was extracted (needed to extract the language and the builtin entity scope) Returns: :class:`.CooccurrenceVectorizer`: The fitted vectorizer """ self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self._language = dataset[LANGUAGE] self.builtin_entity_scope = set( e for e in dataset[ENTITIES] if is_builtin_entity(e)) preprocessed = self._preprocess(list(x)) utterances = [ self._enrich_utterance(utterance, builtin_ents, custom_ent) for utterance, builtin_ents, custom_ent in zip(*preprocessed)] word_pairs = set( p for u in utterances for p in self._extract_word_pairs(u)) self._word_pairs = { pair: i for i, pair in enumerate(sorted(word_pairs)) } return self
def build(cls, dataset, parser_usage): from snips_nlu.dataset import validate_and_format_dataset dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] custom_entities = { entity_name: deepcopy(entity) for entity_name, entity in iteritems(dataset[ENTITIES]) if not is_builtin_entity(entity_name) } if parser_usage == CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS: for ent in viewvalues(custom_entities): stemmed_utterances = _stem_entity_utterances( ent[UTTERANCES], language) ent[UTTERANCES] = _merge_entity_utterances( ent[UTTERANCES], stemmed_utterances) elif parser_usage == CustomEntityParserUsage.WITH_STEMS: for ent in viewvalues(custom_entities): ent[UTTERANCES] = _stem_entity_utterances( ent[UTTERANCES], language) elif parser_usage is None: raise ValueError("A parser usage must be defined in order to fit " "a CustomEntityParser") configuration = _create_custom_entity_parser_configuration( custom_entities) parser = GazetteerEntityParser.build(configuration) return cls(parser, language, parser_usage)
def add_unknown_word_to_utterances(augmented_utterances, replacement_string, unknown_word_prob, random_state): for u in augmented_utterances: for chunk in u[DATA]: if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \ and random_state.rand() < unknown_word_prob: chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT]) return augmented_utterances
def remove_builtin_slots(dataset): filtered_dataset = deepcopy(dataset) for intent_data in itervalues(filtered_dataset[INTENTS]): for utterance in intent_data[UTTERANCES]: utterance[DATA] = [ chunk for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])] return filtered_dataset
def create_entity(entity_name, utterances=None, automatically_extensible=True, use_synonyms=True): if is_builtin_entity(entity_name): return BuiltinEntity(entity_name) else: if utterances is None: utterances = [] return CustomEntity(entity_name, utterances, automatically_extensible, use_synonyms)
def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it Raise: DatasetFormatError: When the dataset format is wrong """ from snips_nlu_parsers import get_all_languages if isinstance(dataset, Dataset): dataset = dataset.json # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict, object_label="dataset") mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict, object_label="entities") validate_type(dataset[INTENTS], dict, object_label="intents") language = dataset[LANGUAGE] validate_type(language, str, object_label="language") if language not in get_all_languages(): raise DatasetFormatError("Unknown language: '%s'" % language) dataset[INTENTS] = { intent_name: intent_data for intent_name, intent_data in sorted(iteritems(dataset[INTENTS]))} for intent in itervalues(dataset[INTENTS]): _validate_and_format_intent(intent, dataset[ENTITIES]) utterance_entities_values = extract_utterance_entities(dataset) builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) dataset[ENTITIES] = { intent_name: entity_data for intent_name, entity_data in sorted(iteritems(dataset[ENTITIES]))} for entity_name, entity in iteritems(dataset[ENTITIES]): uterrance_entities = utterance_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ _validate_and_format_builtin_entity(entity, uterrance_entities) else: dataset[ENTITIES][entity_name] = \ _validate_and_format_custom_entity( entity, uterrance_entities, language, builtin_entity_parser) dataset[VALIDATED] = True return dataset
def _get_entity_scopes(dataset): intent_entities = extract_intent_entities(dataset) intent_groups = [] entity_scopes = [] for intent, entities in sorted(iteritems(intent_entities)): scope = { "builtin": list( {ent for ent in entities if is_builtin_entity(ent)}), "custom": list( {ent for ent in entities if not is_builtin_entity(ent)}) } if scope in entity_scopes: group_idx = entity_scopes.index(scope) intent_groups[group_idx].append(intent) else: entity_scopes.append(scope) intent_groups.append([intent]) return [ { "intent_group": intent_group, "entity_scope": entity_scope } for intent_group, entity_scope in zip(intent_groups, entity_scopes) ]
def get_dataset_specific_noise(dataset, language): """Return a noise list that excludes the dataset entity values""" entities_values = set() for ent_name, ent in iteritems(dataset[ENTITIES]): if is_builtin_entity(ent_name): continue for k, v in iteritems(ent[UTTERANCES]): entities_values.add(k) entities_values.add(v) original_noise = get_noise(language) specific_noise = [n for n in original_noise if n not in entities_values] if not specific_noise: # Avoid returning an empty noise return original_noise return specific_noise
def get_entities_iterators(intent_entities, language, add_builtin_entities_examples, random_state): entities_its = dict() for entity_name, entity in iteritems(intent_entities): utterance_values = random_state.permutation(list(entity[UTTERANCES])) if add_builtin_entities_examples and is_builtin_entity(entity_name): entity_examples = get_builtin_entity_examples( entity_name, language) # Builtin entity examples must be kept first in the iterator to # ensure that they are used when augmenting data iterator_values = entity_examples + list(utterance_values) else: iterator_values = utterance_values entities_its[entity_name] = cycle(iterator_values) return entities_its
def _get_dataset_metadata(dataset): dataset = dataset entities = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): if is_builtin_entity(entity_name): continue entities[entity_name] = { AUTOMATICALLY_EXTENSIBLE: entity[AUTOMATICALLY_EXTENSIBLE] } slot_name_mappings = get_slot_name_mappings(dataset) return { "language_code": dataset[LANGUAGE], "entities": entities, "slot_name_mappings": slot_name_mappings }
def capitalize_utterances(utterances, entities, language, ratio, random_state): capitalized_utterances = [] for utterance in utterances: capitalized_utterance = deepcopy(utterance) for i, chunk in enumerate(capitalized_utterance[DATA]): capitalized_utterance[DATA][i][TEXT] = chunk[TEXT].lower() if ENTITY not in chunk: continue entity_label = chunk[ENTITY] if is_builtin_entity(entity_label): continue if not entities[entity_label][CAPITALIZE]: continue if random_state.rand() > ratio: continue capitalized_utterance[DATA][i][TEXT] = capitalize( chunk[TEXT], language) capitalized_utterances.append(capitalized_utterance) return capitalized_utterances
def _preprocess_utterance(utterance, language, builtin_entity_parser, custom_entity_parser, word_clusters_name, use_stemming, unknownword_replacement_string): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming) for t in utterance_tokens] custom_entities = custom_entity_parser.parse( " ".join(normalized_stemmed_tokens)) custom_entities = [e for e in custom_entities if e["value"] != unknownword_replacement_string] custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], language) for e in custom_entities] builtin_entities = builtin_entity_parser.parse( utterance_text, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language, use_stemming) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _reconciliate_builtin_slots(text, slots, builtin_entities): for slot in slots: if not is_builtin_entity(slot[RES_ENTITY]): continue for be in builtin_entities: if be[ENTITY_KIND] != slot[RES_ENTITY]: continue be_start = be[RES_MATCH_RANGE][START] be_end = be[RES_MATCH_RANGE][END] be_length = be_end - be_start slot_start = slot[RES_MATCH_RANGE][START] slot_end = slot[RES_MATCH_RANGE][END] slot_length = slot_end - slot_start if be_start <= slot_start and be_end >= slot_end \ and be_length > slot_length: slot[RES_MATCH_RANGE] = {START: be_start, END: be_end} slot[RES_VALUE] = text[be_start:be_end] break return slots
def _validate_and_format_intent(intent, entities): validate_type(intent, dict, "intent") validate_key(intent, UTTERANCES, object_label="intent dict") validate_type(intent[UTTERANCES], list, object_label="utterances") for utterance in intent[UTTERANCES]: validate_type(utterance, dict, object_label="utterance") validate_key(utterance, DATA, object_label="utterance") validate_type(utterance[DATA], list, object_label="utterance data") for chunk in utterance[DATA]: validate_type(chunk, dict, object_label="utterance chunk") validate_key(chunk, TEXT, object_label="chunk") if ENTITY in chunk or SLOT_NAME in chunk: mandatory_keys = [ENTITY, SLOT_NAME] validate_keys(chunk, mandatory_keys, object_label="chunk") if is_builtin_entity(chunk[ENTITY]): continue else: validate_key(entities, chunk[ENTITY], object_label=ENTITIES) return intent
def _entities_from_utterance(utterance): builtin_ents = [] custom_ents = [] current_ix = 0 for chunk in utterance[DATA]: text = chunk[TEXT] text_length = len(text) if ENTITY in chunk: ent = { ENTITY_KIND: chunk[ENTITY], RES_VALUE: text, RES_MATCH_RANGE: { START: current_ix, END: current_ix + text_length } } if is_builtin_entity(ent[ENTITY_KIND]): builtin_ents.append(ent) else: custom_ents.append(ent) current_ix += text_length return builtin_ents, custom_ents
def fit(self, dataset, intent): self.language = dataset[LANGUAGE] self.entities = extract_intent_entities( dataset, lambda e: not is_builtin_entity(e))[intent] self.entities = list(self.entities) return self