def _resolve_slots(self, text, slots):
        builtin_scope = [
            slot[RES_ENTITY] for slot in slots
            if is_builtin_entity(slot[RES_ENTITY])
        ]
        custom_scope = [
            slot[RES_ENTITY] for slot in slots
            if not is_builtin_entity(slot[RES_ENTITY])
        ]
        # Do not use cached entities here as datetimes must be computed using
        # current context
        builtin_entities = self.builtin_entity_parser.parse(text,
                                                            builtin_scope,
                                                            use_cache=False)
        custom_entities = self.custom_entity_parser.parse(text,
                                                          custom_scope,
                                                          use_cache=True)

        resolved_slots = []
        for slot in slots:
            entity_name = slot[RES_ENTITY]
            raw_value = slot[RES_VALUE]
            is_builtin = is_builtin_entity(entity_name)
            if is_builtin:
                entities = builtin_entities
                parser = self.builtin_entity_parser
                slot_builder = builtin_slot
                use_cache = False
                extensible = False
            else:
                entities = custom_entities
                parser = self.custom_entity_parser
                slot_builder = custom_slot
                use_cache = True
                extensible = self.dataset_metadata[ENTITIES][entity_name][
                    AUTOMATICALLY_EXTENSIBLE]

            resolved_slot = None
            for ent in entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = slot_builder(slot, ent[RESOLVED_VALUE])
                    break
            if resolved_slot is None:
                matches = parser.parse(raw_value,
                                       scope=[entity_name],
                                       use_cache=use_cache)
                if matches:
                    match = matches[0]
                    if is_builtin or len(match[RES_VALUE]) == len(raw_value):
                        resolved_slot = slot_builder(slot,
                                                     match[RESOLVED_VALUE])

            if resolved_slot is None and extensible:
                resolved_slot = slot_builder(slot)

            if resolved_slot is not None:
                resolved_slots.append(resolved_slot)

        return resolved_slots
Exemple #2
0
 def slot_names_to_entities(self, value):
     self._slot_names_to_entities = value
     if value is None:
         self.entity_scopes = None
     else:
         self.entity_scopes = {
             intent: {
                 "builtin": {ent for ent in itervalues(slot_mapping)
                             if is_builtin_entity(ent)},
                 "custom": {ent for ent in itervalues(slot_mapping)
                            if not is_builtin_entity(ent)}
             }
             for intent, slot_mapping in iteritems(value)}
Exemple #3
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    utterance_entities_values = extract_utterance_entities(dataset)
    builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        uterrance_entities = utterance_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, uterrance_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, uterrance_entities, language, builtin_entity_parser)
    dataset[VALIDATED] = True
    return dataset
    def get_slots(self, text):
        """Extracts slots from the provided text

        Returns:
            list of dict: The list of extracted slots

        Raises:
            NotTrained: When the slot filler is not fitted
        """
        if not self.slot_name_mapping:
            # Early return if the intent has no slots
            return []

        tokens = tokenize(text, self.language)
        if not tokens:
            return []
        features = self.compute_features(tokens)
        tags = [
            _decode_tag(tag) for tag in self.crf_model.predict_single(features)
        ]
        slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
                              self.slot_name_mapping)

        builtin_slots_names = set(
            slot_name
            for (slot_name, entity) in iteritems(self.slot_name_mapping)
            if is_builtin_entity(entity))
        if not builtin_slots_names:
            return slots

        # Replace tags corresponding to builtin entities by outside tags
        tags = _replace_builtin_tags(tags, builtin_slots_names)
        return self._augment_slots(text, tokens, tags, builtin_slots_names)
Exemple #5
0
    def _enrich_utterance(self, utterance, builtin_entities, custom_entities,
                          word_clusters):
        custom_entities_features = [
            _entity_name_to_feature(e[ENTITY_KIND], self.language)
            for e in custom_entities
        ]

        builtin_entities_features = [
            _builtin_entity_to_feature(ent[ENTITY_KIND], self.language)
            for ent in builtin_entities
        ]

        # We remove values of builtin slots from the utterance to avoid
        # learning specific samples such as '42' or 'tomorrow'
        filtered_tokens = [
            chunk[TEXT] for chunk in utterance[DATA]
            if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
        ]

        features = get_default_sep(self.language).join(filtered_tokens)

        if builtin_entities_features:
            features += " " + " ".join(sorted(builtin_entities_features))
        if custom_entities_features:
            features += " " + " ".join(sorted(custom_entities_features))
        if word_clusters:
            features += " " + " ".join(sorted(word_clusters))

        return features
Exemple #6
0
    def parse_slot(self, intent, slot, msg):
        if not self.is_ready:
            return []

        # Here I still use my own method to parse slots because it gives better
        # results in my benchmarks.
        #
        # However, we should keep an eye on https://github.com/snipsco/snips-nlu/pull/724
        # for when it becomes relevant. For now get_slots returns less results than this
        # homemade method below.

        entity_label = self._slot_mappings.get(intent, {}).get(slot)

        # No label, just returns the given value
        if not entity_label:
            return [SlotValue(msg)]

        result = []

        # If it's a builtin entity, try to parse it
        if is_builtin_entity(entity_label):
            parsed = self._engine.builtin_entity_parser.parse(
                msg, [entity_label])

            for slot_data in parsed:
                # Here we move some keys to keep the returned meta consistent with the parse above
                # We are checking if `rawValue` is already present because snips-nlu seems to keep
                # a cache so to avoid mutating the same dict twice, we check again this added key.

                if RES_RAW_VALUE not in slot_data:
                    slot_data[RES_RAW_VALUE] = slot_data[RES_VALUE]
                    slot_data[RES_VALUE] = slot_data[RESOLVED_VALUE]
                    slot_data[ENTITY] = slot_data[ENTITY_KIND]

                result.append(
                    SlotValue(get_entity_value(slot_data[RES_VALUE]),
                              **slot_data))
        else:
            parsed = self._engine.custom_entity_parser.parse(
                msg, [entity_label])

            # The custom parser did not found a match and it's extensible? Just returns the value
            if not parsed and self._entities.get(
                    entity_label, {})[AUTOMATICALLY_EXTENSIBLE] == True:
                return [SlotValue(msg)]

            for slot_data in parsed:
                if RES_RAW_VALUE not in slot_data:
                    slot_data[RES_RAW_VALUE] = slot_data[RES_VALUE]
                    slot_data[RES_VALUE] = {
                        'kind': 'Custom',
                        RES_VALUE: slot_data[RESOLVED_VALUE],
                    }
                    slot_data[ENTITY] = slot_data[ENTITY_KIND]

                result.append(
                    SlotValue(get_entity_value(slot_data[RES_VALUE]),
                              **slot_data))

        return result
Exemple #7
0
    def fit(self, x, dataset):
        """Fits the idf of the vectorizer on the given utterances after
        enriching them with builtin entities matches, custom entities matches
        and the potential word clusters matches

        Args:
            x (list of dict): list of utterances
            dataset (dict): dataset from which x was extracted (needed to
                extract the language and the builtin entity scope)

        Returns:
            :class:`.TfidfVectorizer`: The fitted vectorizer
        """
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        self._language = dataset[LANGUAGE]
        self._init_vectorizer(self._language)
        self.builtin_entity_scope = set(e for e in dataset[ENTITIES]
                                        if is_builtin_entity(e))
        preprocessed_data = self._preprocess(x, training=True)
        utterances = [
            self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
            for u, builtin_ents, custom_ents, w_clusters in zip(
                *preprocessed_data)
        ]
        return self._tfidf_vectorizer.fit(utterances)
Exemple #8
0
    def fit_transform(self, x, dataset):
        """Fits the idf of the vectorizer on the given utterances after
        enriching them with builtin entities matches, custom entities matches
        and the potential word clusters matches.
        Returns the featurized utterances.

        Args:
            x (list of dict): list of utterances
            dataset (dict): dataset from which x was extracted (needed to
                extract the language and the builtin entity scope)

        Returns:
            :class:`.scipy.sparse.csr_matrix`: A sparse matrix X of shape
            (len(x), len(self.vocabulary)) where X[i, j] contains tfdif of
            the ngram of index j of the vocabulary in the utterance i
        """
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        self._language = dataset[LANGUAGE]
        self._init_vectorizer(self._language)
        self.builtin_entity_scope = set(e for e in dataset[ENTITIES]
                                        if is_builtin_entity(e))
        preprocessed_data = self._preprocess(x, training=True)
        utterances = [
            self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
            for u, builtin_ents, custom_ents, w_clusters in zip(
                *preprocessed_data)
        ]
        return self._tfidf_vectorizer.fit_transform(utterances)
Exemple #9
0
    def fit(self, x, dataset):
        """Fits the CooccurrenceVectorizer

        Given a list of utterances the CooccurrenceVectorizer will extract word
        pairs appearing in the same utterance. The order in which the words
        appear is kept. Additionally, if self.config.window_size is not None
        then the vectorizer will only look in a context window of
        self.config.window_size after each word.

        Args:
            x (iterable): list of utterances
            dataset (dict): dataset from which x was extracted (needed to
                extract the language and the builtin entity scope)

        Returns:
            :class:`.CooccurrenceVectorizer`: The fitted vectorizer
        """
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        self._language = dataset[LANGUAGE]
        self.builtin_entity_scope = set(
            e for e in dataset[ENTITIES] if is_builtin_entity(e))

        preprocessed = self._preprocess(list(x))
        utterances = [
            self._enrich_utterance(utterance, builtin_ents, custom_ent)
            for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
        word_pairs = set(
            p for u in utterances for p in self._extract_word_pairs(u))
        self._word_pairs = {
            pair: i for i, pair in enumerate(sorted(word_pairs))
        }
        return self
    def build(cls, dataset, parser_usage):
        from snips_nlu.dataset import validate_and_format_dataset

        dataset = validate_and_format_dataset(dataset)
        language = dataset[LANGUAGE]
        custom_entities = {
            entity_name: deepcopy(entity)
            for entity_name, entity in iteritems(dataset[ENTITIES])
            if not is_builtin_entity(entity_name)
        }
        if parser_usage == CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS:
            for ent in viewvalues(custom_entities):
                stemmed_utterances = _stem_entity_utterances(
                    ent[UTTERANCES], language)
                ent[UTTERANCES] = _merge_entity_utterances(
                    ent[UTTERANCES], stemmed_utterances)
        elif parser_usage == CustomEntityParserUsage.WITH_STEMS:
            for ent in viewvalues(custom_entities):
                ent[UTTERANCES] = _stem_entity_utterances(
                    ent[UTTERANCES], language)
        elif parser_usage is None:
            raise ValueError("A parser usage must be defined in order to fit "
                             "a CustomEntityParser")
        configuration = _create_custom_entity_parser_configuration(
            custom_entities)
        parser = GazetteerEntityParser.build(configuration)
        return cls(parser, language, parser_usage)
Exemple #11
0
def add_unknown_word_to_utterances(augmented_utterances, replacement_string,
                                   unknown_word_prob, random_state):
    for u in augmented_utterances:
        for chunk in u[DATA]:
            if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \
                    and random_state.rand() < unknown_word_prob:
                chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT])
    return augmented_utterances
Exemple #12
0
def remove_builtin_slots(dataset):
    filtered_dataset = deepcopy(dataset)
    for intent_data in itervalues(filtered_dataset[INTENTS]):
        for utterance in intent_data[UTTERANCES]:
            utterance[DATA] = [
                chunk for chunk in utterance[DATA]
                if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])]
    return filtered_dataset
Exemple #13
0
def create_entity(entity_name,
                  utterances=None,
                  automatically_extensible=True,
                  use_synonyms=True):
    if is_builtin_entity(entity_name):
        return BuiltinEntity(entity_name)
    else:
        if utterances is None:
            utterances = []
        return CustomEntity(entity_name, utterances, automatically_extensible,
                            use_synonyms)
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it

    Raise:
        DatasetFormatError: When the dataset format is wrong
    """
    from snips_nlu_parsers import get_all_languages

    if isinstance(dataset, Dataset):
        dataset = dataset.json

    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict, object_label="dataset")
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict, object_label="entities")
    validate_type(dataset[INTENTS], dict, object_label="intents")
    language = dataset[LANGUAGE]
    validate_type(language, str, object_label="language")
    if language not in get_all_languages():
        raise DatasetFormatError("Unknown language: '%s'" % language)

    dataset[INTENTS] = {
        intent_name: intent_data
        for intent_name, intent_data in sorted(iteritems(dataset[INTENTS]))}
    for intent in itervalues(dataset[INTENTS]):
        _validate_and_format_intent(intent, dataset[ENTITIES])

    utterance_entities_values = extract_utterance_entities(dataset)
    builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)

    dataset[ENTITIES] = {
        intent_name: entity_data
        for intent_name, entity_data in sorted(iteritems(dataset[ENTITIES]))}

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        uterrance_entities = utterance_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                _validate_and_format_builtin_entity(entity, uterrance_entities)
        else:
            dataset[ENTITIES][entity_name] = \
                _validate_and_format_custom_entity(
                    entity, uterrance_entities, language,
                    builtin_entity_parser)
    dataset[VALIDATED] = True
    return dataset
Exemple #15
0
def _get_entity_scopes(dataset):
    intent_entities = extract_intent_entities(dataset)
    intent_groups = []
    entity_scopes = []
    for intent, entities in sorted(iteritems(intent_entities)):
        scope = {
            "builtin": list(
                {ent for ent in entities if is_builtin_entity(ent)}),
            "custom": list(
                {ent for ent in entities if not is_builtin_entity(ent)})
        }
        if scope in entity_scopes:
            group_idx = entity_scopes.index(scope)
            intent_groups[group_idx].append(intent)
        else:
            entity_scopes.append(scope)
            intent_groups.append([intent])
    return [
        {
            "intent_group": intent_group,
            "entity_scope": entity_scope
        } for intent_group, entity_scope in zip(intent_groups, entity_scopes)
    ]
def get_dataset_specific_noise(dataset, language):
    """Return a noise list that excludes the dataset entity values"""
    entities_values = set()
    for ent_name, ent in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(ent_name):
            continue
        for k, v in iteritems(ent[UTTERANCES]):
            entities_values.add(k)
            entities_values.add(v)
    original_noise = get_noise(language)
    specific_noise = [n for n in original_noise if n not in entities_values]
    if not specific_noise:  # Avoid returning an empty noise
        return original_noise
    return specific_noise
Exemple #17
0
def get_entities_iterators(intent_entities, language,
                           add_builtin_entities_examples, random_state):
    entities_its = dict()
    for entity_name, entity in iteritems(intent_entities):
        utterance_values = random_state.permutation(list(entity[UTTERANCES]))
        if add_builtin_entities_examples and is_builtin_entity(entity_name):
            entity_examples = get_builtin_entity_examples(
                entity_name, language)
            # Builtin entity examples must be kept first in the iterator to
            # ensure that they are used when augmenting data
            iterator_values = entity_examples + list(utterance_values)
        else:
            iterator_values = utterance_values
        entities_its[entity_name] = cycle(iterator_values)
    return entities_its
Exemple #18
0
def _get_dataset_metadata(dataset):
    dataset = dataset
    entities = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        entities[entity_name] = {
            AUTOMATICALLY_EXTENSIBLE: entity[AUTOMATICALLY_EXTENSIBLE]
        }
    slot_name_mappings = get_slot_name_mappings(dataset)
    return {
        "language_code": dataset[LANGUAGE],
        "entities": entities,
        "slot_name_mappings": slot_name_mappings
    }
def capitalize_utterances(utterances, entities, language, ratio, random_state):
    capitalized_utterances = []
    for utterance in utterances:
        capitalized_utterance = deepcopy(utterance)
        for i, chunk in enumerate(capitalized_utterance[DATA]):
            capitalized_utterance[DATA][i][TEXT] = chunk[TEXT].lower()
            if ENTITY not in chunk:
                continue
            entity_label = chunk[ENTITY]
            if is_builtin_entity(entity_label):
                continue
            if not entities[entity_label][CAPITALIZE]:
                continue
            if random_state.rand() > ratio:
                continue
            capitalized_utterance[DATA][i][TEXT] = capitalize(
                chunk[TEXT], language)
        capitalized_utterances.append(capitalized_utterance)
    return capitalized_utterances
Exemple #20
0
def _preprocess_utterance(utterance, language, builtin_entity_parser,
                          custom_entity_parser, word_clusters_name,
                          use_stemming, unknownword_replacement_string):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming)
                                 for t in utterance_tokens]

    custom_entities = custom_entity_parser.parse(
        " ".join(normalized_stemmed_tokens))
    custom_entities = [e for e in custom_entities
                       if e["value"] != unknownword_replacement_string]
    custom_entities_features = [
        _entity_name_to_feature(e[ENTITY_KIND], language)
        for e in custom_entities]

    builtin_entities = builtin_entity_parser.parse(
        utterance_text, use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language, use_stemming)
        for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if custom_entities_features:
        features += " " + " ".join(sorted(custom_entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
def _reconciliate_builtin_slots(text, slots, builtin_entities):
    for slot in slots:
        if not is_builtin_entity(slot[RES_ENTITY]):
            continue
        for be in builtin_entities:
            if be[ENTITY_KIND] != slot[RES_ENTITY]:
                continue
            be_start = be[RES_MATCH_RANGE][START]
            be_end = be[RES_MATCH_RANGE][END]
            be_length = be_end - be_start
            slot_start = slot[RES_MATCH_RANGE][START]
            slot_end = slot[RES_MATCH_RANGE][END]
            slot_length = slot_end - slot_start
            if be_start <= slot_start and be_end >= slot_end \
                    and be_length > slot_length:
                slot[RES_MATCH_RANGE] = {START: be_start, END: be_end}
                slot[RES_VALUE] = text[be_start:be_end]
                break
    return slots
def _validate_and_format_intent(intent, entities):
    validate_type(intent, dict, "intent")
    validate_key(intent, UTTERANCES, object_label="intent dict")
    validate_type(intent[UTTERANCES], list, object_label="utterances")
    for utterance in intent[UTTERANCES]:
        validate_type(utterance, dict, object_label="utterance")
        validate_key(utterance, DATA, object_label="utterance")
        validate_type(utterance[DATA], list, object_label="utterance data")
        for chunk in utterance[DATA]:
            validate_type(chunk, dict, object_label="utterance chunk")
            validate_key(chunk, TEXT, object_label="chunk")
            if ENTITY in chunk or SLOT_NAME in chunk:
                mandatory_keys = [ENTITY, SLOT_NAME]
                validate_keys(chunk, mandatory_keys, object_label="chunk")
                if is_builtin_entity(chunk[ENTITY]):
                    continue
                else:
                    validate_key(entities, chunk[ENTITY],
                                 object_label=ENTITIES)
    return intent
Exemple #23
0
def _entities_from_utterance(utterance):
    builtin_ents = []
    custom_ents = []
    current_ix = 0
    for chunk in utterance[DATA]:
        text = chunk[TEXT]
        text_length = len(text)
        if ENTITY in chunk:
            ent = {
                ENTITY_KIND: chunk[ENTITY],
                RES_VALUE: text,
                RES_MATCH_RANGE: {
                    START: current_ix,
                    END: current_ix + text_length
                }
            }
            if is_builtin_entity(ent[ENTITY_KIND]):
                builtin_ents.append(ent)
            else:
                custom_ents.append(ent)
        current_ix += text_length
    return builtin_ents, custom_ents
Exemple #24
0
 def fit(self, dataset, intent):
     self.language = dataset[LANGUAGE]
     self.entities = extract_intent_entities(
         dataset, lambda e: not is_builtin_entity(e))[intent]
     self.entities = list(self.entities)
     return self