Ejemplo n.º 1
0
def resolve_slots(input, slots, dataset_entities, language, scope):
    builtin_entities = get_builtin_entities(input, language, scope)
    resolved_slots = []
    for slot in slots:
        entity_name = slot[RES_ENTITY]
        raw_value = slot[RES_VALUE]
        if is_builtin_entity(entity_name):
            found = False
            for ent in builtin_entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = builtin_slot(slot, ent[ENTITY])
                    resolved_slots.append(resolved_slot)
                    found = True
                    break
            if not found:
                builtin_matches = get_builtin_entities(raw_value, language,
                                                       scope=[entity_name])
                if builtin_matches:
                    resolved_slot = builtin_slot(slot,
                                                 builtin_matches[0][VALUE])
                    resolved_slots.append(resolved_slot)
        else:  # custom slot
            entity = dataset_entities[entity_name]
            if raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][raw_value]
            elif entity[AUTOMATICALLY_EXTENSIBLE]:
                resolved_value = raw_value
            else:
                # entity is skipped
                resolved_value = None

            if resolved_value is not None:
                resolved_slots.append(custom_slot(slot, resolved_value))
    return resolved_slots
Ejemplo n.º 2
0
    def get_slots(self, text):
        """Extracts slots from the provided text

        Returns:
            list of dict: The list of extracted slots

        Raises:
            NotTrained: When the slot filler is not fitted
        """
        if not self.fitted:
            raise NotTrained("CRFSlotFiller must be fitted")
        tokens = tokenize(text, self.language)
        if not tokens:
            return []
        features = self.compute_features(tokens)
        tags = [_decode_tag(tag) for tag in
                self.crf_model.predict_single(features)]
        slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
                              self.slot_name_mapping)

        builtin_slots_names = set(slot_name for (slot_name, entity) in
                                  iteritems(self.slot_name_mapping)
                                  if is_builtin_entity(entity))
        if not builtin_slots_names:
            return slots

        # Replace tags corresponding to builtin entities by outside tags
        tags = _replace_builtin_tags(tags, builtin_slots_names)
        return self._augment_slots(text, tokens, tags, builtin_slots_names)
Ejemplo n.º 3
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    queries_entities_values = extract_queries_entities(dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        queries_entities = queries_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, queries_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, queries_entities, language)
    dataset[VALIDATED] = True
    return dataset
Ejemplo n.º 4
0
    def json(self):
        intent_datasets_json = {
            d.intent_name: d.json
            for d in self.intent_datasets
        }
        intents = {
            intent_name: {
                "utterances": dataset_json["utterances"]
            }
            for intent_name, dataset_json in iteritems(intent_datasets_json)
        }
        ents = deepcopy(self.entities)
        ents_values = dict()
        for entity_name, entity in iteritems(self.entities):
            ents_values[entity_name] = set(a.value for a in entity.utterances)
            if entity.use_synonyms:
                ents_values[entity_name].update(
                    set(t for s in entity.utterances for t in s.synonyms))

        for dataset in self.intent_datasets:
            for ent_name, ent in iteritems(dataset.entities):
                if ent_name not in ents:
                    ents[ent_name] = ent
                elif not is_builtin_entity(ent_name):
                    for u in ent.utterances:
                        if u.value not in ents_values:
                            ents[ent_name].utterances.append(u)
        ents = {
            entity_name: entity.json
            for entity_name, entity in iteritems(ents)
        }
        return dict(language=self.language, intents=intents, entities=ents)
Ejemplo n.º 5
0
    def json(self):
        intent_datasets_json = {d.intent_name: d.json
                                for d in self.intent_datasets}
        intents = {
            intent_name: {
                "utterances": dataset_json["utterances"]
            }
            for intent_name, dataset_json in iteritems(intent_datasets_json)
        }
        ents = deepcopy(self.entities)
        ents_values = dict()
        for entity_name, entity in iteritems(self.entities):
            ents_values[entity_name] = set(a.value for a in entity.utterances)
            if entity.use_synonyms:
                ents_values[entity_name].update(
                    set(t for s in entity.utterances for t in s.synonyms))

        for dataset in self.intent_datasets:
            for ent_name, ent in iteritems(dataset.entities):
                if ent_name not in ents:
                    ents[ent_name] = ent
                elif not is_builtin_entity(ent_name):
                    for u in ent.utterances:
                        if u.value not in ents_values:
                            ents[ent_name].utterances.append(u)
        ents = {
            entity_name: entity.json
            for entity_name, entity in iteritems(ents)
        }
        return dict(language=self.language, intents=intents, entities=ents)
Ejemplo n.º 6
0
    def get_slots(self, text):
        """Extracts slots from the provided text

        Returns:
            list of dict: The list of extracted slots

        Raises:
            NotTrained: When the slot filler is not fitted
        """
        if not self.fitted:
            raise NotTrained("CRFSlotFiller must be fitted")
        tokens = tokenize(text, self.language)
        if not tokens:
            return []
        features = self.compute_features(tokens)
        tags = [_decode_tag(tag) for tag in
                self.crf_model.predict_single(features)]
        slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme,
                              self.slot_name_mapping)

        builtin_slots_names = set(slot_name for (slot_name, entity) in
                                  iteritems(self.slot_name_mapping)
                                  if is_builtin_entity(entity))
        if not builtin_slots_names:
            return slots

        # Replace tags corresponding to builtin entities by outside tags
        tags = _replace_builtin_tags(tags, builtin_slots_names)
        return self._augment_slots(text, tokens, tags, builtin_slots_names)
Ejemplo n.º 7
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    queries_entities_values = extract_queries_entities(dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        queries_entities = queries_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, queries_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, queries_entities, language)
    dataset[VALIDATED] = True
    return dataset
Ejemplo n.º 8
0
def add_unknown_word_to_utterances(augmented_utterances, replacement_string,
                                   unknown_word_prob, random_state):
    for u in augmented_utterances:
        for chunk in u[DATA]:
            if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \
                    and random_state.rand() < unknown_word_prob:
                chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT])
    return augmented_utterances
def remove_builtin_slots(dataset):
    filtered_dataset = deepcopy(dataset)
    for intent_data in itervalues(filtered_dataset[INTENTS]):
        for utterance in intent_data[UTTERANCES]:
            utterance[DATA] = [
                chunk for chunk in utterance[DATA]
                if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])]
    return filtered_dataset
Ejemplo n.º 10
0
def add_unknown_word_to_utterances(augmented_utterances, replacement_string,
                                   unknown_word_prob, random_state):
    for u in augmented_utterances:
        for chunk in u[DATA]:
            if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \
                    and random_state.rand() < unknown_word_prob:
                chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT])
    return augmented_utterances
Ejemplo n.º 11
0
 def entities(self):
     """retun all entities in json format for datasets"""
     ents = dict()
     for s in self.slots:
         if s.entity not in ents:
             ents[s.entity] = self.mk_entity(s)
         elif not is_builtin_entity(s.entity):
             ents[s.entity].utterances.append(EntityUtterance(s.text))
     return ents
Ejemplo n.º 12
0
 def entities(self):
     """retun all entities in json format for datasets"""
     ents = dict()
     for s in self.slots:
         if s.entity not in ents:
             ents[s.entity] = self.mk_entity(s)
         elif not is_builtin_entity(s.entity):
             ents[s.entity].utterances.append(EntityUtterance(s.text))
     return ents
Ejemplo n.º 13
0
def _get_utterances_to_features_names(dataset, language):
    utterances_to_features = defaultdict(set)
    for entity_name, entity_data in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        for u in entity_data[UTTERANCES]:
            utterances_to_features[u].add(
                _entity_name_to_feature(entity_name, language))
    return dict(utterances_to_features)
Ejemplo n.º 14
0
def extract_queries_entities(dataset):
    entities_values = {ent_name: [] for ent_name in dataset[ENTITIES]}

    for intent in itervalues(dataset[INTENTS]):
        for query in intent[UTTERANCES]:
            for chunk in query[DATA]:
                if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]):
                    entities_values[chunk[ENTITY]].append(chunk[TEXT])
    return {k: list(v) for k, v in iteritems(entities_values)}
Ejemplo n.º 15
0
def create_entity(entity_name, utterances=None, automatically_extensible=True,
                  use_synonyms=True):
    if is_builtin_entity(entity_name):
        return BuiltinEntity(entity_name)
    else:
        if utterances is None:
            utterances = []
        return CustomEntity(entity_name, utterances, automatically_extensible,
                            use_synonyms)
Ejemplo n.º 16
0
def _get_utterances_to_features_names(dataset, language):
    utterances_to_features = defaultdict(set)
    for entity_name, entity_data in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        for u in entity_data[UTTERANCES]:
            utterances_to_features[u].add(_entity_name_to_feature(
                entity_name, language))
    return dict(utterances_to_features)
Ejemplo n.º 17
0
def get_intent_custom_entities(dataset, intent):
    intent_entities = set()
    for utterance in dataset[INTENTS][intent][UTTERANCES]:
        for c in utterance[DATA]:
            if ENTITY in c:
                intent_entities.add(c[ENTITY])
    custom_entities = dict()
    for ent in intent_entities:
        if not is_builtin_entity(ent):
            custom_entities[ent] = dataset[ENTITIES][ent]
    return custom_entities
Ejemplo n.º 18
0
def get_intent_custom_entities(dataset, intent):
    intent_entities = set()
    for utterance in dataset[INTENTS][intent][UTTERANCES]:
        for c in utterance[DATA]:
            if ENTITY in c:
                intent_entities.add(c[ENTITY])
    custom_entities = dict()
    for ent in intent_entities:
        if not is_builtin_entity(ent):
            custom_entities[ent] = dataset[ENTITIES][ent]
    return custom_entities
def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            utterances = [_get_builtin_entity_name(entity_name, language)]
        else:
            utterances = list(entity[UTTERANCES])
        utterances_patterns = map(regex_escape, utterances)
        utterances_patterns = (p for p in utterances_patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(utterances_patterns, key=len, reverse=True))
    return joined_entity_utterances
    def _is_trainable(self, intent, dataset):
        if len(intent[UTTERANCES]) >= self.config.max_queries:
            return False

        intent_entities = set(chunk[ENTITY] for query in intent[UTTERANCES]
                              for chunk in query[DATA] if ENTITY in chunk)
        total_entities = sum(len(dataset[ENTITIES][ent][UTTERANCES])
                             for ent in intent_entities
                             if not is_builtin_entity(ent))
        if total_entities > self.config.max_entities:
            return False
        return True
    def _is_trainable(self, intent, dataset):
        if len(intent[UTTERANCES]) >= self.config.max_queries:
            return False

        intent_entities = set(chunk[ENTITY] for query in intent[UTTERANCES]
                              for chunk in query[DATA] if ENTITY in chunk)
        total_entities = sum(len(dataset[ENTITIES][ent][UTTERANCES])
                             for ent in intent_entities
                             if not is_builtin_entity(ent))
        if total_entities > self.config.max_entities:
            return False
        return True
def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            utterances = [_get_builtin_entity_name(entity_name, language)]
        else:
            utterances = list(entity[UTTERANCES])
        utterances_patterns = map(regex_escape, utterances)
        utterances_patterns = (p for p in utterances_patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(utterances_patterns, key=len, reverse=True))
    return joined_entity_utterances
Ejemplo n.º 23
0
    def parse_entity(self, msg, intent, slot):
        entity_label = self._engine._dataset_metadata[
            'slot_name_mappings'].get(intent, {}).get(slot)

        # TODO try to find a way to retrieve multiple slot values, that's a hard one
        # May be we can try matching on _dataset_metadata['entities']

        if is_builtin_entity(entity_label):
            parsed = self._entity_parser.parse(msg)

            if parsed:
                return get_entity_value(parsed[0]['entity'], msg)

        return msg
Ejemplo n.º 24
0
def get_entities_iterators(intent_entities, language, random_state):
    entities_its = dict()
    for entity_name, entity in iteritems(intent_entities):
        utterance_values = random_state.permutation(list(entity[UTTERANCES]))
        if is_builtin_entity(entity_name):
            entity_examples = get_builtin_entity_examples(entity_name,
                                                          language)
            # Builtin entity examples must be kept first in the iterator to
            # ensure that they are used when augmenting data
            iterator_values = entity_examples + list(utterance_values)
        else:
            iterator_values = utterance_values
        entities_its[entity_name] = cycle(iterator_values)
    return entities_its
Ejemplo n.º 25
0
def _get_dataset_metadata(dataset):
    entities = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        ent = deepcopy(entity)
        ent.pop(CAPITALIZE)
        entities[entity_name] = ent
    slot_name_mappings = get_slot_name_mappings(dataset)
    return {
        "language_code": dataset[LANGUAGE],
        "entities": entities,
        "slot_name_mappings": slot_name_mappings
    }
Ejemplo n.º 26
0
def _get_dataset_metadata(dataset):
    entities = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        ent = deepcopy(entity)
        ent.pop(CAPITALIZE)
        entities[entity_name] = ent
    slot_name_mappings = get_slot_name_mappings(dataset)
    return {
        "language_code": dataset[LANGUAGE],
        "entities": entities,
        "slot_name_mappings": slot_name_mappings
    }
Ejemplo n.º 27
0
def get_entities_iterators(intent_entities, language,
                           add_builtin_entities_examples, random_state):
    entities_its = dict()
    for entity_name, entity in iteritems(intent_entities):
        utterance_values = random_state.permutation(list(entity[UTTERANCES]))
        if add_builtin_entities_examples and is_builtin_entity(entity_name):
            entity_examples = get_builtin_entity_examples(entity_name,
                                                          language)
            # Builtin entity examples must be kept first in the iterator to
            # ensure that they are used when augmenting data
            iterator_values = entity_examples + list(utterance_values)
        else:
            iterator_values = utterance_values
        entities_its[entity_name] = cycle(iterator_values)
    return entities_its
Ejemplo n.º 28
0
def generate_utterance(contexts_iterator, entities_iterators):
    context = deepcopy(next(contexts_iterator))
    context_data = []
    for chunk in context[DATA]:
        if ENTITY in chunk:
            if not is_builtin_entity(chunk[ENTITY]):
                new_chunk = dict(chunk)
                new_chunk[TEXT] = deepcopy(
                    next(entities_iterators[new_chunk[ENTITY]]))
                context_data.append(new_chunk)
            else:
                context_data.append(chunk)
        else:
            context_data.append(chunk)
    context[DATA] = context_data
    return context
Ejemplo n.º 29
0
def generate_utterance(contexts_iterator, entities_iterators):
    context = deepcopy(next(contexts_iterator))
    context_data = []
    for chunk in context[DATA]:
        if ENTITY in chunk:
            if not is_builtin_entity(chunk[ENTITY]):
                new_chunk = dict(chunk)
                new_chunk[TEXT] = deepcopy(
                    next(entities_iterators[new_chunk[ENTITY]]))
                context_data.append(new_chunk)
            else:
                context_data.append(chunk)
        else:
            context_data.append(chunk)
    context[DATA] = context_data
    return context
Ejemplo n.º 30
0
    def parse_entity(self, msg, intent, slot):
        entity_label = self._engine._dataset_metadata[
            'slot_name_mappings'].get(intent, {}).get(slot)

        # TODO try to find a way to retrieve multiple slot values, that's a hard one
        # May be we can try matching on _dataset_metadata['entities']

        if entity_label:
            if is_builtin_entity(entity_label):
                parsed = self._entity_parser.parse(msg)

                if parsed:
                    return [parsed[0]['entity']]

        # TODO if slot is not an auto-extensible, use fuzzy matching to match with restricted values

        return super(SnipsInterpreter, self).parse_entity(msg, intent, slot)
Ejemplo n.º 31
0
def resolve_slots(input, slots, dataset_entities, language, scope):
    # Do not use cached entities here as datetimes must be computed using
    # current context
    builtin_entities = get_builtin_entities(input,
                                            language,
                                            scope,
                                            use_cache=False)
    resolved_slots = []
    for slot in slots:
        entity_name = slot[RES_ENTITY]
        raw_value = slot[RES_VALUE]
        if is_builtin_entity(entity_name):
            found = False
            for ent in builtin_entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = builtin_slot(slot, ent[ENTITY])
                    resolved_slots.append(resolved_slot)
                    found = True
                    break
            if not found:
                builtin_matches = get_builtin_entities(raw_value,
                                                       language,
                                                       scope=[entity_name],
                                                       use_cache=False)
                if builtin_matches:
                    resolved_slot = builtin_slot(slot,
                                                 builtin_matches[0][VALUE])
                    resolved_slots.append(resolved_slot)
        else:  # custom slot
            entity = dataset_entities[entity_name]
            normalized_raw_value = normalize(raw_value)
            if raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][raw_value]
            elif normalized_raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][normalized_raw_value]
            elif entity[AUTOMATICALLY_EXTENSIBLE]:
                resolved_value = raw_value
            else:
                # entity is skipped
                resolved_value = None

            if resolved_value is not None:
                resolved_slots.append(custom_slot(slot, resolved_value))
    return resolved_slots
Ejemplo n.º 32
0
    def parse(self, text, intents=None):
        """Performs intent parsing on the provided *text* by calling its intent
        parsers successively

        Args:
            text (str): Input
            intents (str or list of str): If provided, reduces the scope of
                intent parsing to the provided list of intents

        Returns:
            dict: The most likely intent along with the extracted slots. See
            :func:`.parsing_result` for the output format.

        Raises:
            NotTrained: When the nlu engine is not fitted
            TypeError: When input type is not unicode
        """
        logging.info("NLU engine parsing: '%s'...", text)
        if not isinstance(text, str):
            raise TypeError("Expected unicode but received: %s" % type(text))

        if not self.fitted:
            raise NotTrained("SnipsNLUEngine must be fitted")

        if isinstance(intents, str):
            intents = [intents]

        language = self._dataset_metadata["language_code"]
        entities = self._dataset_metadata["entities"]

        for parser in self.intent_parsers:
            res = parser.parse(text, intents)
            if is_empty(res):
                continue
            slots = res[RES_SLOTS]
            scope = [
                s[RES_ENTITY] for s in slots
                if is_builtin_entity(s[RES_ENTITY])
            ]
            resolved_slots = resolve_slots(text, slots, entities, language,
                                           scope)
            return parsing_result(text,
                                  intent=res[RES_INTENT],
                                  slots=resolved_slots)
        return empty_result(text)
Ejemplo n.º 33
0
def capitalize_utterances(utterances, entities, language, ratio, random_state):
    capitalized_utterances = []
    for utterance in utterances:
        capitalized_utterance = deepcopy(utterance)
        for i, chunk in enumerate(capitalized_utterance[DATA]):
            capitalized_utterance[DATA][i][TEXT] = chunk[TEXT].lower()
            if ENTITY not in chunk:
                continue
            entity_label = chunk[ENTITY]
            if is_builtin_entity(entity_label):
                continue
            if not entities[entity_label][CAPITALIZE]:
                continue
            if random_state.rand() > ratio:
                continue
            capitalized_utterance[DATA][i][TEXT] = capitalize(
                chunk[TEXT], language)
        capitalized_utterances.append(capitalized_utterance)
    return capitalized_utterances
Ejemplo n.º 34
0
def capitalize_utterances(utterances, entities, language, ratio, random_state):
    capitalized_utterances = []
    for utterance in utterances:
        capitalized_utterance = deepcopy(utterance)
        for i, chunk in enumerate(capitalized_utterance[DATA]):
            capitalized_utterance[DATA][i][TEXT] = chunk[TEXT].lower()
            if ENTITY not in chunk:
                continue
            entity_label = chunk[ENTITY]
            if is_builtin_entity(entity_label):
                continue
            if not entities[entity_label][CAPITALIZE]:
                continue
            if random_state.rand() > ratio:
                continue
            capitalized_utterance[DATA][i][TEXT] = capitalize(
                chunk[TEXT], language)
        capitalized_utterances.append(capitalized_utterance)
    return capitalized_utterances
Ejemplo n.º 35
0
def _reconciliate_builtin_slots(text, slots, builtin_entities):
    for slot in slots:
        if not is_builtin_entity(slot[RES_ENTITY]):
            continue
        for be in builtin_entities:
            if be[ENTITY_KIND] != slot[RES_ENTITY]:
                continue
            be_start = be[RES_MATCH_RANGE][START]
            be_end = be[RES_MATCH_RANGE][END]
            be_length = be_end - be_start
            slot_start = slot[RES_MATCH_RANGE][START]
            slot_end = slot[RES_MATCH_RANGE][END]
            slot_length = slot_end - slot_start
            if be_start <= slot_start and be_end >= slot_end \
                    and be_length > slot_length:
                slot[RES_MATCH_RANGE] = {START: be_start, END: be_end}
                slot[RES_VALUE] = text[be_start:be_end]
                break
    return slots
Ejemplo n.º 36
0
    def parse(self, text, intents=None):
        """Performs intent parsing on the provided *text* by calling its intent
        parsers successively

        Args:
            text (str): Input
            intents (str or list of str): If provided, reduces the scope of
                intent parsing to the provided list of intents

        Returns:
            dict: The most likely intent along with the extracted slots. See
            :func:`.parsing_result` for the output format.

        Raises:
            NotTrained: When the nlu engine is not fitted
            TypeError: When input type is not unicode
        """

        if not isinstance(text, str):
            raise TypeError("Expected unicode but received: %s" % type(text))

        if not self.fitted:
            raise NotTrained("SnipsNLUEngine must be fitted")

        if isinstance(intents, str):
            intents = [intents]

        language = self._dataset_metadata["language_code"]
        entities = self._dataset_metadata["entities"]

        for parser in self.intent_parsers:
            res = parser.parse(text, intents)
            if is_empty(res):
                continue
            slots = res[RES_SLOTS]
            scope = [s[RES_ENTITY] for s in slots
                     if is_builtin_entity(s[RES_ENTITY])]
            resolved_slots = resolve_slots(text, slots, entities, language,
                                           scope)
            return parsing_result(text, intent=res[RES_INTENT],
                                  slots=resolved_slots)
        return empty_result(text)
Ejemplo n.º 37
0
def validate_and_format_intent(intent, entities):
    validate_type(intent, dict)
    validate_key(intent, UTTERANCES, object_label="intent dict")
    validate_type(intent[UTTERANCES], list)
    for utterance in intent[UTTERANCES]:
        validate_type(utterance, dict)
        validate_key(utterance, DATA, object_label="utterance")
        validate_type(utterance[DATA], list)
        for chunk in utterance[DATA]:
            validate_type(chunk, dict)
            validate_key(chunk, TEXT, object_label="chunk")
            if ENTITY in chunk or SLOT_NAME in chunk:
                mandatory_keys = [ENTITY, SLOT_NAME]
                validate_keys(chunk, mandatory_keys, object_label="chunk")
                if is_builtin_entity(chunk[ENTITY]):
                    continue
                else:
                    validate_key(entities, chunk[ENTITY],
                                 object_label=ENTITIES)
    return intent
def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        # matches are performed in a case insensitive manner
        utterances = set(u.lower() for u in entity[UTTERANCES])
        patterns = []
        if is_builtin_entity(entity_name):
            # We add a placeholder value for builtin entities
            placeholder = _get_entity_name_placeholder(entity_name, language)
            patterns.append(regex_escape(placeholder))
        else:
            for utterance in utterances:
                tokens = tokenize_light(utterance, language)
                pattern = WHITESPACE_PATTERN.join(regex_escape(t)
                                                  for t in tokens)
                patterns.append(pattern)
        patterns = (p for p in patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(patterns, key=len, reverse=True))
    return joined_entity_utterances
Ejemplo n.º 39
0
def validate_and_format_intent(intent, entities):
    validate_type(intent, dict)
    validate_key(intent, UTTERANCES, object_label="intent dict")
    validate_type(intent[UTTERANCES], list)
    for utterance in intent[UTTERANCES]:
        validate_type(utterance, dict)
        validate_key(utterance, DATA, object_label="utterance")
        validate_type(utterance[DATA], list)
        for chunk in utterance[DATA]:
            validate_type(chunk, dict)
            validate_key(chunk, TEXT, object_label="chunk")
            if ENTITY in chunk or SLOT_NAME in chunk:
                mandatory_keys = [ENTITY, SLOT_NAME]
                validate_keys(chunk, mandatory_keys, object_label="chunk")
                if is_builtin_entity(chunk[ENTITY]):
                    continue
                else:
                    validate_key(entities,
                                 chunk[ENTITY],
                                 object_label=ENTITIES)
    return intent
Ejemplo n.º 40
0
def augment_utterances(dataset, intent_name, language, min_utterances,
                       capitalization_ratio, random_state):
    contexts_it = get_contexts_iterator(dataset, intent_name, random_state)
    intent_entities = get_intent_entities(dataset, intent_name)
    intent_entities = {
        e: dataset[ENTITIES][e] for e in intent_entities
        if not is_builtin_entity(e)
    }
    entities_its = get_entities_iterators(intent_entities, random_state)
    generated_utterances = []
    nb_to_generate = num_queries_to_generate(dataset, intent_name,
                                             min_utterances)
    while nb_to_generate > 0:
        generated_utterance = generate_utterance(contexts_it, entities_its)
        generated_utterances.append(generated_utterance)
        nb_to_generate -= 1

    generated_utterances = capitalize_utterances(
        generated_utterances, dataset[ENTITIES], language,
        ratio=capitalization_ratio, random_state=random_state)

    return generated_utterances
Ejemplo n.º 41
0
def _reconciliate_builtin_slots(text, slots, builtin_entities):
    for slot in slots:
        if not is_builtin_entity(slot[RES_ENTITY]):
            continue
        for be in builtin_entities:
            if be[ENTITY_KIND] != slot[RES_ENTITY]:
                continue
            be_start = be[RES_MATCH_RANGE][START]
            be_end = be[RES_MATCH_RANGE][END]
            be_length = be_end - be_start
            slot_start = slot[RES_MATCH_RANGE][START]
            slot_end = slot[RES_MATCH_RANGE][END]
            slot_length = slot_end - slot_start
            if be_start <= slot_start and be_end >= slot_end \
                    and be_length > slot_length:
                slot[RES_MATCH_RANGE] = {
                    START: be_start,
                    END: be_end
                }
                slot[RES_VALUE] = text[be_start: be_end]
                break
    return slots
Ejemplo n.º 42
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance_text,
                                            language,
                                            use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
Ejemplo n.º 43
0
def augment_utterances(dataset, intent_name, language, min_utterances,
                       capitalization_ratio, random_state):
    contexts_it = get_contexts_iterator(dataset, intent_name, random_state)
    intent_entities = get_intent_entities(dataset, intent_name)
    intent_entities = {
        e: dataset[ENTITIES][e]
        for e in intent_entities if not is_builtin_entity(e)
    }
    entities_its = get_entities_iterators(intent_entities, random_state)
    generated_utterances = []
    nb_to_generate = num_queries_to_generate(dataset, intent_name,
                                             min_utterances)
    while nb_to_generate > 0:
        generated_utterance = generate_utterance(contexts_it, entities_its)
        generated_utterances.append(generated_utterance)
        nb_to_generate -= 1

    generated_utterances = capitalize_utterances(generated_utterances,
                                                 dataset[ENTITIES],
                                                 language,
                                                 ratio=capitalization_ratio,
                                                 random_state=random_state)

    return generated_utterances
Ejemplo n.º 44
0
 def __init__(self, name):
     if not is_builtin_entity(name):
         raise LookupError("Invalid builtin entity {}".format(name))
     self.name = name
Ejemplo n.º 45
0
 def mk_entity(cls, slot, automatically_extensible=True, use_synonyms=True):
     if is_builtin_entity(slot.entity):
         return BuiltinEntity(slot.entity)
     return CustomEntity([EntityUtterance(slot.text)],
                         automatically_extensible, use_synonyms)
Ejemplo n.º 46
0
 def mk_entity(cls, slot, automatically_extensible=True, use_synonyms=True):
     if is_builtin_entity(slot.entity):
         return BuiltinEntity(slot.entity)
     return CustomEntity([EntityUtterance(slot.text)],
                         automatically_extensible, use_synonyms)