def fit(self, dataset, force_retrain=True):
        """Fits the intent parser with a valid Snips dataset"""
        logger.info("Fitting deterministic parser...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        self.language = dataset[LANGUAGE]
        self.regexes_per_intent = dict()
        entity_placeholders = _get_entity_placeholders(dataset, self.language)
        self.slot_names_to_entities = get_slot_name_mappings(dataset)
        self.group_names_to_slot_names = _get_group_names_to_slot_names(
            self.slot_names_to_entities)

        # Do not use ambiguous patterns that appear in more than one intent
        all_patterns = set()
        ambiguous_patterns = set()
        intent_patterns = dict()
        for intent_name, intent in iteritems(dataset[INTENTS]):
            patterns = self._generate_patterns(intent[UTTERANCES],
                                               entity_placeholders)
            patterns = [
                p for p in patterns if len(p) < self.config.max_pattern_length
            ]
            existing_patterns = {p for p in patterns if p in all_patterns}
            ambiguous_patterns.update(existing_patterns)
            all_patterns.update(set(patterns))
            intent_patterns[intent_name] = patterns

        for intent_name, patterns in iteritems(intent_patterns):
            patterns = [p for p in patterns if p not in ambiguous_patterns]
            patterns = patterns[:self.config.max_queries]
            regexes = [re.compile(p, re.IGNORECASE) for p in patterns]
            self.regexes_per_intent[intent_name] = regexes
        return self
Beispiel #2
0
def _get_dataset_metadata(dataset):
    dataset = dataset
    entities = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        if is_builtin_entity(entity_name):
            continue
        entities[entity_name] = {
            AUTOMATICALLY_EXTENSIBLE: entity[AUTOMATICALLY_EXTENSIBLE]
        }
    slot_name_mappings = get_slot_name_mappings(dataset)
    return {
        "language_code": dataset[LANGUAGE],
        "entities": entities,
        "slot_name_mappings": slot_name_mappings
    }