Example #1
0
    def test_missing_intent_key_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "unknown entity",
                                    "entity": "unknown_entity"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {},
            "language": "en",
            "snips_nlu_version": "1.1.1"
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual(str(ctx.exception.args[0]),
                         "Expected chunk to have key: 'slot_name'")
Example #2
0
    def test_should_not_require_data_for_builtin_entities(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "10p.m",
                                    "entity": SNIPS_DATETIME,
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                SNIPS_DATETIME: {}
            },
            "language": "en",
        }

        # When / Then
        with self.fail_if_exception("Could not validate dataset"):
            validate_and_format_dataset(dataset)
Example #3
0
    def test_missing_intent_key_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "unknown entity",
                                    "entity": "unknown_entity"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {},
            "language": "en",
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual("Expected chunk to have key: 'slot_name'",
                         str(ctx.exception.args[0]))
Example #4
0
    def test_unknown_entity_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "unknown entity",
                                    "entity": "unknown_entity",
                                    "slot_name": "unknown_entity_slot"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "data": [],
                    "use_synonyms": True,
                    "automatically_extensible": False
                }
            },
            "language": "en",
            "snips_nlu_version": "1.1.1"
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual(str(ctx.exception.args[0]),
                         "Expected entities to have key: 'unknown_entity'")
Example #5
0
    def test_unknown_entity_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "unknown entity",
                                    "entity": "unknown_entity",
                                    "slot_name": "unknown_entity_slot"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "data": [],
                    "use_synonyms": True,
                    "automatically_extensible": False
                }
            },
            "language": "en",
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual("Expected entities to have key: 'unknown_entity'",
                         str(ctx.exception.args[0]))
Example #6
0
    def test_should_not_require_data_for_builtin_entities(self):
        # Given
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "10p.m",
                                    "entity": SNIPS_DATETIME,
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                SNIPS_DATETIME: {}
            },
            "language": "en",
            "snips_nlu_version": "0.1.0"
        }

        # When / Then
        with self.fail_if_exception("Could not validate dataset"):
            validate_and_format_dataset(dataset)
    def test_should_generate_dataset_from_yaml_files(self, mock_io):
        # Given
        intent_file_1 = "whoIsGame.yaml"
        intent_file_2 = "getWeather.yaml"
        entity_file_1 = "location.yaml"

        who_is_game_yaml = """
# whoIsGame Intent
---
type: intent
name: whoIsGame
utterances:
  - who is the [role](president) of [country](France)
  - who is the [role](CEO) of [company](Google) please
        """

        get_weather_yaml = """
# getWeather Intent
---
type: intent
name: getWeather
utterances:
  - what is the weather in [weatherLocation:location](Paris)?
  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
        """

        location_yaml = """
# Location Entity
---
type: entity
name: location
automatically_extensible: true
values:
- [new york, big apple]
- london
        """

        # pylint:disable=unused-argument
        def mock_open(filename, **kwargs):
            if filename == intent_file_1:
                return io.StringIO(who_is_game_yaml)
            if filename == intent_file_2:
                return io.StringIO(get_weather_yaml)
            if filename == entity_file_1:
                return io.StringIO(location_yaml)
            return None

        # pylint:enable=unused-argument

        mock_io.open.side_effect = mock_open
        dataset_files = [intent_file_1, intent_file_2, entity_file_1]

        # When
        dataset = Dataset.from_yaml_files("en", dataset_files)
        dataset_dict = dataset.json

        # Then
        validate_and_format_dataset(dataset_dict)
        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
    def test_should_generate_dataset_from_file(self):
        # Given
        dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "whoIsGame.txt")
        dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "getWeather.txt")
        dataset = AssistantDataset.from_files("en",
                                              [dataset_path_1, dataset_path_2])
        dataset_dict = dataset.json

        # When / Then
        validate_and_format_dataset(dataset_dict)
    def test_should_generate_dataset_from_file(self):
        # Given
        dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "whoIsGame.txt")
        dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "getWeather.txt")
        dataset = AssistantDataset.from_files(
            "en", [dataset_path_1, dataset_path_2])
        dataset_dict = dataset.json

        # When / Then
        validate_and_format_dataset(dataset_dict)
Example #10
0
    def test_invalid_language_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {},
            "entities": {},
            "language": "eng",
        }

        # When/Then
        with self.assertRaises(ValueError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0]))
Example #11
0
    def test_invalid_language_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {},
            "entities": {},
            "language": "eng",
            "snips_nlu_version": "1.1.1"
        }

        # When/Then
        with self.assertRaises(ValueError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual(str(ctx.exception.args[0]), "Unknown language: 'eng'")
Example #12
0
    def fit(self, dataset, force_retrain=True):
        """Fits the intent parser with a valid Snips dataset"""
        logger.info("Fitting lookup intent parser...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        self.language = dataset[LANGUAGE]
        self._entity_scopes = _get_entity_scopes(dataset)
        self._map = dict()
        self._stop_words_whitelist = get_stop_words_whitelist(
            dataset, self._stop_words)
        entity_placeholders = _get_entity_placeholders(dataset, self.language)

        ambiguous_keys = set()
        for (key, val) in self._generate_io_mapping(dataset[INTENTS],
                                                    entity_placeholders):
            key = hash_str(key)
            # handle key collisions -*- flag ambiguous entries -*-
            if key in self._map and self._map[key] != val:
                ambiguous_keys.add(key)
            else:
                self._map[key] = val

        # delete ambiguous keys
        for key in ambiguous_keys:
            self._map.pop(key)

        return self
    def test_should_support_int_or_float_for_matching_strictness(self):
        # Given
        dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "data": [],
                    "automatically_extensible": False,
                    "use_synonyms": True,
                    "matching_strictness": 0.5
                },
                "entity2": {
                    "data": [],
                    "automatically_extensible": False,
                    "use_synonyms": True,
                    "matching_strictness": 1
                }
            },
            "language": "en",
        }

        # When/Then
        dataset = validate_and_format_dataset(dataset)

        self.assertEqual(
            0.5, dataset["entities"]["entity1"].get("matching_strictness"))
        self.assertEqual(
            1, dataset["entities"]["entity2"].get("matching_strictness"))
Example #14
0
    def test_engine_should_fit_with_builtins_entities(self):
        # Given
        dataset = validate_and_format_dataset({
            "intents": {
                "dummy": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "10p.m.",
                                    "entity": "snips/datetime",
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "snips/datetime": {}
            },
            "language": "en",
            "snips_nlu_version": "0.0.1"
        })

        # When / Then
        SnipsNLUEngine().fit(dataset)  # This should not raise any error
    def test_should_not_build_builtin_parser_when_provided(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        dataset = validate_and_format_dataset(dataset)
        builtin_entity_parser = BuiltinEntityParser.build(language="en")

        # When
        with patch("snips_nlu.entity_parser.builtin_entity_parser"
                   ".BuiltinEntityParser.build") as mocked_build_parser:
            engine = SnipsNLUEngine(
                builtin_entity_parser=builtin_entity_parser)
            engine.fit(dataset)

        # Then
        mocked_build_parser.assert_not_called()
    def test_should_be_serializable(self, mock_to_dict):
        # Given
        mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"}

        mock_to_dict.return_value = mocked_dict

        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        intent_classifier = LogRegIntentClassifier().fit(dataset)
        coeffs = intent_classifier.classifier.coef_.tolist()
        intercept = intent_classifier.classifier.intercept_.tolist()

        # When
        intent_classifier.persist(self.tmp_file_path)

        # Then
        intent_list = sorted(SAMPLE_DATASET[INTENTS])
        intent_list.append(None)
        expected_dict = {
            "unit_name": "log_reg_intent_classifier",
            "config": LogRegIntentClassifierConfig().to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": 701.0,
            "intent_list": intent_list,
            "featurizer": mocked_dict
        }
        metadata = {"unit_name": "log_reg_intent_classifier"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_classifier.json",
                               expected_dict)
    def test_should_be_serializable(self, mocked_generate_regexes):
        # Given

        # pylint: disable=unused-argument
        def mock_generate_patterns(utterances, joined_entity_utterances,
                                   group_names_to_slot_names, language):
            patterns = ["mocked_regex_%s" % i for i in range(len(utterances))]
            group_to_slot = {"group_0": "dummy slot name"}
            return patterns, group_to_slot

        # pylint: enable=unused-argument

        mocked_generate_regexes.side_effect = mock_generate_patterns
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=100)
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_dict = {
            "unit_name": "deterministic_intent_parser",
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 100
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "group_0": "dummy slot name"
            },
            "patterns": {
                "dummy_intent_1": [
                    "mocked_regex_0",
                    "mocked_regex_1",
                    "mocked_regex_2",
                    "mocked_regex_3"
                ],
                "dummy_intent_2": [
                    "mocked_regex_0"
                ]
            },
            "slot_names_to_entities": {
                "dummy_intent_1": {
                    "dummy_slot_name": "dummy_entity_1",
                    "dummy_slot_name3": "dummy_entity_2",
                    "dummy_slot_name2": "dummy_entity_2"
                },
                "dummy_intent_2": {
                    "dummy slot nàme": "dummy_entity_1"
                }
            }
        }
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json",
                               metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_dict)
Example #18
0
    def test_engine_should_fit_with_builtins_entities(self):
        # Given
        dataset = validate_and_format_dataset({
            "intents": {
                "dummy": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "10p.m.",
                                    "entity": "snips/datetime",
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "snips/datetime": {}
            },
            "language": "en",
            "snips_nlu_version": "0.0.1"
        })

        # When / Then
        SnipsNLUEngine().fit(dataset)  # This should not raise any error
    def test_should_be_serializable(self, mock_to_dict):
        # Given
        mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"}

        mock_to_dict.return_value = mocked_dict

        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        intent_classifier = LogRegIntentClassifier().fit(dataset)
        coeffs = intent_classifier.classifier.coef_.tolist()
        intercept = intent_classifier.classifier.intercept_.tolist()

        # When
        classifier_dict = intent_classifier.to_dict()

        # Then
        intent_list = sorted(SAMPLE_DATASET[INTENTS])
        intent_list.append(None)
        expected_dict = {
            "unit_name": "log_reg_intent_classifier",
            "config": LogRegIntentClassifierConfig().to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": 701.0,
            "intent_list": intent_list,
            "featurizer": mocked_dict
        }
        self.assertEqual(expected_dict, classifier_dict)
Example #20
0
    def build(cls, dataset, parser_usage):
        from snips_nlu.dataset import validate_and_format_dataset

        dataset = validate_and_format_dataset(dataset)
        language = dataset[LANGUAGE]
        custom_entities = {
            entity_name: deepcopy(entity)
            for entity_name, entity in iteritems(dataset[ENTITIES])
            if not is_builtin_entity(entity_name)
        }
        if parser_usage == CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS:
            for ent in viewvalues(custom_entities):
                stemmed_utterances = _stem_entity_utterances(
                    ent[UTTERANCES], language)
                ent[UTTERANCES] = _merge_entity_utterances(
                    ent[UTTERANCES], stemmed_utterances)
        elif parser_usage == CustomEntityParserUsage.WITH_STEMS:
            for ent in viewvalues(custom_entities):
                ent[UTTERANCES] = _stem_entity_utterances(
                    ent[UTTERANCES], language)
        elif parser_usage is None:
            raise ValueError("A parser usage must be defined in order to fit "
                             "a CustomEntityParser")
        configuration = _create_custom_entity_parser_configuration(
            custom_entities)
        parser = GazetteerEntityParser.build(configuration)
        return cls(parser, language, parser_usage)
Example #21
0
    def fit(self, dataset, intent):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        logger.debug("Fitting %s slot filler...", intent)
        dataset = validate_and_format_dataset(dataset)
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        self.language = dataset[LANGUAGE]
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)

        if not self.slot_name_mapping:
            # No need to train the CRF if the intent has no slots
            return self

        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset,
            self.intent,
            language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances
        ]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # Ensure that X, Y are safe and that the OUTSIDE label is learnt to
        # avoid segfault at inference time
        # pylint: disable=C0103
        X = [
            self.compute_features(sample[TOKENS], drop_out=True)
            for sample in crf_samples
        ]
        Y = [[tag for tag in sample[TAGS]] for sample in crf_samples]
        X, Y = _ensure_safe(X, Y)

        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in y] for y in Y]

        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)

        logger.debug("Most relevant features for %s:\n%s", self.intent,
                     DifferedLoggingMessage(self.log_weights))
        return self
    def fit(self, dataset, force_retrain=True):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            force_retrain (bool, optional): If *False*, will not retrain intent
                classifier and slot fillers when they are already fitted.
                Default to *True*.

        Returns:
            :class:`ProbabilisticIntentParser`: The same instance, trained
        """
        dataset = validate_and_format_dataset(dataset)
        intents = list(dataset[INTENTS])
        if self.intent_classifier is None:
            self.intent_classifier = build_processing_unit(
                self.config.intent_classifier_config)
        if force_retrain or not self.intent_classifier.fitted:
            self.intent_classifier.fit(dataset)

        if self.slot_fillers is None:
            self.slot_fillers = dict()
        for intent_name in intents:
            # We need to copy the slot filler config as it may be mutated
            if self.slot_fillers.get(intent_name) is None:
                slot_filler_config = deepcopy(self.config.slot_filler_config)
                self.slot_fillers[intent_name] = build_processing_unit(
                    slot_filler_config)
            if force_retrain or not self.slot_fillers[intent_name].fitted:
                self.slot_fillers[intent_name].fit(dataset, intent_name)
        return self
Example #23
0
    def fit(self, dataset, force_retrain=True):
        """Fit the NLU engine

        Args:
            dataset (dict): A valid Snips dataset
            force_retrain (bool, optional): If *False*, will not retrain intent
                parsers when they are already fitted. Default to *True*.

        Returns:
            The same object, trained.
        """
        logger.info("Fitting NLU engine...")
        dataset = validate_and_format_dataset(dataset)
        self._dataset_metadata = _get_dataset_metadata(dataset)

        if self.config is None:
            language = self._dataset_metadata["language_code"]
            self.config = self.config_type.from_dict(DEFAULT_CONFIGS[language])

        parsers = []
        for parser_config in self.config.intent_parsers_configs:
            # Re-use existing parsers to allow pre-training
            recycled_parser = None
            for parser in self.intent_parsers:
                if parser.unit_name == parser_config.unit_name:
                    recycled_parser = parser
                    break
            if recycled_parser is None:
                recycled_parser = build_processing_unit(parser_config)
            if force_retrain or not recycled_parser.fitted:
                recycled_parser.fit(dataset, force_retrain)
            parsers.append(recycled_parser)

        self.intent_parsers = parsers
        return self
Example #24
0
    def fit(self, dataset, force_retrain=True):
        """Fit the NLU engine

        Args:
            dataset (dict): A valid Snips dataset
            force_retrain (bool, optional): If *False*, will not retrain intent
                parsers when they are already fitted. Default to *True*.

        Returns:
            The same object, trained.
        """
        dataset = validate_and_format_dataset(dataset)
        self._dataset_metadata = _get_dataset_metadata(dataset)

        if self.config is None:
            language = self._dataset_metadata["language_code"]
            self.config = self.config_type.from_dict(DEFAULT_CONFIGS[language])

        parsers = []
        for parser_config in self.config.intent_parsers_configs:
            # Re-use existing parsers to allow pre-training
            recycled_parser = None
            for parser in self.intent_parsers:
                if parser.unit_name == parser_config.unit_name:
                    recycled_parser = parser
                    break
            if recycled_parser is None:
                recycled_parser = build_processing_unit(parser_config)
            if force_retrain or not recycled_parser.fitted:
                recycled_parser.fit(dataset, force_retrain)
            parsers.append(recycled_parser)

        self.intent_parsers = parsers
        return self
Example #25
0
    def test_should_normalize_synonyms(self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation.lower(), variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [{
                        "data": [{
                            "text": "ëNtity",
                            "entity": "entity1",
                            "slot_name": "startTime"
                        }]
                    }]
                }
            },
            "entities": {
                "entity1": {
                    "data": [],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "language": "en",
        }

        expected_dataset = {
            "intents": {
                "intent1": {
                    "utterances": [{
                        "data": [{
                            "text": "ëNtity",
                            "entity": "entity1",
                            "slot_name": "startTime"
                        }]
                    }]
                }
            },
            "entities": {
                "entity1": {
                    "utterances": {
                        "ëntity": "ëNtity",
                        "Ëntity": "ëNtity",
                        "ëNtity": "ëNtity"
                    },
                    "automatically_extensible": True,
                    "capitalize": False
                }
            },
            "language": "en",
            "validated": True
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        # Then
        self.assertDictEqual(expected_dataset, dataset)
Example #26
0
    def test_should_get_builtin_slots(self):
        # Given
        dataset = validate_and_format_dataset(WEATHER_DATASET)
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "SearchWeatherForecast"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Give me the weather at 9p.m. in Paris")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 20,
                END: 28
            },
                            value='at 9p.m.',
                            entity='snips/datetime',
                            slot_name='datetime'),
            unresolved_slot(match_range={
                START: 32,
                END: 37
            },
                            value='Paris',
                            entity='weather_location',
                            slot_name='location')
        ]
        self.assertListEqual(expected_slots, slots)
Example #27
0
    def fit(self, dataset, force_retrain=True):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            force_retrain (bool, optional): If *False*, will not retrain intent
                classifier and slot fillers when they are already fitted.
                Default to *True*.

        Returns:
            :class:`ProbabilisticIntentParser`: The same instance, trained
        """
        logger.info("Fitting probabilistic intent parser...")
        dataset = validate_and_format_dataset(dataset)
        intents = list(dataset[INTENTS])
        if self.intent_classifier is None:
            self.intent_classifier = build_processing_unit(
                self.config.intent_classifier_config)
        if force_retrain or not self.intent_classifier.fitted:
            self.intent_classifier.fit(dataset)

        if self.slot_fillers is None:
            self.slot_fillers = dict()
        slot_fillers_start = datetime.now()
        for intent_name in intents:
            # We need to copy the slot filler config as it may be mutated
            if self.slot_fillers.get(intent_name) is None:
                slot_filler_config = deepcopy(self.config.slot_filler_config)
                self.slot_fillers[intent_name] = build_processing_unit(
                    slot_filler_config)
            if force_retrain or not self.slot_fillers[intent_name].fitted:
                self.slot_fillers[intent_name].fit(dataset, intent_name)
        logger.debug("Fitted slot fillers in %s",
                     elapsed_since(slot_fillers_start))
        return self
Example #28
0
    def fit(self, dataset):
        """Fit the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        logger.debug("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            language,
            data_augmentation_config.unknown_words_replacement_string,
            self.config.featurizer_config)
        self.featurizer = self.featurizer.fit(dataset, utterances, classes)
        if self.featurizer is None:
            return self

        X = self.featurizer.transform(utterances)  # pylint: disable=C0103
        alpha = get_regularization_factor(dataset)
        self.classifier = SGDClassifier(random_state=random_state,
                                        alpha=alpha, **LOG_REG_ARGS)
        self.classifier.fit(X, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
Example #29
0
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)
        slot_filler = CRFSlotFiller(slot_filler_config)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        slot_filler.fit(dataset, intent="dummy_intent_1")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {"ngram_1": "foo"},
            {},
            {"ngram_1": "world"},
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
    def fit(self, dataset, force_retrain=True):
        """Fits the intent parser with a valid Snips dataset"""
        logger.info("Fitting deterministic parser...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        self.language = dataset[LANGUAGE]
        self.regexes_per_intent = dict()
        entity_placeholders = _get_entity_placeholders(dataset, self.language)
        self.slot_names_to_entities = get_slot_name_mappings(dataset)
        self.group_names_to_slot_names = _get_group_names_to_slot_names(
            self.slot_names_to_entities)

        # Do not use ambiguous patterns that appear in more than one intent
        all_patterns = set()
        ambiguous_patterns = set()
        intent_patterns = dict()
        for intent_name, intent in iteritems(dataset[INTENTS]):
            patterns = self._generate_patterns(intent[UTTERANCES],
                                               entity_placeholders)
            patterns = [
                p for p in patterns if len(p) < self.config.max_pattern_length
            ]
            existing_patterns = {p for p in patterns if p in all_patterns}
            ambiguous_patterns.update(existing_patterns)
            all_patterns.update(set(patterns))
            intent_patterns[intent_name] = patterns

        for intent_name, patterns in iteritems(intent_patterns):
            patterns = [p for p in patterns if p not in ambiguous_patterns]
            patterns = patterns[:self.config.max_queries]
            regexes = [re.compile(p, re.IGNORECASE) for p in patterns]
            self.regexes_per_intent[intent_name] = regexes
        return self
Example #31
0
    def test_fitting_should_be_reproducible_after_serialization(self):
        # Given
        dataset = BEVERAGE_DATASET
        validated_dataset = validate_and_format_dataset(dataset)

        seed1 = 666
        seed2 = 42
        config = ProbabilisticIntentParserConfig(
            intent_classifier_config=LogRegIntentClassifierConfig(
                random_seed=seed1),
            slot_filler_config=CRFSlotFillerConfig(random_seed=seed2))
        parser = ProbabilisticIntentParser(config)
        parser_dict = parser.to_dict()

        # When
        fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        # Then
        feature_weights_1 = fitted_parser_1.slot_fillers[
            "MakeTea"].crf_model.state_features_
        feature_weights_2 = fitted_parser_2.slot_fillers[
            "MakeTea"].crf_model.state_features_
        self.assertEqual(feature_weights_1, feature_weights_2)
Example #32
0
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        from sklearn.linear_model import SGDClassifier
        from sklearn.utils import compute_class_weight

        logger.info("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            self.random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources,
            random_state=self.random_state,
        )
        self.featurizer.language = language

        none_class = max(classes)
        try:
            x = self.featurizer.fit_transform(dataset, utterances, classes,
                                              none_class)
        except _EmptyDatasetUtterancesError:
            logger.warning("No (non-empty) utterances found in dataset")
            self.featurizer = None
            return self

        alpha = get_regularization_factor(dataset)

        class_weights_arr = compute_class_weight("balanced",
                                                 range(none_class + 1),
                                                 classes)
        # Re-weight the noise class
        class_weights_arr[-1] *= self.config.noise_reweight_factor
        class_weight = {idx: w for idx, w in enumerate(class_weights_arr)}

        self.classifier = SGDClassifier(random_state=self.random_state,
                                        alpha=alpha,
                                        class_weight=class_weight,
                                        **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self
Example #33
0
    def test_should_generate_dataset_from_file(self):
        # Given
        dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "whoIsGame.txt")
        dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset",
                                      "examples", "getWeather.txt")
        dataset = AssistantDataset.from_files("en",
                                              [dataset_path_1, dataset_path_2])
        dataset_dict = dataset.json

        # When / Then
        validate_and_format_dataset(dataset_dict)
        expected_intents = {"getWeather", "whoIsGame"}
        self.assertEqual(expected_intents, set(dataset_dict[INTENTS]))
        expected_entities = {
            "location", "snips/datetime", "role", "country", "company"
        }
        self.assertEqual(expected_entities, set(dataset_dict[ENTITIES]))
Example #34
0
    def test_should_be_serializable(self, mock_serialize_crf_model):
        # Given
        mock_serialize_crf_model.return_value = "mocked_crf_model_data"
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        slot_filler = CRFSlotFiller(config)
        intent = "dummy_intent_1"
        slot_filler.fit(dataset, intent=intent)

        # When
        actual_slot_filler_dict = slot_filler.to_dict()

        # Then
        expected_feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": "en"},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": "mocked_crf_model_data",
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy_slot_name2": "dummy_entity_2",
                "dummy_slot_name3": "dummy_entity_2",
            }
        }
        self.assertDictEqual(actual_slot_filler_dict,
                             expected_slot_filler_dict)
Example #35
0
    def test_missing_entity_key_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "data": [],
                    "automatically_extensible": False
                }
            },
            "language": "en",
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual("Expected entity to have key: 'use_synonyms'",
                         str(ctx.exception.args[0]))
    def test_should_generate_dataset_from_files(self):
        # Given
        intent_file_1 = "intent_whoIsGame.txt"
        intent_file_2 = "intent_getWeather.txt"
        entity_file_1 = "entity_location.txt"

        who_is_game_txt = """
who is the [role:role](president) of [country:country](France)
who is the [role:role](CEO) of [company:company](Google) please
"""

        get_weather_txt = """
what is the weather in [weatherLocation:location](Paris)?
is it raining in [weatherLocation] [weatherDate:snips/datetime]
"""

        location_txt = """
new york,big apple
london
        """

        # pylint:disable=unused-argument
        def mock_open(self_, *args, **kwargs):
            if str(self_) == intent_file_1:
                return io.StringIO(who_is_game_txt)
            if str(self_) == intent_file_2:
                return io.StringIO(get_weather_txt)
            if str(self_) == entity_file_1:
                return io.StringIO(location_txt)
            return None

        # pylint:enable=unused-argument

        dataset_files = [intent_file_1, intent_file_2, entity_file_1]

        # When
        with patch("pathlib.io") as mock_io:
            mock_io.open.side_effect = mock_open
            dataset = Dataset.from_files("en", dataset_files)
        dataset_dict = dataset.json

        # When / Then
        validate_and_format_dataset(dataset_dict)
        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
Example #37
0
    def test_should_generate_dataset_from_yaml_files(self):
        # Given
        who_is_game_yaml = io.StringIO("""
# whoIsGame Intent
---
type: intent
name: whoIsGame
utterances:
  - who is the [role](president) of [country](France)
  - who is the [role](CEO) of [company](Google) please
        """)

        get_weather_yaml = io.StringIO("""
# getWeather Intent
---
type: intent
name: getWeather
utterances:
  - what is the weather in [weatherLocation:location](Paris)?
  - is it raining in [weatherLocation] [weatherDate:snips/datetime]
        """)

        location_yaml = io.StringIO("""
# Location Entity
---
type: entity
name: location
automatically_extensible: true
values:
- [new york, big apple]
- london
        """)

        dataset_files = [who_is_game_yaml, get_weather_yaml, location_yaml]

        # When
        with mock.patch("snips_nlu_parsers.get_builtin_entity_examples",
                        return_value=["Today"]):
            dataset = Dataset.from_yaml_files("en", dataset_files)

        # Then
        validate_and_format_dataset(dataset)
        self.assertDictEqual(EXPECTED_DATASET_DICT, dataset.json)
Example #38
0
    def test_should_handle_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        engine = SnipsNLUEngine().fit(dataset)

        # When
        result = engine.parse("hello world")

        # Then
        self.assertEqual(empty_result("hello world"), result)
Example #39
0
    def test_should_handle_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        engine = SnipsNLUEngine().fit(dataset)

        # When
        result = engine.parse("hello world")

        # Then
        self.assertEqual(empty_result("hello world"), result)
Example #40
0
    def test_missing_entity_key_should_raise_exception(self):
        # Given
        dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "data": [],
                    "automatically_extensible": False
                }
            },
            "language": "en",
            "snips_nlu_version": "1.1.1"
        }

        # When/Then
        with self.assertRaises(KeyError) as ctx:
            validate_and_format_dataset(dataset)
        self.assertEqual(str(ctx.exception.args[0]),
                         "Expected entity to have key: 'use_synonyms'")
    def test_should_extract_entity_values(self):
        # Given
        set_light_color_yaml = io.StringIO("""
---
type: intent
name: setLightColor
utterances:
  - set the lights to [color](blue)
  - change the light to [color](yellow) in the [room](bedroom)""")

        turn_light_on_yaml = io.StringIO("""
---
type: intent
name: turnLightOn
utterances:
  - turn the light on in the [room](kitchen)
  - turn the [room](bathroom)'s lights on""")

        color_yaml = io.StringIO("""
type: entity
name: color
values:
- [blue, cyan]
- red""")

        room_yaml = io.StringIO("""
type: entity
name: room
values:
- garage
- [living room, main room]""")

        dataset_files = [
            set_light_color_yaml, turn_light_on_yaml, color_yaml, room_yaml
        ]
        dataset = Dataset.from_yaml_files("en", dataset_files).json
        dataset = validate_and_format_dataset(dataset)

        # When
        entity_values = extract_entity_values(dataset,
                                              apply_normalization=True)

        # Then
        expected_values = {
            "setLightColor": {
                "blue", "yellow", "cyan", "red", "bedroom", "garage",
                "living room", "main room", "kitchen", "bathroom"
            },
            "turnLightOn": {
                "bedroom", "garage", "living room", "main room", "kitchen",
                "bathroom"
            }
        }
        self.assertDictEqual(expected_values, entity_values)
    def test_should_get_none_if_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "this is a dummy query"

        # When
        intent = classifier.get_intent(text)

        # Then
        expected_intent = None
        self.assertEqual(intent, expected_intent)
    def test_should_get_intent_after_deserialization(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        classifier = LogRegIntentClassifier().fit(dataset)
        classifier_dict = classifier.to_dict()

        # When
        loaded_classifier = LogRegIntentClassifier.from_dict(classifier_dict)
        result = loaded_classifier.get_intent("Make me two cups of tea")

        # Then
        expected_intent = "MakeTea"
        self.assertEqual(expected_intent, result[RES_INTENT_NAME])
    def test_intent_classifier_should_get_intent(self):
        # Given
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "This is a dummy_3 query from another intent"

        # When
        res = classifier.get_intent(text)
        intent = res[RES_INTENT_NAME]

        # Then
        expected_intent = "dummy_intent_2"

        self.assertEqual(intent, expected_intent)
Example #45
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, EntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
Example #46
0
    def test_should_format_dataset_by_adding_synonyms(
            self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation.lower(), variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "data": [
                        {
                            "value": "Entity_1",
                            "synonyms": ["entity 2"]
                        }
                    ],
                    "use_synonyms": True,
                    "automatically_extensible": False
                }
            },
            "language": "en",
            "snips_nlu_version": "1.1.1"
        }

        expected_dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "utterances": {
                        "Entity_1": "Entity_1",
                        "entity_1": "Entity_1",
                        "entity 2": "Entity_1",
                        "Entity 2": "Entity_1",
                    },
                    "automatically_extensible": False,
                    "capitalize": False
                }
            },
            "language": "en",
            "snips_nlu_version": "1.1.1",
            "validated": True
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        # Then
        self.assertDictEqual(dataset, expected_dataset)
    def test_empty_vocabulary_should_fit_and_return_none_intent(
            self, mocked_build_training):
        # Given
        language = LANGUAGE_EN
        dataset = {
            "snips_nlu_version": "0.0.1",
            "entities": {
                "dummy_entity_1": {
                    "automatically_extensible": True,
                    "use_synonyms": False,
                    "data": [
                        {
                            "value": "...",
                            "synonyms": [],
                        }
                    ]
                }
            },
            "intents": {
                "dummy_intent_1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "...",
                                    "slot_name": "dummy_slot_name",
                                    "entity": "dummy_entity_1"
                                }
                            ]
                        }
                    ]
                }
            },
            "language": language
        }
        dataset = validate_and_format_dataset(dataset)

        text = " "
        noise_size = 6
        utterance = [text] + [text] * noise_size
        labels = [1] + [None] * noise_size
        intent_list = ["dummy_intent_1", None]
        mocked_build_training.return_value = utterance, labels, intent_list

        # When / Then
        intent_classifier = LogRegIntentClassifier().fit(dataset)
        intent = intent_classifier.get_intent("no intent there")
        self.assertEqual(intent, None)
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
    def fit(self, dataset, intent, verbose=False):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on
            verbose (bool, optional): If *True*, it will print the weights
                of the CRF once the training is done

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        dataset = validate_and_format_dataset(dataset)
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)
        self.language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset, self.intent, language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # pylint: disable=C0103
        X = [self.compute_features(sample[TOKENS], drop_out=True)
             for sample in crf_samples]
        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in sample[TAGS]]
             for sample in crf_samples]
        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)
        if verbose:
            self.print_weights()

        return self
    def test_intent_classifier_should_get_intent_when_filter(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        classifier = LogRegIntentClassifier().fit(dataset)

        # When
        text1 = "Make me two cups of tea"
        res1 = classifier.get_intent(text1, ["MakeCoffee", "MakeTea"])

        text2 = "Make me two cups of tea"
        res2 = classifier.get_intent(text2, ["MakeCoffee"])

        text3 = "bla bla bla"
        res3 = classifier.get_intent(text3, ["MakeCoffee"])

        # Then
        self.assertEqual("MakeTea", res1[RES_INTENT_NAME])
        self.assertEqual("MakeCoffee", res2[RES_INTENT_NAME])
        self.assertEqual(None, res3)
Example #51
0
    def test_dataset_should_handle_synonyms(
            self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation.lower(), variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {},
            "entities": {
                "entity1": {
                    "data": [
                        {
                            "value": "Ëntity 1",
                            "synonyms": ["entity 2"]
                        }
                    ],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "language": "en",
            "snips_nlu_version": "1.1.1"
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        expected_entities = {
            "entity1": {
                AUTOMATICALLY_EXTENSIBLE: True,
                UTTERANCES: {
                    "Ëntity 1": "Ëntity 1",
                    "ëntity 1": "Ëntity 1",
                    "entity 2": "Ëntity 1",
                    "Entity 2": "Ëntity 1",
                },
                CAPITALIZE: False
            }
        }

        # Then
        self.assertDictEqual(dataset[ENTITIES], expected_entities)
 def fit(self, dataset, force_retrain=True):
     """Fit the intent parser with a valid Snips dataset"""
     dataset = validate_and_format_dataset(dataset)
     self.language = dataset[LANGUAGE]
     self.regexes_per_intent = dict()
     self.group_names_to_slot_names = dict()
     joined_entity_utterances = _get_joined_entity_utterances(
         dataset, self.language)
     self.slot_names_to_entities = _get_slot_names_mapping(dataset)
     for intent_name, intent in iteritems(dataset[INTENTS]):
         if not self._is_trainable(intent, dataset):
             self.regexes_per_intent[intent_name] = []
             continue
         utterances = [_preprocess_builtin_entities(u, self.language)
                       for u in intent[UTTERANCES]]
         regexes, self.group_names_to_slot_names = _generate_regexes(
             utterances, joined_entity_utterances,
             self.group_names_to_slot_names, self.language)
         self.regexes_per_intent[intent_name] = regexes
     return self
Example #53
0
    def test_should_add_capitalize_field(
            self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation, variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "My entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot0"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot2"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot2"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot3"
                                },
                                {
                                    "text": "My entity2",
                                    "entity": "entity2",
                                    "slot_name": "slot1"
                                },
                                {
                                    "text": "myentity2",
                                    "entity": "entity2",
                                    "slot_name": "slot1"
                                },
                                {
                                    "text": "m_entity3",
                                    "entity": "entity3",
                                    "slot_name": "slot1"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "data": [],
                    "use_synonyms": False,
                    "automatically_extensible": True
                },
                "entity2": {
                    "data": [],
                    "use_synonyms": False,
                    "automatically_extensible": True
                },
                "entity3": {
                    "data": [
                        {
                            "value": "Entity3",
                            "synonyms": ["entity3"]
                        }
                    ],
                    "use_synonyms": False,
                    "automatically_extensible": True
                }
            },
            "language": "en",
            "snips_nlu_version": "0.0.1"
        }

        expected_dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "My entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot0"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot2"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot2"
                                },
                                {
                                    "text": "entity1",
                                    "entity": "entity1",
                                    "slot_name": "slot3"
                                },
                                {
                                    "text": "My entity2",
                                    "entity": "entity2",
                                    "slot_name": "slot1"
                                },
                                {
                                    "text": "myentity2",
                                    "entity": "entity2",
                                    "slot_name": "slot1"
                                },
                                {
                                    "text": "m_entity3",
                                    "entity": "entity3",
                                    "slot_name": "slot1"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "utterances":
                        {
                            "My entity1": "My entity1",
                            "My Entity1": "My entity1",
                            "entity1": "entity1",
                            "Entity1": "entity1",
                        },
                    "automatically_extensible": True,
                    "capitalize": True
                },
                "entity2": {
                    "utterances": {
                        "My entity2": "My entity2",
                        "My Entity2": "My entity2",
                        "myentity2": "myentity2",
                        "Myentity2": "myentity2"
                    },
                    "automatically_extensible": True,
                    "capitalize": True
                },
                "entity3": {
                    "utterances":
                        {
                            "Entity3": "Entity3",
                            "m_entity3": "m_entity3",
                            "M_Entity3": "m_entity3"
                        },
                    "automatically_extensible": True,
                    "capitalize": False
                }
            },
            "language": "en",
            "snips_nlu_version": "0.0.1",
            "validated": True
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        # Then
        self.assertDictEqual(dataset, expected_dataset)
Example #54
0
    def test_should_remove_empty_entities_value_and_empty_synonyms(
            self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation, variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "",
                                    "entity": "entity1",
                                    "slot_name": "slot1"
                                }
                            ]
                        },
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "entity 1",
                                    "entity": "entity1",
                                    "slot_name": "slot1"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "data": [
                        {
                            "value": "entity 1",
                            "synonyms": [""]
                        },
                        {
                            "value": "",
                            "synonyms": []
                        }
                    ],
                    "use_synonyms": False,
                    "automatically_extensible": False
                }
            },
            "language": "en",
            "snips_nlu_version": "0.0.1"
        }

        expected_dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "",
                                    "entity": "entity1",
                                    "slot_name": "slot1"
                                }
                            ]
                        },
                        {
                            "data": [
                                {
                                    "text": "this is ",
                                },
                                {
                                    "text": "entity 1",
                                    "entity": "entity1",
                                    "slot_name": "slot1"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "utterances":
                        {
                            "entity 1": "entity 1",
                            "Entity 1": "entity 1",
                        },
                    "capitalize": False,
                    "automatically_extensible": False
                }
            },
            "language": "en",
            "snips_nlu_version": "0.0.1",
            "validated": True
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        # Then
        self.assertEqual(dataset, expected_dataset)
Example #55
0
    def test_should_normalize_synonyms(
            self, mocked_get_string_variations):
        # Given
        def mock_get_string_variations(variation, language):
            return {variation.lower(), variation.title()}

        mocked_get_string_variations.side_effect = mock_get_string_variations
        dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "ëNtity",
                                    "entity": "entity1",
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "data": [],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "language": "en",
            "snips_nlu_version": "0.1.0"
        }

        expected_dataset = {
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "ëNtity",
                                    "entity": "entity1",
                                    "slot_name": "startTime"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {
                "entity1": {
                    "utterances": {
                        "ëntity": "ëNtity",
                        "Ëntity": "ëNtity",
                    },
                    "automatically_extensible": True,
                    "capitalize": False
                }
            },
            "language": "en",
            "snips_nlu_version": "0.1.0",
            "validated": True
        }

        # When
        dataset = validate_and_format_dataset(dataset)

        # Then
        self.assertDictEqual(dataset, expected_dataset)