def test_should_be_serializable(self, mocked_generate_regexes):
        # Given

        # pylint: disable=unused-argument
        def mock_generate_patterns(utterances, joined_entity_utterances,
                                   group_names_to_slot_names, language):
            patterns = ["mocked_regex_%s" % i for i in range(len(utterances))]
            group_to_slot = {"group_0": "dummy slot name"}
            return patterns, group_to_slot

        # pylint: enable=unused-argument

        mocked_generate_regexes.side_effect = mock_generate_patterns
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=100)
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_dict = {
            "unit_name": "deterministic_intent_parser",
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 100
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "group_0": "dummy slot name"
            },
            "patterns": {
                "dummy_intent_1": [
                    "mocked_regex_0",
                    "mocked_regex_1",
                    "mocked_regex_2",
                    "mocked_regex_3"
                ],
                "dummy_intent_2": [
                    "mocked_regex_0"
                ]
            },
            "slot_names_to_entities": {
                "dummy_intent_1": {
                    "dummy_slot_name": "dummy_entity_1",
                    "dummy_slot_name3": "dummy_entity_2",
                    "dummy_slot_name2": "dummy_entity_2"
                },
                "dummy_intent_2": {
                    "dummy slot nàme": "dummy_entity_1"
                }
            }
        }
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json",
                               metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_dict)
    def test_should_limit_nb_queries(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_first_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](my second entity)
- this is [slot3:entity3](my third entity)

---
type: intent
name: my_second_intent
utterances:
- this is [slot4:entity4](my fourth entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = DeterministicIntentParserConfig(max_queries=2,
                                                 max_pattern_length=1000)

        # When
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # Then
        self.assertEqual(len(parser.regexes_per_intent["my_first_intent"]), 2)
        self.assertEqual(len(parser.regexes_per_intent["my_second_intent"]), 1)
    def test_should_be_serializable_before_fitting(self):
        # Given
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43,
                                                 ignore_stop_words=True)
        parser = DeterministicIntentParser(config=config)

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_dict = {
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 43,
                "ignore_stop_words": True
            },
            "language_code": None,
            "group_names_to_slot_names": None,
            "patterns": None,
            "slot_names_to_entities": None,
            "stop_words_whitelist": None
        }

        metadata = {"unit_name": "deterministic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_dict)
    def test_should_be_deserializable_before_fitting_with_whitelist(self):
        # Given
        parser_dict = {
            "config": {
                "max_queries": 42,
                "max_pattern_length": 43
            },
            "language_code": None,
            "group_names_to_slot_names": None,
            "patterns": None,
            "slot_names_to_entities": None,
            "stop_words_whitelist": None
        }
        self.tmp_file_path.mkdir()
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.writeJsonContent(self.tmp_file_path / "intent_parser.json",
                              parser_dict)
        self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata)

        # When
        parser = DeterministicIntentParser.from_path(self.tmp_file_path)

        # Then
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43)
        expected_parser = DeterministicIntentParser(config=config)
        self.assertEqual(parser.to_dict(), expected_parser.to_dict())
    def test_should_limit_patterns_length(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_first_intent
utterances:
- how are you
- hello how are you?
- what's up

---
type: intent
name: my_second_intent
utterances:
- what is the weather today ?
- does it rain
- will it rain tomorrow""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = DeterministicIntentParserConfig(max_queries=1000,
                                                 max_pattern_length=25,
                                                 ignore_stop_words=False)

        # When
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # Then
        self.assertEqual(2, len(parser.regexes_per_intent["my_first_intent"]))
        self.assertEqual(1, len(parser.regexes_per_intent["my_second_intent"]))
    def test_should_be_deserializable_without_stop_words(self):
        # Given
        parser_dict = {
            "config": {
                "max_queries": 42,
                "max_pattern_length": 43
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "hello_group": "hello_slot",
                "world_group": "world_slot"
            },
            "patterns": {
                "my_intent":
                ["(?P<hello_group>hello?)", "(?P<world_group>world$)"]
            },
            "slot_names_to_entities": {
                "my_intent": {
                    "hello_slot": "hello_entity",
                    "world_slot": "world_entity"
                }
            }
        }
        self.tmp_file_path.mkdir()
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.writeJsonContent(self.tmp_file_path / "intent_parser.json",
                              parser_dict)
        self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata)

        # When
        parser = DeterministicIntentParser.from_path(self.tmp_file_path)

        # Then
        patterns = {
            "my_intent":
            ["(?P<hello_group>hello?)", "(?P<world_group>world$)"]
        }
        group_names_to_slot_names = {
            "hello_group": "hello_slot",
            "world_group": "world_slot"
        }
        slot_names_to_entities = {
            "my_intent": {
                "hello_slot": "hello_entity",
                "world_slot": "world_entity"
            }
        }
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43)
        expected_parser = DeterministicIntentParser(config=config)
        expected_parser.language = LANGUAGE_EN
        expected_parser.group_names_to_slot_names = group_names_to_slot_names
        expected_parser.slot_names_to_entities = slot_names_to_entities
        expected_parser.patterns = patterns
        # pylint:disable=protected-access
        expected_parser._stop_words_whitelist = dict()
        # pylint:enable=protected-access

        self.assertEqual(parser.to_dict(), expected_parser.to_dict())
Beispiel #7
0
    def test_should_limit_patterns_length(self):
        # Given
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        config = DeterministicIntentParserConfig(max_queries=1000,
                                                 max_pattern_length=300)

        # When
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # Then
        self.assertEqual(4, len(parser.regexes_per_intent["dummy_intent_1"]))
        self.assertEqual(1, len(parser.regexes_per_intent["dummy_intent_2"]))
Beispiel #8
0
    def __init__(self, intent_parsers_configs=None):

        if intent_parsers_configs is None:
            from snips_nlu.pipeline.configs import (
                ProbabilisticIntentParserConfig,
                DeterministicIntentParserConfig)
            intent_parsers_configs = [
                DeterministicIntentParserConfig(),
                ProbabilisticIntentParserConfig()
            ]
        self.intent_parsers_configs = list(
            map(get_processing_unit_config, intent_parsers_configs))
    def test_should_be_serializable(self, mocked_generate_regexes):
        # Given

        # pylint: disable=unused-argument
        def mock_generate_regexes(utterances, joined_entity_utterances,
                                  group_names_to_slot_names, language):
            regexes = [
                re.compile(r"mocked_regex_%s" % i)
                for i in range(len(utterances))
            ]
            group_to_slot = {"group_0": "dummy slot name"}
            return regexes, group_to_slot

        # pylint: enable=unused-argument

        mocked_generate_regexes.side_effect = mock_generate_regexes
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_entities=100)
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # When
        actual_dict = parser.to_dict()

        # Then
        expected_dict = {
            "unit_name": "deterministic_intent_parser",
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_entities": 100
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "group_0": "dummy slot name"
            },
            "patterns": {
                "dummy_intent_1": [
                    "mocked_regex_0", "mocked_regex_1", "mocked_regex_2",
                    "mocked_regex_3"
                ],
                "dummy_intent_2": ["mocked_regex_0"]
            },
            "slot_names_to_entities": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy slot nàme": "dummy_entity_1",
                "dummy_slot_name3": "dummy_entity_2",
                "dummy_slot_name2": "dummy_entity_2"
            }
        }

        self.assertDictEqual(actual_dict, expected_dict)
    def test_should_be_deserializable(self):
        # Given
        parser_dict = {
            "config": {
                "max_queries": 42,
                "max_pattern_length": 43
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "hello_group": "hello_slot",
                "world_group": "world_slot"
            },
            "patterns": {
                "intent_name": [
                    "(?P<hello_group>hello?)",
                    "(?P<world_group>world$)"
                ]
            },
            "slot_names_to_entities": {
                "hello_slot": "hello_entity",
                "world_slot": "world_entity"
            }
        }

        # When
        parser = DeterministicIntentParser.from_dict(parser_dict)

        # Then
        patterns = {
            "intent_name": [
                "(?P<hello_group>hello?)",
                "(?P<world_group>world$)"
            ]
        }
        group_names_to_slot_names = {
            "hello_group": "hello_slot",
            "world_group": "world_slot"
        }
        slot_names_to_entities = {
            "hello_slot": "hello_entity",
            "world_slot": "world_entity"
        }
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43)
        expected_parser = DeterministicIntentParser(config=config)
        expected_parser.language = LANGUAGE_EN
        expected_parser.group_names_to_slot_names = group_names_to_slot_names
        expected_parser.slot_names_to_entities = slot_names_to_entities
        expected_parser.patterns = patterns

        self.assertEqual(parser.to_dict(), expected_parser.to_dict())
Beispiel #11
0
    def test_deterministic_parser_config(self):
        # Given
        config_dict = {
            "unit_name": "deterministic_intent_parser",
            "max_queries": 666,
            "max_entities": 333
        }

        # When
        config = DeterministicIntentParserConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Beispiel #12
0
    def __init__(self, intent_parsers_configs=None):
        from snips_nlu.intent_parser import IntentParser

        if intent_parsers_configs is None:
            from snips_nlu.pipeline.configs import (
                ProbabilisticIntentParserConfig,
                DeterministicIntentParserConfig)
            intent_parsers_configs = [
                DeterministicIntentParserConfig(),
                ProbabilisticIntentParserConfig()
            ]
        self.intent_parsers_configs = [
            IntentParser.get_config(conf) for conf in intent_parsers_configs
        ]
    def test_should_not_train_intents_too_big(self):
        # Given
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        config = DeterministicIntentParserConfig(max_queries=2,
                                                 max_entities=200)

        # When
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # Then
        not_fitted_intent = "dummy_intent_1"
        fitted_intent = "dummy_intent_2"
        self.assertGreater(len(parser.regexes_per_intent[fitted_intent]), 0)
        self.assertListEqual(parser.regexes_per_intent[not_fitted_intent], [])
    def test_should_parse_stop_words_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: search
utterances:
  - search
  - search [search_object](this)
  - search [search_object](a cat)
  
---
type: entity
name: search_object
values:
  - [this thing, that]
  """)

        resources = self.get_resources("en")
        resources[STOP_WORDS] = {"a", "this", "that"}
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        parser_config = DeterministicIntentParserConfig(ignore_stop_words=True)
        parser = DeterministicIntentParser(config=parser_config,
                                           resources=resources)
        parser.fit(dataset)

        # When
        res_1 = parser.parse("search this")
        res_2 = parser.parse("search that")

        # Then
        expected_intent = intent_classification_result(intent_name="search",
                                                       probability=1.0)
        expected_slots_1 = [
            unresolved_slot(match_range=(7, 11),
                            value="this",
                            entity="search_object",
                            slot_name="search_object")
        ]
        expected_slots_2 = [
            unresolved_slot(match_range=(7, 11),
                            value="that",
                            entity="search_object",
                            slot_name="search_object")
        ]
        self.assertEqual(expected_intent, res_1[RES_INTENT])
        self.assertEqual(expected_intent, res_2[RES_INTENT])
        self.assertListEqual(expected_slots_1, res_1[RES_SLOTS])
        self.assertListEqual(expected_slots_2, res_2[RES_SLOTS])
Beispiel #15
0
    def test_deterministic_parser_config(self):
        # Given
        config_dict = {
            "unit_name": "deterministic_intent_parser",
            "max_queries": 666,
            "max_pattern_length": 333,
            "ignore_stop_words": True
        }

        # When
        config = DeterministicIntentParserConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Beispiel #16
0
    def test_nlu_config_from_dict(self):
        # Given
        config_dict = {
            "unit_name": "nlu_engine",
            "intent_parsers_configs": [
                DeterministicIntentParserConfig().to_dict(),
                ProbabilisticIntentParserConfig().to_dict()
            ]
        }

        # When
        config = NLUEngineConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
    def test_should_parse_intent_with_stop_words(self, mock_get_stop_words):
        # Given
        mock_get_stop_words.return_value = {"a", "hey"}
        dataset = self.slots_dataset
        config = DeterministicIntentParserConfig(ignore_stop_words=True)
        parser = DeterministicIntentParser(config).fit(dataset)
        text = "Hey this is dummy_a query with another dummy_c at 10p.m. or " \
               "at 12p.m."

        # When
        parsing = parser.parse(text)

        # Then
        probability = 1.0
        expected_intent = intent_classification_result(
            intent_name="dummy_intent_1", probability=probability)

        self.assertEqual(expected_intent, parsing[RES_INTENT])
    def test_should_be_deserializable_before_fitting(self):
        # Given
        parser_dict = {
            "config": {
                "max_queries": 42,
                "max_pattern_length": 43
            },
            "language_code": None,
            "group_names_to_slot_names": None,
            "patterns": None,
            "slot_names_to_entities": None
        }

        # When
        parser = DeterministicIntentParser.from_dict(parser_dict)

        # Then
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43)
        expected_parser = DeterministicIntentParser(config=config)
        self.assertEqual(parser.to_dict(), expected_parser.to_dict())
    def test_should_be_serializable_before_fitting(self):
        # Given
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=43)
        parser = DeterministicIntentParser(config=config)

        # When
        actual_dict = parser.to_dict()

        # Then
        expected_dict = {
            "unit_name": "deterministic_intent_parser",
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 43
            },
            "language_code": None,
            "group_names_to_slot_names": None,
            "patterns": None,
            "slot_names_to_entities": None
        }

        self.assertDictEqual(actual_dict, expected_dict)
    def test_should_be_serializable(self, mock_get_stop_words):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: searchFlight
slots:
  - name: origin
    entity: city
  - name: destination
    entity: city
utterances:
  - find me a flight from [origin](Paris) to [destination](New York)
  - I need a flight to [destination](Berlin)

---
type: entity
name: city
values:
  - london
  - [new york, big apple]
  - [paris, city of lights]
            """)

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        mock_get_stop_words.return_value = {"a", "me"}
        config = DeterministicIntentParserConfig(max_queries=42,
                                                 max_pattern_length=100,
                                                 ignore_stop_words=True)
        parser = DeterministicIntentParser(config=config).fit(dataset)

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_dict = {
            "config": {
                "unit_name": "deterministic_intent_parser",
                "max_queries": 42,
                "max_pattern_length": 100,
                "ignore_stop_words": True
            },
            "language_code": "en",
            "group_names_to_slot_names": {
                "group0": "destination",
                "group1": "origin",
            },
            "patterns": {
                "searchFlight": [
                    "^\\s*find\\s*flight\\s*from\\s*(?P<group1>%CITY%)\\s*to"
                    "\\s*(?P<group0>%CITY%)\\s*$",
                    "^\\s*i\\s*need\\s*flight\\s*to\\s*(?P<group0>%CITY%)"
                    "\\s*$",
                ]
            },
            "slot_names_to_entities": {
                "searchFlight": {
                    "destination": "city",
                    "origin": "city",
                }
            },
            "stop_words_whitelist": dict()
        }
        metadata = {"unit_name": "deterministic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_dict)