コード例 #1
0
    def test_should_fit_and_parse_empty_intent(self):
        # Given
        dataset = {
            "intents": {
                "dummy_intent": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": " "
                                }
                            ]
                        }
                    ]
                }
            },
            "language": "en",
            "entities": dict()
        }

        slot_filler = CRFSlotFiller(**self.get_shared_data(dataset))

        # When
        slot_filler.fit(dataset, "dummy_intent")
        slot_filler.get_slots("ya")
コード例 #2
0
    def test_should_be_deserializable_when_fitted_without_slots(self):
        # Given
        dataset = {
            "language": "en",
            "intents": {
                "intent1": {
                    "utterances": [{
                        "data": [{
                            "text":
                            "This is an utterance without "
                            "slots"
                        }]
                    }]
                }
            },
            "entities": {}
        }

        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(**shared)
        slot_filler.fit(dataset, intent="intent1")
        slot_filler.persist(self.tmp_file_path)
        loaded_slot_filler = CRFSlotFiller.from_path(self.tmp_file_path,
                                                     **shared)

        # When
        slots = loaded_slot_filler.get_slots(
            "This is an utterance without slots")

        # Then
        self.assertListEqual([], slots)
コード例 #3
0
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)
        slot_filler = CRFSlotFiller(slot_filler_config)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        slot_filler.fit(dataset, intent="dummy_intent_1")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {"ngram_1": "foo"},
            {},
            {"ngram_1": "world"},
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
コード例 #4
0
    def test_should_get_sub_builtin_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: PlanBreak
utterances:
- 'I want to leave from [start:snips/datetime](tomorrow) until 
  [end:snips/datetime](next thursday)'
- find me something from [start](9am) to [end](12pm)
- I need a break from [start](2pm) until [end](4pm)
- Can you suggest something from [start](april 4th) until [end](april 6th) ?
- Book me a trip from [start](this friday) to [end](next tuesday)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "PlanBreak"
        slot_filler = CRFSlotFiller(config,
                                    **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Find me a plan from 5pm to 6pm")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 20, END: 23},
                            value="5pm",
                            entity="snips/datetime",
                            slot_name="start"),
            unresolved_slot(match_range={START: 27, END: 30},
                            value="6pm",
                            entity="snips/datetime",
                            slot_name="end")
        ]
        self.assertListEqual(expected_slots, slots)
コード例 #5
0
    def test_should_get_slots_after_deserialization(self):
        # Given
        dataset = BEVERAGE_DATASET
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "MakeTea"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)
        slot_filler.persist(self.tmp_file_path)

        custom_entity_parser = slot_filler.custom_entity_parser
        builtin_entity_parser = slot_filler.builtin_entity_parser

        deserialized_slot_filler = CRFSlotFiller.from_path(
            self.tmp_file_path,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser)

        # When
        slots = deserialized_slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 8,
                END: 11
            },
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')
        ]
        self.assertListEqual(expected_slots, slots)
コード例 #6
0
    def test_should_get_slots_after_deserialization(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me [number_of_cups:snips/number](one) cup of tea
- i want [number_of_cups] cups of tea please
- can you prepare [number_of_cups] cups of tea ?""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        intent = "MakeTea"
        shared = self.get_shared_data(dataset)
        shared[RANDOM_STATE] = 42
        slot_filler = CRFSlotFiller(**shared)
        slot_filler.fit(dataset, intent)
        slot_filler.persist(self.tmp_file_path)

        deserialized_slot_filler = CRFSlotFiller.from_path(
            self.tmp_file_path, **shared)

        # When
        slots = deserialized_slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 8,
                END: 11
            },
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')
        ]
        self.assertListEqual(expected_slots, slots)
コード例 #7
0
    def test_should_be_serializable(self):
        # Given
        features_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = SAMPLE_DATASET

        slot_filler = CRFSlotFiller(config)
        intent = "dummy_intent_1"
        slot_filler.fit(dataset, intent=intent)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        expected_crf_file = Path(slot_filler.crf_model.modelfile.name).name
        self.assertTrue((self.tmp_file_path / expected_crf_file).exists())

        expected_feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": "en"
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "crf_model_file": expected_crf_file,
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy_slot_name2": "dummy_entity_2",
                "dummy_slot_name3": "dummy_entity_2",
            }
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
コード例 #8
0
    def test_should_get_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me [number_of_cups:snips/number](five) cups of tea
- please I want [number_of_cups](two) cups of tea""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        shared = self.get_shared_data(dataset)
        shared[RANDOM_STATE] = 42
        slot_filler = CRFSlotFiller(**shared)
        intent = "MakeTea"
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 8,
                END: 11
            },
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')
        ]
        self.assertListEqual(slots, expected_slots)
コード例 #9
0
    def test_should_not_use_crf_when_dataset_with_no_slots(self):
        # Given
        dataset = {
            "language": "en",
            "intents": {
                "intent1": {
                    "utterances": [{
                        "data": [{
                            "text":
                            "This is an utterance without "
                            "slots"
                        }]
                    }]
                }
            },
            "entities": {}
        }
        slot_filler = CRFSlotFiller(**self.get_shared_data(dataset))
        mock_compute_features = MagicMock()
        slot_filler.compute_features = mock_compute_features

        # When
        slot_filler.fit(dataset, "intent1")
        slots = slot_filler.get_slots("This is an utterance without slots")

        # Then
        mock_compute_features.assert_not_called()
        self.assertListEqual([], slots)
コード例 #10
0
    def test_should_get_builtin_slots(self):
        # Given
        dataset = validate_and_format_dataset(WEATHER_DATASET)
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "SearchWeatherForecast"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Give me the weather at 9p.m. in Paris")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 20,
                END: 28
            },
                            value='at 9p.m.',
                            entity='snips/datetime',
                            slot_name='datetime'),
            unresolved_slot(match_range={
                START: 32,
                END: 37
            },
                            value='Paris',
                            entity='weather_location',
                            slot_name='location')
        ]
        self.assertListEqual(expected_slots, slots)
コード例 #11
0
    def test_should_get_builtin_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: GetWeather
utterances:
- what is the weather [datetime:snips/datetime](at 9pm)
- what's the weather in [location:weather_location](berlin)
- What's the weather in [location](tokyo) [datetime](this weekend)?
- Can you tell me the weather [datetime] please ?
- what is the weather forecast [datetime] in [location](paris)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "GetWeather"
        slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Give me the weather at 9pm in Paris")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 20, END: 26},
                            value='at 9pm',
                            entity='snips/datetime',
                            slot_name='datetime'),
            unresolved_slot(match_range={START: 30, END: 35},
                            value='Paris',
                            entity='weather_location',
                            slot_name='location')
        ]
        self.assertListEqual(expected_slots, slots)
コード例 #12
0
    def test_should_be_serializable(self, mock_serialize_crf_model):
        # Given
        mock_serialize_crf_model.return_value = "mocked_crf_model_data"
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        slot_filler = CRFSlotFiller(config)
        intent = "dummy_intent_1"
        slot_filler.fit(dataset, intent=intent)

        # When
        actual_slot_filler_dict = slot_filler.to_dict()

        # Then
        expected_feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": "en"},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": "mocked_crf_model_data",
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy_slot_name2": "dummy_entity_2",
                "dummy_slot_name3": "dummy_entity_2",
            }
        }
        self.assertDictEqual(actual_slot_filler_dict,
                             expected_slot_filler_dict)
コード例 #13
0
    def test_should_be_serializable_when_fitted_without_slots(self):
        # Given
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = {
            "language": "en",
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "This is an utterance without "
                                            "slots"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {}
        }

        slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent="intent1")

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})
        self.assertIsNone(slot_filler.crf_model)
コード例 #14
0
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](second_entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        shared = self.get_shared_data(dataset,
                                      CustomEntityParserUsage.WITHOUT_STEMS)
        slot_filler = CRFSlotFiller(slot_filler_config, **shared)
        slot_filler.fit(dataset, intent="my_intent")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {
                "ngram_1": "foo"
            },
            {},
            {
                "ngram_1": "world"
            },
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
コード例 #15
0
    def test_should_fit_and_parse_with_non_ascii_tags(self):
        # Given
        inputs = ("string%s" % i for i in range(10))
        utterances = [{
            DATA: [{
                TEXT: string,
                ENTITY: "non_ascìi_entïty",
                SLOT_NAME: "non_ascìi_slöt"
            }]
        } for string in inputs]

        # When
        naughty_dataset = {
            "intents": {
                "naughty_intent": {
                    "utterances": utterances
                }
            },
            "entities": {
                "non_ascìi_entïty": {
                    "use_synonyms": False,
                    "automatically_extensible": True,
                    "data": []
                }
            },
            "language": "en",
            "snips_nlu_version": "0.0.1"
        }

        naughty_dataset = validate_and_format_dataset(naughty_dataset)

        # Then
        with self.fail_if_exception("Naughty string make NLU crash"):
            slot_filler = CRFSlotFiller()
            slot_filler.fit(naughty_dataset, "naughty_intent")
            slots = slot_filler.get_slots("string0")
            expected_slot = {
                "entity": "non_ascìi_entïty",
                "range": {
                    "start": 0,
                    "end": 7
                },
                "slotName": u"non_ascìi_slöt",
                "value": u"string0"
            }
            self.assertListEqual([expected_slot], slots)
コード例 #16
0
    def test_should_get_slots(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "MakeTea"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 8, END: 11},
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')]
        self.assertListEqual(slots, expected_slots)
コード例 #17
0
    def test_refit(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        updated_dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)
- this is [entity1](my first entity) again""")
        updated_dataset = Dataset.from_yaml_files(
            "en", [updated_dataset_stream]).json

        config = CRFSlotFillerConfig(feature_factory_configs=[
            {
                "args": {
                    "common_words_gazetteer_name": "top_10000_words_stemmed",
                    "use_stemming": True,
                    "n": 1
                },
                "factory_name": "ngram",
                "offsets": [-2, -1, 0, 1, 2]
            },
        ])

        # When
        slot_filler = CRFSlotFiller(config).fit(dataset, "my_intent")

        # Then
        slot_filler.fit(updated_dataset, "my_intent")
コード例 #18
0
    def test_training_should_be_reproducible(self):
        # Given
        random_state = 42
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        # When
        slot_filler1 = CRFSlotFiller(random_state=random_state)
        slot_filler1.fit(dataset, "MakeTea")

        slot_filler2 = CRFSlotFiller(random_state=random_state)
        slot_filler2.fit(dataset, "MakeTea")

        # Then
        self.assertDictEqual(slot_filler1.crf_model.state_features_,
                             slot_filler2.crf_model.state_features_)
        self.assertDictEqual(slot_filler1.crf_model.transition_features_,
                             slot_filler2.crf_model.transition_features_)
コード例 #19
0
    def test_should_be_serializable(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](second_entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        features_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(config, **shared)
        intent = "my_intent"
        slot_filler.fit(dataset, intent=intent)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        self.assertTrue((self.tmp_file_path / CRF_MODEL_FILENAME).exists())

        expected_feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": "en"
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "crf_model_file": CRF_MODEL_FILENAME,
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "slot1": "entity1",
                "slot2": "entity2",
            }
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)