def test_should_be_serializable_before_fit(self):
        # Given
        features_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)

        slot_filler = CRFSlotFiller(config)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        expected_slot_filler_dict = {
            "crf_model_file": None,
            "language_code": None,
            "config": config.to_dict(),
            "intent": None,
            "slot_name_mapping": None,
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
    def test_should_be_serializable_before_fit(self):
        # Given
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)

        slot_filler = CRFSlotFiller(config)

        # When
        actual_slot_filler_dict = slot_filler.to_dict()

        # Then
        expected_slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": None,
            "language_code": None,
            "config": config.to_dict(),
            "intent": None,
            "slot_name_mapping": None,
        }
        self.assertDictEqual(actual_slot_filler_dict,
                             expected_slot_filler_dict)
Beispiel #3
0
    def test_should_be_serializable(self):
        # Given
        features_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = SAMPLE_DATASET

        slot_filler = CRFSlotFiller(config)
        intent = "dummy_intent_1"
        slot_filler.fit(dataset, intent=intent)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        expected_crf_file = Path(slot_filler.crf_model.modelfile.name).name
        self.assertTrue((self.tmp_file_path / expected_crf_file).exists())

        expected_feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": "en"
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "crf_model_file": expected_crf_file,
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy_slot_name2": "dummy_entity_2",
                "dummy_slot_name3": "dummy_entity_2",
            }
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
    def test_should_be_serializable(self, mock_serialize_crf_model):
        # Given
        mock_serialize_crf_model.return_value = "mocked_crf_model_data"
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        slot_filler = CRFSlotFiller(config)
        intent = "dummy_intent_1"
        slot_filler.fit(dataset, intent=intent)

        # When
        actual_slot_filler_dict = slot_filler.to_dict()

        # Then
        expected_feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": "en"},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": "mocked_crf_model_data",
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "dummy_slot_name": "dummy_entity_1",
                "dummy_slot_name2": "dummy_entity_2",
                "dummy_slot_name3": "dummy_entity_2",
            }
        }
        self.assertDictEqual(actual_slot_filler_dict,
                             expected_slot_filler_dict)
    def test_should_get_builtin_slots(self):
        # Given
        dataset = validate_and_format_dataset(WEATHER_DATASET)
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "SearchWeatherForecast"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Give me the weather at 9p.m. in Paris")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 20,
                END: 28
            },
                            value='at 9p.m.',
                            entity='snips/datetime',
                            slot_name='datetime'),
            unresolved_slot(match_range={
                START: 32,
                END: 37
            },
                            value='Paris',
                            entity='weather_location',
                            slot_name='location')
        ]
        self.assertListEqual(expected_slots, slots)
Beispiel #6
0
    def test_should_get_builtin_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: GetWeather
utterances:
- what is the weather [datetime:snips/datetime](at 9pm)
- what's the weather in [location:weather_location](berlin)
- What's the weather in [location](tokyo) [datetime](this weekend)?
- Can you tell me the weather [datetime] please ?
- what is the weather forecast [datetime] in [location](paris)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "GetWeather"
        slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Give me the weather at 9pm in Paris")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 20, END: 26},
                            value='at 9pm',
                            entity='snips/datetime',
                            slot_name='datetime'),
            unresolved_slot(match_range={START: 30, END: 35},
                            value='Paris',
                            entity='weather_location',
                            slot_name='location')
        ]
        self.assertListEqual(expected_slots, slots)
Beispiel #7
0
    def test_should_get_slots_after_deserialization(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me [number_of_cups:snips/number](one) cup of tea
- i want [number_of_cups] cups of tea please
- can you prepare [number_of_cups] cups of tea ?""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "MakeTea"
        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(config, **shared)
        slot_filler.fit(dataset, intent)
        slot_filler.persist(self.tmp_file_path)

        deserialized_slot_filler = CRFSlotFiller.from_path(
            self.tmp_file_path, **shared)

        # When
        slots = deserialized_slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 8, END: 11},
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')]
        self.assertListEqual(expected_slots, slots)
Beispiel #8
0
    def test_should_get_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me [number_of_cups:snips/number](five) cups of tea
- please I want [number_of_cups](two) cups of tea""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(config, **shared)
        intent = "MakeTea"
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 8, END: 11},
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')]
        self.assertListEqual(slots, expected_slots)
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)
        slot_filler = CRFSlotFiller(slot_filler_config)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset = validate_and_format_dataset(SAMPLE_DATASET)
        slot_filler.fit(dataset, intent="dummy_intent_1")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {"ngram_1": "foo"},
            {},
            {"ngram_1": "world"},
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
Beispiel #10
0
    def test_fitting_should_be_reproducible_after_serialization(self):
        # Given
        dataset = BEVERAGE_DATASET
        validated_dataset = validate_and_format_dataset(dataset)

        seed1 = 666
        seed2 = 42
        config = ProbabilisticIntentParserConfig(
            intent_classifier_config=LogRegIntentClassifierConfig(
                random_seed=seed1),
            slot_filler_config=CRFSlotFillerConfig(random_seed=seed2))
        parser = ProbabilisticIntentParser(config)
        parser_dict = parser.to_dict()

        # When
        fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        # Then
        feature_weights_1 = fitted_parser_1.slot_fillers[
            "MakeTea"].crf_model.state_features_
        feature_weights_2 = fitted_parser_2.slot_fillers[
            "MakeTea"].crf_model.state_features_
        self.assertEqual(feature_weights_1, feature_weights_2)
Beispiel #11
0
    def test_should_get_slots_after_deserialization(self):
        # Given
        dataset = BEVERAGE_DATASET
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "MakeTea"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)
        slot_filler.persist(self.tmp_file_path)

        custom_entity_parser = slot_filler.custom_entity_parser
        builtin_entity_parser = slot_filler.builtin_entity_parser

        deserialized_slot_filler = CRFSlotFiller.from_path(
            self.tmp_file_path,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser)

        # When
        slots = deserialized_slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={
                START: 8,
                END: 11
            },
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')
        ]
        self.assertListEqual(expected_slots, slots)
Beispiel #12
0
    def test_should_get_sub_builtin_slots(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: PlanBreak
utterances:
- 'I want to leave from [start:snips/datetime](tomorrow) until 
  [end:snips/datetime](next thursday)'
- find me something from [start](9am) to [end](12pm)
- I need a break from [start](2pm) until [end](4pm)
- Can you suggest something from [start](april 4th) until [end](april 6th) ?
- Book me a trip from [start](this friday) to [end](next tuesday)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "PlanBreak"
        slot_filler = CRFSlotFiller(config,
                                    **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("Find me a plan from 5pm to 6pm")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 20, END: 23},
                            value="5pm",
                            entity="snips/datetime",
                            slot_name="start"),
            unresolved_slot(match_range={START: 27, END: 30},
                            value="6pm",
                            entity="snips/datetime",
                            slot_name="end")
        ]
        self.assertListEqual(expected_slots, slots)
Beispiel #13
0
 def __init__(self, intent_classifier_config=None, slot_filler_config=None):
     if intent_classifier_config is None:
         from snips_nlu.pipeline.configs import LogRegIntentClassifierConfig
         intent_classifier_config = LogRegIntentClassifierConfig()
     if slot_filler_config is None:
         from snips_nlu.pipeline.configs import CRFSlotFillerConfig
         slot_filler_config = CRFSlotFillerConfig()
     self.intent_classifier_config = get_processing_unit_config(
         intent_classifier_config)
     self.slot_filler_config = get_processing_unit_config(
         slot_filler_config)
Beispiel #14
0
    def test_should_be_deserializable_before_fit(self):
        # Given
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories)
        slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_file": None,
            "language_code": None,
            "intent": None,
            "slot_name_mapping": None,
            "config": slot_filler_config.to_dict()
        }
        metadata = {"unit_name": "crf_slot_filler"}
        self.tmp_file_path.mkdir()
        self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.writeJsonContent(self.tmp_file_path / "slot_filler.json",
                              slot_filler_dict)

        # When
        slot_filler = CRFSlotFiller.from_path(self.tmp_file_path)

        # Then
        expected_features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_language = None
        expected_config = CRFSlotFillerConfig(
            feature_factory_configs=expected_features_factories)
        expected_intent = None
        expected_slot_name_mapping = None
        expected_crf_model = None

        self.assertEqual(slot_filler.crf_model, expected_crf_model)
        self.assertEqual(slot_filler.language, expected_language)
        self.assertEqual(slot_filler.intent, expected_intent)
        self.assertEqual(slot_filler.slot_name_mapping,
                         expected_slot_name_mapping)
        self.assertDictEqual(expected_config.to_dict(),
                             slot_filler.config.to_dict())
    def test_should_be_deserializable_before_fit(self,
                                                 mock_deserialize_crf_model):
        # Given
        mock_deserialize_crf_model.return_value = None
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories)
        slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": None,
            "language_code": None,
            "intent": None,
            "slot_name_mapping": None,
            "config": slot_filler_config.to_dict()
        }

        # When
        slot_filler = CRFSlotFiller.from_dict(slot_filler_dict)

        # Then
        expected_features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_language = None
        expected_config = CRFSlotFillerConfig(
            feature_factory_configs=expected_features_factories)
        expected_intent = None
        expected_slot_name_mapping = None
        expected_crf_model = None

        self.assertEqual(slot_filler.crf_model, expected_crf_model)
        self.assertEqual(slot_filler.language, expected_language)
        self.assertEqual(slot_filler.intent, expected_intent)
        self.assertEqual(slot_filler.slot_name_mapping,
                         expected_slot_name_mapping)
        self.assertDictEqual(expected_config.to_dict(),
                             slot_filler.config.to_dict())
Beispiel #16
0
    def test_probabilistic_intent_parser_config(self):
        # Given
        config_dict = {
            "unit_name": "probabilistic_intent_parser",
            "intent_classifier_config":
            LogRegIntentClassifierConfig().to_dict(),
            "slot_filler_config": CRFSlotFillerConfig().to_dict(),
        }

        # When
        config = ProbabilisticIntentParserConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Beispiel #17
0
    def test_should_compute_features(self):
        # Given
        features_factories = [
            {
                "factory_name": NgramFactory.name,
                "args": {
                    "n": 1,
                    "use_stemming": False,
                    "common_words_gazetteer_name": None
                },
                "offsets": [0],
                "drop_out": 0.3
            },
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=features_factories, random_seed=40)

        tokens = tokenize("foo hello world bar", LANGUAGE_EN)
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](second_entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        shared = self.get_shared_data(dataset,
                                      CustomEntityParserUsage.WITHOUT_STEMS)
        slot_filler = CRFSlotFiller(slot_filler_config, **shared)
        slot_filler.fit(dataset, intent="my_intent")

        # When
        features_with_drop_out = slot_filler.compute_features(tokens, True)

        # Then
        expected_features = [
            {
                "ngram_1": "foo"
            },
            {},
            {
                "ngram_1": "world"
            },
            {},
        ]
        self.assertListEqual(expected_features, features_with_drop_out)
Beispiel #18
0
    def test_should_be_serializable_when_fitted_without_slots(self):
        # Given
        features_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        dataset = {
            "language": "en",
            "intents": {
                "intent1": {
                    "utterances": [
                        {
                            "data": [
                                {
                                    "text": "This is an utterance without "
                                            "slots"
                                }
                            ]
                        }
                    ]
                }
            },
            "entities": {}
        }

        slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset))
        slot_filler.fit(dataset, intent="intent1")

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})
        self.assertIsNone(slot_filler.crf_model)
    def test_should_get_slots(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        config = CRFSlotFillerConfig(random_seed=42)
        intent = "MakeTea"
        slot_filler = CRFSlotFiller(config)
        slot_filler.fit(dataset, intent)

        # When
        slots = slot_filler.get_slots("make me two cups of tea")

        # Then
        expected_slots = [
            unresolved_slot(match_range={START: 8, END: 11},
                            value='two',
                            entity='snips/number',
                            slot_name='number_of_cups')]
        self.assertListEqual(slots, expected_slots)
Beispiel #20
0
    def test_fitting_should_be_reproducible_after_serialization(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        seed1 = 666
        seed2 = 42
        config = ProbabilisticIntentParserConfig(
            intent_classifier_config=LogRegIntentClassifierConfig(
                random_seed=seed1),
            slot_filler_config=CRFSlotFillerConfig(random_seed=seed2))
        shared = self.get_shared_data(dataset)
        parser = ProbabilisticIntentParser(config, **shared)
        parser.persist(self.tmp_file_path)

        # When
        fitted_parser_1 = ProbabilisticIntentParser.from_path(
            self.tmp_file_path, **shared).fit(dataset)

        fitted_parser_2 = ProbabilisticIntentParser.from_path(
            self.tmp_file_path, **shared).fit(dataset)

        # Then
        feature_weights_1 = fitted_parser_1.slot_fillers[
            "MakeTea"].crf_model.state_features_
        feature_weights_2 = fitted_parser_2.slot_fillers[
            "MakeTea"].crf_model.state_features_
        self.assertEqual(feature_weights_1, feature_weights_2)
Beispiel #21
0
    def test_crf_slot_filler_config(self):
        # Given
        feature_factories = [{
            "args": {
                "common_words_gazetteer_name": None,
                "use_stemming": True,
                "n": 1
            },
            "factory_name": "get_ngram_fn",
            "offsets": [-2, -1, 0, 1, 2]
        }, {
            "args": {},
            "factory_name": "is_digit",
            "offsets": [-1, 0, 1]
        }]
        config_dict = {
            "unit_name":
            "crf_slot_filler",
            "feature_factory_configs":
            feature_factories,
            "tagging_scheme":
            2,
            "crf_args": {
                "c1": .2,
                "c2": .3,
                "algorithm": "lbfgs"
            },
            "exhaustive_permutations_threshold":
            42,
            "data_augmentation_config":
            SlotFillerDataAugmentationConfig().to_dict(),
            "random_seed":
            43
        }

        # When
        config = CRFSlotFillerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Beispiel #22
0
    def test_should_be_serializable_before_fitting(self):
        # Given
        parser = ProbabilisticIntentParser()

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_parser_dict = {
            "config": {
                "unit_name": "probabilistic_intent_parser",
                "slot_filler_config": CRFSlotFillerConfig().to_dict(),
                "intent_classifier_config":
                    LogRegIntentClassifierConfig().to_dict()
            },
            "slot_fillers": []
        }
        metadata = {"unit_name": "probabilistic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_parser_dict)
Beispiel #23
0
    def test_should_parse_top_intents(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - "[entity1](foo) bar"

---
type: intent
name: intent2
utterances:
  - foo bar [entity2](baz)

---
type: intent
name: intent3
utterances:
  - foz for [entity3](baz)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        classifier_config = LogRegIntentClassifierConfig(random_seed=42)
        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        parser_config = ProbabilisticIntentParserConfig(
            classifier_config, slot_filler_config)
        parser = ProbabilisticIntentParser(parser_config)
        parser.fit(dataset)
        text = "foo bar baz"

        # When
        results = parser.parse(text, top_n=2)
        intents = [res[RES_INTENT][RES_INTENT_NAME] for res in results]
        entities = [[s[RES_VALUE] for s in res[RES_SLOTS]] for res in results]

        # Then
        expected_intents = ["intent2", "intent1"]
        expected_entities = [["baz"], ["foo"]]

        self.assertListEqual(expected_intents, intents)
        self.assertListEqual(expected_entities, entities)
Beispiel #24
0
    def test_should_be_serializable_before_fitting(self):
        # Given
        parser = ProbabilisticIntentParser()

        # When
        actual_parser_dict = parser.to_dict()

        # Then
        expected_parser_dict = {
            "unit_name": "probabilistic_intent_parser",
            "config": {
                "unit_name":
                "probabilistic_intent_parser",
                "slot_filler_config":
                CRFSlotFillerConfig().to_dict(),
                "intent_classifier_config":
                LogRegIntentClassifierConfig().to_dict()
            },
            "intent_classifier": None,
            "slot_fillers": dict(),
        }
        self.assertDictEqual(actual_parser_dict, expected_parser_dict)
    def test_refit(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        updated_dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)
- this is [entity1](my first entity) again""")
        updated_dataset = Dataset.from_yaml_files(
            "en", [updated_dataset_stream]).json

        config = CRFSlotFillerConfig(feature_factory_configs=[
            {
                "args": {
                    "common_words_gazetteer_name": "top_10000_words_stemmed",
                    "use_stemming": True,
                    "n": 1
                },
                "factory_name": "ngram",
                "offsets": [-2, -1, 0, 1, 2]
            },
        ])

        # When
        slot_filler = CRFSlotFiller(config).fit(dataset, "my_intent")

        # Then
        slot_filler.fit(updated_dataset, "my_intent")
Beispiel #26
0
    def test_should_parse_with_filter(self):
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - "[slot1:entity1](foo) bar"

---
type: intent
name: intent2
utterances:
  - foo bar [slot2:entity2](baz)

---
type: intent
name: intent3
utterances:
  - foz for [slot3:entity3](baz)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        classifier_config = LogRegIntentClassifierConfig(random_seed=42)
        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        parser_config = ProbabilisticIntentParserConfig(
            classifier_config, slot_filler_config)
        parser = ProbabilisticIntentParser(parser_config)
        parser.fit(dataset)
        text = "foo bar baz"

        # When
        result = parser.parse(text, intents=["intent1", "intent3"])

        # Then
        expected_slots = [unresolved_slot((0, 3), "foo", "entity1", "slot1")]

        self.assertEqual("intent1", result[RES_INTENT][RES_INTENT_NAME])
        self.assertEqual(expected_slots, result[RES_SLOTS])
    def test_should_be_deserializable(self):
        # Given
        language = LANGUAGE_EN
        feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": language
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=feature_factories)
        slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_file": "foobar.crfsuite",
            "language_code": "en",
            "intent": "dummy_intent_1",
            "slot_name_mapping": {
                "dummy_intent_1": {
                    "dummy_slot_name": "dummy_entity_1",
                }
            },
            "config": slot_filler_config.to_dict()
        }
        metadata = {"unit_name": "crf_slot_filler"}
        self.tmp_file_path.mkdir()
        self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.writeJsonContent(self.tmp_file_path / "slot_filler.json",
                              slot_filler_dict)
        self.writeFileContent(self.tmp_file_path / "foobar.crfsuite",
                              "foo bar")

        # When
        slot_filler = CRFSlotFiller.from_path(self.tmp_file_path)

        # Then
        expected_language = LANGUAGE_EN
        expected_feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": language
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        expected_config = CRFSlotFillerConfig(
            feature_factory_configs=expected_feature_factories)
        expected_intent = "dummy_intent_1"
        expected_slot_name_mapping = {
            "dummy_intent_1": {
                "dummy_slot_name": "dummy_entity_1",
            }
        }

        self.assertEqual(slot_filler.language, expected_language)
        self.assertEqual(slot_filler.intent, expected_intent)
        self.assertEqual(slot_filler.slot_name_mapping,
                         expected_slot_name_mapping)
        self.assertDictEqual(expected_config.to_dict(),
                             slot_filler.config.to_dict())
        crf_path = Path(slot_filler.crf_model.modelfile.name)
        self.assertFileContent(crf_path, "foo bar")
    def test_should_be_serializable(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [slot1:entity1](my first entity)
- this is [slot2:entity2](second_entity)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        features_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=features_factories)
        shared = self.get_shared_data(dataset)
        slot_filler = CRFSlotFiller(config, **shared)
        intent = "my_intent"
        slot_filler.fit(dataset, intent=intent)

        # When
        slot_filler.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"})

        self.assertTrue((self.tmp_file_path / CRF_MODEL_FILENAME).exists())

        expected_feature_factories = [{
            "factory_name": ShapeNgramFactory.name,
            "args": {
                "n": 1,
                "language_code": "en"
            },
            "offsets": [0]
        }, {
            "factory_name": IsDigitFactory.name,
            "args": {},
            "offsets": [-1, 0]
        }]
        expected_config = CRFSlotFillerConfig(
            tagging_scheme=TaggingScheme.BILOU,
            feature_factory_configs=expected_feature_factories)
        expected_slot_filler_dict = {
            "crf_model_file": CRF_MODEL_FILENAME,
            "language_code": "en",
            "config": expected_config.to_dict(),
            "intent": intent,
            "slot_name_mapping": {
                "slot1": "entity1",
                "slot2": "entity2",
            }
        }
        slot_filler_path = self.tmp_file_path / "slot_filler.json"
        self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
    def test_should_be_deserializable(self, mock_deserialize_crf_model):
        # Given
        language = LANGUAGE_EN
        mock_deserialize_crf_model.return_value = None
        feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": language},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        slot_filler_config = CRFSlotFillerConfig(
            feature_factory_configs=feature_factories)
        slot_filler_dict = {
            "unit_name": "crf_slot_filler",
            "crf_model_data": "mocked_crf_model_data",
            "language_code": "en",
            "intent": "dummy_intent_1",
            "slot_name_mapping": {
                "dummy_intent_1": {
                    "dummy_slot_name": "dummy_entity_1",
                }
            },
            "config": slot_filler_config.to_dict()
        }
        # When
        slot_filler = CRFSlotFiller.from_dict(slot_filler_dict)

        # Then
        mock_deserialize_crf_model.assert_called_once_with(
            "mocked_crf_model_data")
        expected_language = LANGUAGE_EN
        expected_feature_factories = [
            {
                "factory_name": ShapeNgramFactory.name,
                "args": {"n": 1, "language_code": language},
                "offsets": [0]
            },
            {
                "factory_name": IsDigitFactory.name,
                "args": {},
                "offsets": [-1, 0]
            }
        ]
        expected_config = CRFSlotFillerConfig(
            feature_factory_configs=expected_feature_factories)
        expected_intent = "dummy_intent_1"
        expected_slot_name_mapping = {
            "dummy_intent_1": {
                "dummy_slot_name": "dummy_entity_1",
            }
        }

        self.assertEqual(slot_filler.language, expected_language)
        self.assertEqual(slot_filler.intent, expected_intent)
        self.assertEqual(slot_filler.slot_name_mapping,
                         expected_slot_name_mapping)
        self.assertDictEqual(expected_config.to_dict(),
                             slot_filler.config.to_dict())
    def test_augment_slots(self):
        # Given
        language = LANGUAGE_EN
        text = "Find me a flight before 10pm and after 8pm"
        tokens = tokenize(text, language)
        missing_slots = {"start_date", "end_date"}

        tags = ['O' for _ in tokens]

        def mocked_sequence_probability(_, tags_):
            tags_1 = [
                'O', 'O', 'O', 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX, 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX
            ]

            tags_2 = [
                'O', 'O', 'O', 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX, 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX
            ]

            tags_3 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

            tags_4 = [
                'O', 'O', 'O', 'O', 'O', 'O', 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX
            ]

            tags_5 = [
                'O', 'O', 'O', 'O', 'O', 'O', 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX
            ]

            tags_6 = [
                'O', 'O', 'O', 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX, 'O', 'O', 'O'
            ]

            tags_7 = [
                'O', 'O', 'O', 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX, 'O', 'O', 'O'
            ]

            tags_8 = [
                'O', 'O', 'O', 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX, 'O',
                '%sstart_date' % BEGINNING_PREFIX,
                '%sstart_date' % INSIDE_PREFIX
            ]

            tags_9 = [
                'O', 'O', 'O', 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX, 'O',
                '%send_date' % BEGINNING_PREFIX,
                '%send_date' % INSIDE_PREFIX
            ]

            if tags_ == tags_1:
                return 0.6
            elif tags_ == tags_2:
                return 0.8
            elif tags_ == tags_3:
                return 0.2
            elif tags_ == tags_4:
                return 0.2
            elif tags_ == tags_5:
                return 0.99
            elif tags_ == tags_6:
                return 0.0
            elif tags_ == tags_7:
                return 0.0
            elif tags_ == tags_8:
                return 0.5
            elif tags_ == tags_9:
                return 0.5
            else:
                raise ValueError("Unexpected tag sequence: %s" % tags_)

        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        slot_filler = CRFSlotFiller(config=slot_filler_config)
        slot_filler.language = LANGUAGE_EN
        slot_filler.intent = "intent1"
        slot_filler.slot_name_mapping = {
            "start_date": "snips/datetime",
            "end_date": "snips/datetime",
        }

        # pylint:disable=protected-access
        slot_filler._get_sequence_probability = MagicMock(
            side_effect=mocked_sequence_probability)
        # pylint:enable=protected-access

        slot_filler.compute_features = MagicMock(return_value=None)

        # When
        # pylint: disable=protected-access
        augmented_slots = slot_filler._augment_slots(text, tokens, tags,
                                                     missing_slots)
        # pylint: enable=protected-access

        # Then
        expected_slots = [
            unresolved_slot(value='after 8pm',
                            match_range={
                                START: 33,
                                END: 42
                            },
                            entity='snips/datetime',
                            slot_name='end_date')
        ]
        self.assertListEqual(augmented_slots, expected_slots)