Ejemplo n.º 1
0
    def test_should_fail_loading_unregistered_factory_from_config(self):
        config = {
            "factory_name": "my_unknown_feature",
            "args": {},
            "offsets": [0]
        }

        # When / Then
        with self.assertRaises(NotRegisteredError):
            CRFFeatureFactory.from_config(config)
Ejemplo n.º 2
0
    def test_ngram_factory_with_gazetteer(self):
        # Given
        config = {
            "factory_name": "ngram",
            "args": {
                "n": 2,
                "use_stemming": False,
                "common_words_gazetteer_name": "my_gazetteer"
            },
            "offsets": [0]
        }
        resources = {
            GAZETTEERS: {
                "my_gazetteer": {"hello", "beautiful", "world"}
            }
        }
        tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = CRFFeatureFactory.from_config(config, resources=resources)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, NgramFactory)
        self.assertEqual(features[0].base_name, "ngram_2")
        self.assertEqual(res, "beautiful rare_word")
Ejemplo n.º 3
0
    def test_custom_single_feature_factory(self):
        # Given
        # pylint:disable=unused-variable
        @CRFFeatureFactory.register("my_single_feature", override=True)
        class MySingleFeatureFactory(SingleFeatureFactory):
            def compute_feature(self, tokens, token_index):
                return "(%s)[my_feature]" % tokens[token_index].value

        # pylint:enable=unused-variable

        # When
        config = {
            "factory_name": "my_single_feature",
            "args": {},
            "offsets": [0, -1]
        }
        feature_factory = CRFFeatureFactory.from_config(config)
        features = feature_factory.build_features()
        feature_name = features[0].name
        feature_name_offset = features[1].name
        tokens = tokenize("hello world", "en")
        cache = [{TOKEN_NAME: token} for token in tokens]
        feature_value = features[0].compute(1, cache)
        feature_value_offset = features[1].compute(1, cache)

        # Then
        self.assertEqual("my_single_feature", feature_name)
        self.assertEqual("my_single_feature[-1]", feature_name_offset)
        self.assertEqual("(world)[my_feature]", feature_value)
        self.assertEqual("(hello)[my_feature]", feature_value_offset)
Ejemplo n.º 4
0
    def test_ngram_factory_with_stemming(self):
        # Given
        config = {
            "factory_name": "ngram",
            "args": {
                "n": 2,
                "use_stemming": True,
                "common_words_gazetteer_name": None
            },
            "offsets": [0]
        }
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        resources = {STEMS: {"beautiful": "beauty"}}
        factory = CRFFeatureFactory.from_config(config, resources=resources)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(0, cache)

        # Then
        self.assertIsInstance(factory, NgramFactory)
        self.assertEqual(features[0].base_name, "ngram_2")
        self.assertEqual(res, "hello beauty")
Ejemplo n.º 5
0
    def test_builtin_entity_match_factory(self):
        # Given
        def mock_builtin_entity_scope(dataset, _):
            if dataset[LANGUAGE] == LANGUAGE_EN:
                return {SNIPS_NUMBER, SNIPS_DATETIME}
            return []

        config = {
            "factory_name": "builtin_entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
            },
            "offsets": [0]
        }

        tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        builtin_entity_parser = BuiltinEntityParser.build(language="en")
        factory = CRFFeatureFactory.from_config(
            config, builtin_entity_parser=builtin_entity_parser)
        # pylint: disable=protected-access
        factory._get_builtin_entity_scope = mock_builtin_entity_scope
        # pylint: enable=protected-access
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, BuiltinEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name,
                         "builtin_entity_match_snips/datetime")
        self.assertEqual(features[1].base_name,
                         "builtin_entity_match_snips/number")
        self.assertEqual(res0, UNIT_PREFIX)
        self.assertEqual(res1, None)
        self.assertEqual(res2, BEGINNING_PREFIX)
        self.assertEqual(res3, INSIDE_PREFIX)
        self.assertEqual(res4, LAST_PREFIX)

        self.assertEqual(res5, UNIT_PREFIX)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, None)
Ejemplo n.º 6
0
    def get_required_resources(self):
        # Import here to avoid circular imports
        from snips_nlu.slot_filler.feature_factory import CRFFeatureFactory

        resources = self.data_augmentation_config.get_required_resources()
        for config in self.feature_factory_configs:
            factory = CRFFeatureFactory.from_config(config)
            resources = merge_required_resources(
                resources, factory.get_required_resources())
        return resources
Ejemplo n.º 7
0
 def __init__(self, config=None, **shared):
     """The CRF slot filler can be configured by passing a
     :class:`.CRFSlotFillerConfig`"""
     super(CRFSlotFiller, self).__init__(config, **shared)
     self.crf_model = None
     self.features_factories = [
         CRFFeatureFactory.from_config(conf, **shared)
         for conf in self.config.feature_factory_configs
     ]
     self._features = None
     self.language = None
     self.intent = None
     self.slot_name_mapping = None
Ejemplo n.º 8
0
    def test_custom_multi_feature_factory(self):
        # Given

        # pylint:disable=unused-variable
        @CRFFeatureFactory.register("my_multi_feature_factory", override=True)
        class MyMultiFeature(CRFFeatureFactory):
            def build_features(self):
                first_features = [
                    Feature("my_first_feature",
                            self.compute_feature_1,
                            offset=offset) for offset in self.offsets
                ]
                second_features = [
                    Feature("my_second_feature",
                            self.compute_feature_2,
                            offset=offset) for offset in self.offsets
                ]
                return first_features + second_features

            @staticmethod
            def compute_feature_1(tokens, token_index):
                return "(%s)[my_feature_1]" % tokens[token_index].value

            @staticmethod
            def compute_feature_2(tokens, token_index):
                return "(%s)[my_feature_2]" % tokens[token_index].value

        # pylint:enable=unused-variable

        # When
        config = {
            "factory_name": "my_multi_feature_factory",
            "args": {},
            "offsets": [-1, 0]
        }
        feature_factory = CRFFeatureFactory.from_config(config)
        features = feature_factory.build_features()
        feature_0 = features[0]
        feature_1 = features[1]
        feature_2 = features[2]
        feature_3 = features[3]
        tokens = tokenize("foo bar baz", "en")
        cache = [{TOKEN_NAME: token} for token in tokens]

        # Then
        self.assertEqual("my_first_feature[-1]", feature_0.name)
        self.assertEqual("(foo)[my_feature_1]", feature_0.compute(1, cache))
        self.assertEqual("my_first_feature", feature_1.name)
        self.assertEqual("my_second_feature[-1]", feature_2.name)
        self.assertEqual("(bar)[my_feature_2]", feature_2.compute(2, cache))
        self.assertEqual("my_second_feature", feature_3.name)
Ejemplo n.º 9
0
 def __init__(self, config=None, **shared):
     """The CRF slot filler can be configured by passing a
     :class:`.CRFSlotFillerConfig`"""
     # The CRFSlotFillerConfig must be deep-copied as it is mutated when
     # fitting the feature factories
     config = deepcopy(config)
     super(CRFSlotFiller, self).__init__(config, **shared)
     self.crf_model = None
     self.features_factories = [
         CRFFeatureFactory.from_config(conf, **shared)
         for conf in self.config.feature_factory_configs]
     self._features = None
     self.language = None
     self.intent = None
     self.slot_name_mapping = None
Ejemplo n.º 10
0
    def test_length_factory(self):
        # Given
        config = {"factory_name": "length", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = CRFFeatureFactory.from_config(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, LengthFactory)
        self.assertEqual(features[0].base_name, "length")
        self.assertEqual(res, "5")
Ejemplo n.º 11
0
    def test_factory_from_config(self):
        # Given
        @CRFFeatureFactory.register("my_custom_feature")
        class MySingleFeatureFactory(SingleFeatureFactory):
            def compute_feature(self, tokens, token_index):
                return "(%s)[my_custom_feature]" % tokens[token_index].value

        config = {
            "factory_name": "my_custom_feature",
            "args": {},
            "offsets": [0]
        }

        # When
        factory = CRFFeatureFactory.from_config(config)

        # Then
        self.assertIsInstance(factory, MySingleFeatureFactory)
Ejemplo n.º 12
0
    def test_is_first_factory(self):
        # Given
        config = {"factory_name": "is_first", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = CRFFeatureFactory.from_config(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res1 = features[0].compute(0, cache)
        res2 = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, IsFirstFactory)
        self.assertEqual(features[0].base_name, "is_first")
        self.assertEqual(res1, "1")
        self.assertEqual(res2, None)
Ejemplo n.º 13
0
    def test_word_cluster_factory(self):
        # Given
        resources = {
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "word1": "00",
                    "word2": "11"
                }
            }
        }

        config = {
            "factory_name": "word_cluster",
            "args": {
                "cluster_name": "my_word_clusters",
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("hello word1 word2", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = CRFFeatureFactory.from_config(config, resources=resources)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, WordClusterFactory)
        self.assertEqual(features[0].base_name,
                         "word_cluster_my_word_clusters")
        self.assertEqual(res0, None)
        self.assertEqual(res1, "00")
        self.assertEqual(res2, "11")
Ejemplo n.º 14
0
    def test_shape_ngram_factory(self):
        # Given
        config = {
            "factory_name": "shape_ngram",
            "args": {
                "n": 3,
            },
            "offsets": [0]
        }

        tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = CRFFeatureFactory.from_config(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, ShapeNgramFactory)
        self.assertEqual(features[0].base_name, "shape_ngram_3")
        self.assertEqual(res, "Xxx xX xxx")
Ejemplo n.º 15
0
    def test_entity_match_factory(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)
- this is [entity2](second_entity)""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": True
            },
            "offsets": [0]
        }

        tokens = tokenize("my first entity and second_entity", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        resources = {STEMS: dict()}
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)
        factory = CRFFeatureFactory.from_config(
            config,
            custom_entity_parser=custom_entity_parser,
            resources=resources)
        factory.fit(dataset, "my_intent")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, CustomEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_entity1")
        self.assertEqual(features[1].base_name, "entity_match_entity2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)