コード例 #1
0
    def test_ngram_factory_with_gazetteer(self, mock_get_gazetteer):
        # Given
        config = {
            "factory_name": "ngram",
            "args": {
                "n": 2,
                "use_stemming": False,
                "common_words_gazetteer_name": "mocked_gazetteer"
            },
            "offsets": [0]
        }

        mock_get_gazetteer.return_value = {"hello", "beautiful", "world"}
        tokens = tokenize("hello beautiful foobar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, NgramFactory)
        self.assertEqual(features[0].base_name, "ngram_2")
        self.assertEqual(res, "beautiful rare_word")
コード例 #2
0
    def test_builtin_entity_match_factory(self, mock_supported_entities):
        # Given
        def mocked_supported_entities(language):
            if language == LANGUAGE_EN:
                return {SNIPS_NUMBER, SNIPS_DATETIME}
            return set()

        mock_supported_entities.side_effect = mocked_supported_entities

        config = {
            "factory_name": "builtin_entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
            },
            "offsets": [0]
        }

        tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, BuiltinEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name,
                         "builtin_entity_match_snips/datetime")
        self.assertEqual(features[1].base_name,
                         "builtin_entity_match_snips/number")
        self.assertEqual(res0, UNIT_PREFIX)
        self.assertEqual(res1, None)
        self.assertEqual(res2, BEGINNING_PREFIX)
        self.assertEqual(res3, INSIDE_PREFIX)
        self.assertEqual(res4, LAST_PREFIX)

        self.assertEqual(res5, UNIT_PREFIX)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, None)
コード例 #3
0
    def test_builtin_entity_match_factory(self, mock_supported_entities):
        # Given
        def mocked_supported_entities(language):
            if language == LANGUAGE_EN:
                return {SNIPS_NUMBER, SNIPS_DATETIME}
            return set()

        mock_supported_entities.side_effect = mocked_supported_entities

        config = {
            "factory_name": "builtin_entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
            },
            "offsets": [0]
        }

        tokens = tokenize("one tea tomorrow at 2pm", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, BuiltinEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name,
                         "builtin_entity_match_snips/datetime")
        self.assertEqual(features[1].base_name,
                         "builtin_entity_match_snips/number")
        self.assertEqual(res0, UNIT_PREFIX)
        self.assertEqual(res1, None)
        self.assertEqual(res2, BEGINNING_PREFIX)
        self.assertEqual(res3, INSIDE_PREFIX)
        self.assertEqual(res4, LAST_PREFIX)

        self.assertEqual(res5, UNIT_PREFIX)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, None)
コード例 #4
0
ファイル: slot_filler.py プロジェクト: yucdong/snips-nlu
    def get_required_resources(self):
        # Import here to avoid circular imports
        from snips_nlu.slot_filler.feature_factory import get_feature_factory

        resources = self.data_augmentation_config.get_required_resources()
        for config in self.feature_factory_configs:
            factory = get_feature_factory(config)
            resources = merge_required_resources(
                resources, factory.get_required_resources())
        return resources
コード例 #5
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": True
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a had dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features(
            custom_entity_parser=custom_entity_parser)
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, CustomEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
コード例 #6
0
    def __init__(self, config=None):
        """The CRF slot filler can be configured by passing a
        :class:`.CRFSlotFillerConfig`"""

        if config is None:
            config = self.config_type()
        super(CRFSlotFiller, self).__init__(config)
        self.crf_model = None
        self.features_factories = [get_feature_factory(conf) for conf in
                                   self.config.feature_factory_configs]
        self._features = None
        self.language = None
        self.intent = None
        self.slot_name_mapping = None
コード例 #7
0
    def __init__(self, config=None):
        """The CRF slot filler can be configured by passing a
        :class:`.CRFSlotFillerConfig`"""

        if config is None:
            config = self.config_type()
        super(CRFSlotFiller, self).__init__(config)
        self.crf_model = None
        self.features_factories = [get_feature_factory(conf) for conf in
                                   self.config.feature_factory_configs]
        self._features = None
        self.language = None
        self.intent = None
        self.slot_name_mapping = None
コード例 #8
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, EntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
コード例 #9
0
    def test_length_factory(self):
        # Given
        config = {"factory_name": "length", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, LengthFactory)
        self.assertEqual(features[0].base_name, "length")
        self.assertEqual(res, "5")
コード例 #10
0
    def test_is_first_factory(self):
        # Given
        config = {"factory_name": "is_first", "args": {}, "offsets": [0]}
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res1 = features[0].compute(0, cache)
        res2 = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, IsFirstFactory)
        self.assertEqual(features[0].base_name, "is_first")
        self.assertEqual(res1, "1")
        self.assertEqual(res2, None)
コード例 #11
0
    def test_length_factory(self):
        # Given
        config = {
            "factory_name": "length",
            "args": {},
            "offsets": [0]
        }
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, LengthFactory)
        self.assertEqual(features[0].base_name, "length")
        self.assertEqual(res, 5)
コード例 #12
0
    def test_word_cluster_factory(self, mock_get_word_clusters):
        # Given
        def mocked_get_word_clusters(language):
            if language == LANGUAGE_EN:
                return {
                    "mocked_cluster": {
                        "word1": "00",
                        "word2": "11"
                    }
                }
            return dict()

        mock_get_word_clusters.side_effect = mocked_get_word_clusters

        config = {
            "factory_name": "word_cluster",
            "args": {
                "cluster_name": "mocked_cluster",
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("hello word1 word2", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, WordClusterFactory)
        self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster")
        self.assertEqual(res0, None)
        self.assertEqual(res1, "00")
        self.assertEqual(res2, "11")
コード例 #13
0
    def test_is_first_factory(self):
        # Given
        config = {
            "factory_name": "is_first",
            "args": {},
            "offsets": [0]
        }
        tokens = tokenize("hello beautiful world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        factory.fit(None, None)
        features = factory.build_features()

        # When
        res1 = features[0].compute(0, cache)
        res2 = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, IsFirstFactory)
        self.assertEqual(features[0].base_name, "is_first")
        self.assertEqual(res1, "1")
        self.assertEqual(res2, None)
コード例 #14
0
    def test_word_cluster_factory(self, mock_get_word_clusters):
        # Given
        def mocked_get_word_clusters(language):
            if language == LANGUAGE_EN:
                return {"mocked_cluster": {"word1": "00", "word2": "11"}}
            return dict()

        mock_get_word_clusters.side_effect = mocked_get_word_clusters

        config = {
            "factory_name": "word_cluster",
            "args": {
                "cluster_name": "mocked_cluster",
                "use_stemming": False
            },
            "offsets": [0]
        }

        tokens = tokenize("hello word1 word2", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)

        # Then
        self.assertIsInstance(factory, WordClusterFactory)
        self.assertEqual(features[0].base_name, "word_cluster_mocked_cluster")
        self.assertEqual(res0, None)
        self.assertEqual(res1, "00")
        self.assertEqual(res2, "11")
コード例 #15
0
    def test_shape_ngram_factory(self):
        # Given
        config = {
            "factory_name": "shape_ngram",
            "args": {
                "n": 3,
            },
            "offsets": [0]
        }

        tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, ShapeNgramFactory)
        self.assertEqual(features[0].base_name, "shape_ngram_3")
        self.assertEqual(res, "Xxx xX xxx")
コード例 #16
0
    def test_shape_ngram_factory(self):
        # Given
        config = {
            "factory_name": "shape_ngram",
            "args": {
                "n": 3,
            },
            "offsets": [0]
        }

        tokens = tokenize("hello Beautiful foObar world", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        mocked_dataset = {"language": "en"}
        factory.fit(mocked_dataset, None)
        features = factory.build_features()

        # When
        res = features[0].compute(1, cache)

        # Then
        self.assertIsInstance(factory, ShapeNgramFactory)
        self.assertEqual(features[0].base_name, "shape_ngram_3")
        self.assertEqual(res, "Xxx xX xxx")