Exemple #1
0
    def test_should_be_serializable(self):
        # Given
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict())
        self.tmp_file_path.mkdir()
        parser_path = self.tmp_file_path / "custom_entity_parser"
        parser.persist(parser_path)
        loaded_parser = CustomEntityParser.from_path(parser_path)

        # When
        scope = ["dummy_entity_1"]
        text = "dummy_entity_1 dummy_1"
        result = loaded_parser.parse(text, scope=scope)

        # Then
        expected_entities = [{
            "value": "dummy_entity_1",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 0,
                "end": 14
            },
            "entity_kind": "dummy_entity_1"
        }, {
            "value": "dummy_1",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 15,
                "end": 22
            },
            "entity_kind": "dummy_entity_1"
        }]
        self.assertListEqual(expected_entities, result)
    def test_should_parse_with_and_without_stems(self, mocked_stem):
        # Given
        mocked_stem.side_effect = _stem
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS)
        scope = ["dummy_entity_1"]
        text = "dummy_entity_ dummy_1"

        # When
        result = parser.parse(text, scope=scope)

        # Then
        expected_entities = [
            {
                "value": "dummy_entity_",
                "resolved_value": "dummy_entity_1",
                "range": {
                    "start": 0,
                    "end": 13
                },
                "entity_kind": "dummy_entity_1"
            },
            {
                "value": "dummy_1",
                "resolved_value": "dummy_entity_1",
                "range": {
                    "start": 14,
                    "end": 21
                },
                "entity_kind": "dummy_entity_1"
            }
        ]
        self.assertListEqual(expected_entities, result)
    def test_should_not_build_custom_parser_when_provided(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        resources = load_resources("en")
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS, resources)

        # When
        with patch("snips_nlu.entity_parser.custom_entity_parser"
                   ".CustomEntityParser.build") as mocked_build_parser:
            engine = SnipsNLUEngine(
                custom_entity_parser=custom_entity_parser)
            engine.fit(dataset)

        # Then
        mocked_build_parser.assert_not_called()
Exemple #4
0
    def test_should_parse_with_stems(self):
        # Given
        resources = {
            STEMS: {
                "dummy_entity_1": "dummy_entity_",
                "dummy_1": "dummy_"
            }
        }
        parser = CustomEntityParser.build(DATASET,
                                          CustomEntityParserUsage.WITH_STEMS,
                                          resources)
        text = "dummy_entity_ dummy_1"
        scope = ["dummy_entity_1"]

        # When
        result = parser.parse(text, scope=scope)

        # Then
        expected_entities = [{
            "value": "dummy_entity_",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 0,
                "end": 13
            },
            "entity_kind": "dummy_entity_1"
        }]
        self.assertListEqual(expected_entities, result)
Exemple #5
0
    def test_should_parse_with_proper_tokenization(self):
        # Given
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict())
        text = "  dummy_1?dummy_2"

        # When
        result = parser.parse(text)
        result = sorted(result, key=lambda e: e["range"]["start"])

        # Then
        expected_entities = [{
            "value": "dummy_1",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 2,
                "end": 9
            },
            "entity_kind": "dummy_entity_1"
        }, {
            "value": "dummy_2",
            "resolved_value": "dummy_entity_2",
            "range": {
                "start": 10,
                "end": 17
            },
            "entity_kind": "dummy_entity_2"
        }]
        self.assertListEqual(expected_entities, result)
    def from_path(cls, path, **shared):
        """Loads a :class:`SnipsNLUEngine` instance from a directory path

        The data at the given path must have been generated using
        :func:`~SnipsNLUEngine.persist`

        Args:
            path (str): The path where the nlu engine is stored

        Raises:
            LoadingError: when some files are missing
            IncompatibleModelError: when trying to load an engine model which
                is not compatible with the current version of the lib
        """
        directory_path = Path(path)
        model_path = directory_path / "nlu_engine.json"
        if not model_path.exists():
            raise LoadingError("Missing nlu engine model file: %s" %
                               model_path.name)

        with model_path.open(encoding="utf8") as f:
            model = json.load(f)
        model_version = model.get("model_version")
        if model_version is None or model_version != __model_version__:
            raise IncompatibleModelError(model_version)

        dataset_metadata = model["dataset_metadata"]
        if shared.get(RESOURCES) is None and dataset_metadata is not None:
            language = dataset_metadata["language_code"]
            resources_dir = directory_path / "resources" / language
            if resources_dir.is_dir():
                resources = load_resources_from_dir(resources_dir)
                shared[RESOURCES] = resources

        if shared.get(BUILTIN_ENTITY_PARSER) is None:
            path = model["builtin_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path(
                    parser_path)

        if shared.get(CUSTOM_ENTITY_PARSER) is None:
            path = model["custom_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path(
                    parser_path)

        config = cls.config_type.from_dict(model["config"])
        nlu_engine = cls(config=config, **shared)
        nlu_engine.dataset_metadata = dataset_metadata
        intent_parsers = []
        for parser_idx, parser_name in enumerate(model["intent_parsers"]):
            parser_config = config.intent_parsers_configs[parser_idx]
            intent_parser_path = directory_path / parser_name
            intent_parser = IntentParser.load_from_path(
                intent_parser_path, parser_config.unit_name, **shared)
            intent_parsers.append(intent_parser)
        nlu_engine.intent_parsers = intent_parsers
        return nlu_engine
Exemple #7
0
    def from_path(cls, path, **shared):
        """Load a :class:`SnipsNLUEngine` instance from a directory path

        The data at the given path must have been generated using
        :func:`~SnipsNLUEngine.persist`

        Args:
            path (str): The path where the nlu engine is
                stored.
        """
        directory_path = Path(path)
        model_path = directory_path / "nlu_engine.json"
        if not model_path.exists():
            raise OSError("Missing nlu engine model file: %s" %
                          model_path.name)

        with model_path.open(encoding="utf8") as f:
            model = json.load(f)
        model_version = model.get("model_version")
        if model_version is None or model_version != __model_version__:
            raise ValueError(
                "Incompatible data model: persisted object=%s, python lib=%s" %
                (model_version, __model_version__))

        dataset_metadata = model["dataset_metadata"]
        if dataset_metadata is not None:
            language = dataset_metadata["language_code"]
            resources_dir = directory_path / "resources" / language
            if resources_dir.is_dir():
                load_resources_from_dir(resources_dir)

        if shared.get(BUILTIN_ENTITY_PARSER) is None:
            path = model["builtin_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path(
                    parser_path)

        if shared.get(CUSTOM_ENTITY_PARSER) is None:
            path = model["custom_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path(
                    parser_path)

        nlu_engine = cls(config=model["config"], **shared)

        # pylint:disable=protected-access
        nlu_engine._dataset_metadata = dataset_metadata
        # pylint:enable=protected-access
        intent_parsers = []
        for intent_parser_name in model["intent_parsers"]:
            intent_parser_path = directory_path / intent_parser_name
            intent_parser = load_processing_unit(intent_parser_path, **shared)
            intent_parsers.append(intent_parser)
        nlu_engine.intent_parsers = intent_parsers
        return nlu_engine
Exemple #8
0
    def test_should_respect_scope(self):
        # Given
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict())
        scope = ["dummy_entity_1"]
        text = "dummy_entity_2"

        # When
        result = parser.parse(text, scope=scope)

        # Then
        self.assertListEqual([], result)
Exemple #9
0
    def test_entity_match_factory(self):
        # Given
        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": True
            },
            "offsets": [0]
        }

        tokens = tokenize("2 dummy a had dummy_c", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        factory = get_feature_factory(config)
        dataset = deepcopy(SAMPLE_DATASET)
        dataset = validate_and_format_dataset(dataset)
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS)
        factory.fit(dataset, "dummy_intent_1")

        # When
        features = factory.build_features(
            custom_entity_parser=custom_entity_parser)
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, CustomEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1")
        self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
Exemple #10
0
    def test_should_be_serializable(self):
        # Given
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict())
        self.tmp_file_path.mkdir()
        parser_path = self.tmp_file_path / "custom_entity_parser"
        parser.persist(parser_path)
        loaded_parser = CustomEntityParser.from_path(parser_path)

        # When
        scope = ["dummy_entity_1"]
        text = "dummy_entity_1 dummy_1"
        result = loaded_parser.parse(text, scope=scope)

        # Then
        expected_entities = [{
            "value": "dummy_entity_1",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 0,
                "end": 14
            },
            "entity_kind": "dummy_entity_1"
        }, {
            "value": "dummy_1",
            "resolved_value": "dummy_entity_1",
            "range": {
                "start": 15,
                "end": 22
            },
            "entity_kind": "dummy_entity_1"
        }]
        self.assertListEqual(expected_entities, result)
        license_path = parser_path / "parser" / "parser_1" / "LICENSE"
        self.assertTrue(license_path.exists())
        with license_path.open(encoding="utf8") as f:
            license_content = f.read()
        self.assertEqual("some license content here", license_content)
    def test_should_parse_without_stems(self):
        # Given
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS)
        text = "dummy_entity_1 dummy_1 dummy_entity_2 dummy_2"

        # When
        result = parser.parse(text)
        result = sorted(result, key=lambda e: e["range"]["start"])

        # Then
        expected_entities = [
            {
                "value": "dummy_entity_1",
                "resolved_value": "dummy_entity_1",
                "range": {
                    "start": 0,
                    "end": 14
                },
                "entity_kind": "dummy_entity_1"
            },
            {
                "value": "dummy_1",
                "resolved_value": "dummy_entity_1",
                "range": {
                    "start": 15,
                    "end": 22
                },
                "entity_kind": "dummy_entity_1"
            },
            {
                "value": "dummy_entity_2",
                "resolved_value": "dummy_entity_2",
                "range": {
                    "start": 23,
                    "end": 37
                },
                "entity_kind": "dummy_entity_2"
            },
            {
                "value": "dummy_2",
                "resolved_value": "dummy_entity_2",
                "range": {
                    "start": 38,
                    "end": 45
                },
                "entity_kind": "dummy_entity_2"
            }
        ]
        self.assertListEqual(expected_entities, result)
Exemple #12
0
    def test_should_use_cache(self, mocked_parse):
        # Given
        mocked_parse.return_value = []
        parser = CustomEntityParser.build(
            DATASET, CustomEntityParserUsage.WITHOUT_STEMS, resources=dict())

        text = ""

        # When
        parser.parse(text)
        parser.parse(text)

        # Then
        self.assertEqual(1, mocked_parse.call_count)
Exemple #13
0
    def get_shared_data(cls, dataset, parser_usage=None):
        from snips_nlu.entity_parser import (BuiltinEntityParser,
                                             CustomEntityParser,
                                             CustomEntityParserUsage)

        if parser_usage is None:
            parser_usage = CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS
        resources = cls.get_resources(dataset["language"])
        builtin_entity_parser = BuiltinEntityParser.build(dataset)
        custom_entity_parser = CustomEntityParser.build(
            dataset, parser_usage, resources)
        return {
            "resources": resources,
            "builtin_entity_parser": builtin_entity_parser,
            "custom_entity_parser": custom_entity_parser
        }
Exemple #14
0
    def test_should_be_serializable_into_bytearray(self):
        # Given
        dataset = BEVERAGE_DATASET
        engine = SnipsNLUEngine().fit(dataset)

        # When
        engine_bytes = engine.to_byte_array()
        builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)
        custom_entity_parser = CustomEntityParser.build(
            dataset, parser_usage=CustomEntityParserUsage.WITHOUT_STEMS)
        loaded_engine = SnipsNLUEngine.from_byte_array(
            engine_bytes, builtin_entity_parser=builtin_entity_parser,
            custom_entity_parser=custom_entity_parser)
        result = loaded_engine.parse("Make me two cups of coffee")

        # Then
        self.assertEqual(result[RES_INTENT][RES_INTENT_NAME], "MakeCoffee")
Exemple #15
0
    def fit_custom_entity_parser_if_needed(self, dataset):
        # We only fit a custom entity parser when the unit has already been
        # fitted or if the parser is none.
        # In the other cases the parser is provided fitted by another unit.
        required_resources = self.config.get_required_resources()
        if not required_resources or not required_resources.get(
                CUSTOM_ENTITY_PARSER_USAGE):
            # In these cases we need a custom entity parser only to do the
            # final slot resolution step, which must be done without stemming.
            parser_usage = CustomEntityParserUsage.WITHOUT_STEMS
        else:
            parser_usage = required_resources[CUSTOM_ENTITY_PARSER_USAGE]

        if self.custom_entity_parser is None or self.fitted:
            self.custom_entity_parser = CustomEntityParser.build(
                dataset, parser_usage)
        return self
Exemple #16
0
    def test_should_get_intent_after_deserialization(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        classifier = LogRegIntentClassifier().fit(dataset)
        classifier.persist(self.tmp_file_path)

        # When
        builtin_entity_parser = BuiltinEntityParser.build(language="en")
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS)
        loaded_classifier = LogRegIntentClassifier.from_path(
            self.tmp_file_path,
            builtin_entity_parser=builtin_entity_parser,
            custom_entity_parser=custom_entity_parser)
        result = loaded_classifier.get_intent("Make me two cups of tea")

        # Then
        expected_intent = "MakeTea"
        self.assertEqual(expected_intent, result[RES_INTENT_NAME])
Exemple #17
0
    def test_should_be_serializable_into_bytearray(self):
        # Given
        dataset = validate_and_format_dataset(BEVERAGE_DATASET)
        intent_classifier = LogRegIntentClassifier().fit(dataset)

        # When
        intent_classifier_bytes = intent_classifier.to_byte_array()
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS)
        builtin_entity_parser = BuiltinEntityParser.build(language="en")
        loaded_classifier = LogRegIntentClassifier.from_byte_array(
            intent_classifier_bytes,
            builtin_entity_parser=builtin_entity_parser,
            custom_entity_parser=custom_entity_parser)
        result = loaded_classifier.get_intent("make me two cups of tea")

        # Then
        expected_intent = "MakeTea"
        self.assertEqual(expected_intent, result[RES_INTENT_NAME])
    def test_preprocess_for_training(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": " yo "
            }, {
                "text": " yo "
            }, {
                "text": "yo "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }, {
                "text": " "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }, {
                "text": " "
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird birdy"
            }]
        }]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances, training=True)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {
            "data": [{
                "text": "hello world"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }, {
                "text": ""
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }]
        }
        u_1 = {
            "data": [{
                "text": "beauty world"
            }, {
                "text": "ent 1",
                "entity": "entity_1"
            }, {
                "text": ""
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }
        u_2 = {"data": [{"text": "bird bird"}]}

        ent_00 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 23,
                "end": 31
            }
        }
        ent_01 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 32,
                "end": 40
            }
        }

        ent_1 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "2",
            "range": {
                "start": 25,
                "end": 26
            }
        }

        expected_data = [(u_0, [], [ent_00, ent_01], []),
                         (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_2, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
Exemple #19
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
            text_to_utterance("Bird birdy"),
        ]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language
        vectorizer.builtin_entity_scope = {"snips/number"}

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {"data": [{"text": "hello world entity_2"}]}

        u_1 = {"data": [{"text": "beauty world ent 1"}]}

        u_2 = {"data": [{"text": "bird bird"}]}

        u_3 = {"data": [{"text": "bird bird"}]}

        ent_0 = {
            "entity_kind": "entity_2",
            "value": "entity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            },
        }

        expected_data = [(u_0, [num_0], [ent_0], []),
                         (u_1, [num_1], [ent_11,
                                         ent_12], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_3, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
Exemple #20
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
    """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        u_0 = text_to_utterance("hÉllo wOrld Éntity_2")
        u_1 = text_to_utterance("beauTiful World entity 1")
        u_2 = text_to_utterance("Bird bïrdy")
        u_3 = text_to_utterance("Bird birdy")
        utterances = [u_0, u_1, u_2, u_3]

        vectorizer = CooccurrenceVectorizer(
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)

        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        ent_0 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            }
        }

        expected_data = [(u_0, [num_0], [ent_0]),
                         (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []),
                         (u_3, [], [])]

        self.assertSequenceEqual(expected_data, processed_data)
Exemple #21
0
    def test_entity_match_factory(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_intent
utterances:
- this is [entity1](my first entity)
- this is [entity2](second_entity)""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        config = {
            "factory_name": "entity_match",
            "args": {
                "tagging_scheme_code": TaggingScheme.BILOU.value,
                "use_stemming": True
            },
            "offsets": [0]
        }

        tokens = tokenize("my first entity and second_entity", LANGUAGE_EN)
        cache = [{TOKEN_NAME: token} for token in tokens]
        resources = {STEMS: dict()}
        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)
        factory = CRFFeatureFactory.from_config(
            config,
            custom_entity_parser=custom_entity_parser,
            resources=resources)
        factory.fit(dataset, "my_intent")

        # When
        features = factory.build_features()
        features = sorted(features, key=lambda f: f.base_name)
        res0 = features[0].compute(0, cache)
        res1 = features[0].compute(1, cache)
        res2 = features[0].compute(2, cache)
        res3 = features[0].compute(3, cache)
        res4 = features[0].compute(4, cache)

        res5 = features[1].compute(0, cache)
        res6 = features[1].compute(1, cache)
        res7 = features[1].compute(2, cache)
        res8 = features[1].compute(3, cache)
        res9 = features[1].compute(4, cache)

        # Then
        self.assertIsInstance(factory, CustomEntityMatchFactory)
        self.assertEqual(len(features), 2)
        self.assertEqual(features[0].base_name, "entity_match_entity1")
        self.assertEqual(features[1].base_name, "entity_match_entity2")
        self.assertEqual(res0, BEGINNING_PREFIX)
        self.assertEqual(res1, INSIDE_PREFIX)
        self.assertEqual(res2, LAST_PREFIX)
        self.assertEqual(res3, None)
        self.assertEqual(res4, None)

        self.assertEqual(res5, None)
        self.assertEqual(res6, None)
        self.assertEqual(res7, None)
        self.assertEqual(res8, None)
        self.assertEqual(res9, UNIT_PREFIX)
    def test_preprocess_utterances(self, mocked_parser_stem,
                                   mocked_featurizer_stem,
                                   mocked_word_cluster):
        # Given
        language = LANGUAGE_EN

        def _stem(t):
            t = normalize(t)
            if t == "beautiful":
                s = "beauty"
            elif t == "birdy":
                s = "bird"
            elif t == "entity":
                s = "ent"
            else:
                s = t
            return s

        def stem_function(text, language):
            return get_default_sep(language).join(
                [_stem(t) for t in tokenize_light(text, language)])

        mocked_word_cluster.return_value = {
            "beautiful": "cluster_1",
            "birdy": "cluster_2",
            "entity": "cluster_3"
        }

        mocked_parser_stem.side_effect = stem_function
        mocked_featurizer_stem.side_effect = stem_function

        dataset = {
            "intents": {
                "intent1": {
                    "utterances": []
                }
            },
            "entities": {
                "entity_1": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": ["alternative entity 1"]
                    }, {
                        "value": "éntity 1",
                        "synonyms": ["alternative entity 1"]
                    }],
                    "use_synonyms":
                    False,
                    "automatically_extensible":
                    False,
                    "matching_strictness":
                    1.0
                },
                "entity_2": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": []
                    }, {
                        "value": "Éntity 2",
                        "synonyms": ["Éntity_2", "Alternative entity 2"]
                    }],
                    "use_synonyms":
                    True,
                    "automatically_extensible":
                    False,
                    "matching_strictness":
                    1.0
                },
                "snips/number": {}
            },
            "language": "en",
        }

        dataset = validate_and_format_dataset(dataset)

        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
        ]

        labeled_utterance = {
            DATA: [{
                TEXT: "beauTiful éntity "
            }, {
                TEXT: "1",
                ENTITY: "snips/number",
                SLOT_NAME: "number"
            }, {
                TEXT: " bIrd Éntity_2"
            }]
        }
        utterances.append(labeled_utterance)
        labels = np.array([0, 0, 1, 1])

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS)

        featurizer = Featurizer(language,
                                None,
                                custom_entity_parser=custom_entity_parser,
                                config=FeaturizerConfig(
                                    word_clusters_name="brown_clusters",
                                    use_stemming=True)).fit(
                                        dataset, utterances, labels)

        # When
        utterances = featurizer.preprocess_utterances(utterances)

        # Then
        expected_utterances = [
            "hello world entity_2 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_2",
            "beauty world ent 1 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_1 entityfeatureentity_2 "
            "cluster_1 cluster_3", "bird bird",
            "beauty ent bird entity_2 builtinentityfeaturesnipsnumber "
            "builtinentityfeaturesnipsnumber entityfeatureentity_1 "
            "entityfeatureentity_2 entityfeatureentity_2 cluster_1"
        ]

        self.assertListEqual(utterances, expected_utterances)