Beispiel #1
0
    def test_should_not_disambiguate_grammar_and_gazetteer_entities(self):
        # Given
        text = "trois nuits par semaine"
        gazetteer_entities = ["snips/musicTrack"]
        parser = BuiltinEntityParser.build(
            language="fr", gazetteer_entity_scope=gazetteer_entities)

        # When
        result = parser.parse(text)

        # Then
        expected_result = [{
            "value": "trois",
            "range": {
                "start": 0,
                "end": 5
            },
            "entity": {
                "kind": "Number",
                "value": 3.0
            },
            "entity_kind": "snips/number"
        }, {
            "value": "trois nuits par semaine",
            "range": {
                "start": 0,
                "end": 23
            },
            "entity": {
                "kind": "MusicTrack",
                "value": "3 nuits par semaine"
            },
            "entity_kind": "snips/musicTrack"
        }]
        self.assertListEqual(expected_result, result)
Beispiel #2
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    utterance_entities_values = extract_utterance_entities(dataset)
    builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        uterrance_entities = utterance_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, uterrance_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, uterrance_entities, language, builtin_entity_parser)
    dataset[VALIDATED] = True
    return dataset
Beispiel #3
0
    def from_path(cls, path, **shared):
        """Loads a :class:`SnipsNLUEngine` instance from a directory path

        The data at the given path must have been generated using
        :func:`~SnipsNLUEngine.persist`

        Args:
            path (str): The path where the nlu engine is stored

        Raises:
            LoadingError: when some files are missing
            IncompatibleModelError: when trying to load an engine model which
                is not compatible with the current version of the lib
        """
        directory_path = Path(path)
        model_path = directory_path / "nlu_engine.json"
        if not model_path.exists():
            raise LoadingError("Missing nlu engine model file: %s" %
                               model_path.name)

        with model_path.open(encoding="utf8") as f:
            model = json.load(f)
        model_version = model.get("model_version")
        if model_version is None or model_version != __model_version__:
            raise IncompatibleModelError(model_version)

        dataset_metadata = model["dataset_metadata"]
        if shared.get(RESOURCES) is None and dataset_metadata is not None:
            language = dataset_metadata["language_code"]
            resources_dir = directory_path / "resources" / language
            if resources_dir.is_dir():
                resources = load_resources_from_dir(resources_dir)
                shared[RESOURCES] = resources

        if shared.get(BUILTIN_ENTITY_PARSER) is None:
            path = model["builtin_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path(
                    parser_path)

        if shared.get(CUSTOM_ENTITY_PARSER) is None:
            path = model["custom_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path(
                    parser_path)

        config = cls.config_type.from_dict(model["config"])
        nlu_engine = cls(config=config, **shared)
        nlu_engine.dataset_metadata = dataset_metadata
        intent_parsers = []
        for parser_idx, parser_name in enumerate(model["intent_parsers"]):
            parser_config = config.intent_parsers_configs[parser_idx]
            intent_parser_path = directory_path / parser_name
            intent_parser = IntentParser.load_from_path(
                intent_parser_path, parser_config.unit_name, **shared)
            intent_parsers.append(intent_parser)
        nlu_engine.intent_parsers = intent_parsers
        return nlu_engine
Beispiel #4
0
 def fit_builtin_entity_parser_if_needed(self, dataset):
     # We only fit a builtin entity parser when the unit has already been
     # fitted or if the parser is none.
     # In the other cases the parser is provided fitted by another unit.
     if self.builtin_entity_parser is None or self.fitted:
         self.builtin_entity_parser = BuiltinEntityParser.build(
             dataset=dataset)
     return self
Beispiel #5
0
    def from_path(cls, path, **shared):
        """Load a :class:`SnipsNLUEngine` instance from a directory path

        The data at the given path must have been generated using
        :func:`~SnipsNLUEngine.persist`

        Args:
            path (str): The path where the nlu engine is
                stored.
        """
        directory_path = Path(path)
        model_path = directory_path / "nlu_engine.json"
        if not model_path.exists():
            raise OSError("Missing nlu engine model file: %s" %
                          model_path.name)

        with model_path.open(encoding="utf8") as f:
            model = json.load(f)
        model_version = model.get("model_version")
        if model_version is None or model_version != __model_version__:
            raise ValueError(
                "Incompatible data model: persisted object=%s, python lib=%s" %
                (model_version, __model_version__))

        dataset_metadata = model["dataset_metadata"]
        if dataset_metadata is not None:
            language = dataset_metadata["language_code"]
            resources_dir = directory_path / "resources" / language
            if resources_dir.is_dir():
                load_resources_from_dir(resources_dir)

        if shared.get(BUILTIN_ENTITY_PARSER) is None:
            path = model["builtin_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path(
                    parser_path)

        if shared.get(CUSTOM_ENTITY_PARSER) is None:
            path = model["custom_entity_parser"]
            if path is not None:
                parser_path = directory_path / path
                shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path(
                    parser_path)

        nlu_engine = cls(config=model["config"], **shared)

        # pylint:disable=protected-access
        nlu_engine._dataset_metadata = dataset_metadata
        # pylint:enable=protected-access
        intent_parsers = []
        for intent_parser_name in model["intent_parsers"]:
            intent_parser_path = directory_path / intent_parser_name
            intent_parser = load_processing_unit(intent_parser_path, **shared)
            intent_parsers.append(intent_parser)
        nlu_engine.intent_parsers = intent_parsers
        return nlu_engine
Beispiel #6
0
    def test_should_support_all_languages(self):
        # Given
        text = ""

        for language in get_all_languages():
            parser = BuiltinEntityParser.build(language=language)
            msg = "get_builtin_entities does not support %s." % language
            with self.fail_if_exception(msg):
                # When / Then
                parser.parse(text)
Beispiel #7
0
    def test_should_respect_scope(self):
        # Given
        text = "meet me at 10 p.m."

        # When
        scope = ["snips/number"]
        parser = BuiltinEntityParser.build(language="en")
        parse = parser.parse(text, scope=scope)

        # Then
        self.assertEqual(len(parse), 1)
        self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
Beispiel #8
0
    def test_should_share_parser(self, mocked_parser):
        # Given
        dataset1 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicArtist": {},
                "snips/musicTrack": {},
                "snips/number": {}
            }
        }

        dataset2 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicTrack": {},
                "snips/musicAlbum": {},
                "snips/amountOfMoney": {}
            }
        }

        dataset3 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicTrack": {},
                "snips/musicArtist": {},
            }
        }

        # When
        BuiltinEntityParser.build(dataset=dataset1)
        BuiltinEntityParser.build(dataset=dataset2)
        BuiltinEntityParser.build(dataset=dataset3)

        # Then
        self.assertEqual(2, mocked_parser.call_count)
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it

    Raise:
        DatasetFormatError: When the dataset format is wrong
    """
    from snips_nlu_parsers import get_all_languages

    if isinstance(dataset, Dataset):
        dataset = dataset.json

    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict, object_label="dataset")
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict, object_label="entities")
    validate_type(dataset[INTENTS], dict, object_label="intents")
    language = dataset[LANGUAGE]
    validate_type(language, str, object_label="language")
    if language not in get_all_languages():
        raise DatasetFormatError("Unknown language: '%s'" % language)

    dataset[INTENTS] = {
        intent_name: intent_data
        for intent_name, intent_data in sorted(iteritems(dataset[INTENTS]))}
    for intent in itervalues(dataset[INTENTS]):
        _validate_and_format_intent(intent, dataset[ENTITIES])

    utterance_entities_values = extract_utterance_entities(dataset)
    builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset)

    dataset[ENTITIES] = {
        intent_name: entity_data
        for intent_name, entity_data in sorted(iteritems(dataset[ENTITIES]))}

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        uterrance_entities = utterance_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                _validate_and_format_builtin_entity(entity, uterrance_entities)
        else:
            dataset[ENTITIES][entity_name] = \
                _validate_and_format_custom_entity(
                    entity, uterrance_entities, language,
                    builtin_entity_parser)
    dataset[VALIDATED] = True
    return dataset
Beispiel #10
0
    def test_should_respect_scope_with_gazetteer_entity(self):
        # Given
        text = "je veux écouter metallica"

        # When
        gazetteer_entities = ["snips/musicArtist", "snips/musicAlbum"]
        parser = BuiltinEntityParser.build(
            language="fr", gazetteer_entity_scope=gazetteer_entities)
        scope1 = ["snips/musicArtist"]
        parse1 = parser.parse(text, scope=scope1)
        scope2 = ["snips/musicAlbum"]
        parse2 = parser.parse(text, scope=scope2)

        # Then
        expected_parse1 = [{
            "entity": {
                "kind": "MusicArtist",
                "value": "Metallica"
            },
            "entity_kind": "snips/musicArtist",
            "range": {
                "end": 25,
                "start": 16
            },
            "value": "metallica"
        }]
        expected_parse2 = [{
            "entity": {
                "kind": "MusicAlbum",
                "value": "Metallica"
            },
            "entity_kind": "snips/musicAlbum",
            "range": {
                "end": 25,
                "start": 16
            },
            "value": "metallica"
        }]
        self.assertEqual(expected_parse1, parse1)
        self.assertEqual(expected_parse2, parse2)
Beispiel #11
0
    def test_should_parse_gazetteer_entities(self):
        # Given
        text = "je veux ecouter les daft punk s'il vous plait"
        parser = BuiltinEntityParser.build(
            language="fr", gazetteer_entity_scope=["snips/musicArtist"])

        # When / Then
        parse = parser.parse(text)

        expected_parse = [{
            "entity": {
                "kind": "MusicArtist",
                "value": "Daft Punk"
            },
            "entity_kind": "snips/musicArtist",
            "range": {
                "end": 29,
                "start": 20
            },
            "value": "daft punk"
        }]
        self.assertEqual(parse, expected_parse)
Beispiel #12
0
    def test_should_parse_grammar_entities(self):
        # Given
        text = "we'll be 2 at the meeting"
        language = "en"
        parser = BuiltinEntityParser.build(language=language)

        # When / Then
        parse = parser.parse(text)

        expected_parse = [{
            "entity": {
                "kind": "Number",
                "value": 2.0
            },
            "entity_kind": "snips/number",
            "range": {
                "end": 10,
                "start": 9
            },
            "value": "2"
        }]
        self.assertEqual(parse, expected_parse)
Beispiel #13
0
    def test_should_share_parser(self, mocked_build_builtin_parser):
        # Given

        # pylint:disable=unused-argument
        def mock_build_builtin_parser(language, gazetteer_entity_scope):
            return None

        # pylint:enable=unused-argument

        mocked_build_builtin_parser.side_effect = mock_build_builtin_parser

        dataset1 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicArtist": {},
                "snips/musicTrack": {},
                "snips/number": {}
            }
        }

        dataset2 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicTrack": {},
                "snips/musicAlbum": {},
                "snips/amountOfMoney": {}
            }
        }

        dataset3 = {
            LANGUAGE: "fr",
            ENTITIES: {
                "snips/musicTrack": {},
                "snips/musicArtist": {},
            }
        }

        # When
        BuiltinEntityParser.build(dataset=dataset1)
        BuiltinEntityParser.build(dataset=dataset2)
        BuiltinEntityParser.build(dataset=dataset3)

        # Then
        self.assertEqual(2, mocked_build_builtin_parser.call_count)