def test_should_not_disambiguate_grammar_and_gazetteer_entities(self): # Given text = "trois nuits par semaine" gazetteer_entities = ["snips/musicTrack"] parser = BuiltinEntityParser.build( language="fr", gazetteer_entity_scope=gazetteer_entities) # When result = parser.parse(text) # Then expected_result = [{ "value": "trois", "range": { "start": 0, "end": 5 }, "entity": { "kind": "Number", "value": 3.0 }, "entity_kind": "snips/number" }, { "value": "trois nuits par semaine", "range": { "start": 0, "end": 23 }, "entity": { "kind": "MusicTrack", "value": "3 nuits par semaine" }, "entity_kind": "snips/musicTrack" }] self.assertListEqual(expected_result, result)
def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it""" # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict) mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict) validate_type(dataset[INTENTS], dict) language = dataset[LANGUAGE] validate_type(language, str) if language not in get_all_languages(): raise ValueError("Unknown language: '%s'" % language) for intent in itervalues(dataset[INTENTS]): validate_and_format_intent(intent, dataset[ENTITIES]) utterance_entities_values = extract_utterance_entities(dataset) builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) for entity_name, entity in iteritems(dataset[ENTITIES]): uterrance_entities = utterance_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ validate_and_format_builtin_entity(entity, uterrance_entities) else: dataset[ENTITIES][entity_name] = validate_and_format_custom_entity( entity, uterrance_entities, language, builtin_entity_parser) dataset[VALIDATED] = True return dataset
def from_path(cls, path, **shared): """Loads a :class:`SnipsNLUEngine` instance from a directory path The data at the given path must have been generated using :func:`~SnipsNLUEngine.persist` Args: path (str): The path where the nlu engine is stored Raises: LoadingError: when some files are missing IncompatibleModelError: when trying to load an engine model which is not compatible with the current version of the lib """ directory_path = Path(path) model_path = directory_path / "nlu_engine.json" if not model_path.exists(): raise LoadingError("Missing nlu engine model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model = json.load(f) model_version = model.get("model_version") if model_version is None or model_version != __model_version__: raise IncompatibleModelError(model_version) dataset_metadata = model["dataset_metadata"] if shared.get(RESOURCES) is None and dataset_metadata is not None: language = dataset_metadata["language_code"] resources_dir = directory_path / "resources" / language if resources_dir.is_dir(): resources = load_resources_from_dir(resources_dir) shared[RESOURCES] = resources if shared.get(BUILTIN_ENTITY_PARSER) is None: path = model["builtin_entity_parser"] if path is not None: parser_path = directory_path / path shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path( parser_path) if shared.get(CUSTOM_ENTITY_PARSER) is None: path = model["custom_entity_parser"] if path is not None: parser_path = directory_path / path shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path( parser_path) config = cls.config_type.from_dict(model["config"]) nlu_engine = cls(config=config, **shared) nlu_engine.dataset_metadata = dataset_metadata intent_parsers = [] for parser_idx, parser_name in enumerate(model["intent_parsers"]): parser_config = config.intent_parsers_configs[parser_idx] intent_parser_path = directory_path / parser_name intent_parser = IntentParser.load_from_path( intent_parser_path, parser_config.unit_name, **shared) intent_parsers.append(intent_parser) nlu_engine.intent_parsers = intent_parsers return nlu_engine
def fit_builtin_entity_parser_if_needed(self, dataset): # We only fit a builtin entity parser when the unit has already been # fitted or if the parser is none. # In the other cases the parser is provided fitted by another unit. if self.builtin_entity_parser is None or self.fitted: self.builtin_entity_parser = BuiltinEntityParser.build( dataset=dataset) return self
def from_path(cls, path, **shared): """Load a :class:`SnipsNLUEngine` instance from a directory path The data at the given path must have been generated using :func:`~SnipsNLUEngine.persist` Args: path (str): The path where the nlu engine is stored. """ directory_path = Path(path) model_path = directory_path / "nlu_engine.json" if not model_path.exists(): raise OSError("Missing nlu engine model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model = json.load(f) model_version = model.get("model_version") if model_version is None or model_version != __model_version__: raise ValueError( "Incompatible data model: persisted object=%s, python lib=%s" % (model_version, __model_version__)) dataset_metadata = model["dataset_metadata"] if dataset_metadata is not None: language = dataset_metadata["language_code"] resources_dir = directory_path / "resources" / language if resources_dir.is_dir(): load_resources_from_dir(resources_dir) if shared.get(BUILTIN_ENTITY_PARSER) is None: path = model["builtin_entity_parser"] if path is not None: parser_path = directory_path / path shared[BUILTIN_ENTITY_PARSER] = BuiltinEntityParser.from_path( parser_path) if shared.get(CUSTOM_ENTITY_PARSER) is None: path = model["custom_entity_parser"] if path is not None: parser_path = directory_path / path shared[CUSTOM_ENTITY_PARSER] = CustomEntityParser.from_path( parser_path) nlu_engine = cls(config=model["config"], **shared) # pylint:disable=protected-access nlu_engine._dataset_metadata = dataset_metadata # pylint:enable=protected-access intent_parsers = [] for intent_parser_name in model["intent_parsers"]: intent_parser_path = directory_path / intent_parser_name intent_parser = load_processing_unit(intent_parser_path, **shared) intent_parsers.append(intent_parser) nlu_engine.intent_parsers = intent_parsers return nlu_engine
def test_should_support_all_languages(self): # Given text = "" for language in get_all_languages(): parser = BuiltinEntityParser.build(language=language) msg = "get_builtin_entities does not support %s." % language with self.fail_if_exception(msg): # When / Then parser.parse(text)
def test_should_respect_scope(self): # Given text = "meet me at 10 p.m." # When scope = ["snips/number"] parser = BuiltinEntityParser.build(language="en") parse = parser.parse(text, scope=scope) # Then self.assertEqual(len(parse), 1) self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
def test_should_share_parser(self, mocked_parser): # Given dataset1 = { LANGUAGE: "fr", ENTITIES: { "snips/musicArtist": {}, "snips/musicTrack": {}, "snips/number": {} } } dataset2 = { LANGUAGE: "fr", ENTITIES: { "snips/musicTrack": {}, "snips/musicAlbum": {}, "snips/amountOfMoney": {} } } dataset3 = { LANGUAGE: "fr", ENTITIES: { "snips/musicTrack": {}, "snips/musicArtist": {}, } } # When BuiltinEntityParser.build(dataset=dataset1) BuiltinEntityParser.build(dataset=dataset2) BuiltinEntityParser.build(dataset=dataset3) # Then self.assertEqual(2, mocked_parser.call_count)
def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it Raise: DatasetFormatError: When the dataset format is wrong """ from snips_nlu_parsers import get_all_languages if isinstance(dataset, Dataset): dataset = dataset.json # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict, object_label="dataset") mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict, object_label="entities") validate_type(dataset[INTENTS], dict, object_label="intents") language = dataset[LANGUAGE] validate_type(language, str, object_label="language") if language not in get_all_languages(): raise DatasetFormatError("Unknown language: '%s'" % language) dataset[INTENTS] = { intent_name: intent_data for intent_name, intent_data in sorted(iteritems(dataset[INTENTS]))} for intent in itervalues(dataset[INTENTS]): _validate_and_format_intent(intent, dataset[ENTITIES]) utterance_entities_values = extract_utterance_entities(dataset) builtin_entity_parser = BuiltinEntityParser.build(dataset=dataset) dataset[ENTITIES] = { intent_name: entity_data for intent_name, entity_data in sorted(iteritems(dataset[ENTITIES]))} for entity_name, entity in iteritems(dataset[ENTITIES]): uterrance_entities = utterance_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ _validate_and_format_builtin_entity(entity, uterrance_entities) else: dataset[ENTITIES][entity_name] = \ _validate_and_format_custom_entity( entity, uterrance_entities, language, builtin_entity_parser) dataset[VALIDATED] = True return dataset
def test_should_respect_scope_with_gazetteer_entity(self): # Given text = "je veux écouter metallica" # When gazetteer_entities = ["snips/musicArtist", "snips/musicAlbum"] parser = BuiltinEntityParser.build( language="fr", gazetteer_entity_scope=gazetteer_entities) scope1 = ["snips/musicArtist"] parse1 = parser.parse(text, scope=scope1) scope2 = ["snips/musicAlbum"] parse2 = parser.parse(text, scope=scope2) # Then expected_parse1 = [{ "entity": { "kind": "MusicArtist", "value": "Metallica" }, "entity_kind": "snips/musicArtist", "range": { "end": 25, "start": 16 }, "value": "metallica" }] expected_parse2 = [{ "entity": { "kind": "MusicAlbum", "value": "Metallica" }, "entity_kind": "snips/musicAlbum", "range": { "end": 25, "start": 16 }, "value": "metallica" }] self.assertEqual(expected_parse1, parse1) self.assertEqual(expected_parse2, parse2)
def test_should_parse_gazetteer_entities(self): # Given text = "je veux ecouter les daft punk s'il vous plait" parser = BuiltinEntityParser.build( language="fr", gazetteer_entity_scope=["snips/musicArtist"]) # When / Then parse = parser.parse(text) expected_parse = [{ "entity": { "kind": "MusicArtist", "value": "Daft Punk" }, "entity_kind": "snips/musicArtist", "range": { "end": 29, "start": 20 }, "value": "daft punk" }] self.assertEqual(parse, expected_parse)
def test_should_parse_grammar_entities(self): # Given text = "we'll be 2 at the meeting" language = "en" parser = BuiltinEntityParser.build(language=language) # When / Then parse = parser.parse(text) expected_parse = [{ "entity": { "kind": "Number", "value": 2.0 }, "entity_kind": "snips/number", "range": { "end": 10, "start": 9 }, "value": "2" }] self.assertEqual(parse, expected_parse)
def test_should_share_parser(self, mocked_build_builtin_parser): # Given # pylint:disable=unused-argument def mock_build_builtin_parser(language, gazetteer_entity_scope): return None # pylint:enable=unused-argument mocked_build_builtin_parser.side_effect = mock_build_builtin_parser dataset1 = { LANGUAGE: "fr", ENTITIES: { "snips/musicArtist": {}, "snips/musicTrack": {}, "snips/number": {} } } dataset2 = { LANGUAGE: "fr", ENTITIES: { "snips/musicTrack": {}, "snips/musicAlbum": {}, "snips/amountOfMoney": {} } } dataset3 = { LANGUAGE: "fr", ENTITIES: { "snips/musicTrack": {}, "snips/musicArtist": {}, } } # When BuiltinEntityParser.build(dataset=dataset1) BuiltinEntityParser.build(dataset=dataset2) BuiltinEntityParser.build(dataset=dataset3) # Then self.assertEqual(2, mocked_build_builtin_parser.call_count)