def test_should_be_serializable_before_fit(self): # Given features_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1 }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) slot_filler = CRFSlotFiller(config) # When slot_filler.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"}) expected_slot_filler_dict = { "crf_model_file": None, "language_code": None, "config": config.to_dict(), "intent": None, "slot_name_mapping": None, } slot_filler_path = self.tmp_file_path / "slot_filler.json" self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
def test_should_be_serializable_before_fit(self): # Given features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) slot_filler = CRFSlotFiller(config) # When actual_slot_filler_dict = slot_filler.to_dict() # Then expected_slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_data": None, "language_code": None, "config": config.to_dict(), "intent": None, "slot_name_mapping": None, } self.assertDictEqual(actual_slot_filler_dict, expected_slot_filler_dict)
def test_should_be_serializable(self): # Given features_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1 }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) dataset = SAMPLE_DATASET slot_filler = CRFSlotFiller(config) intent = "dummy_intent_1" slot_filler.fit(dataset, intent=intent) # When slot_filler.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"}) expected_crf_file = Path(slot_filler.crf_model.modelfile.name).name self.assertTrue((self.tmp_file_path / expected_crf_file).exists()) expected_feature_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1, "language_code": "en" }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] expected_config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=expected_feature_factories) expected_slot_filler_dict = { "crf_model_file": expected_crf_file, "language_code": "en", "config": expected_config.to_dict(), "intent": intent, "slot_name_mapping": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name2": "dummy_entity_2", "dummy_slot_name3": "dummy_entity_2", } } slot_filler_path = self.tmp_file_path / "slot_filler.json" self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
def test_should_be_serializable(self, mock_serialize_crf_model): # Given mock_serialize_crf_model.return_value = "mocked_crf_model_data" features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler = CRFSlotFiller(config) intent = "dummy_intent_1" slot_filler.fit(dataset, intent=intent) # When actual_slot_filler_dict = slot_filler.to_dict() # Then expected_feature_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1, "language_code": "en"}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=expected_feature_factories) expected_slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_data": "mocked_crf_model_data", "language_code": "en", "config": expected_config.to_dict(), "intent": intent, "slot_name_mapping": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name2": "dummy_entity_2", "dummy_slot_name3": "dummy_entity_2", } } self.assertDictEqual(actual_slot_filler_dict, expected_slot_filler_dict)
def test_should_get_builtin_slots(self): # Given dataset = validate_and_format_dataset(WEATHER_DATASET) config = CRFSlotFillerConfig(random_seed=42) intent = "SearchWeatherForecast" slot_filler = CRFSlotFiller(config) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Give me the weather at 9p.m. in Paris") # Then expected_slots = [ unresolved_slot(match_range={ START: 20, END: 28 }, value='at 9p.m.', entity='snips/datetime', slot_name='datetime'), unresolved_slot(match_range={ START: 32, END: 37 }, value='Paris', entity='weather_location', slot_name='location') ] self.assertListEqual(expected_slots, slots)
def test_should_get_builtin_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: GetWeather utterances: - what is the weather [datetime:snips/datetime](at 9pm) - what's the weather in [location:weather_location](berlin) - What's the weather in [location](tokyo) [datetime](this weekend)? - Can you tell me the weather [datetime] please ? - what is the weather forecast [datetime] in [location](paris)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) intent = "GetWeather" slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Give me the weather at 9pm in Paris") # Then expected_slots = [ unresolved_slot(match_range={START: 20, END: 26}, value='at 9pm', entity='snips/datetime', slot_name='datetime'), unresolved_slot(match_range={START: 30, END: 35}, value='Paris', entity='weather_location', slot_name='location') ] self.assertListEqual(expected_slots, slots)
def test_should_get_slots_after_deserialization(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](one) cup of tea - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cups of tea ?""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) intent = "MakeTea" shared = self.get_shared_data(dataset) slot_filler = CRFSlotFiller(config, **shared) slot_filler.fit(dataset, intent) slot_filler.persist(self.tmp_file_path) deserialized_slot_filler = CRFSlotFiller.from_path( self.tmp_file_path, **shared) # When slots = deserialized_slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={START: 8, END: 11}, value='two', entity='snips/number', slot_name='number_of_cups')] self.assertListEqual(expected_slots, slots)
def test_should_get_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](five) cups of tea - please I want [number_of_cups](two) cups of tea""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) shared = self.get_shared_data(dataset) slot_filler = CRFSlotFiller(config, **shared) intent = "MakeTea" slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={START: 8, END: 11}, value='two', entity='snips/number', slot_name='number_of_cups')] self.assertListEqual(slots, expected_slots)
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) slot_filler = CRFSlotFiller(slot_filler_config) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler.fit(dataset, intent="dummy_intent_1") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ {"ngram_1": "foo"}, {}, {"ngram_1": "world"}, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def test_fitting_should_be_reproducible_after_serialization(self): # Given dataset = BEVERAGE_DATASET validated_dataset = validate_and_format_dataset(dataset) seed1 = 666 seed2 = 42 config = ProbabilisticIntentParserConfig( intent_classifier_config=LogRegIntentClassifierConfig( random_seed=seed1), slot_filler_config=CRFSlotFillerConfig(random_seed=seed2)) parser = ProbabilisticIntentParser(config) parser_dict = parser.to_dict() # When fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) # Then feature_weights_1 = fitted_parser_1.slot_fillers[ "MakeTea"].crf_model.state_features_ feature_weights_2 = fitted_parser_2.slot_fillers[ "MakeTea"].crf_model.state_features_ self.assertEqual(feature_weights_1, feature_weights_2)
def test_should_get_slots_after_deserialization(self): # Given dataset = BEVERAGE_DATASET config = CRFSlotFillerConfig(random_seed=42) intent = "MakeTea" slot_filler = CRFSlotFiller(config) slot_filler.fit(dataset, intent) slot_filler.persist(self.tmp_file_path) custom_entity_parser = slot_filler.custom_entity_parser builtin_entity_parser = slot_filler.builtin_entity_parser deserialized_slot_filler = CRFSlotFiller.from_path( self.tmp_file_path, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser) # When slots = deserialized_slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={ START: 8, END: 11 }, value='two', entity='snips/number', slot_name='number_of_cups') ] self.assertListEqual(expected_slots, slots)
def test_should_get_sub_builtin_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: PlanBreak utterances: - 'I want to leave from [start:snips/datetime](tomorrow) until [end:snips/datetime](next thursday)' - find me something from [start](9am) to [end](12pm) - I need a break from [start](2pm) until [end](4pm) - Can you suggest something from [start](april 4th) until [end](april 6th) ? - Book me a trip from [start](this friday) to [end](next tuesday)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) intent = "PlanBreak" slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Find me a plan from 5pm to 6pm") # Then expected_slots = [ unresolved_slot(match_range={START: 20, END: 23}, value="5pm", entity="snips/datetime", slot_name="start"), unresolved_slot(match_range={START: 27, END: 30}, value="6pm", entity="snips/datetime", slot_name="end") ] self.assertListEqual(expected_slots, slots)
def __init__(self, intent_classifier_config=None, slot_filler_config=None): if intent_classifier_config is None: from snips_nlu.pipeline.configs import LogRegIntentClassifierConfig intent_classifier_config = LogRegIntentClassifierConfig() if slot_filler_config is None: from snips_nlu.pipeline.configs import CRFSlotFillerConfig slot_filler_config = CRFSlotFillerConfig() self.intent_classifier_config = get_processing_unit_config( intent_classifier_config) self.slot_filler_config = get_processing_unit_config( slot_filler_config)
def test_should_be_deserializable_before_fit(self): # Given features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories) slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_file": None, "language_code": None, "intent": None, "slot_name_mapping": None, "config": slot_filler_config.to_dict() } metadata = {"unit_name": "crf_slot_filler"} self.tmp_file_path.mkdir() self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) self.writeJsonContent(self.tmp_file_path / "slot_filler.json", slot_filler_dict) # When slot_filler = CRFSlotFiller.from_path(self.tmp_file_path) # Then expected_features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_language = None expected_config = CRFSlotFillerConfig( feature_factory_configs=expected_features_factories) expected_intent = None expected_slot_name_mapping = None expected_crf_model = None self.assertEqual(slot_filler.crf_model, expected_crf_model) self.assertEqual(slot_filler.language, expected_language) self.assertEqual(slot_filler.intent, expected_intent) self.assertEqual(slot_filler.slot_name_mapping, expected_slot_name_mapping) self.assertDictEqual(expected_config.to_dict(), slot_filler.config.to_dict())
def test_should_be_deserializable_before_fit(self, mock_deserialize_crf_model): # Given mock_deserialize_crf_model.return_value = None features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories) slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_data": None, "language_code": None, "intent": None, "slot_name_mapping": None, "config": slot_filler_config.to_dict() } # When slot_filler = CRFSlotFiller.from_dict(slot_filler_dict) # Then expected_features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_language = None expected_config = CRFSlotFillerConfig( feature_factory_configs=expected_features_factories) expected_intent = None expected_slot_name_mapping = None expected_crf_model = None self.assertEqual(slot_filler.crf_model, expected_crf_model) self.assertEqual(slot_filler.language, expected_language) self.assertEqual(slot_filler.intent, expected_intent) self.assertEqual(slot_filler.slot_name_mapping, expected_slot_name_mapping) self.assertDictEqual(expected_config.to_dict(), slot_filler.config.to_dict())
def test_probabilistic_intent_parser_config(self): # Given config_dict = { "unit_name": "probabilistic_intent_parser", "intent_classifier_config": LogRegIntentClassifierConfig().to_dict(), "slot_filler_config": CRFSlotFillerConfig().to_dict(), } # When config = ProbabilisticIntentParserConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset, CustomEntityParserUsage.WITHOUT_STEMS) slot_filler = CRFSlotFiller(slot_filler_config, **shared) slot_filler.fit(dataset, intent="my_intent") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ { "ngram_1": "foo" }, {}, { "ngram_1": "world" }, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def test_should_be_serializable_when_fitted_without_slots(self): # Given features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) dataset = { "language": "en", "intents": { "intent1": { "utterances": [ { "data": [ { "text": "This is an utterance without " "slots" } ] } ] } }, "entities": {} } slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) slot_filler.fit(dataset, intent="intent1") # When slot_filler.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"}) self.assertIsNone(slot_filler.crf_model)
def test_should_get_slots(self): # Given dataset = validate_and_format_dataset(BEVERAGE_DATASET) config = CRFSlotFillerConfig(random_seed=42) intent = "MakeTea" slot_filler = CRFSlotFiller(config) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={START: 8, END: 11}, value='two', entity='snips/number', slot_name='number_of_cups')] self.assertListEqual(slots, expected_slots)
def test_fitting_should_be_reproducible_after_serialization(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json seed1 = 666 seed2 = 42 config = ProbabilisticIntentParserConfig( intent_classifier_config=LogRegIntentClassifierConfig( random_seed=seed1), slot_filler_config=CRFSlotFillerConfig(random_seed=seed2)) shared = self.get_shared_data(dataset) parser = ProbabilisticIntentParser(config, **shared) parser.persist(self.tmp_file_path) # When fitted_parser_1 = ProbabilisticIntentParser.from_path( self.tmp_file_path, **shared).fit(dataset) fitted_parser_2 = ProbabilisticIntentParser.from_path( self.tmp_file_path, **shared).fit(dataset) # Then feature_weights_1 = fitted_parser_1.slot_fillers[ "MakeTea"].crf_model.state_features_ feature_weights_2 = fitted_parser_2.slot_fillers[ "MakeTea"].crf_model.state_features_ self.assertEqual(feature_weights_1, feature_weights_2)
def test_crf_slot_filler_config(self): # Given feature_factories = [{ "args": { "common_words_gazetteer_name": None, "use_stemming": True, "n": 1 }, "factory_name": "get_ngram_fn", "offsets": [-2, -1, 0, 1, 2] }, { "args": {}, "factory_name": "is_digit", "offsets": [-1, 0, 1] }] config_dict = { "unit_name": "crf_slot_filler", "feature_factory_configs": feature_factories, "tagging_scheme": 2, "crf_args": { "c1": .2, "c2": .3, "algorithm": "lbfgs" }, "exhaustive_permutations_threshold": 42, "data_augmentation_config": SlotFillerDataAugmentationConfig().to_dict(), "random_seed": 43 } # When config = CRFSlotFillerConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_should_be_serializable_before_fitting(self): # Given parser = ProbabilisticIntentParser() # When parser.persist(self.tmp_file_path) # Then expected_parser_dict = { "config": { "unit_name": "probabilistic_intent_parser", "slot_filler_config": CRFSlotFillerConfig().to_dict(), "intent_classifier_config": LogRegIntentClassifierConfig().to_dict() }, "slot_fillers": [] } metadata = {"unit_name": "probabilistic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_parser_dict)
def test_should_parse_top_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - "[entity1](foo) bar" --- type: intent name: intent2 utterances: - foo bar [entity2](baz) --- type: intent name: intent3 utterances: - foz for [entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) slot_filler_config = CRFSlotFillerConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig( classifier_config, slot_filler_config) parser = ProbabilisticIntentParser(parser_config) parser.fit(dataset) text = "foo bar baz" # When results = parser.parse(text, top_n=2) intents = [res[RES_INTENT][RES_INTENT_NAME] for res in results] entities = [[s[RES_VALUE] for s in res[RES_SLOTS]] for res in results] # Then expected_intents = ["intent2", "intent1"] expected_entities = [["baz"], ["foo"]] self.assertListEqual(expected_intents, intents) self.assertListEqual(expected_entities, entities)
def test_should_be_serializable_before_fitting(self): # Given parser = ProbabilisticIntentParser() # When actual_parser_dict = parser.to_dict() # Then expected_parser_dict = { "unit_name": "probabilistic_intent_parser", "config": { "unit_name": "probabilistic_intent_parser", "slot_filler_config": CRFSlotFillerConfig().to_dict(), "intent_classifier_config": LogRegIntentClassifierConfig().to_dict() }, "intent_classifier": None, "slot_fillers": dict(), } self.assertDictEqual(actual_parser_dict, expected_parser_dict)
def test_refit(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [entity1](my first entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json updated_dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [entity1](my first entity) - this is [entity1](my first entity) again""") updated_dataset = Dataset.from_yaml_files( "en", [updated_dataset_stream]).json config = CRFSlotFillerConfig(feature_factory_configs=[ { "args": { "common_words_gazetteer_name": "top_10000_words_stemmed", "use_stemming": True, "n": 1 }, "factory_name": "ngram", "offsets": [-2, -1, 0, 1, 2] }, ]) # When slot_filler = CRFSlotFiller(config).fit(dataset, "my_intent") # Then slot_filler.fit(updated_dataset, "my_intent")
def test_should_parse_with_filter(self): dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - "[slot1:entity1](foo) bar" --- type: intent name: intent2 utterances: - foo bar [slot2:entity2](baz) --- type: intent name: intent3 utterances: - foz for [slot3:entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) slot_filler_config = CRFSlotFillerConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig( classifier_config, slot_filler_config) parser = ProbabilisticIntentParser(parser_config) parser.fit(dataset) text = "foo bar baz" # When result = parser.parse(text, intents=["intent1", "intent3"]) # Then expected_slots = [unresolved_slot((0, 3), "foo", "entity1", "slot1")] self.assertEqual("intent1", result[RES_INTENT][RES_INTENT_NAME]) self.assertEqual(expected_slots, result[RES_SLOTS])
def test_should_be_deserializable(self): # Given language = LANGUAGE_EN feature_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1, "language_code": language }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=feature_factories) slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_file": "foobar.crfsuite", "language_code": "en", "intent": "dummy_intent_1", "slot_name_mapping": { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", } }, "config": slot_filler_config.to_dict() } metadata = {"unit_name": "crf_slot_filler"} self.tmp_file_path.mkdir() self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) self.writeJsonContent(self.tmp_file_path / "slot_filler.json", slot_filler_dict) self.writeFileContent(self.tmp_file_path / "foobar.crfsuite", "foo bar") # When slot_filler = CRFSlotFiller.from_path(self.tmp_file_path) # Then expected_language = LANGUAGE_EN expected_feature_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1, "language_code": language }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] expected_config = CRFSlotFillerConfig( feature_factory_configs=expected_feature_factories) expected_intent = "dummy_intent_1" expected_slot_name_mapping = { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", } } self.assertEqual(slot_filler.language, expected_language) self.assertEqual(slot_filler.intent, expected_intent) self.assertEqual(slot_filler.slot_name_mapping, expected_slot_name_mapping) self.assertDictEqual(expected_config.to_dict(), slot_filler.config.to_dict()) crf_path = Path(slot_filler.crf_model.modelfile.name) self.assertFileContent(crf_path, "foo bar")
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](second_entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json features_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1 }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) shared = self.get_shared_data(dataset) slot_filler = CRFSlotFiller(config, **shared) intent = "my_intent" slot_filler.fit(dataset, intent=intent) # When slot_filler.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, {"unit_name": "crf_slot_filler"}) self.assertTrue((self.tmp_file_path / CRF_MODEL_FILENAME).exists()) expected_feature_factories = [{ "factory_name": ShapeNgramFactory.name, "args": { "n": 1, "language_code": "en" }, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] }] expected_config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=expected_feature_factories) expected_slot_filler_dict = { "crf_model_file": CRF_MODEL_FILENAME, "language_code": "en", "config": expected_config.to_dict(), "intent": intent, "slot_name_mapping": { "slot1": "entity1", "slot2": "entity2", } } slot_filler_path = self.tmp_file_path / "slot_filler.json" self.assertJsonContent(slot_filler_path, expected_slot_filler_dict)
def test_should_be_deserializable(self, mock_deserialize_crf_model): # Given language = LANGUAGE_EN mock_deserialize_crf_model.return_value = None feature_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1, "language_code": language}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=feature_factories) slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_data": "mocked_crf_model_data", "language_code": "en", "intent": "dummy_intent_1", "slot_name_mapping": { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", } }, "config": slot_filler_config.to_dict() } # When slot_filler = CRFSlotFiller.from_dict(slot_filler_dict) # Then mock_deserialize_crf_model.assert_called_once_with( "mocked_crf_model_data") expected_language = LANGUAGE_EN expected_feature_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1, "language_code": language}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_config = CRFSlotFillerConfig( feature_factory_configs=expected_feature_factories) expected_intent = "dummy_intent_1" expected_slot_name_mapping = { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", } } self.assertEqual(slot_filler.language, expected_language) self.assertEqual(slot_filler.intent, expected_intent) self.assertEqual(slot_filler.slot_name_mapping, expected_slot_name_mapping) self.assertDictEqual(expected_config.to_dict(), slot_filler.config.to_dict())
def test_augment_slots(self): # Given language = LANGUAGE_EN text = "Find me a flight before 10pm and after 8pm" tokens = tokenize(text, language) missing_slots = {"start_date", "end_date"} tags = ['O' for _ in tokens] def mocked_sequence_probability(_, tags_): tags_1 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] tags_2 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_3 = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] tags_4 = [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_5 = [ 'O', 'O', 'O', 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] tags_6 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', 'O', 'O' ] tags_7 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', 'O', 'O' ] tags_8 = [ 'O', 'O', 'O', 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX, 'O', '%sstart_date' % BEGINNING_PREFIX, '%sstart_date' % INSIDE_PREFIX ] tags_9 = [ 'O', 'O', 'O', 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX, 'O', '%send_date' % BEGINNING_PREFIX, '%send_date' % INSIDE_PREFIX ] if tags_ == tags_1: return 0.6 elif tags_ == tags_2: return 0.8 elif tags_ == tags_3: return 0.2 elif tags_ == tags_4: return 0.2 elif tags_ == tags_5: return 0.99 elif tags_ == tags_6: return 0.0 elif tags_ == tags_7: return 0.0 elif tags_ == tags_8: return 0.5 elif tags_ == tags_9: return 0.5 else: raise ValueError("Unexpected tag sequence: %s" % tags_) slot_filler_config = CRFSlotFillerConfig(random_seed=42) slot_filler = CRFSlotFiller(config=slot_filler_config) slot_filler.language = LANGUAGE_EN slot_filler.intent = "intent1" slot_filler.slot_name_mapping = { "start_date": "snips/datetime", "end_date": "snips/datetime", } # pylint:disable=protected-access slot_filler._get_sequence_probability = MagicMock( side_effect=mocked_sequence_probability) # pylint:enable=protected-access slot_filler.compute_features = MagicMock(return_value=None) # When # pylint: disable=protected-access augmented_slots = slot_filler._augment_slots(text, tokens, tags, missing_slots) # pylint: enable=protected-access # Then expected_slots = [ unresolved_slot(value='after 8pm', match_range={ START: 33, END: 42 }, entity='snips/datetime', slot_name='end_date') ] self.assertListEqual(augmented_slots, expected_slots)