def generate_dataset(language, *files): """Create a Snips NLU dataset from text friendly files""" if any(f.endswith(".yml") or f.endswith(".yaml") for f in files): dataset = Dataset.from_yaml_files(language, list(files)) else: dataset = Dataset.from_files(language, list(files)) print(json.dumps(dataset.json, indent=2, sort_keys=True))
def test_should_parse_with_filter(self): dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - "[slot1:entity1](foo) bar" --- type: intent name: intent2 utterances: - foo bar [slot2:entity2](baz) --- type: intent name: intent3 utterances: - foz for [slot3:entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) shared[RANDOM_STATE] = 42 parser = ProbabilisticIntentParser(**shared) parser.fit(dataset) text = "foo bar baz" # When result = parser.parse(text, intents=["intent1", "intent3"]) # Then expected_slots = [unresolved_slot((0, 3), "foo", "entity1", "slot1")] self.assertEqual("intent1", result[RES_INTENT][RES_INTENT_NAME]) self.assertEqual(expected_slots, result[RES_SLOTS])
def test_should_get_slots_after_deserialization(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](one) cup of tea - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cups of tea ?""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json intent = "MakeTea" shared = self.get_shared_data(dataset) shared[RANDOM_STATE] = 42 slot_filler = CRFSlotFiller(**shared) slot_filler.fit(dataset, intent) slot_filler.persist(self.tmp_file_path) deserialized_slot_filler = CRFSlotFiller.from_path( self.tmp_file_path, **shared) # When slots = deserialized_slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={ START: 8, END: 11 }, value='two', entity='snips/number', slot_name='number_of_cups') ] self.assertListEqual(expected_slots, slots)
def test_should_get_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - yala yili --- type: intent name: intent2 utterances: - yala yili yulu --- type: intent name: intent3 utterances: - yili yulu yele""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig(classifier_config) parser = ProbabilisticIntentParser(parser_config).fit(dataset) text = "yala yili yulu" # When results = parser.get_intents(text) intents = [res[RES_INTENT_NAME] for res in results] # Then expected_intents = ["intent2", "intent1", "intent3", None] self.assertEqual(expected_intents, intents)
def test_get_slots_should_raise_with_unknown_intent(self): # Given slots_dataset_stream = io.StringIO(""" --- type: intent name: greeting1 utterances: - Hello [name1](John) --- type: intent name: goodbye utterances: - Goodbye [name](Eric)""") dataset = Dataset.from_yaml_files("en", [slots_dataset_stream]).json # pylint:disable=unused-variable @IntentClassifier.register("my_intent_classifier", True) class MyIntentClassifier(MockIntentClassifier): pass @SlotFiller.register("my_slot_filler", True) class MySlotFiller(MockSlotFiller): pass # pylint:enable=unused-variable config = ProbabilisticIntentParserConfig( intent_classifier_config="my_intent_classifier", slot_filler_config="my_slot_filler") parser = ProbabilisticIntentParser(config).fit(dataset) # When / Then with self.assertRaises(IntentNotFoundError): parser.get_slots("Hello John", "greeting3")
def test_parse_should_raise_with_unknown_intent_in_filter(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: greeting1 utterances: - Hello [name1](John) --- type: intent name: goodbye utterances: - Goodbye [name](Eric)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # pylint:disable=unused-variable @IntentParser.register("my_intent_parser", True) class FirstIntentParser(MockIntentParser): pass # pylint:enable=unused-variable config = NLUEngineConfig(["my_intent_parser"]) nlu_engine = SnipsNLUEngine(config).fit(dataset) # When / Then with self.assertRaises(IntentNotFoundError): nlu_engine.parse("Hello John", intents="greeting3") with self.assertRaises(IntentNotFoundError): nlu_engine.parse("Hello John", intents=["greeting3"])
def test_should_persist_resources_from_memory(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) engine = SnipsNLUEngine(**shared).fit(dataset) dir_temp_engine = self.fixture_dir / "temp_engine" engine.persist(dir_temp_engine) # When loaded_engine = SnipsNLUEngine.from_path(dir_temp_engine) shutil.rmtree(str(dir_temp_engine)) # Then loaded_engine.to_byte_array()
def test_should_limit_nb_queries(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](my second entity) - this is [slot3:entity3](my third entity) --- type: intent name: my_second_intent utterances: - this is [slot4:entity4](my fourth entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=2, max_pattern_length=1000) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(len(parser.regexes_per_intent["my_first_intent"]), 2) self.assertEqual(len(parser.regexes_per_intent["my_second_intent"]), 1)
def test_should_limit_patterns_length(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=1000, max_pattern_length=25, ignore_stop_words=False) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(2, len(parser.regexes_per_intent["my_first_intent"])) self.assertEqual(1, len(parser.regexes_per_intent["my_second_intent"]))
def test_should_get_builtin_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: GetWeather utterances: - what is the weather [datetime:snips/datetime](at 9pm) - what's the weather in [location:weather_location](berlin) - What's the weather in [location](tokyo) [datetime](this weekend)? - Can you tell me the weather [datetime] please ? - what is the weather forecast [datetime] in [location](paris)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) intent = "GetWeather" slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Give me the weather at 9pm in Paris") # Then expected_slots = [ unresolved_slot(match_range={START: 20, END: 26}, value='at 9pm', entity='snips/datetime', slot_name='datetime'), unresolved_slot(match_range={START: 30, END: 35}, value='Paris', entity='weather_location', slot_name='location') ] self.assertListEqual(expected_slots, slots)
def test_should_get_sub_builtin_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: PlanBreak utterances: - 'I want to leave from [start:snips/datetime](tomorrow) until [end:snips/datetime](next thursday)' - find me something from [start](9am) to [end](12pm) - I need a break from [start](2pm) until [end](4pm) - Can you suggest something from [start](april 4th) until [end](april 6th) ? - Book me a trip from [start](this friday) to [end](next tuesday)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = CRFSlotFillerConfig(random_seed=42) intent = "PlanBreak" slot_filler = CRFSlotFiller(config, **self.get_shared_data(dataset)) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Find me a plan from 5pm to 6pm") # Then expected_slots = [ unresolved_slot(match_range={START: 20, END: 23}, value="5pm", entity="snips/datetime", slot_name="start"), unresolved_slot(match_range={START: 27, END: 30}, value="6pm", entity="snips/datetime", slot_name="end") ] self.assertListEqual(expected_slots, slots)
def test_should_parse_top_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - hello world --- type: intent name: intent2 utterances: - foo bar""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "hello world" # When results = parser.parse(text, top_n=3) # Then expected_intent = intent_classification_result(intent_name="intent1", probability=1.0) expected_results = [extraction_result(expected_intent, [])] self.assertEqual(expected_results, results)
def setUp(self): super(TestCLI, self).setUp() if not self.fixture_dir.exists(): self.fixture_dir.mkdir() dataset_stream = io.StringIO(u""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups - i want [number_of_cups] cups of [beverage_temperature](boiling hot) tea pls - can you prepare [number_of_cups] cup of [beverage_temperature](cold) tea ? --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee - can you prepare [number_of_cups] cup of coffee""") beverage_dataset = Dataset.from_yaml_files("en", [dataset_stream]).json self.beverage_dataset_path = self.fixture_dir / "beverage_dataset.json" if self.beverage_dataset_path.exists(): self.beverage_dataset_path.unlink() with self.beverage_dataset_path.open(mode="w") as f: f.write(json_string(beverage_dataset)) self.tmp_file_path = self.fixture_dir / next( tempfile._get_candidate_names()) while self.tmp_file_path.exists(): self.tmp_file_path = self.fixture_dir / next( tempfile._get_candidate_names())
def test_training_should_be_reproducible(self): # Given random_state = 42 dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a hot cup of tea - make me five tea cups --- type: intent name: MakeCoffee utterances: - make me one cup of coffee please - brew two cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # When engine1 = SnipsNLUEngine(random_state=random_state) engine1.fit(dataset) engine2 = SnipsNLUEngine(random_state=random_state) engine2.fit(dataset) # Then with temp_dir() as tmp_dir: dir_engine1 = tmp_dir / "engine1" dir_engine2 = tmp_dir / "engine2" engine1.persist(dir_engine1) engine2.persist(dir_engine2) hash1 = dirhash(str(dir_engine1), 'sha256') hash2 = dirhash(str(dir_engine2), 'sha256') self.assertEqual(hash1, hash2)
def test_should_not_build_builtin_parser_when_provided(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json dataset = validate_and_format_dataset(dataset) builtin_entity_parser = BuiltinEntityParser.build(language="en") # When with patch("snips_nlu.entity_parser.builtin_entity_parser" ".BuiltinEntityParser.build") as mocked_build_parser: engine = SnipsNLUEngine( builtin_entity_parser=builtin_entity_parser) engine.fit(dataset) # Then mocked_build_parser.assert_not_called()
def test_should_ignore_very_ambiguous_utterances(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent_1 utterances: - "[event_type](meeting) tomorrow" --- type: intent name: intent_2 utterances: - call [time:snips/datetime](today) --- type: entity name: event_type values: - call - diner""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "call tomorrow" # When res = parser.parse(text) # Then self.assertEqual(empty_result(text, 1.0), res)
def test_should_not_build_custom_parser_when_provided(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json resources = load_resources("en") custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS, resources) # When with patch("snips_nlu.entity_parser.custom_entity_parser" ".CustomEntityParser.build") as mocked_build_parser: engine = SnipsNLUEngine( custom_entity_parser=custom_entity_parser) engine.fit(dataset) # Then mocked_build_parser.assert_not_called()
def test_should_parse_slightly_ambiguous_utterances(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent_1 utterances: - call tomorrow --- type: intent name: intent_2 utterances: - call [time:snips/datetime](today)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "call tomorrow" # When res = parser.parse(text) # Then expected_intent = intent_classification_result(intent_name="intent_1", probability=2. / 3.) expected_result = parsing_result(text, expected_intent, []) self.assertEqual(expected_result, res)
def test_should_be_serializable_into_bytearray(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) engine = SnipsNLUEngine(**shared).fit(dataset) # When engine_bytes = engine.to_byte_array() loaded_engine = SnipsNLUEngine.from_byte_array(engine_bytes) result = loaded_engine.parse("Make me two cups of coffee") # Then self.assertEqual(result[RES_INTENT][RES_INTENT_NAME], "MakeCoffee")
def test_should_parse_intent(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - foo bar baz --- type: intent name: intent2 utterances: - foo bar ban""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "foo bar ban" # When parsing = parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result(intent_name="intent2", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])
def test_validate_should_accept_dataset_object(self): # Given dataset_stream = io.StringIO(""" # getWeather Intent --- type: intent name: getWeather utterances: - what is the weather in [weatherLocation:location](Paris)? - is it raining in [weatherLocation] [weatherDate:snips/datetime] # Location Entity --- type: entity name: location automatically_extensible: true values: - [new york, big apple] - london """) dataset = Dataset.from_yaml_files("en", [dataset_stream]) # When validated_dataset = validate_and_format_dataset(dataset) # Then self.assertTrue(validated_dataset.get(VALIDATED, False))
def test_should_be_serializable_into_bytearray(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](one) cup of tea - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cup of tea ? --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](two) cups of coffee - brew [number_of_cups] cups of coffee - can you prepare [number_of_cups] cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) intent_parser = DeterministicIntentParser(**shared).fit(dataset) # When intent_parser_bytes = intent_parser.to_byte_array() loaded_intent_parser = DeterministicIntentParser.from_byte_array( intent_parser_bytes, **shared) result = loaded_intent_parser.parse("make me two cups of coffee") # Then self.assertEqual("MakeCoffee", result[RES_INTENT][RES_INTENT_NAME])
def test_should_get_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - yala yili --- type: intent name: intent2 utterances: - yala yili yulu --- type: intent name: intent3 utterances: - yili yulu yele""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) shared[RANDOM_STATE] = 42 parser = ProbabilisticIntentParser(**shared).fit(dataset) text = "yala yili yulu" # When results = parser.get_intents(text) intents = [res[RES_INTENT_NAME] for res in results] # Then expected_intents = ["intent2", "intent1", "intent3", None] self.assertEqual(expected_intents, intents)
def test_nlu_engine_should_train_and_parse_in_all_languages(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups - i want [number_of_cups] cups of [beverage_temperature](boiling hot) tea pls - can you prepare [number_of_cups] cup of [beverage_temperature](cold) tea ? --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee - can you prepare [number_of_cups] cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json text = "please brew me a cup of coffee" for language in get_all_languages(): dataset[LANGUAGE] = language engine = SnipsNLUEngine() # When / Then msg = "Could not fit engine in '%s'" % language with self.fail_if_exception(msg): engine = engine.fit(dataset) msg = "Could not parse in '%s'" % language with self.fail_if_exception(msg): res = engine.parse(text) self.assertEqual("MakeCoffee", res[RES_INTENT][RES_INTENT_NAME])
def test_should_not_retrain_intent_classifier_when_no_force_retrain(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = ProbabilisticIntentParser() intent_classifier = LogRegIntentClassifier() intent_classifier.fit(dataset) parser.intent_classifier = intent_classifier # When / Then with patch("snips_nlu.intent_classifier.log_reg_classifier" ".LogRegIntentClassifier.fit") as mock_fit: parser.fit(dataset, force_retrain=False) mock_fit.assert_not_called()
def test_nlu_engine_should_raise_error_with_bytes_input(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json bytes_input = b"brew me an espresso" # pylint:disable=unused-variable @IntentParser.register("my_intent_parser", True) class MyIntentParser(MockIntentParser): pass # pylint:enable=unused-variable config = NLUEngineConfig(["my_intent_parser"]) engine = SnipsNLUEngine(config).fit(dataset) # When / Then with self.assertRaises(InvalidInputError) as cm: engine.parse(bytes_input) message = str(cm.exception.args[0]) self.assertTrue("Expected unicode but received" in message)
def test_should_get_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](five) cups of tea - please I want [number_of_cups](two) cups of tea""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) shared[RANDOM_STATE] = 42 slot_filler = CRFSlotFiller(**shared) intent = "MakeTea" slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={ START: 8, END: 11 }, value='two', entity='snips/number', slot_name='number_of_cups') ] self.assertListEqual(slots, expected_slots)
def test_should_not_load_resources_when_provided( self, mocked_load_resources): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json resources = load_resources("en") # When engine = SnipsNLUEngine(resources=resources) engine.fit(dataset) # Then mocked_load_resources.assert_not_called()
def test_should_be_serializable_into_bytearray(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](one) cup of tea - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cups of tea ?""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) slot_filler = CRFSlotFiller(**shared).fit(dataset, "MakeTea") # When slot_filler_bytes = slot_filler.to_byte_array() loaded_slot_filler = CRFSlotFiller.from_byte_array( slot_filler_bytes, **shared) slots = loaded_slot_filler.get_slots("make me two cups of tea") # Then expected_slots = [ unresolved_slot(match_range={ START: 8, END: 11 }, value='two', entity='snips/number', slot_name='number_of_cups') ] self.assertListEqual(expected_slots, slots)
def test_should_get_intent_when_filter(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a cup of tea - i want two cups of tea please - can you prepare one cup of tea ? --- type: intent name: MakeCoffee utterances: - make me a cup of coffee please - brew two cups of coffee - can you prepare one cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier = LogRegIntentClassifier(random_state=42).fit(dataset) # When text1 = "Make me two cups of tea" res1 = classifier.get_intent(text1, ["MakeCoffee", "MakeTea"]) text2 = "Make me two cups of tea" res2 = classifier.get_intent(text2, ["MakeCoffee"]) text3 = "bla bla bla" res3 = classifier.get_intent(text3, ["MakeCoffee"]) # Then self.assertEqual("MakeTea", res1[RES_INTENT_NAME]) self.assertEqual("MakeCoffee", res2[RES_INTENT_NAME]) self.assertEqual(None, res3[RES_INTENT_NAME])