def test_enrich_utterance(self): # Given u = text_to_utterance("a b c d e f") builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" }] vectorizer = CooccurrenceVectorizer() vectorizer._language = "en" # When preprocessed = vectorizer._enrich_utterance(u, builtin_ents, custom_ents) # Then expected = ["a", "b", "THE_C_ENTITY", "d", "THE_SNIPS_E_ENTITY", "f"] self.assertSequenceEqual(expected, preprocessed)
def test_cooccurrence_vectorizer_should_persist(self): # Given x = [text_to_utterance("yoo yoo")] dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset) vectorizer.builtin_entity_scope = {"snips/entity"} # When vectorizer.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" expected_metadata = {"unit_name": "cooccurrence_vectorizer"} self.assertJsonContent(metadata_path, expected_metadata) vectorizer_path = self.tmp_file_path / "vectorizer.json" expected_vectorizer = { "word_pairs": { "0": ["yoo", "yoo"] }, "language_code": "en", "config": vectorizer.config.to_dict(), "builtin_entity_scope": ["snips/entity"] } self.assertJsonContent(vectorizer_path, expected_vectorizer)
def test_limit_vocabulary_should_raise(self): # Given vectorizer = CooccurrenceVectorizer() vectorizer._word_pairs = { ("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3 } # When / Then with self.assertRaises(ValueError): vectorizer.limit_word_pairs([("a", "f")])
def test_limit_vocabulary(self): # Given config = CooccurrenceVectorizerConfig(filter_stop_words=False) vectorizer = CooccurrenceVectorizer(config=config) train_data = [ text_to_utterance(t) for t in ("a b", "a c", "a d", "a e") ] data = [text_to_utterance(t) for t in ("a c e", "a d e")] vectorizer.fit(train_data, get_empty_dataset("en")) x_0 = vectorizer.transform(data) pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3} kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")] self.assertDictEqual(pairs, vectorizer.word_pairs) # When kept_pairs_indexes = [pairs[p] for p in kept_pairs] vectorizer.limit_word_pairs(kept_pairs) # Then expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2} self.assertDictEqual(expected_pairs, vectorizer.word_pairs) x_1 = vectorizer.transform(data) self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(), x_1.todense().tolist())
def test_cooccurrence_vectorizer_should_load(self): # Given config = CooccurrenceVectorizerConfig() word_pairs = {("a", "b"): 0, ("a", 'c'): 12} serializable_word_pairs = {0: ["a", "b"], 12: ["a", "c"]} vectorizer_dict = { "unit_name": "cooccurrence_vectorizer", "language_code": "en", "word_pairs": serializable_word_pairs, "builtin_entity_scope": ["snips/datetime"], "config": config.to_dict(), } self.tmp_file_path.mkdir() self.writeJsonContent(self.tmp_file_path / "vectorizer.json", vectorizer_dict) # When vectorizer = CooccurrenceVectorizer.from_path(self.tmp_file_path) # Then self.assertDictEqual(config.to_dict(), vectorizer.config.to_dict()) self.assertEqual("en", vectorizer.language) self.assertDictEqual(vectorizer.word_pairs, word_pairs) self.assertEqual({"snips/datetime"}, vectorizer.builtin_entity_scope)
def test_fit_unordered(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False, keep_order=False, ) dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) # When expected_pairs = { ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0, ("THE_C_ENTITY", "a"): 1, ("THE_C_ENTITY", "d"): 2, ("THE_C_ENTITY", "f"): 3, ("THE_SNIPS_E_ENTITY", "a"): 4, ("THE_SNIPS_E_ENTITY", "d"): 5, ("THE_SNIPS_E_ENTITY", "f"): 6, ("a", "d"): 7, ("d", "f"): 8, } vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
def test_fit_transform(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False ) dataset = get_empty_dataset("en") builtin_parser = EntityParserMock({t: builtin_ents}) custom_parser = EntityParserMock({t: custom_ents}) resources = {STOP_WORDS: set()} vectorizer1 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer2 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) # When x = [u] x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist() x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist() # Then self.assertListEqual(x_0, x_1)
def test_transform(self): # Given config = CooccurrenceVectorizerConfig( filter_stop_words=True, window_size=3, unknown_words_replacement_string="d") t_0 = "yo a b c d e f yo" t_1 = "yo a b c d e" u_0 = text_to_utterance(t_0) u_1 = text_to_utterance(t_1) resources = {STOP_WORDS: {"b"}} builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 11, "end": 12 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 7, "end": 8 }, "entity_kind": "the_c_entity" }] builtin_parser = EntityParserMock({ t_0: builtin_ents, t_1: builtin_ents }) custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents}) vectorizer = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer._language = "en" vectorizer._word_pairs = { ("THE_SNIPS_E_ENTITY", "f"): 0, ("a", "THE_C_ENTITY"): 1, ("a", "THE_SNIPS_E_ENTITY"): 2, ("b", "THE_SNIPS_E_ENTITY"): 3, ("yo", "yo"): 4, ("d", "THE_SNIPS_E_ENTITY"): 5 } data = [u_0, u_1] # When x = vectorizer.transform(data) # Then expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]] self.assertEqual(expected, x.todense().tolist())
def test_training_should_be_reproducible(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json x = [text_to_utterance("please make me two hots cups of tea")] shared = self.get_shared_data(dataset) shared["random_state"] = 42 # When vectorizer1 = CooccurrenceVectorizer(**shared) vectorizer1.fit(x, dataset) vectorizer2 = CooccurrenceVectorizer(**shared) vectorizer2.fit(x, dataset) # Then with temp_dir() as tmp_dir: dir_vectorizer1 = tmp_dir / "vectorizer1" dir_vectorizer2 = tmp_dir / "vectorizer2" vectorizer1.persist(dir_vectorizer1) vectorizer2.persist(dir_vectorizer2) hash1 = dirhash(str(dir_vectorizer1), 'sha256') hash2 = dirhash(str(dir_vectorizer2), 'sha256') self.assertEqual(hash1, hash2)
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) u_0 = text_to_utterance("hÉllo wOrld Éntity_2") u_1 = text_to_utterance("beauTiful World entity 1") u_2 = text_to_utterance("Bird bïrdy") u_3 = text_to_utterance("Bird birdy") utterances = [u_0, u_1, u_2, u_3] vectorizer = CooccurrenceVectorizer( custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then ent_0 = { "entity_kind": "entity_2", "value": "Éntity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } ent_12 = { "entity_kind": "entity_2", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" } } expected_data = [(u_0, [num_0], [ent_0]), (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []), (u_3, [], [])] self.assertSequenceEqual(expected_data, processed_data)
def test_preprocess_for_training(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [{ "data": [{ "text": "hÉllo wOrld " }, { "text": " yo " }, { "text": " yo " }, { "text": "yo " }, { "text": "Éntity_2", "entity": "entity_2" }, { "text": " " }, { "text": "Éntity_2", "entity": "entity_2" }] }, { "data": [{ "text": "beauTiful World " }, { "text": "entity 1", "entity": "entity_1" }] }, { "data": [{ "text": "Bird bïrdy" }] }, { "data": [{ "text": "Bird birdy" }] }] vectorizer = CooccurrenceVectorizer( custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances, training=True) processed_data = list(zip(*processed_data)) # Then ent_00 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 23, "end": 31 } } ent_01 = { "entity_kind": "entity_2", "value": "Éntity_2", "range": { "start": 32, "end": 40 } } ent_1 = { "entity_kind": "entity_1", "value": "entity 1", "range": { "start": 16, "end": 24 } } expected_data = [(utterances[0], [], [ent_00, ent_01]), (utterances[1], [], [ent_1]), (utterances[2], [], []), (utterances[3], [], [])] self.assertSequenceEqual(expected_data, processed_data)