def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2): # Given vectorizer_config = CooccurrenceVectorizerConfig( filter_stop_words=False) config = FeaturizerConfig( added_cooccurrence_feature_ratio=.3, cooccurrence_vectorizer_config=vectorizer_config) featurizer = Featurizer(config) mocked_dataset = {"language": "fr", "entities": {}, "intents": {}} utterances = [ text_to_utterance("a b c d e"), text_to_utterance("f g h i j"), text_to_utterance("none"), ] mocked_vectorizer = MagicMock() mocked_vectorizer.idf_diag = range(10) featurizer.tfidf_vectorizer = mocked_vectorizer classes = [0, 0, 1] # When mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] + [1.0 for _ in range(100)]) featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1, mocked_dataset) # Then expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2} self.assertDictEqual(expected_pairs, featurizer.cooccurrence_vectorizer.word_pairs)
def test_limit_vocabulary(self): # Given vectorizer = TfidfVectorizer() dataset = get_empty_dataset("en") utterances = [ text_to_utterance("5 55 6 66 666"), text_to_utterance("55 66") ] voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4} kept_unigrams = ["5", "6", "666"] vectorizer.fit(utterances, dataset) self.assertDictEqual(voca, vectorizer.vocabulary) diag = vectorizer.idf_diag.copy() # When vectorizer.limit_vocabulary(kept_unigrams) # Then expected_voca = {"5": 0, "6": 1, "666": 2} self.assertDictEqual(expected_voca, vectorizer.vocabulary) expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist() self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
def test_limit_vocabulary(self): # Given config = CooccurrenceVectorizerConfig(filter_stop_words=False) vectorizer = CooccurrenceVectorizer(config=config) train_data = [ text_to_utterance(t) for t in ("a b", "a c", "a d", "a e") ] data = [text_to_utterance(t) for t in ("a c e", "a d e")] vectorizer.fit(train_data, get_empty_dataset("en")) x_0 = vectorizer.transform(data) pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3} kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")] self.assertDictEqual(pairs, vectorizer.word_pairs) # When kept_pairs_indexes = [pairs[p] for p in kept_pairs] vectorizer.limit_word_pairs(kept_pairs) # Then expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2} self.assertDictEqual(expected_pairs, vectorizer.word_pairs) x_1 = vectorizer.transform(data) self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(), x_1.todense().tolist())
def test_log_activation_weights(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - foo bar --- type: intent name: intent2 utterances: - lorem ipsum""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) intent_classifier = LogRegIntentClassifier(**shared) text = "yo" utterances = [text_to_utterance(text)] self.assertIsNone(intent_classifier.log_activation_weights(text, None)) # When intent_classifier.fit(dataset) x = intent_classifier.featurizer.transform(utterances)[0] log = intent_classifier.log_activation_weights(text, x, top_n=42) # Then self.assertIsInstance(log, str) self.assertIn("Top 42", log)
def _get_intents(self, text, intents_filter): if isinstance(intents_filter, str): intents_filter = {intents_filter} elif isinstance(intents_filter, list): intents_filter = set(intents_filter) if not text or not self.intent_list or not self.featurizer: results = [intent_classification_result(None, 1.0)] results += [intent_classification_result(i, 0.0) for i in self.intent_list if i is not None] return results if len(self.intent_list) == 1: return [intent_classification_result(self.intent_list[0], 1.0)] # pylint: disable=C0103 X = self.featurizer.transform([text_to_utterance(text)]) # pylint: enable=C0103 proba_vec = self._predict_proba(X) logger.debug( "%s", DifferedLoggingMessage(self.log_activation_weights, text, X)) results = [ intent_classification_result(i, proba) for i, proba in zip(self.intent_list, proba_vec[0]) if intents_filter is None or i is None or i in intents_filter] return sorted(results, key=lambda res: -res[RES_PROBA])
def test_generate_noise_utterances(self, mocked_get_noise): # Given language = LANGUAGE_EN num_intents = 2 noise_factor = 1 utterances_length = 5 noise = [str(i) for i in range(utterances_length)] mocked_get_noise.return_value = noise augmented_utterances = [ { "data": [ { "text": " ".join( "{}".format(i) for i in range(utterances_length)) } ] } ] num_utterances = 10 random_state = np.random.RandomState(1) augmented_utterances = augmented_utterances * num_utterances config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor) # When noise_utterances = generate_noise_utterances( augmented_utterances, noise, num_intents, config, language, random_state) # Then joined_noise = text_to_utterance(" ".join(noise)) for u in noise_utterances: self.assertEqual(u, joined_noise)
def test_cooccurrence_vectorizer_should_persist(self): # Given x = [text_to_utterance("yoo yoo")] dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset) vectorizer.builtin_entity_scope = {"snips/entity"} # When vectorizer.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" expected_metadata = {"unit_name": "cooccurrence_vectorizer"} self.assertJsonContent(metadata_path, expected_metadata) vectorizer_path = self.tmp_file_path / "vectorizer.json" expected_vectorizer = { "word_pairs": { "0": ["yoo", "yoo"] }, "language_code": "en", "config": vectorizer.config.to_dict(), "builtin_entity_scope": ["snips/entity"] } self.assertJsonContent(vectorizer_path, expected_vectorizer)
def test_enrich_utterance(self): # Given u = text_to_utterance("a b c d e f") builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" }] vectorizer = CooccurrenceVectorizer() vectorizer._language = "en" # When preprocessed = vectorizer._enrich_utterance(u, builtin_ents, custom_ents) # Then expected = ["a", "b", "THE_C_ENTITY", "d", "THE_SNIPS_E_ENTITY", "f"] self.assertSequenceEqual(expected, preprocessed)
def test_featurizer_should_exclude_replacement_string(self): # Given language = LANGUAGE_EN dataset = { "entities": { "dummy1": { "utterances": { "unknownword": "unknownword", "what": "what" } } } } replacement_string = "unknownword" featurizer = Featurizer( language, unknown_words_replacement_string=replacement_string, config=FeaturizerConfig()) utterances = [text_to_utterance("hello dude")] y = np.array([1]) # When featurizer.fit(dataset, utterances, y) # Then self.assertNotIn(replacement_string, featurizer.entity_utterances_to_feature_names)
def test_empty_vocabulary_should_fit_and_return_none_intent( self, mocked_build_training): # Given dataset_stream = io.StringIO(""" --- type: intent name: dummy_intent_1 utterances: - "[dummy_slot_name:dummy_entity_1](...)" --- type: entity name: dummy_entity_1 automatically_extensible: true use_synonyms: false matching_strictness: 1.0 values: - ... """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json text = " " noise_size = 6 utterances = [text] + [text] * noise_size utterances = [text_to_utterance(t) for t in utterances] labels = [0] + [1] * noise_size intent_list = ["dummy_intent_1", None] mocked_build_training.return_value = utterances, labels, intent_list # When / Then intent_classifier = LogRegIntentClassifier().fit(dataset) intent = intent_classifier.get_intent("no intent there") self.assertEqual(intent_classification_result(None, 1.0), intent)
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: dummy_intent utterances: - this is the number [number:snips/number](one) """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [ text_to_utterance("this is the number"), text_to_utterance("yo") ] classes = np.array([0, 1]) featurizer.fit(dataset, utterances, classes, max(classes)) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": "en", "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertTrue(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertTrue(cooc_vectorizer_path.exists())
def test_should_build_training_data_with_noise(self, mocked_augment_utterances, mocked_get_noise): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noise.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [{ "text": " ".join("1" for _ in range(utterances_length)) }] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [ utterance for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES] ] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [ text_to_utterance(next(noise_it)) for _ in range(noise_size) ] expected_utterances += noisy_utterances expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(expected_utterances, utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def test_training_should_be_reproducible(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json utterances = [ text_to_utterance("please make me two hots cups of tea"), text_to_utterance("i want a cup of coffee"), ] classes = np.array([0, 1]) shared = self.get_shared_data(dataset) shared["random_state"] = 42 # When featurizer1 = Featurizer(**shared) featurizer1.fit(dataset, utterances, classes, max(classes)) featurizer2 = Featurizer(**shared) featurizer2.fit(dataset, utterances, classes, max(classes)) # Then with temp_dir() as tmp_dir: dir_featurizer1 = tmp_dir / "featurizer1" dir_featurizer2 = tmp_dir / "featurizer2" featurizer1.persist(dir_featurizer1) featurizer2.persist(dir_featurizer2) hash1 = dirhash(str(dir_featurizer1), 'sha256') hash2 = dirhash(str(dir_featurizer2), 'sha256') self.assertEqual(hash1, hash2)
def test_limit_vocabulary_should_raise(self): # Given vectorizer = TfidfVectorizer() dataset = {"language": "en", "entities": dict(), "intents": dict()} utterances = [text_to_utterance("5 55 6 66 666")] vectorizer.fit(utterances, dataset) # When / Then kept_indexes = ["7", "8"] with self.assertRaises(ValueError): vectorizer.limit_vocabulary(kept_indexes)
def test_fit_unordered(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False, keep_order=False, ) dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) # When expected_pairs = { ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0, ("THE_C_ENTITY", "a"): 1, ("THE_C_ENTITY", "d"): 2, ("THE_C_ENTITY", "f"): 3, ("THE_SNIPS_E_ENTITY", "a"): 4, ("THE_SNIPS_E_ENTITY", "d"): 5, ("THE_SNIPS_E_ENTITY", "f"): 6, ("a", "d"): 7, ("d", "f"): 8, } vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
def test_fit_transform(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False ) dataset = get_empty_dataset("en") builtin_parser = EntityParserMock({t: builtin_ents}) custom_parser = EntityParserMock({t: custom_ents}) resources = {STOP_WORDS: set()} vectorizer1 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer2 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) # When x = [u] x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist() x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist() # Then self.assertListEqual(x_0, x_1)
def test_add_unknown_word_to_utterances_with_zero_max_unknownword(self): # Given utterances = [text_to_utterance("yo")] replacement_string = "yo" unknown_word_prob = 1 max_unknown_words = 0 random_state = np.random.RandomState # When / Then with self.fail_if_exception("Failed to augment utterances with " "unknown_word_prob=0"): add_unknown_word_to_utterances(utterances, replacement_string, unknown_word_prob, max_unknown_words, random_state)
def test_empty_vocabulary_should_fit_and_return_none_intent( self, mocked_build_training): # Given language = LANGUAGE_EN dataset = { "entities": { "dummy_entity_1": { "automatically_extensible": True, "use_synonyms": False, "data": [ { "value": "...", "synonyms": [], } ], "matching_strictness": 1.0 } }, "intents": { "dummy_intent_1": { "utterances": [ { "data": [ { "text": "...", "slot_name": "dummy_slot_name", "entity": "dummy_entity_1" } ] } ] } }, "language": language } dataset = validate_and_format_dataset(dataset) text = " " noise_size = 6 utterances = [text] + [text] * noise_size utterances = [text_to_utterance(t) for t in utterances] labels = [0] + [1] * noise_size intent_list = ["dummy_intent_1", None] mocked_build_training.return_value = utterances, labels, intent_list # When / Then intent_classifier = LogRegIntentClassifier().fit(dataset) intent = intent_classifier.get_intent("no intent there") self.assertEqual(None, intent)
def get_intent(self, text, intents_filter=None): """Performs intent classification on the provided *text* Args: text (str): Input intents_filter (str or list of str): When defined, it will find the most likely intent among the list, otherwise it will use the whole list of intents defined in the dataset Returns: dict or None: The most likely intent along with its probability or *None* if no intent was found Raises: NotTrained: When the intent classifier is not fitted """ if not self.fitted: raise NotTrained('LogRegIntentClassifier must be fitted') if isinstance(intents_filter, str): intents_filter = [intents_filter] if not text or not self.intent_list \ or self.featurizer is None or self.classifier is None: return None if len(self.intent_list) == 1: if self.intent_list[0] is None: return None return intent_classification_result(self.intent_list[0], 1.0) # pylint: disable=C0103 X = self.featurizer.transform([text_to_utterance(text)]) # pylint: enable=C0103 proba_vec = self._predict_proba(X, intents_filter=intents_filter) intents_probas = sorted(zip(self.intent_list, proba_vec[0]), key=lambda p: -p[1]) for intent, proba in intents_probas: if intent is None: return None if intents_filter is None or intent in intents_filter: return intent_classification_result(intent, proba) return None
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2]""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITH_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), text_to_utterance("Bird birdy"), ] config = TfidfVectorizerConfig(use_stemming=True, word_clusters_name="my_word_clusters") vectorizer = TfidfVectorizer( config=config, custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language vectorizer.builtin_entity_scope = {"snips/number"} # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then u_0 = {"data": [{"text": "hello world entity_2"}]} u_1 = {"data": [{"text": "beauty world ent 1"}]} u_2 = {"data": [{"text": "bird bird"}]} u_3 = {"data": [{"text": "bird bird"}]} ent_0 = { "entity_kind": "entity_2", "value": "entity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } ent_12 = { "entity_kind": "entity_2", "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 13, "end": 18 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" }, } expected_data = [(u_0, [num_0], [ent_0], []), (u_1, [num_1], [ent_11, ent_12], ["cluster_1", "cluster_3"]), (u_2, [], [], []), (u_3, [], [], ["cluster_2"])] self.assertSequenceEqual(expected_data, processed_data)
def test_preprocess(self): # Given language = LANGUAGE_EN resources = { STEMS: { "beautiful": "beauty", "birdy": "bird", "entity": "ent" }, WORD_CLUSTERS: { "my_word_clusters": { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } }, STOP_WORDS: set() } dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - dummy utterance --- type: entity name: entity_1 automatically_extensible: false use_synononyms: false matching_strictness: 1.0 values: - [entity 1, alternative entity 1] - [éntity 1, alternative entity 1] --- type: entity name: entity_2 automatically_extensible: false use_synononyms: true matching_strictness: 1.0 values: - entity 1 - [Éntity 2, Éntity_2, Alternative entity 2] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json custom_entity_parser = CustomEntityParser.build( dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources) builtin_entity_parser = BuiltinEntityParser.build(dataset, language) u_0 = text_to_utterance("hÉllo wOrld Éntity_2") u_1 = text_to_utterance("beauTiful World entity 1") u_2 = text_to_utterance("Bird bïrdy") u_3 = text_to_utterance("Bird birdy") utterances = [u_0, u_1, u_2, u_3] vectorizer = CooccurrenceVectorizer( custom_entity_parser=custom_entity_parser, builtin_entity_parser=builtin_entity_parser, resources=resources) vectorizer._language = language # When processed_data = vectorizer._preprocess(utterances) processed_data = list(zip(*processed_data)) # Then ent_0 = { "entity_kind": "entity_2", "value": "Éntity_2", "resolved_value": "Éntity 2", "range": { "start": 12, "end": 20 } } num_0 = { "entity_kind": "snips/number", "value": "2", "resolved_value": { "value": 2.0, "kind": "Number" }, "range": { "start": 19, "end": 20 } } ent_11 = { "entity_kind": "entity_1", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } ent_12 = { "entity_kind": "entity_2", "value": "entity 1", "resolved_value": "entity 1", "range": { "start": 16, "end": 24 } } num_1 = { "entity_kind": "snips/number", "value": "1", "range": { "start": 23, "end": 24 }, "resolved_value": { "value": 1.0, "kind": "Number" } } expected_data = [(u_0, [num_0], [ent_0]), (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []), (u_3, [], [])] self.assertSequenceEqual(expected_data, processed_data)
def test_enrich_utterance(self): # Given utterances = [ { "data": [ { "text": "one", "entity": "snips/number" }, { "text": "beauty world", }, { "text": "ent 1", "entity": "dummy_entity_1" }, ] }, text_to_utterance("one beauty world ent 1"), text_to_utterance("hello world entity_2"), text_to_utterance("bird bird"), ] builtin_ents = [[{ "value": "one", "resolved_value": 1, "range": { "start": 0, "end": 3 }, "entity_kind": "snips/number" }], [{ "value": "one", "resolved_value": 1, "range": { "start": 0, "end": 3 }, "entity_kind": "snips/number" }, { "value": "1", "resolved_value": 1, "range": { "start": 27, "end": 28 }, "entity_kind": "snips/number" }], [{ "value": "2", "resolved_value": 2, "range": { "start": 19, "end": 20 }, "entity_kind": "snips/number" }], []] custom_ents = [[{ "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 20, "end": 28 }, "entity_kind": "dummy_entity_1" }], [{ "value": "ent 1", "resolved_value": "entity 1", "range": { "start": 20, "end": 28 }, "entity_kind": "dummy_entity_1" }], [{ "value": "entity_2", "resolved_value": "Éntity_2", "range": { "start": 12, "end": 20 }, "entity_kind": "dummy_entity_2" }], []] w_clusters = [["111", "112"], ["111", "112"], [], []] vectorizer = TfidfVectorizer() vectorizer._language = "en" # When enriched_utterances = [ vectorizer._enrich_utterance(*data) for data in zip(utterances, builtin_ents, custom_ents, w_clusters) ] # Then expected_u0 = "beauty world ent 1 " \ "builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_1 111 112" expected_u1 = "one beauty world ent 1 " \ "builtinentityfeaturesnipsnumber " \ "builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_1 111 112" expected_u2 = "hello world entity_2 builtinentityfeaturesnipsnumber " \ "entityfeaturedummy_entity_2" expected_u3 = "bird bird" expected_utterances = [ expected_u0, expected_u1, expected_u2, expected_u3 ] self.assertEqual(expected_utterances, enriched_utterances)
def test_transform(self): # Given config = CooccurrenceVectorizerConfig( filter_stop_words=True, window_size=3, unknown_words_replacement_string="d") t_0 = "yo a b c d e f yo" t_1 = "yo a b c d e" u_0 = text_to_utterance(t_0) u_1 = text_to_utterance(t_1) resources = {STOP_WORDS: {"b"}} builtin_ents = [{ "value": "e", "resolved_value": "e", "range": { "start": 11, "end": 12 }, "entity_kind": "the_snips_e_entity" }] custom_ents = [{ "value": "c", "resolved_value": "c", "range": { "start": 7, "end": 8 }, "entity_kind": "the_c_entity" }] builtin_parser = EntityParserMock({ t_0: builtin_ents, t_1: builtin_ents }) custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents}) vectorizer = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer._language = "en" vectorizer._word_pairs = { ("THE_SNIPS_E_ENTITY", "f"): 0, ("a", "THE_C_ENTITY"): 1, ("a", "THE_SNIPS_E_ENTITY"): 2, ("b", "THE_SNIPS_E_ENTITY"): 3, ("yo", "yo"): 4, ("d", "THE_SNIPS_E_ENTITY"): 5 } data = [u_0, u_1] # When x = vectorizer.transform(data) # Then expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]] self.assertEqual(expected, x.todense().tolist())
def test_preprocess_utterances(self, mocked_stem, mocked_word_cluster): # Given language = LANGUAGE_EN def _stem(t): if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)]) mocked_word_cluster.return_value = { "beautiful": "cluster_1", "birdy": "cluster_2", "entity": "cluster_3" } mocked_stem.side_effect = stem_function dataset = { "intents": { "intent1": { "utterances": [] } }, "entities": { "entity_1": { "data": [{ "value": "entity 1", "synonyms": ["alternative entity 1"] }, { "value": "éntity 1", "synonyms": ["alternative entity 1"] }], "use_synonyms": False, "automatically_extensible": False }, "entity_2": { "data": [{ "value": "entity 1", "synonyms": [] }, { "value": "Éntity 2", "synonyms": ["Éntity_2", "Alternative entity 2"] }], "use_synonyms": True, "automatically_extensible": False }, "snips/number": {} }, "language": "en", } dataset = validate_and_format_dataset(dataset) utterances = [ text_to_utterance("hÉllo wOrld Éntity_2"), text_to_utterance("beauTiful World entity 1"), text_to_utterance("Bird bïrdy"), ] labeled_utterance = { DATA: [{ TEXT: "beauTiful éntity " }, { TEXT: "1", ENTITY: "snips/number", SLOT_NAME: "number" }, { TEXT: " bIrd Éntity_2" }] } utterances.append(labeled_utterance) labels = np.array([0, 0, 1, 1]) featurizer = Featurizer( language, None, config=FeaturizerConfig(word_clusters_name="brown_clusters")).fit( dataset, utterances, labels) # When utterances = featurizer.preprocess_utterances(utterances) # Then expected_utterances = [ "hello world entity_2 builtinentityfeaturesnipsnumber " "entityfeatureentity_2", "beauty world ent 1 builtinentityfeaturesnipsnumber " "entityfeatureentity_1 entityfeatureentity_2 " "cluster_1 cluster_3", "bird bird", "beauty ent bird entity_2 builtinentityfeaturesnipsnumber " "builtinentityfeaturesnipsnumber entityfeatureentity_1 " "entityfeatureentity_2 entityfeatureentity_2 cluster_1" ] self.assertListEqual(utterances, expected_utterances)
def test_should_be_serializable(self): # Given language = LANGUAGE_EN tfidf_vectorizer = _get_tfidf_vectorizer(language) pvalue_threshold = 0.42 featurizer = Featurizer(language, config=FeaturizerConfig( pvalue_threshold=pvalue_threshold, word_clusters_name="brown_clusters"), unknown_words_replacement_string=None, tfidf_vectorizer=tfidf_vectorizer) dataset = { "entities": { "entity2": { "data": [{ "value": "entity1", "synonyms": ["entity1"] }], "use_synonyms": True, "automatically_extensible": True } }, "intents": {}, "language": "en" } dataset = validate_and_format_dataset(dataset) utterances = [ "hello world", "beautiful world", "hello here", "bird birdy", "beautiful bird" ] utterances = [text_to_utterance(u) for u in utterances] classes = np.array([0, 0, 0, 1, 1]) featurizer.fit(dataset, utterances, classes) # When serialized_featurizer = featurizer.to_dict() # Then msg = "Featurizer dict should be json serializable to utf8." with self.fail_if_exception(msg): dumped = json_string(serialized_featurizer) msg = "SnipsNLUEngine should be deserializable from dict with unicode" \ " values" with self.fail_if_exception(msg): _ = Featurizer.from_dict(json.loads(dumped)) vocabulary = tfidf_vectorizer.vocabulary_ # pylint: disable=W0212 idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist() # pylint: enable=W0212 best_features = featurizer.best_features entity_utterances_to_feature_names = { "entity1": ["entityfeatureentity2"] } expected_serialized = { "config": { 'sublinear_tf': False, 'pvalue_threshold': pvalue_threshold, 'word_clusters_name': "brown_clusters" }, "language_code": "en", "tfidf_vectorizer": { "idf_diag": idf_diag, "vocab": vocabulary }, "best_features": best_features, "entity_utterances_to_feature_names": entity_utterances_to_feature_names, "unknown_words_replacement_string": None } self.assertDictEqual(expected_serialized, serialized_featurizer)