def test_cooccurrence_vectorizer_should_persist(self): # Given x = [text_to_utterance("yoo yoo")] dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset) vectorizer.builtin_entity_scope = {"snips/entity"} # When vectorizer.persist(self.tmp_file_path) # Then metadata_path = self.tmp_file_path / "metadata.json" expected_metadata = {"unit_name": "cooccurrence_vectorizer"} self.assertJsonContent(metadata_path, expected_metadata) vectorizer_path = self.tmp_file_path / "vectorizer.json" expected_vectorizer = { "word_pairs": { "0": ["yoo", "yoo"] }, "language_code": "en", "config": vectorizer.config.to_dict(), "builtin_entity_scope": ["snips/entity"] } self.assertJsonContent(vectorizer_path, expected_vectorizer)
def test_limit_vocabulary(self): # Given config = CooccurrenceVectorizerConfig(filter_stop_words=False) vectorizer = CooccurrenceVectorizer(config=config) train_data = [ text_to_utterance(t) for t in ("a b", "a c", "a d", "a e") ] data = [text_to_utterance(t) for t in ("a c e", "a d e")] vectorizer.fit(train_data, get_empty_dataset("en")) x_0 = vectorizer.transform(data) pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3} kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")] self.assertDictEqual(pairs, vectorizer.word_pairs) # When kept_pairs_indexes = [pairs[p] for p in kept_pairs] vectorizer.limit_word_pairs(kept_pairs) # Then expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2} self.assertDictEqual(expected_pairs, vectorizer.word_pairs) x_1 = vectorizer.transform(data) self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(), x_1.todense().tolist())
def test_limit_vocabulary(self): # Given vectorizer = TfidfVectorizer() dataset = get_empty_dataset("en") utterances = [ text_to_utterance("5 55 6 66 666"), text_to_utterance("55 66") ] voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4} kept_unigrams = ["5", "6", "666"] vectorizer.fit(utterances, dataset) self.assertDictEqual(voca, vectorizer.vocabulary) diag = vectorizer.idf_diag.copy() # When vectorizer.limit_vocabulary(kept_unigrams) # Then expected_voca = {"5": 0, "6": 1, "666": 2} self.assertDictEqual(expected_voca, vectorizer.vocabulary) expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist() self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
def test_should_handle_empty_dataset(self): # Given dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN)) engine = SnipsNLUEngine().fit(dataset) # When result = engine.parse("hello world") # Then self.assertEqual(empty_result("hello world"), result)
def test_should_handle_empty_dataset(self): # Given dataset = get_empty_dataset(LANGUAGE_EN) shared = self.get_shared_data(dataset) engine = SnipsNLUEngine(**shared).fit(dataset) # When result = engine.parse("hello world") # Then self.assertEqual(empty_result("hello world", 1.0), result)
def test_fit_with_no_utterance_should_raise(self): # Given utterances = [] classes = [] dataset = get_empty_dataset("en") # When/Then with self.assertRaises(_EmptyDatasetUtterancesError) as ctx: Featurizer().fit_transform(dataset, utterances, classes, None) self.assertEqual("Tokenized utterances are empty", str(ctx.exception))
def test_should_get_none_if_empty_dataset(self): # Given dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN)) classifier = LogRegIntentClassifier().fit(dataset) text = "this is a dummy query" # When intent = classifier.get_intent(text) # Then expected_intent = None self.assertEqual(intent, expected_intent)
def test_should_get_intents_when_empty_dataset(self): # Given dataset = get_empty_dataset(LANGUAGE_EN) classifier = LogRegIntentClassifier().fit(dataset) text = "this is a dummy query" # When results = classifier.get_intents(text) # Then expected_results = [{RES_INTENT_NAME: None, RES_PROBA: 1.0}] self.assertEqual(expected_results, results)
def test_should_get_none_intent_when_empty_dataset(self): # Given dataset = get_empty_dataset(LANGUAGE_EN) classifier = LogRegIntentClassifier().fit(dataset) text = "this is a dummy query" # When intent = classifier.get_intent(text) # Then expected_intent = intent_classification_result(None, 1.0) self.assertEqual(intent, expected_intent)
def test_fit_transform(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False ) dataset = get_empty_dataset("en") builtin_parser = EntityParserMock({t: builtin_ents}) custom_parser = EntityParserMock({t: custom_ents}) resources = {STOP_WORDS: set()} vectorizer1 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) vectorizer2 = CooccurrenceVectorizer( config, builtin_entity_parser=builtin_parser, custom_entity_parser=custom_parser, resources=resources) # When x = [u] x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist() x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist() # Then self.assertListEqual(x_0, x_1)
def test_fit_unordered(self, mocked_preprocess): t = "a b c d e f" u = text_to_utterance(t) builtin_ents = [ { "value": "e", "resolved_value": "e", "range": { "start": 8, "end": 9 }, "entity_kind": "the_snips_e_entity" } ] custom_ents = [ { "value": "c", "resolved_value": "c", "range": { "start": 4, "end": 5 }, "entity_kind": "the_c_entity" } ] mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents] config = CooccurrenceVectorizerConfig( window_size=3, unknown_words_replacement_string="b", filter_stop_words=False, keep_order=False, ) dataset = get_empty_dataset("en") shared = self.get_shared_data(dataset) # When expected_pairs = { ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0, ("THE_C_ENTITY", "a"): 1, ("THE_C_ENTITY", "d"): 2, ("THE_C_ENTITY", "f"): 3, ("THE_SNIPS_E_ENTITY", "a"): 4, ("THE_SNIPS_E_ENTITY", "d"): 5, ("THE_SNIPS_E_ENTITY", "f"): 6, ("a", "d"): 7, ("d", "f"): 8, } vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset) # Then self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
def test_should_build_training_data_with_no_data(self): # Given language = LANGUAGE_EN dataset = validate_and_format_dataset(get_empty_dataset(language)) random_state = np.random.RandomState(1) # When data_augmentation_config = LogRegIntentClassifierConfig() \ .data_augmentation_config utterances, _, intent_mapping = build_training_data( dataset, language, data_augmentation_config, random_state) # Then expected_utterances = [] expected_intent_mapping = [] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)