def test_should_be_deserializable(self, mocked_cooccurrence_load, mocked_tfidf_load): # Given mocked_tfidf_load.return_value = "tfidf_vectorizer" mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer" language = LANGUAGE_EN config = FeaturizerConfig() featurizer_dict = { "language_code": language, "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } self.tmp_file_path.mkdir() featurizer_path = self.tmp_file_path / "featurizer.json" with featurizer_path.open("w", encoding="utf-8") as f: f.write(json_string(featurizer_dict)) # When featurizer = Featurizer.from_path(self.tmp_file_path) # Then self.assertEqual(language, featurizer.language) self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer) self.assertEqual("cooccurrence_vectorizer", featurizer.cooccurrence_vectorizer) self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
def test_should_be_serializable_before_fit(self): # Given pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) featurizer = Featurizer(config=config) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": None, "tfidf_vectorizer": None, "cooccurrence_vectorizer": None, "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertFalse(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertFalse(cooc_vectorizer_path.exists())
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: dummy_intent utterances: - this is the number [number:snips/number](one) """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json pvalue_threshold = 0.42 config = FeaturizerConfig(pvalue_threshold=pvalue_threshold, added_cooccurrence_feature_ratio=0.2) shared = self.get_shared_data(dataset) featurizer = Featurizer(config=config, **shared) utterances = [ text_to_utterance("this is the number"), text_to_utterance("yo") ] classes = np.array([0, 1]) featurizer.fit(dataset, utterances, classes, max(classes)) # When featurizer.persist(self.tmp_file_path) # Then expected_featurizer_dict = { "language_code": "en", "tfidf_vectorizer": "tfidf_vectorizer", "cooccurrence_vectorizer": "cooccurrence_vectorizer", "config": config.to_dict() } featurizer_dict_path = self.tmp_file_path / "featurizer.json" self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict) expected_metadata = {"unit_name": "featurizer"} metadata_path = self.tmp_file_path / "metadata.json" self.assertJsonContent(metadata_path, expected_metadata) tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer" self.assertTrue(tfidf_vectorizer_path.exists()) cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer" self.assertTrue(cooc_vectorizer_path.exists())