Esempio n. 1
0
    def test_should_be_deserializable(self, mocked_cooccurrence_load,
                                      mocked_tfidf_load):
        # Given
        mocked_tfidf_load.return_value = "tfidf_vectorizer"
        mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer"

        language = LANGUAGE_EN
        config = FeaturizerConfig()

        featurizer_dict = {
            "language_code": language,
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }

        self.tmp_file_path.mkdir()
        featurizer_path = self.tmp_file_path / "featurizer.json"
        with featurizer_path.open("w", encoding="utf-8") as f:
            f.write(json_string(featurizer_dict))

        # When
        featurizer = Featurizer.from_path(self.tmp_file_path)

        # Then
        self.assertEqual(language, featurizer.language)
        self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer)
        self.assertEqual("cooccurrence_vectorizer",
                         featurizer.cooccurrence_vectorizer)
        self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
Esempio n. 2
0
    def test_should_be_serializable_before_fit(self):
        # Given
        pvalue_threshold = 0.42
        config = FeaturizerConfig(pvalue_threshold=pvalue_threshold,
                                  added_cooccurrence_feature_ratio=0.2)
        featurizer = Featurizer(config=config)

        # When
        featurizer.persist(self.tmp_file_path)

        # Then
        expected_featurizer_dict = {
            "language_code": None,
            "tfidf_vectorizer": None,
            "cooccurrence_vectorizer": None,
            "config": config.to_dict()
        }
        featurizer_dict_path = self.tmp_file_path / "featurizer.json"
        self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict)

        expected_metadata = {"unit_name": "featurizer"}
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, expected_metadata)

        tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer"
        self.assertFalse(tfidf_vectorizer_path.exists())

        cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer"
        self.assertFalse(cooc_vectorizer_path.exists())
Esempio n. 3
0
    def test_should_be_serializable(self):
        # Given

        dataset_stream = io.StringIO("""
---
type: intent
name: dummy_intent
utterances:
  - this is the number [number:snips/number](one)
""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        pvalue_threshold = 0.42
        config = FeaturizerConfig(pvalue_threshold=pvalue_threshold,
                                  added_cooccurrence_feature_ratio=0.2)
        shared = self.get_shared_data(dataset)
        featurizer = Featurizer(config=config, **shared)
        utterances = [
            text_to_utterance("this is the number"),
            text_to_utterance("yo")
        ]
        classes = np.array([0, 1])
        featurizer.fit(dataset, utterances, classes, max(classes))

        # When
        featurizer.persist(self.tmp_file_path)

        # Then
        expected_featurizer_dict = {
            "language_code": "en",
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }
        featurizer_dict_path = self.tmp_file_path / "featurizer.json"
        self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict)

        expected_metadata = {"unit_name": "featurizer"}
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, expected_metadata)

        tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer"
        self.assertTrue(tfidf_vectorizer_path.exists())

        cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer"
        self.assertTrue(cooc_vectorizer_path.exists())