Ejemplo n.º 1
0
    def test_cooccurrence_vectorizer_should_load(self):
        # Given
        config = CooccurrenceVectorizerConfig()

        word_pairs = {("a", "b"): 0, ("a", 'c'): 12}

        serializable_word_pairs = {0: ["a", "b"], 12: ["a", "c"]}

        vectorizer_dict = {
            "unit_name": "cooccurrence_vectorizer",
            "language_code": "en",
            "word_pairs": serializable_word_pairs,
            "builtin_entity_scope": ["snips/datetime"],
            "config": config.to_dict(),
        }

        self.tmp_file_path.mkdir()
        self.writeJsonContent(self.tmp_file_path / "vectorizer.json",
                              vectorizer_dict)

        # When
        vectorizer = CooccurrenceVectorizer.from_path(self.tmp_file_path)

        # Then
        self.assertDictEqual(config.to_dict(), vectorizer.config.to_dict())
        self.assertEqual("en", vectorizer.language)
        self.assertDictEqual(vectorizer.word_pairs, word_pairs)
        self.assertEqual({"snips/datetime"}, vectorizer.builtin_entity_scope)
Ejemplo n.º 2
0
    def test_limit_vocabulary(self):
        # Given
        config = CooccurrenceVectorizerConfig(filter_stop_words=False)
        vectorizer = CooccurrenceVectorizer(config=config)
        train_data = [
            text_to_utterance(t) for t in ("a b", "a c", "a d", "a e")
        ]

        data = [text_to_utterance(t) for t in ("a c e", "a d e")]
        vectorizer.fit(train_data, get_empty_dataset("en"))
        x_0 = vectorizer.transform(data)
        pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3}
        kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")]
        self.assertDictEqual(pairs, vectorizer.word_pairs)

        # When
        kept_pairs_indexes = [pairs[p] for p in kept_pairs]
        vectorizer.limit_word_pairs(kept_pairs)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2}
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
        x_1 = vectorizer.transform(data)
        self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(),
                             x_1.todense().tolist())
Ejemplo n.º 3
0
    def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2):
        # Given
        vectorizer_config = CooccurrenceVectorizerConfig(
            filter_stop_words=False)
        config = FeaturizerConfig(
            added_cooccurrence_feature_ratio=.3,
            cooccurrence_vectorizer_config=vectorizer_config)
        featurizer = Featurizer(config)
        mocked_dataset = {"language": "fr", "entities": {}, "intents": {}}
        utterances = [
            text_to_utterance("a b c d e"),
            text_to_utterance("f g h i j"),
            text_to_utterance("none"),
        ]

        mocked_vectorizer = MagicMock()
        mocked_vectorizer.idf_diag = range(10)

        featurizer.tfidf_vectorizer = mocked_vectorizer
        classes = [0, 0, 1]

        # When
        mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] +
                                    [1.0 for _ in range(100)])
        featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1,
                                                mocked_dataset)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2}
        self.assertDictEqual(expected_pairs,
                             featurizer.cooccurrence_vectorizer.word_pairs)
Ejemplo n.º 4
0
    def test_featurizer_config(self):
        # Given
        tfid_vectorizer_config = TfidfVectorizerConfig()
        cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
        config_dict = {
            "unit_name": "featurizer",
            "pvalue_threshold": 0.2,
            "added_cooccurrence_feature_ratio": 0.2,
            "tfidf_vectorizer_config": tfid_vectorizer_config.to_dict(),
            "cooccurrence_vectorizer_config":
                cooccurrence_vectorizer_config.to_dict()
        }

        # When
        config = FeaturizerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
    def test_fit_transform(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False
        )

        dataset = get_empty_dataset("en")

        builtin_parser = EntityParserMock({t: builtin_ents})
        custom_parser = EntityParserMock({t: custom_ents})
        resources = {STOP_WORDS: set()}
        vectorizer1 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)
        vectorizer2 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)

        # When
        x = [u]
        x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist()
        x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist()

        # Then
        self.assertListEqual(x_0, x_1)
    def test_fit_unordered(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False,
            keep_order=False,
        )
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)

        # When
        expected_pairs = {
            ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0,
            ("THE_C_ENTITY", "a"): 1,
            ("THE_C_ENTITY", "d"): 2,
            ("THE_C_ENTITY", "f"): 3,
            ("THE_SNIPS_E_ENTITY", "a"): 4,
            ("THE_SNIPS_E_ENTITY", "d"): 5,
            ("THE_SNIPS_E_ENTITY", "f"): 6,
            ("a", "d"): 7,
            ("d", "f"): 8,
        }
        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)

        # Then
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
Ejemplo n.º 7
0
    def test_cooccurrence_vectorizer_config(self):
        # Given
        config_dict = {
            "unit_name": "cooccurrence_vectorizer",
            "unknown_words_replacement_string": None,
            "window_size": 5,
            "filter_stop_words": True,
            "keep_order": True,
        }

        # When
        config = CooccurrenceVectorizerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Ejemplo n.º 8
0
    def test_transform(self):
        # Given
        config = CooccurrenceVectorizerConfig(
            filter_stop_words=True,
            window_size=3,
            unknown_words_replacement_string="d")

        t_0 = "yo a b c d e f yo"
        t_1 = "yo a b c d e"
        u_0 = text_to_utterance(t_0)
        u_1 = text_to_utterance(t_1)

        resources = {STOP_WORDS: {"b"}}

        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 11,
                "end": 12
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 7,
                "end": 8
            },
            "entity_kind": "the_c_entity"
        }]

        builtin_parser = EntityParserMock({
            t_0: builtin_ents,
            t_1: builtin_ents
        })
        custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents})

        vectorizer = CooccurrenceVectorizer(
            config,
            builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser,
            resources=resources)

        vectorizer._language = "en"
        vectorizer._word_pairs = {
            ("THE_SNIPS_E_ENTITY", "f"): 0,
            ("a", "THE_C_ENTITY"): 1,
            ("a", "THE_SNIPS_E_ENTITY"): 2,
            ("b", "THE_SNIPS_E_ENTITY"): 3,
            ("yo", "yo"): 4,
            ("d", "THE_SNIPS_E_ENTITY"): 5
        }

        data = [u_0, u_1]

        # When
        x = vectorizer.transform(data)

        # Then
        expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]]
        self.assertEqual(expected, x.todense().tolist())